Issue #2061 - Part 3: Update highway source to 1.0.2.

author: Job Bautista <jobbautista9@protonmail.com> 2022-12-22 12:55:25 +0800
committer: Job Bautista <jobbautista9@protonmail.com> 2022-12-22 12:55:25 +0800
commit: ada110f57919f20740557929fb782352900e0af7 (patch)
tree: b49ee6c17d6c139a21173c7e0cbbb6f61219c42d /media
parent: 1356ecde966f1565f2beee191a32eab835a3794e (diff)
download: uxp-ada110f57919f20740557929fb782352900e0af7.tar.gz
115 files changed, 10773 insertions, 3708 deletions
diff --git a/media/highway/src/BUILD b/media/highway/src/BUILD
index e41966c8ba..1928c32759 100644
--- a/media/highway/src/BUILD
+++ b/media/highway/src/BUILD
@@ -141,6 +141,7 @@ cc_library(
     name = "hwy",
     srcs = [
         "hwy/aligned_allocator.cc",
+        "hwy/per_target.cc",
         "hwy/print.cc",
         "hwy/targets.cc",
     ],
@@ -160,9 +161,12 @@ cc_library(
         # These are textual because config macros influence them:
         "hwy/detect_targets.h",  # private
         "hwy/targets.h",
+        # This .cc file #includes itself through foreach_target.h
+        "hwy/per_target.cc",
         # End of list
         "hwy/highway.h",  # public
         "hwy/foreach_target.h",  # public
+        "hwy/per_target.h",  # public
         "hwy/print-inl.h",  # public
         "hwy/highway_export.h",  # public
         "hwy/ops/arm_neon-inl.h",
@@ -321,10 +325,14 @@ HWY_TESTS = [
     ("hwy/tests/", "crypto_test"),
     ("hwy/tests/", "demote_test"),
     ("hwy/tests/", "float_test"),
+    ("hwy/tests/", "if_test"),
+    ("hwy/tests/", "interleaved_test"),
     ("hwy/tests/", "logical_test"),
     ("hwy/tests/", "mask_test"),
+    ("hwy/tests/", "mask_mem_test"),
     ("hwy/tests/", "memory_test"),
     ("hwy/tests/", "mul_test"),
+    ("hwy/tests/", "reduction_test"),
     ("hwy/tests/", "reverse_test"),
     ("hwy/tests/", "shift_test"),
     ("hwy/tests/", "swizzle_test"),
diff --git a/media/highway/src/CMakeLists.txt b/media/highway/src/CMakeLists.txt
index 520e4af415..b6b14ab833 100644
--- a/media/highway/src/CMakeLists.txt
+++ b/media/highway/src/CMakeLists.txt
@@ -19,7 +19,13 @@ if(POLICY CMP0083)
   cmake_policy(SET CMP0083 NEW)
 endif()
 
-project(hwy VERSION 0.17.0)  # Keep in sync with highway.h version
+# Workaround for 3.19 raising error 'IMPORTED_LOCATION not set for imported
+# target "GTest::gtest_main"'.
+if(POLICY CMP0111)
+  cmake_policy(SET CMP0111 OLD)
+endif()
+
+project(hwy VERSION 1.0.2)  # Keep in sync with highway.h version
 
 # Directly define the ABI version from the cmake project() version values:
 set(LIBRARY_VERSION "${hwy_VERSION}")
@@ -27,6 +33,10 @@ set(LIBRARY_SOVERSION ${hwy_VERSION_MAJOR})
 
 set(CMAKE_CXX_EXTENSIONS OFF)
 
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+# Search for Atomics implementation:
+find_package(Atomics REQUIRED)
+
 # Enabled PIE binaries by default if supported.
 include(CheckPIESupported OPTIONAL RESULT_VARIABLE CHECK_PIE_SUPPORTED)
 if(CHECK_PIE_SUPPORTED)
@@ -51,6 +61,7 @@ set(HWY_WARNINGS_ARE_ERRORS OFF CACHE BOOL "Add -Werror flag?")
 set(HWY_ENABLE_CONTRIB ON CACHE BOOL "Include contrib/")
 set(HWY_ENABLE_EXAMPLES ON CACHE BOOL "Build examples")
 set(HWY_ENABLE_INSTALL ON CACHE BOOL "Install library")
+set(HWY_ENABLE_TESTS ON CACHE BOOL "Enable HWY tests")
 
 include(CheckCXXSourceCompiles)
 check_cxx_source_compiles(
@@ -63,13 +74,25 @@ check_cxx_source_compiles(
   HWY_EMSCRIPTEN
 )
 
+check_cxx_source_compiles(
+   "int main() {
+      #if !defined(__riscv)
+      static_assert(false, \"__riscv is not defined\");
+      #endif
+      return 0;
+    }"
+  HWY_RISCV
+)
+
 if (HWY_ENABLE_CONTRIB)
-set(HWY_CONTRIB_SOURCES
+# Glob all the traits so we don't need to modify this file when adding
+# additional special cases.
+file(GLOB HWY_CONTRIB_SOURCES "hwy/contrib/sort/vqsort_*.cc")
+list(APPEND HWY_CONTRIB_SOURCES
     hwy/contrib/dot/dot-inl.h
     hwy/contrib/image/image.cc
     hwy/contrib/image/image.h
     hwy/contrib/math/math-inl.h
-    hwy/contrib/sort/disabled_targets.h
     hwy/contrib/sort/shared-inl.h
     hwy/contrib/sort/sorting_networks-inl.h
     hwy/contrib/sort/traits-inl.h
@@ -77,24 +100,9 @@ set(HWY_CONTRIB_SOURCES
     hwy/contrib/sort/vqsort-inl.h
     hwy/contrib/sort/vqsort.cc
     hwy/contrib/sort/vqsort.h
-    hwy/contrib/sort/vqsort_128a.cc
-    hwy/contrib/sort/vqsort_128d.cc
-    hwy/contrib/sort/vqsort_f32a.cc
-    hwy/contrib/sort/vqsort_f32d.cc
-    hwy/contrib/sort/vqsort_f64a.cc
-    hwy/contrib/sort/vqsort_f64d.cc
-    hwy/contrib/sort/vqsort_i16a.cc
-    hwy/contrib/sort/vqsort_i16d.cc
-    hwy/contrib/sort/vqsort_i32a.cc
-    hwy/contrib/sort/vqsort_i32d.cc
-    hwy/contrib/sort/vqsort_i64a.cc
-    hwy/contrib/sort/vqsort_i64d.cc
-    hwy/contrib/sort/vqsort_u16a.cc
-    hwy/contrib/sort/vqsort_u16d.cc
-    hwy/contrib/sort/vqsort_u32a.cc
-    hwy/contrib/sort/vqsort_u32d.cc
-    hwy/contrib/sort/vqsort_u64a.cc
-    hwy/contrib/sort/vqsort_u64d.cc
+    hwy/contrib/algo/copy-inl.h
+    hwy/contrib/algo/find-inl.h
+    hwy/contrib/algo/transform-inl.h
 )
 endif()  # HWY_ENABLE_CONTRIB
 
@@ -114,6 +122,7 @@ set(HWY_SOURCES
     hwy/ops/arm_sve-inl.h
     hwy/ops/emu128-inl.h
     hwy/ops/generic_ops-inl.h
+    hwy/ops/rvv-inl.h
     hwy/ops/scalar-inl.h
     hwy/ops/set_macros-inl.h
     hwy/ops/shared-inl.h
@@ -121,6 +130,8 @@ set(HWY_SOURCES
     hwy/ops/x86_128-inl.h
     hwy/ops/x86_256-inl.h
     hwy/ops/x86_512-inl.h
+    hwy/per_target.cc
+    hwy/per_target.h
     hwy/print-inl.h
     hwy/print.cc
     hwy/print.h
@@ -225,10 +236,26 @@ else()
     )
   endif()  # HWY_CMAKE_ARM7
 
+  if(HWY_RISCV)
+    if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+      # Not yet supported by GCC. When runtime dispatch is supported and
+      # implemented, we will remove v from the required flags. Until then, using
+      # clang for RISC-V will require the CPU to support the V extension (1.0).
+      list(APPEND HWY_FLAGS -march=rv64gcv1p0)
+      list(APPEND HWY_FLAGS -menable-experimental-extensions)
+    endif()
+  endif()
+
   if (HWY_WARNINGS_ARE_ERRORS)
     list(APPEND HWY_FLAGS -Werror)
   endif()
 
+  # Prevent "wasm-ld: error: --shared-memory is disallowed by targets.cc.o
+  # because it was not compiled with 'atomics' or 'bulk-memory' features."
+  if (HWY_EMSCRIPTEN)
+    list(APPEND HWY_FLAGS -matomics)
+  endif()
+
 endif()  # !MSVC
 
 # By default prefer STATIC build (legacy behavior)
@@ -265,12 +292,29 @@ target_include_directories(hwy PUBLIC
 target_compile_features(hwy PUBLIC cxx_std_11)
 set_target_properties(hwy PROPERTIES
   LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
-# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
+# For GCC __atomic_store_8, see #887
+target_link_libraries(hwy PRIVATE ${ATOMICS_LIBRARIES})
 if(UNIX AND NOT APPLE)
+  # not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
   set_property(TARGET hwy APPEND_STRING PROPERTY
     LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
 endif()
 
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown")
+  # uname -p is broken on this system.  Try uname -m
+  EXECUTE_PROCESS( COMMAND uname -m
+		   OUTPUT_STRIP_TRAILING_WHITESPACE
+		   ERROR_QUIET
+		   OUTPUT_VARIABLE HWY_ARCH)
+else (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown")
+  set(HWY_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+endif (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown")
+message(STATUS "Architecture: " ${HWY_ARCH})
+if (HWY_ARCH MATCHES "mips")
+  target_link_options(hwy PUBLIC "LINKER:-z,noexecstack")
+endif (HWY_ARCH MATCHES "mips")
+
+
 if (HWY_ENABLE_CONTRIB)
 add_library(hwy_contrib ${HWY_LIBRARY_TYPE} ${HWY_CONTRIB_SOURCES})
 target_link_libraries(hwy_contrib hwy)
@@ -281,6 +325,13 @@ target_include_directories(hwy_contrib PUBLIC
     $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
     $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
 target_compile_features(hwy_contrib PUBLIC cxx_std_11)
+set_target_properties(hwy_contrib PROPERTIES
+  LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
+# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
+if(UNIX AND NOT APPLE)
+  set_property(TARGET hwy_contrib APPEND_STRING PROPERTY
+    LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
+endif()
 endif()  # HWY_ENABLE_CONTRIB
 
 add_library(hwy_test ${HWY_LIBRARY_TYPE} ${HWY_TEST_SOURCES})
@@ -292,6 +343,13 @@ target_include_directories(hwy_test PUBLIC
     $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
     $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
 target_compile_features(hwy_test PUBLIC cxx_std_11)
+set_target_properties(hwy_test PROPERTIES
+  LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
+# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
+if(UNIX AND NOT APPLE)
+  set_property(TARGET hwy_test APPEND_STRING PROPERTY
+    LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
+endif()
 
 # -------------------------------------------------------- hwy_list_targets
 # Generate a tool to print the compiled-in targets as defined by the current
@@ -384,8 +442,8 @@ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
 add_executable(hwy_benchmark hwy/examples/benchmark.cc)
 target_sources(hwy_benchmark PRIVATE
     hwy/nanobenchmark.h)
-# Try adding either -DHWY_COMPILE_ONLY_SCALAR or -DHWY_COMPILE_ONLY_STATIC to
-# observe the difference in targets printed.
+# Try adding one of -DHWY_COMPILE_ONLY_SCALAR, -DHWY_COMPILE_ONLY_EMU128 or
+# -DHWY_COMPILE_ONLY_STATIC to observe the difference in targets printed.
 target_compile_options(hwy_benchmark PRIVATE ${HWY_FLAGS})
 target_link_libraries(hwy_benchmark hwy)
 set_target_properties(hwy_benchmark
@@ -396,7 +454,7 @@ endif()  # HWY_ENABLE_EXAMPLES
 
 include(CTest)
 
-if(BUILD_TESTING)
+if(BUILD_TESTING AND HWY_ENABLE_TESTS)
 enable_testing()
 include(GoogleTest)
 
@@ -428,13 +486,6 @@ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
 add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
                  ${CMAKE_CURRENT_BINARY_DIR}/googletest-build
                  EXCLUDE_FROM_ALL)
-
-# The gtest/gtest_main targets carry header search path
-# dependencies automatically when using CMake 2.8.11 or
-# later. Otherwise we have to add them here ourselves.
-if (CMAKE_VERSION VERSION_LESS 2.8.11)
-  include_directories("${gtest_SOURCE_DIR}/include")
-endif()
 endif()  # HWY_SYSTEM_GTEST
 
 set(HWY_TEST_FILES
@@ -444,6 +495,7 @@ set(HWY_TEST_FILES
   hwy/aligned_allocator_test.cc
   hwy/base_test.cc
   hwy/highway_test.cc
+  hwy/nanobenchmark_test.cc
   hwy/targets_test.cc
   hwy/examples/skeleton_test.cc
   hwy/tests/arithmetic_test.cc
@@ -451,14 +503,20 @@ set(HWY_TEST_FILES
   hwy/tests/blockwise_shift_test.cc
   hwy/tests/combine_test.cc
   hwy/tests/compare_test.cc
+  hwy/tests/compress_test.cc
   hwy/tests/convert_test.cc
   hwy/tests/crypto_test.cc
   hwy/tests/demote_test.cc
   hwy/tests/float_test.cc
+  hwy/tests/if_test.cc
+  hwy/tests/interleaved_test.cc
   hwy/tests/logical_test.cc
   hwy/tests/mask_test.cc
+  hwy/tests/mask_mem_test.cc
   hwy/tests/memory_test.cc
   hwy/tests/mul_test.cc
+  hwy/tests/reduction_test.cc
+  hwy/tests/reverse_test.cc
   hwy/tests/shift_test.cc
   hwy/tests/swizzle_test.cc
   hwy/tests/test_util_test.cc
@@ -480,7 +538,11 @@ list(APPEND HWY_TEST_FILES
 endif()  # HWY_ENABLE_CONTRIB
 
 if(HWY_SYSTEM_GTEST)
-  set(HWY_GTEST_LIBS GTest::GTest GTest::Main)
+  if (CMAKE_VERSION VERSION_LESS 3.20)
+    set(HWY_GTEST_LIBS GTest::GTest GTest::Main)
+  else()
+    set(HWY_GTEST_LIBS GTest::gtest GTest::gtest_main)
+  endif()
 else()
   set(HWY_GTEST_LIBS gtest gtest_main)
 endif()
diff --git a/media/highway/src/README.md b/media/highway/src/README.md
index 1383b27c9f..969f32950c 100644
--- a/media/highway/src/README.md
+++ b/media/highway/src/README.md
@@ -55,7 +55,8 @@ layouts, and aligned/padded allocations.
 
 Online demos using Compiler Explorer:
 
--   [generating code for multiple targets](https://gcc.godbolt.org/z/n6rx6xK5h) (recommended)
+-   [multiple targets with dynamic dispatch](https://gcc.godbolt.org/z/zP7MYe9Yf)
+    (recommended)
 -   [single target using -m flags](https://gcc.godbolt.org/z/rGnjMevKG)
 
 Projects using Highway: (to add yours, feel free to raise an issue or contact us
@@ -83,19 +84,19 @@ incrementing MINOR after backward-compatible additions and PATCH after
 backward-compatible fixes. We recommend using releases (rather than the Git tip)
 because they are tested more extensively, see below.
 
-Version 0.11 is considered stable enough to use in other projects.
-Version 1.0 will signal an increased focus on backwards compatibility and is
-planned for 2022H1 now that all targets are feature-complete.
+The current version 1.0 signals an increased focus on backwards compatibility.
+Applications using documented functionality will remain compatible with future
+updates that have the same major version number.
 
 ### Testing
 
 Continuous integration tests build with a recent version of Clang (running on
-native x86, Spike for RVV, and QEMU for ARM) and MSVC from VS2015 (running on
-native x86).
+native x86, or QEMU for RVV and ARM) and MSVC 2019 (v19.28, running on native
+x86).
 
-Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via
-GCC cross-compile and QEMU. See the
-[testing process](g3doc/release_testing_process.md) for details.
+Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via GCC
+cross-compile. See the [testing process](g3doc/release_testing_process.md) for
+details.
 
 ### Related modules
 
@@ -142,6 +143,9 @@ A [quick-reference page](g3doc/quick_reference.md) briefly lists all operations
 and their parameters, and the [instruction_matrix](g3doc/instruction_matrix.pdf)
 indicates the number of instructions per operation.
 
+The [FAQ](g3doc/faq.md) answers questions about portability, API design and
+where to find more information.
+
 We recommend using full SIMD vectors whenever possible for maximum performance
 portability. To obtain them, pass a `ScalableTag<float>` (or equivalently
 `HWY_FULL(float)`) tag to functions such as `Zero/Set/Load`. There are two
@@ -163,8 +167,8 @@ Due to ADL restrictions, user code calling Highway ops must either:
     hn::Add()`; or
 *   add using-declarations for each op used: `using hwy::HWY_NAMESPACE::Add;`.
 
-Additionally, each function that calls Highway ops must either be prefixed with
-`HWY_ATTR`, OR reside between `HWY_BEFORE_NAMESPACE()` and
+Additionally, each function that calls Highway ops (such as `Load`) must either
+be prefixed with `HWY_ATTR`, OR reside between `HWY_BEFORE_NAMESPACE()` and
 `HWY_AFTER_NAMESPACE()`. Lambda functions currently require `HWY_ATTR` before
 their opening brace.
 
@@ -186,6 +190,27 @@ they use static or dynamic dispatch.
     [quick-reference](g3doc/quick_reference.md)) if `HWY_TARGET_INCLUDE` is
     defined and `foreach_target.h` is included.
 
+When using dynamic dispatch, `foreach_target.h` is included from translation
+units (.cc files), not headers. Headers containing vector code shared between
+several translation units require a special include guard, for example the
+following taken from `examples/skeleton-inl.h`:
+
+```
+#if defined(HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
+#undef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
+#else
+#define HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
+#endif
+
+#include "hwy/highway.h"
+// Your vector code
+#endif
+```
+
+By convention, we name such headers `-inl.h` because their contents (often
+function templates) are usually inlined.
+
 ## Compiler flags
 
 Applications should be compiled with optimizations enabled - without inlining,
diff --git a/media/highway/src/debian/changelog b/media/highway/src/debian/changelog
index 71f2d6e3da..36d0c1de03 100644
--- a/media/highway/src/debian/changelog
+++ b/media/highway/src/debian/changelog
@@ -1,3 +1,45 @@
+highway (1.0.2-1) UNRELEASED; urgency=medium
+
+* Add ExclusiveNeither, FindKnownFirstTrue, Ne128
+* Add 16-bit SumOfLanes/ReorderWidenMulAccumulate/ReorderDemote2To
+* Faster sort for low-entropy input, improved pivot selection
+* Add GN build system, Highway FAQ, k32v32 type to vqsort
+* CMake: Support find_package(GTest), add rvv-inl.h, add HWY_ENABLE_TESTS
+* Fix MIPS and C++20 build, Apple LLVM 10.3 detection, EMU128 AllTrue on RVV
+* Fix missing exec_prefix, RVV build, warnings, libatomic linking
+* Work around GCC 10.4 issue, disabled RDCYCLE, arm7 with vfpv3
+* Documentation/example improvements
+* Support static dispatch to SVE2_128 and SVE_256
+
+ -- Jan Wassenberg <janwas@google.com>  Thu, 27 Oct 2022 17:00:00 +0200
+
+highway (1.0.1-1) UNRELEASED; urgency=medium
+
+* Add Eq128, i64 Mul, unsigned->float ConvertTo
+* Faster sort for few unique keys, more robust pivot selection
+* Fix: floating-point generator for sort tests, Min/MaxOfLanes for i16
+* Fix: avoid always_inline in debug, link atomic
+* GCC warnings: string.h, maybe-uninitialized, ignored-attributes
+* GCC warnings: preprocessor int overflow, spurious use-after-free/overflow
+* Doc: <=HWY_AVX3, Full32/64/128, how to use generic-inl
+
+ -- Jan Wassenberg <janwas@google.com>  Tue, 23 Aug 2022 10:00:00 +0200
+
+highway (1.0.0-1) UNRELEASED; urgency=medium
+
+* ABI change: 64-bit target values, more room for expansion
+* Add CompressBlocksNot, CompressNot, Lt128Upper, Min/Max128Upper, TruncateTo
+* Add HWY_SVE2_128 target
+* Sort speedups especially for 128-bit
+* Documentation clarifications
+* Faster NEON CountTrue/FindFirstTrue/AllFalse/AllTrue
+* Improved SVE codegen
+* Fix u16x8 ConcatEven/Odd, SSSE3 i64 Lt
+* MSVC 2017 workarounds
+* Support for runtime dispatch on Arm/GCC/Linux
+
+ -- Jan Wassenberg <janwas@google.com>  Wed, 27 Jul 2022 10:00:00 +0200
+
 highway (0.17.0-1) UNRELEASED; urgency=medium
 
 * Add ExtractLane, InsertLane, IsInf, IsFinite, IsNaN
diff --git a/media/highway/src/hwy/aligned_allocator_test.cc b/media/highway/src/hwy/aligned_allocator_test.cc
index 4654eaacf9..ced08e7bda 100644
--- a/media/highway/src/hwy/aligned_allocator_test.cc
+++ b/media/highway/src/hwy/aligned_allocator_test.cc
@@ -23,7 +23,6 @@
 #include <vector>
 
 #include "gtest/gtest.h"
-#include "hwy/base.h"
 
 namespace {
 
@@ -70,8 +69,8 @@ class FakeAllocator {
   void Free(void* memory) {
     if (!memory) return;
     EXPECT_NE(allocs_.end(), allocs_.find(memory));
-    free(memory);
     allocs_.erase(memory);
+    free(memory);
   }
 
   std::set<void*> allocs_;
diff --git a/media/highway/src/hwy/base.h b/media/highway/src/hwy/base.h
index 0e2465cefb..0a4491eb71 100644
--- a/media/highway/src/hwy/base.h
+++ b/media/highway/src/hwy/base.h
@@ -24,6 +24,9 @@
 #include "hwy/detect_compiler_arch.h"
 #include "hwy/highway_export.h"
 
+#if HWY_COMPILER_MSVC
+#include <string.h>  // memcpy
+#endif
 #if HWY_ARCH_X86
 #include <atomic>
 #endif
@@ -59,7 +62,13 @@
 #else
 
 #define HWY_RESTRICT __restrict__
+// force inlining without optimization enabled creates very inefficient code
+// that can cause compiler timeout
+#ifdef __OPTIMIZE__
 #define HWY_INLINE inline __attribute__((always_inline))
+#else
+#define HWY_INLINE inline
+#endif
 #define HWY_NOINLINE __attribute__((noinline))
 #define HWY_FLATTEN __attribute__((flatten))
 #define HWY_NORETURN __attribute__((noreturn))
@@ -125,6 +134,19 @@
 #define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
 #define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
 
+#if HWY_COMPILER_GCC_ACTUAL
+// nielskm: GCC does not support '#pragma GCC unroll' without the factor.
+#define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor)
+#define HWY_DEFAULT_UNROLL HWY_UNROLL(4)
+#elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX
+#define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor)
+#define HWY_DEFAULT_UNROLL HWY_UNROLL()
+#else
+#define HWY_UNROLL(factor)
+#define HWY_DEFAULT_UNROLL
+#endif
+
+
 // Compile-time fence to prevent undesirable code reordering. On Clang x86, the
 // typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
 // does, without generating code.
@@ -229,19 +251,17 @@ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
 // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
 // by concatenating base type and bits.
 
-#if HWY_ARCH_ARM && (__ARM_FP & 2)
-#define HWY_NATIVE_FLOAT16 1
-#else
-#define HWY_NATIVE_FLOAT16 0
-#endif
-
 #pragma pack(push, 1)
 
-#if HWY_NATIVE_FLOAT16
+// ACLE (https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html):
+// always supported on aarch64, for v7 only if -mfp16-format is given.
+#if ((HWY_ARCH_ARM_A64 || (__ARM_FP & 2)) && HWY_COMPILER_GCC)
 using float16_t = __fp16;
-// Clang does not allow __fp16 arguments, but scalar.h requires LaneType
-// arguments, so use a wrapper.
-// TODO(janwas): replace with _Float16 when that is supported?
+// C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets.
+// Required for Clang RVV if the float16 extension is used.
+#elif HWY_ARCH_RVV && HWY_COMPILER_CLANG && defined(__riscv_zvfh)
+using float16_t = _Float16;
+// Otherwise emulate
 #else
 struct float16_t {
   uint16_t bits;
@@ -257,6 +277,65 @@ struct bfloat16_t {
 using float32_t = float;
 using float64_t = double;
 
+#pragma pack(push, 1)
+
+// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
+// https://reviews.llvm.org/D86310
+struct alignas(16) uint128_t {
+  uint64_t lo;  // little-endian layout
+  uint64_t hi;
+};
+
+// 64 bit key plus 64 bit value. Faster than using uint128_t when only the key
+// field is to be compared (Lt128Upper instead of Lt128).
+struct alignas(16) K64V64 {
+  uint64_t value;  // little-endian layout
+  uint64_t key;
+};
+
+// 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier
+// than when considering both to be a 64-bit key.
+struct alignas(8) K32V32 {
+  uint32_t value;  // little-endian layout
+  uint32_t key;
+};
+
+#pragma pack(pop)
+
+static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
+                                              const uint128_t& b) {
+  return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
+}
+// Required for std::greater.
+static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
+                                              const uint128_t& b) {
+  return b < a;
+}
+static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a,
+                                               const uint128_t& b) {
+  return a.lo == b.lo && a.hi == b.hi;
+}
+
+static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
+                                              const K64V64& b) {
+  return a.key < b.key;
+}
+// Required for std::greater.
+static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
+                                              const K64V64& b) {
+  return b < a;
+}
+
+static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a,
+                                              const K32V32& b) {
+  return a.key < b.key;
+}
+// Required for std::greater.
+static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a,
+                                              const K32V32& b) {
+  return b < a;
+}
+
 //------------------------------------------------------------------------------
 // Controlling overload resolution (SFINAE)
 
@@ -309,6 +388,8 @@ HWY_API constexpr bool IsSame() {
   hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
 #define HWY_IF_NOT_LANE_SIZE(T, bytes) \
   hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
+#define HWY_IF_LANE_SIZE_LT(T, bytes) \
+  hwy::EnableIf<sizeof(T) < (bytes)>* = nullptr
 
 #define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
   hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
@@ -341,12 +422,14 @@ struct Relations<uint8_t> {
   using Unsigned = uint8_t;
   using Signed = int8_t;
   using Wide = uint16_t;
+  enum { is_signed = 0, is_float = 0 };
 };
 template <>
 struct Relations<int8_t> {
   using Unsigned = uint8_t;
   using Signed = int8_t;
   using Wide = int16_t;
+  enum { is_signed = 1, is_float = 0 };
 };
 template <>
 struct Relations<uint16_t> {
@@ -354,6 +437,7 @@ struct Relations<uint16_t> {
   using Signed = int16_t;
   using Wide = uint32_t;
   using Narrow = uint8_t;
+  enum { is_signed = 0, is_float = 0 };
 };
 template <>
 struct Relations<int16_t> {
@@ -361,6 +445,7 @@ struct Relations<int16_t> {
   using Signed = int16_t;
   using Wide = int32_t;
   using Narrow = int8_t;
+  enum { is_signed = 1, is_float = 0 };
 };
 template <>
 struct Relations<uint32_t> {
@@ -369,6 +454,7 @@ struct Relations<uint32_t> {
   using Float = float;
   using Wide = uint64_t;
   using Narrow = uint16_t;
+  enum { is_signed = 0, is_float = 0 };
 };
 template <>
 struct Relations<int32_t> {
@@ -377,13 +463,16 @@ struct Relations<int32_t> {
   using Float = float;
   using Wide = int64_t;
   using Narrow = int16_t;
+  enum { is_signed = 1, is_float = 0 };
 };
 template <>
 struct Relations<uint64_t> {
   using Unsigned = uint64_t;
   using Signed = int64_t;
   using Float = double;
+  using Wide = uint128_t;
   using Narrow = uint32_t;
+  enum { is_signed = 0, is_float = 0 };
 };
 template <>
 struct Relations<int64_t> {
@@ -391,6 +480,13 @@ struct Relations<int64_t> {
   using Signed = int64_t;
   using Float = double;
   using Narrow = int32_t;
+  enum { is_signed = 1, is_float = 0 };
+};
+template <>
+struct Relations<uint128_t> {
+  using Unsigned = uint128_t;
+  using Narrow = uint64_t;
+  enum { is_signed = 0, is_float = 0 };
 };
 template <>
 struct Relations<float16_t> {
@@ -398,12 +494,14 @@ struct Relations<float16_t> {
   using Signed = int16_t;
   using Float = float16_t;
   using Wide = float;
+  enum { is_signed = 1, is_float = 1 };
 };
 template <>
 struct Relations<bfloat16_t> {
   using Unsigned = uint16_t;
   using Signed = int16_t;
   using Wide = float;
+  enum { is_signed = 1, is_float = 1 };
 };
 template <>
 struct Relations<float> {
@@ -412,6 +510,7 @@ struct Relations<float> {
   using Float = float;
   using Wide = double;
   using Narrow = float16_t;
+  enum { is_signed = 1, is_float = 1 };
 };
 template <>
 struct Relations<double> {
@@ -419,6 +518,7 @@ struct Relations<double> {
   using Signed = int64_t;
   using Float = double;
   using Narrow = float;
+  enum { is_signed = 1, is_float = 1 };
 };
 
 template <size_t N>
@@ -445,6 +545,10 @@ struct TypeFromSize<8> {
   using Signed = int64_t;
   using Float = double;
 };
+template <>
+struct TypeFromSize<16> {
+  using Unsigned = uint128_t;
+};
 
 }  // namespace detail
 
@@ -470,6 +574,24 @@ using SignedFromSize = typename detail::TypeFromSize<N>::Signed;
 template <size_t N>
 using FloatFromSize = typename detail::TypeFromSize<N>::Float;
 
+// Avoid confusion with SizeTag where the parameter is a lane size.
+using UnsignedTag = SizeTag<0>;
+using SignedTag = SizeTag<0x100>;  // integer
+using FloatTag = SizeTag<0x200>;
+
+template <typename T, class R = detail::Relations<T>>
+constexpr auto TypeTag() -> hwy::SizeTag<((R::is_signed + R::is_float) << 8)> {
+  return hwy::SizeTag<((R::is_signed + R::is_float) << 8)>();
+}
+
+// For when we only want to distinguish FloatTag from everything else.
+using NonFloatTag = SizeTag<0x400>;
+
+template <typename T, class R = detail::Relations<T>>
+constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {
+  return hwy::SizeTag<(R::is_float ? 0x200 : 0x400)>();
+}
+
 //------------------------------------------------------------------------------
 // Type traits
 
@@ -535,6 +657,20 @@ constexpr double HighestValue<double>() {
   return 1.7976931348623158e+308;
 }
 
+// Difference between 1.0 and the next representable value.
+template <typename T>
+HWY_API constexpr T Epsilon() {
+  return 1;
+}
+template <>
+constexpr float Epsilon<float>() {
+  return 1.192092896e-7f;
+}
+template <>
+constexpr double Epsilon<double>() {
+  return 2.2204460492503131e-16;
+}
+
 // Returns width in bits of the mantissa field in IEEE binary32/64.
 template <typename T>
 constexpr int MantissaBits() {
@@ -640,7 +776,7 @@ HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
 #else   // HWY_ARCH_X86_64
   // _BitScanForward64 not available
   uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
-  unsigned long index;
+  unsigned long index;  // NOLINT
   if (lsb == 0) {
     uint32_t msb = static_cast<uint32_t>(x >> 32u);
     _BitScanForward(&index, msb);
@@ -675,7 +811,7 @@ HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
 #else   // HWY_ARCH_X86_64
   // _BitScanReverse64 not available
   const uint32_t msb = static_cast<uint32_t>(x >> 32u);
-  unsigned long index;
+  unsigned long index;  // NOLINT
   if (msb == 0) {
     const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
     _BitScanReverse(&index, lsb);
@@ -691,7 +827,7 @@ HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
 }
 
 HWY_API size_t PopCount(uint64_t x) {
-#if HWY_COMPILER_CLANG || HWY_COMPILER_GCC
+#if HWY_COMPILER_GCC  // includes clang
   return static_cast<size_t>(__builtin_popcountll(x));
   // This instruction has a separate feature flag, but is often called from
   // non-SIMD code, so we don't want to require dynamic dispatch. It was first
@@ -700,7 +836,8 @@ HWY_API size_t PopCount(uint64_t x) {
 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
   return _mm_popcnt_u64(x);
 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
-  return _mm_popcnt_u32(uint32_t(x)) + _mm_popcnt_u32(uint32_t(x >> 32));
+  return _mm_popcnt_u32(static_cast<uint32_t>(x & 0xFFFFFFFFu)) +
+         _mm_popcnt_u32(static_cast<uint32_t>(x >> 32));
 #else
   x -= ((x >> 1) & 0x5555555555555555ULL);
   x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
@@ -764,10 +901,18 @@ HWY_API void CopyBytes(const From* from, To* to) {
 #if HWY_COMPILER_MSVC
   memcpy(to, from, kBytes);
 #else
-  __builtin_memcpy(to, from, kBytes);
+  __builtin_memcpy(
+      static_cast<void*>(to), static_cast<const void*>(from), kBytes);
 #endif
 }
 
+// Same as CopyBytes, but for same-sized objects; avoids a size argument.
+template <typename From, typename To>
+HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {
+  static_assert(sizeof(From) == sizeof(To), "");
+  CopyBytes<sizeof(From)>(from, to);
+}
+
 template <size_t kBytes, typename To>
 HWY_API void ZeroBytes(To* to) {
 #if HWY_COMPILER_MSVC
@@ -781,13 +926,13 @@ HWY_API float F32FromBF16(bfloat16_t bf) {
   uint32_t bits = bf.bits;
   bits <<= 16;
   float f;
-  CopyBytes<4>(&bits, &f);
+  CopySameSize(&bits, &f);
   return f;
 }
 
 HWY_API bfloat16_t BF16FromF32(float f) {
   uint32_t bits;
-  CopyBytes<4>(&f, &bits);
+  CopySameSize(&f, &bits);
   bfloat16_t bf;
   bf.bits = static_cast<uint16_t>(bits >> 16);
   return bf;
diff --git a/media/highway/src/hwy/base_test.cc b/media/highway/src/hwy/base_test.cc
index d95b0ddc7b..baca70b6f1 100644
--- a/media/highway/src/hwy/base_test.cc
+++ b/media/highway/src/hwy/base_test.cc
@@ -22,7 +22,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "base_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -31,25 +31,26 @@ namespace hwy {
 namespace HWY_NAMESPACE {
 
 HWY_NOINLINE void TestAllLimits() {
-  HWY_ASSERT_EQ(uint8_t(0), LimitsMin<uint8_t>());
-  HWY_ASSERT_EQ(uint16_t(0), LimitsMin<uint16_t>());
-  HWY_ASSERT_EQ(uint32_t(0), LimitsMin<uint32_t>());
-  HWY_ASSERT_EQ(uint64_t(0), LimitsMin<uint64_t>());
-
-  HWY_ASSERT_EQ(int8_t(-128), LimitsMin<int8_t>());
-  HWY_ASSERT_EQ(int16_t(-32768), LimitsMin<int16_t>());
-  HWY_ASSERT_EQ(int32_t(0x80000000u), LimitsMin<int32_t>());
-  HWY_ASSERT_EQ(int64_t(0x8000000000000000ull), LimitsMin<int64_t>());
-
-  HWY_ASSERT_EQ(uint8_t(0xFF), LimitsMax<uint8_t>());
-  HWY_ASSERT_EQ(uint16_t(0xFFFF), LimitsMax<uint16_t>());
-  HWY_ASSERT_EQ(uint32_t(0xFFFFFFFFu), LimitsMax<uint32_t>());
-  HWY_ASSERT_EQ(uint64_t(0xFFFFFFFFFFFFFFFFull), LimitsMax<uint64_t>());
-
-  HWY_ASSERT_EQ(int8_t(0x7F), LimitsMax<int8_t>());
-  HWY_ASSERT_EQ(int16_t(0x7FFF), LimitsMax<int16_t>());
-  HWY_ASSERT_EQ(int32_t(0x7FFFFFFFu), LimitsMax<int32_t>());
-  HWY_ASSERT_EQ(int64_t(0x7FFFFFFFFFFFFFFFull), LimitsMax<int64_t>());
+  HWY_ASSERT_EQ(uint8_t{0}, LimitsMin<uint8_t>());
+  HWY_ASSERT_EQ(uint16_t{0}, LimitsMin<uint16_t>());
+  HWY_ASSERT_EQ(uint32_t{0}, LimitsMin<uint32_t>());
+  HWY_ASSERT_EQ(uint64_t{0}, LimitsMin<uint64_t>());
+
+  HWY_ASSERT_EQ(int8_t{-128}, LimitsMin<int8_t>());
+  HWY_ASSERT_EQ(int16_t{-32768}, LimitsMin<int16_t>());
+  HWY_ASSERT_EQ(static_cast<int32_t>(0x80000000u), LimitsMin<int32_t>());
+  HWY_ASSERT_EQ(static_cast<int64_t>(0x8000000000000000ull),
+                LimitsMin<int64_t>());
+
+  HWY_ASSERT_EQ(uint8_t{0xFF}, LimitsMax<uint8_t>());
+  HWY_ASSERT_EQ(uint16_t{0xFFFF}, LimitsMax<uint16_t>());
+  HWY_ASSERT_EQ(uint32_t{0xFFFFFFFFu}, LimitsMax<uint32_t>());
+  HWY_ASSERT_EQ(uint64_t{0xFFFFFFFFFFFFFFFFull}, LimitsMax<uint64_t>());
+
+  HWY_ASSERT_EQ(int8_t{0x7F}, LimitsMax<int8_t>());
+  HWY_ASSERT_EQ(int16_t{0x7FFF}, LimitsMax<int16_t>());
+  HWY_ASSERT_EQ(int32_t{0x7FFFFFFFu}, LimitsMax<int32_t>());
+  HWY_ASSERT_EQ(int64_t{0x7FFFFFFFFFFFFFFFull}, LimitsMax<int64_t>());
 }
 
 struct TestLowestHighest {
@@ -89,6 +90,10 @@ HWY_NOINLINE void TestAllType() {
   ForUnsignedTypes(TestIsUnsigned());
   ForSignedTypes(TestIsSigned());
   ForFloatTypes(TestIsFloat());
+
+  static_assert(sizeof(MakeUnsigned<hwy::uint128_t>) == 16, "");
+  static_assert(sizeof(MakeWide<uint64_t>) == 16, "Expected uint128_t");
+  static_assert(sizeof(MakeNarrow<hwy::uint128_t>) == 8, "Expected uint64_t");
 }
 
 struct TestIsSame {
@@ -103,54 +108,54 @@ struct TestIsSame {
 HWY_NOINLINE void TestAllIsSame() { ForAllTypes(TestIsSame()); }
 
 HWY_NOINLINE void TestAllBitScan() {
-  HWY_ASSERT_EQ(size_t(0), Num0BitsAboveMS1Bit_Nonzero32(0x80000000u));
-  HWY_ASSERT_EQ(size_t(0), Num0BitsAboveMS1Bit_Nonzero32(0xFFFFFFFFu));
-  HWY_ASSERT_EQ(size_t(1), Num0BitsAboveMS1Bit_Nonzero32(0x40000000u));
-  HWY_ASSERT_EQ(size_t(1), Num0BitsAboveMS1Bit_Nonzero32(0x40108210u));
-  HWY_ASSERT_EQ(size_t(30), Num0BitsAboveMS1Bit_Nonzero32(2u));
-  HWY_ASSERT_EQ(size_t(30), Num0BitsAboveMS1Bit_Nonzero32(3u));
-  HWY_ASSERT_EQ(size_t(31), Num0BitsAboveMS1Bit_Nonzero32(1u));
-
-  HWY_ASSERT_EQ(size_t(0),
+  HWY_ASSERT_EQ(size_t{0}, Num0BitsAboveMS1Bit_Nonzero32(0x80000000u));
+  HWY_ASSERT_EQ(size_t{0}, Num0BitsAboveMS1Bit_Nonzero32(0xFFFFFFFFu));
+  HWY_ASSERT_EQ(size_t{1}, Num0BitsAboveMS1Bit_Nonzero32(0x40000000u));
+  HWY_ASSERT_EQ(size_t{1}, Num0BitsAboveMS1Bit_Nonzero32(0x40108210u));
+  HWY_ASSERT_EQ(size_t{30}, Num0BitsAboveMS1Bit_Nonzero32(2u));
+  HWY_ASSERT_EQ(size_t{30}, Num0BitsAboveMS1Bit_Nonzero32(3u));
+  HWY_ASSERT_EQ(size_t{31}, Num0BitsAboveMS1Bit_Nonzero32(1u));
+
+  HWY_ASSERT_EQ(size_t{0},
                 Num0BitsAboveMS1Bit_Nonzero64(0x8000000000000000ull));
-  HWY_ASSERT_EQ(size_t(0),
+  HWY_ASSERT_EQ(size_t{0},
                 Num0BitsAboveMS1Bit_Nonzero64(0xFFFFFFFFFFFFFFFFull));
-  HWY_ASSERT_EQ(size_t(1),
+  HWY_ASSERT_EQ(size_t{1},
                 Num0BitsAboveMS1Bit_Nonzero64(0x4000000000000000ull));
-  HWY_ASSERT_EQ(size_t(1),
+  HWY_ASSERT_EQ(size_t{1},
                 Num0BitsAboveMS1Bit_Nonzero64(0x4010821004200011ull));
-  HWY_ASSERT_EQ(size_t(62), Num0BitsAboveMS1Bit_Nonzero64(2ull));
-  HWY_ASSERT_EQ(size_t(62), Num0BitsAboveMS1Bit_Nonzero64(3ull));
-  HWY_ASSERT_EQ(size_t(63), Num0BitsAboveMS1Bit_Nonzero64(1ull));
-
-  HWY_ASSERT_EQ(size_t(0), Num0BitsBelowLS1Bit_Nonzero32(1u));
-  HWY_ASSERT_EQ(size_t(1), Num0BitsBelowLS1Bit_Nonzero32(2u));
-  HWY_ASSERT_EQ(size_t(30), Num0BitsBelowLS1Bit_Nonzero32(0xC0000000u));
-  HWY_ASSERT_EQ(size_t(31), Num0BitsBelowLS1Bit_Nonzero32(0x80000000u));
-
-  HWY_ASSERT_EQ(size_t(0), Num0BitsBelowLS1Bit_Nonzero64(1ull));
-  HWY_ASSERT_EQ(size_t(1), Num0BitsBelowLS1Bit_Nonzero64(2ull));
-  HWY_ASSERT_EQ(size_t(62),
+  HWY_ASSERT_EQ(size_t{62}, Num0BitsAboveMS1Bit_Nonzero64(2ull));
+  HWY_ASSERT_EQ(size_t{62}, Num0BitsAboveMS1Bit_Nonzero64(3ull));
+  HWY_ASSERT_EQ(size_t{63}, Num0BitsAboveMS1Bit_Nonzero64(1ull));
+
+  HWY_ASSERT_EQ(size_t{0}, Num0BitsBelowLS1Bit_Nonzero32(1u));
+  HWY_ASSERT_EQ(size_t{1}, Num0BitsBelowLS1Bit_Nonzero32(2u));
+  HWY_ASSERT_EQ(size_t{30}, Num0BitsBelowLS1Bit_Nonzero32(0xC0000000u));
+  HWY_ASSERT_EQ(size_t{31}, Num0BitsBelowLS1Bit_Nonzero32(0x80000000u));
+
+  HWY_ASSERT_EQ(size_t{0}, Num0BitsBelowLS1Bit_Nonzero64(1ull));
+  HWY_ASSERT_EQ(size_t{1}, Num0BitsBelowLS1Bit_Nonzero64(2ull));
+  HWY_ASSERT_EQ(size_t{62},
                 Num0BitsBelowLS1Bit_Nonzero64(0xC000000000000000ull));
-  HWY_ASSERT_EQ(size_t(63),
+  HWY_ASSERT_EQ(size_t{63},
                 Num0BitsBelowLS1Bit_Nonzero64(0x8000000000000000ull));
 }
 
 HWY_NOINLINE void TestAllPopCount() {
-  HWY_ASSERT_EQ(size_t(0), PopCount(0u));
-  HWY_ASSERT_EQ(size_t(1), PopCount(1u));
-  HWY_ASSERT_EQ(size_t(1), PopCount(2u));
-  HWY_ASSERT_EQ(size_t(2), PopCount(3u));
-  HWY_ASSERT_EQ(size_t(1), PopCount(0x80000000u));
-  HWY_ASSERT_EQ(size_t(31), PopCount(0x7FFFFFFFu));
-  HWY_ASSERT_EQ(size_t(32), PopCount(0xFFFFFFFFu));
-
-  HWY_ASSERT_EQ(size_t(1), PopCount(0x80000000ull));
-  HWY_ASSERT_EQ(size_t(31), PopCount(0x7FFFFFFFull));
-  HWY_ASSERT_EQ(size_t(32), PopCount(0xFFFFFFFFull));
-  HWY_ASSERT_EQ(size_t(33), PopCount(0x10FFFFFFFFull));
-  HWY_ASSERT_EQ(size_t(63), PopCount(0xFFFEFFFFFFFFFFFFull));
-  HWY_ASSERT_EQ(size_t(64), PopCount(0xFFFFFFFFFFFFFFFFull));
+  HWY_ASSERT_EQ(size_t{0}, PopCount(0u));
+  HWY_ASSERT_EQ(size_t{1}, PopCount(1u));
+  HWY_ASSERT_EQ(size_t{1}, PopCount(2u));
+  HWY_ASSERT_EQ(size_t{2}, PopCount(3u));
+  HWY_ASSERT_EQ(size_t{1}, PopCount(0x80000000u));
+  HWY_ASSERT_EQ(size_t{31}, PopCount(0x7FFFFFFFu));
+  HWY_ASSERT_EQ(size_t{32}, PopCount(0xFFFFFFFFu));
+
+  HWY_ASSERT_EQ(size_t{1}, PopCount(0x80000000ull));
+  HWY_ASSERT_EQ(size_t{31}, PopCount(0x7FFFFFFFull));
+  HWY_ASSERT_EQ(size_t{32}, PopCount(0xFFFFFFFFull));
+  HWY_ASSERT_EQ(size_t{33}, PopCount(0x10FFFFFFFFull));
+  HWY_ASSERT_EQ(size_t{63}, PopCount(0xFFFEFFFFFFFFFFFFull));
+  HWY_ASSERT_EQ(size_t{64}, PopCount(0xFFFFFFFFFFFFFFFFull));
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/media/highway/src/hwy/cache_control.h b/media/highway/src/hwy/cache_control.h
index 222c0d4f77..b124e5707e 100644
--- a/media/highway/src/hwy/cache_control.h
+++ b/media/highway/src/hwy/cache_control.h
@@ -51,10 +51,10 @@ namespace hwy {
 #define HWY_ATTR_CACHE
 #endif
 
-// Delays subsequent loads until prior loads are visible. On Intel CPUs, also
-// serves as a full fence (waits for all prior instructions to complete).
-// No effect on non-x86.
-// DEPRECATED due to differing behavior across architectures AND vendors.
+// Delays subsequent loads until prior loads are visible. Beware of potentially
+// differing behavior across architectures and vendors: on Intel but not
+// AMD CPUs, also serves as a full fence (waits for all prior instructions to
+// complete).
 HWY_INLINE HWY_ATTR_CACHE void LoadFence() {
 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
   _mm_lfence();
@@ -77,7 +77,7 @@ template <typename T>
 HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
   _mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0);
-#elif HWY_COMPILER_GCC || HWY_COMPILER_CLANG
+#elif HWY_COMPILER_GCC  // includes clang
   // Hint=0 (NTA) behavior differs, but skipping outer caches is probably not
   // desirable, so use the default 3 (keep in caches).
   __builtin_prefetch(p, /*write=*/0, /*hint=*/3);
diff --git a/media/highway/src/hwy/contrib/algo/copy-inl.h b/media/highway/src/hwy/contrib/algo/copy-inl.h
index 34e926a915..033cf8a626 100644
--- a/media/highway/src/hwy/contrib/algo/copy-inl.h
+++ b/media/highway/src/hwy/contrib/algo/copy-inl.h
@@ -22,8 +22,6 @@
 #define HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
 #endif
 
-#include <string.h>  // memcpy
-
 #include "hwy/highway.h"
 
 HWY_BEFORE_NAMESPACE();
diff --git a/media/highway/src/hwy/contrib/algo/copy_test.cc b/media/highway/src/hwy/contrib/algo/copy_test.cc
index 8e55cd5bec..e2675a39d7 100644
--- a/media/highway/src/hwy/contrib/algo/copy_test.cc
+++ b/media/highway/src/hwy/contrib/algo/copy_test.cc
@@ -18,7 +18,7 @@
 // clang-format off
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/algo/copy_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 #include "hwy/contrib/algo/copy-inl.h"
 #include "hwy/tests/test_util-inl.h"
diff --git a/media/highway/src/hwy/contrib/algo/find_test.cc b/media/highway/src/hwy/contrib/algo/find_test.cc
index 8caf7e1512..da13c475d8 100644
--- a/media/highway/src/hwy/contrib/algo/find_test.cc
+++ b/media/highway/src/hwy/contrib/algo/find_test.cc
@@ -23,7 +23,7 @@
 // clang-format off
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/algo/find_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 #include "hwy/contrib/algo/find-inl.h"
 #include "hwy/tests/test_util-inl.h"
diff --git a/media/highway/src/hwy/contrib/algo/transform_test.cc b/media/highway/src/hwy/contrib/algo/transform_test.cc
index 52373cca6c..335607ccfb 100644
--- a/media/highway/src/hwy/contrib/algo/transform_test.cc
+++ b/media/highway/src/hwy/contrib/algo/transform_test.cc
@@ -13,14 +13,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <string.h>
+#include <string.h>  // memcpy
 
 #include "hwy/aligned_allocator.h"
 
 // clang-format off
 #undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/algo/transform_test.cc"
-#include "hwy/foreach_target.h"
+#define HWY_TARGET_INCLUDE "hwy/contrib/algo/transform_test.cc"  //NOLINT
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 #include "hwy/contrib/algo/transform-inl.h"
 #include "hwy/tests/test_util-inl.h"
diff --git a/media/highway/src/hwy/contrib/dot/dot_test.cc b/media/highway/src/hwy/contrib/dot/dot_test.cc
index d9e1ac621d..12d7ab270d 100644
--- a/media/highway/src/hwy/contrib/dot/dot_test.cc
+++ b/media/highway/src/hwy/contrib/dot/dot_test.cc
@@ -22,7 +22,7 @@
 // clang-format off
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/dot/dot_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 #include "hwy/contrib/dot/dot-inl.h"
 #include "hwy/tests/test_util-inl.h"
diff --git a/media/highway/src/hwy/contrib/image/image.cc b/media/highway/src/hwy/contrib/image/image.cc
index 3f8f255bab..2bcdcd6c95 100644
--- a/media/highway/src/hwy/contrib/image/image.cc
+++ b/media/highway/src/hwy/contrib/image/image.cc
@@ -20,7 +20,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/image/image.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 
 HWY_BEFORE_NAMESPACE();
diff --git a/media/highway/src/hwy/contrib/image/image.h b/media/highway/src/hwy/contrib/image/image.h
index bea6e654c4..231f3c51ae 100644
--- a/media/highway/src/hwy/contrib/image/image.h
+++ b/media/highway/src/hwy/contrib/image/image.h
@@ -18,7 +18,6 @@
 
 // SIMD/multicore-friendly planar image representation with row accessors.
 
-#include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
@@ -104,7 +103,7 @@ struct HWY_CONTRIB_DLLEXPORT ImageBase {
   HWY_INLINE void* VoidRow(const size_t y) const {
 #if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
     if (y >= ysize_) {
-      HWY_ABORT("Row(%" PRIu64 ") >= %u\n", static_cast<uint64_t>(y), ysize_);
+      HWY_ABORT("Row(%d) >= %u\n", static_cast<int>(y), ysize_);
     }
 #endif
 
@@ -223,14 +222,11 @@ class Image3 {
 
   Image3(ImageT&& plane0, ImageT&& plane1, ImageT&& plane2) {
     if (!SameSize(plane0, plane1) || !SameSize(plane0, plane2)) {
-      HWY_ABORT("Not same size: %" PRIu64 " x %" PRIu64 ", %" PRIu64
-                " x %" PRIu64 ", %" PRIu64 " x %" PRIu64 "\n",
-                static_cast<uint64_t>(plane0.xsize()),
-                static_cast<uint64_t>(plane0.ysize()),
-                static_cast<uint64_t>(plane1.xsize()),
-                static_cast<uint64_t>(plane1.ysize()),
-                static_cast<uint64_t>(plane2.xsize()),
-                static_cast<uint64_t>(plane2.ysize()));
+      HWY_ABORT(
+          "Not same size: %d x %d, %d x %d, %d x %d\n",
+          static_cast<int>(plane0.xsize()), static_cast<int>(plane0.ysize()),
+          static_cast<int>(plane1.xsize()), static_cast<int>(plane1.ysize()),
+          static_cast<int>(plane2.xsize()), static_cast<int>(plane2.ysize()));
     }
     planes_[0] = std::move(plane0);
     planes_[1] = std::move(plane1);
@@ -294,9 +290,8 @@ class Image3 {
   HWY_INLINE void* VoidPlaneRow(const size_t c, const size_t y) const {
 #if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
     if (c >= kNumPlanes || y >= ysize()) {
-      HWY_ABORT("PlaneRow(%" PRIu64 ", %" PRIu64 ") >= %" PRIu64 "\n",
-                static_cast<uint64_t>(c), static_cast<uint64_t>(y),
-                static_cast<uint64_t>(ysize()));
+      HWY_ABORT("PlaneRow(%d, %d) >= %d\n", static_cast<int>(c),
+                static_cast<int>(y), static_cast<int>(ysize()));
     }
 #endif
     // Use the first plane's stride because the compiler might not realize they
diff --git a/media/highway/src/hwy/contrib/image/image_test.cc b/media/highway/src/hwy/contrib/image/image_test.cc
index a23ec6ccc9..6886577a46 100644
--- a/media/highway/src/hwy/contrib/image/image_test.cc
+++ b/media/highway/src/hwy/contrib/image/image_test.cc
@@ -15,14 +15,7 @@
 
 #include "hwy/contrib/image/image.h"
 
-#include <cstddef>
-
-#include "hwy/base.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/image/image_test.cc"
-#include "hwy/foreach_target.h"
-
+#include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -30,6 +23,11 @@
 #include <random>
 #include <utility>
 
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/image/image_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target:
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
diff --git a/media/highway/src/hwy/contrib/math/math_test.cc b/media/highway/src/hwy/contrib/math/math_test.cc
index ec6032c6c1..246a081d6b 100644
--- a/media/highway/src/hwy/contrib/math/math_test.cc
+++ b/media/highway/src/hwy/contrib/math/math_test.cc
@@ -13,6 +13,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
+#include <inttypes.h>
 #include <stdio.h>
 
 #include <cfloat>  // FLT_MAX
@@ -21,7 +25,7 @@
 // clang-format off
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/math/math_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 #include "hwy/contrib/math/math-inl.h"
 #include "hwy/tests/test_util-inl.h"
diff --git a/media/highway/src/hwy/contrib/sort/BUILD b/media/highway/src/hwy/contrib/sort/BUILD
index 2e1ddcc0cc..3f56d6d747 100644
--- a/media/highway/src/hwy/contrib/sort/BUILD
+++ b/media/highway/src/hwy/contrib/sort/BUILD
@@ -8,32 +8,89 @@ COMPAT = [
     "//buildenv/target:non_prod",  # includes mobile/vendor.
 ]
 
+# cc_library(
+#     name = "vxsort",
+#     srcs = [
+#         "vxsort/isa_detection.cpp",
+#         "vxsort/isa_detection_msvc.cpp",
+#         "vxsort/isa_detection_sane.cpp",
+#         "vxsort/machine_traits.avx2.cpp",
+#         "vxsort/smallsort/avx2_load_mask_tables.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.double.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.float.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.uint32_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.uint64_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.double.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.float.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.uint32_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.uint64_t.generated.cpp",
+#         "vxsort/vxsort_stats.cpp",
+#     ],
+#     hdrs = [
+#         "vxsort/alignment.h",
+#         "vxsort/defs.h",
+#         "vxsort/isa_detection.h",
+#         "vxsort/machine_traits.avx2.h",
+#         "vxsort/machine_traits.avx512.h",
+#         "vxsort/machine_traits.h",
+#         "vxsort/packer.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.double.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.float.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.uint32_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.uint64_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.double.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.float.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.uint32_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.uint64_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.h",
+#         "vxsort/vxsort.h",
+#         "vxsort/vxsort_stats.h",
+#     ],
+#     compatible_with = [],
+#     textual_hdrs = [
+#         "vxsort/vxsort_targets_disable.h",
+#         "vxsort/vxsort_targets_enable_avx2.h",
+#         "vxsort/vxsort_targets_enable_avx512.h",
+#     ],
+# )
+
 cc_library(
     name = "vqsort",
     srcs = [
         # Split into separate files to reduce MSVC build time.
         "vqsort.cc",
-        "vqsort_i16a.cc",
-        "vqsort_i16d.cc",
-        "vqsort_u16a.cc",
-        "vqsort_u16d.cc",
+        "vqsort_128a.cc",
+        "vqsort_128d.cc",
         "vqsort_f32a.cc",
         "vqsort_f32d.cc",
-        "vqsort_i32a.cc",
-        "vqsort_i32d.cc",
-        "vqsort_u32a.cc",
-        "vqsort_u32d.cc",
         "vqsort_f64a.cc",
         "vqsort_f64d.cc",
+        "vqsort_i16a.cc",
+        "vqsort_i16d.cc",
+        "vqsort_i32a.cc",
+        "vqsort_i32d.cc",
         "vqsort_i64a.cc",
         "vqsort_i64d.cc",
+        "vqsort_kv64a.cc",
+        "vqsort_kv64d.cc",
+        "vqsort_kv128a.cc",
+        "vqsort_kv128d.cc",
+        "vqsort_u16a.cc",
+        "vqsort_u16d.cc",
+        "vqsort_u32a.cc",
+        "vqsort_u32d.cc",
         "vqsort_u64a.cc",
         "vqsort_u64d.cc",
-        "vqsort_128a.cc",
-        "vqsort_128d.cc",
     ],
     hdrs = [
-        "disabled_targets.h",
         "vqsort.h",  # public interface
     ],
     compatible_with = [],
@@ -44,11 +101,13 @@ cc_library(
         "traits-inl.h",
         "traits128-inl.h",
         "vqsort-inl.h",
+        # Placeholder for internal instrumentation. Do not remove.
     ],
     deps = [
         # Only if VQSORT_SECURE_RNG is set.
         # "//third_party/absl/random",
         "//:hwy",
+        # ":vxsort",  # required if HAVE_VXSORT
     ],
 )
 
@@ -87,8 +146,7 @@ cc_test(
     name = "sort_test",
     size = "medium",
     srcs = ["sort_test.cc"],
-    features = ["fully_static_link"],
-    linkstatic = True,
+    # Do not enable fully_static_link (pthread crash on bazel)
     local_defines = ["HWY_IS_TEST"],
     # for test_suite.
     tags = ["hwy_ops_test"],
@@ -105,8 +163,7 @@ cc_binary(
     name = "bench_sort",
     testonly = 1,
     srcs = ["bench_sort.cc"],
-    features = ["fully_static_link"],
-    linkstatic = True,
+    # Do not enable fully_static_link (pthread crash on bazel)
     local_defines = ["HWY_IS_TEST"],
     deps = [
         ":helpers",
@@ -121,8 +178,7 @@ cc_binary(
     name = "bench_parallel",
     testonly = 1,
     srcs = ["bench_parallel.cc"],
-    features = ["fully_static_link"],
-    linkstatic = True,
+    # Do not enable fully_static_link (pthread crash on bazel)
     local_defines = ["HWY_IS_TEST"],
     deps = [
         ":helpers",
diff --git a/media/highway/src/hwy/contrib/sort/README.md b/media/highway/src/hwy/contrib/sort/README.md
new file mode 100644
index 0000000000..a0051414d3
--- /dev/null
+++ b/media/highway/src/hwy/contrib/sort/README.md
@@ -0,0 +1,87 @@
+# Vectorized and performance-portable Quicksort
+
+## Introduction
+
+As of 2022-06-07 this sorts large arrays of built-in types about ten times as
+fast as `std::sort`. See also our
+[blog post](https://opensource.googleblog.com/2022/06/Vectorized%20and%20performance%20portable%20Quicksort.html)
+and [paper](https://arxiv.org/abs/2205.05982).
+
+## Instructions
+
+Here are instructions for reproducing our results on Linux and AWS (SVE, NEON).
+
+### Linux
+
+Please first ensure golang, and Clang (tested with 13.0.1) are installed via
+your system's package manager.
+
+```
+go install github.com/bazelbuild/bazelisk@latest
+git clone https://github.com/google/highway
+cd highway
+CC=clang CXX=clang++ ~/go/bin/bazelisk build -c opt hwy/contrib/sort:all
+bazel-bin/hwy/contrib/sort/sort_test
+bazel-bin/hwy/contrib/sort/bench_sort
+```
+
+### AWS Graviton3
+
+Instance config: amazon linux 5.10 arm64, c7g.8xlarge (largest allowed config is
+32 vCPU). Initial launch will fail. Wait a few minutes for an email saying the
+config is verified, then re-launch. See IPv4 hostname in list of instances.
+
+`ssh -i /path/key.pem ec2-user@hostname`
+
+Note that the AWS CMake package is too old for llvm, so we build it first:
+```
+wget https://cmake.org/files/v3.23/cmake-3.23.2.tar.gz
+tar -xvzf cmake-3.23.2.tar.gz && cd cmake-3.23.2/
+./bootstrap -- -DCMAKE_USE_OPENSSL=OFF
+make -j8 && sudo make install
+cd ..
+```
+
+AWS clang is at version 11.1, which generates unnecessary `AND` instructions
+which slow down the sort by 1.15x. We tested with clang trunk as of June 13
+(which reports Git hash 8f6512fea000c3a0d394864bb94e524bee375069). To build:
+
+```
+git clone --depth 1 https://github.com/llvm/llvm-project.git
+cd llvm-project
+mkdir -p build && cd build
+/usr/local/bin/cmake ../llvm -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" -DCMAKE_BUILD_TYPE=Release
+make -j32 && sudo make install
+```
+
+```
+sudo yum install go
+go install github.com/bazelbuild/bazelisk@latest
+git clone https://github.com/google/highway
+cd highway
+CC=/usr/local/bin/clang CXX=/usr/local/bin/clang++ ~/go/bin/bazelisk build -c opt --copt=-march=armv8.2-a+sve hwy/contrib/sort:all
+bazel-bin/hwy/contrib/sort/sort_test
+bazel-bin/hwy/contrib/sort/bench_sort
+```
+
+The above command line enables SVE, which is currently only available on
+Graviton 3. You can also test NEON on the same processor, or other Arm CPUs, by
+changing the `-march=` option to `--copt=-march=armv8.2-a+crypto`. Note that
+such flags will be unnecessary once Clang supports `#pragma target` for NEON and
+SVE intrinsics, as it does for x86.
+
+## Results
+
+`bench_sort` outputs the instruction set (AVX3 refers to AVX-512), the sort
+algorithm (std for `std::sort`, vq for our vqsort), the type of keys being
+sorted (f32 is float), the distribution of keys (uniform32 for uniform random
+with range 0-2^32), the number of keys, then the throughput of sorted keys (i.e.
+number of key bytes output per second).
+
+Example excerpt from Xeon 6154 (Skylake-X) CPU clocked at 3 GHz:
+
+```
+[ RUN      ] BenchSortGroup/BenchSort.BenchAllSort/AVX3
+      AVX3:          std:     f32: uniform32: 1.00E+06   54 MB/s ( 1 threads)
+      AVX3:           vq:     f32: uniform32: 1.00E+06 1143 MB/s ( 1 threads)
+```
diff --git a/media/highway/src/hwy/contrib/sort/algo-inl.h b/media/highway/src/hwy/contrib/sort/algo-inl.h
index 6e85ca681a..4b01e2de33 100644
--- a/media/highway/src/hwy/contrib/sort/algo-inl.h
+++ b/media/highway/src/hwy/contrib/sort/algo-inl.h
@@ -34,10 +34,11 @@
 #define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1)
 #define HAVE_PDQSORT 0
 #define HAVE_SORT512 0
+#define HAVE_VXSORT 0
 
 #if HAVE_AVX2SORT
 HWY_PUSH_ATTRIBUTES("avx2,avx")
-#include "avx2sort.h"
+#include "avx2sort.h"  //NOLINT
 HWY_POP_ATTRIBUTES
 #endif
 #if HAVE_IPS4O || HAVE_PARALLEL_IPS4O
@@ -48,18 +49,59 @@ HWY_POP_ATTRIBUTES
 #include "third_party/boost/allowed/sort/sort.hpp"
 #endif
 #if HAVE_SORT512
-#include "sort512.h"
+#include "sort512.h"  //NOLINT
 #endif
 
+// vxsort is difficult to compile for multiple targets because it also uses
+// .cpp files, and we'd also have to #undef its include guards. Instead, compile
+// only for AVX2 or AVX3 depending on this macro.
+#define VXSORT_AVX3 1
+#if HAVE_VXSORT
+// inlined from vxsort_targets_enable_avx512 (must close before end of header)
+#ifdef __GNUC__
+#ifdef __clang__
+#if VXSORT_AVX3
+#pragma clang attribute push(__attribute__((target("avx512f,avx512dq"))), \
+                             apply_to = any(function))
+#else
+#pragma clang attribute push(__attribute__((target("avx2"))), \
+                             apply_to = any(function))
+#endif  // VXSORT_AVX3
+
+#else
+#pragma GCC push_options
+#if VXSORT_AVX3
+#pragma GCC target("avx512f,avx512dq")
+#else
+#pragma GCC target("avx2")
+#endif  // VXSORT_AVX3
+#endif
+#endif
+
+#if VXSORT_AVX3
+#include "vxsort/machine_traits.avx512.h"
+#else
+#include "vxsort/machine_traits.avx2.h"
+#endif  // VXSORT_AVX3
+#include "vxsort/vxsort.h"
+#ifdef __GNUC__
+#ifdef __clang__
+#pragma clang attribute pop
+#else
+#pragma GCC pop_options
+#endif
+#endif
+#endif  // HAVE_VXSORT
+
 namespace hwy {
 
 enum class Dist { kUniform8, kUniform16, kUniform32 };
 
-std::vector<Dist> AllDist() {
+static inline std::vector<Dist> AllDist() {
   return {/*Dist::kUniform8, Dist::kUniform16,*/ Dist::kUniform32};
 }
 
-const char* DistName(Dist dist) {
+static inline const char* DistName(Dist dist) {
   switch (dist) {
     case Dist::kUniform8:
       return "uniform8";
@@ -82,7 +124,7 @@ class InputStats {
     // bit representations as the checksum.
     uint64_t bits = 0;
     static_assert(sizeof(T) <= 8, "Expected a built-in type");
-    CopyBytes<sizeof(T)>(&value, &bits);
+    CopyBytes<sizeof(T)>(&value, &bits);  // not same size
     sum_ += bits;
     count_ += 1;
   }
@@ -94,13 +136,16 @@ class InputStats {
     }
 
     if (min_ != other.min_ || max_ != other.max_) {
-      HWY_ABORT("minmax %f/%f vs %f/%f\n", double(min_), double(max_),
-                double(other.min_), double(other.max_));
+      HWY_ABORT("minmax %f/%f vs %f/%f\n", static_cast<double>(min_),
+                static_cast<double>(max_), static_cast<double>(other.min_),
+                static_cast<double>(other.max_));
     }
 
     // Sum helps detect duplicated/lost values
     if (sum_ != other.sum_) {
-      HWY_ABORT("Sum mismatch; min %g max %g\n", double(min_), double(max_));
+      HWY_ABORT("Sum mismatch %g %g; min %g max %g\n",
+                static_cast<double>(sum_), static_cast<double>(other.sum_),
+                static_cast<double>(min_), static_cast<double>(max_));
     }
 
     return true;
@@ -129,12 +174,15 @@ enum class Algo {
 #if HAVE_SORT512
   kSort512,
 #endif
+#if HAVE_VXSORT
+  kVXSort,
+#endif
   kStd,
   kVQSort,
   kHeap,
 };
 
-const char* AlgoName(Algo algo) {
+static inline const char* AlgoName(Algo algo) {
   switch (algo) {
 #if HAVE_AVX2SORT
     case Algo::kSEA:
@@ -156,6 +204,10 @@ const char* AlgoName(Algo algo) {
     case Algo::kSort512:
       return "sort512";
 #endif
+#if HAVE_VXSORT
+    case Algo::kVXSort:
+      return "vxsort";
+#endif
     case Algo::kStd:
       return "std";
     case Algo::kVQSort:
@@ -206,12 +258,11 @@ class Xorshift128Plus {
   }
 
   // Need to pass in the state because vector cannot be class members.
-  template <class DU64>
-  static Vec<DU64> RandomBits(DU64 /* tag */, Vec<DU64>& state0,
-                              Vec<DU64>& state1) {
-    Vec<DU64> s1 = state0;
-    Vec<DU64> s0 = state1;
-    const Vec<DU64> bits = Add(s1, s0);
+  template <class VU64>
+  static VU64 RandomBits(VU64& state0, VU64& state1) {
+    VU64 s1 = state0;
+    VU64 s0 = state1;
+    const VU64 bits = Add(s1, s0);
     state0 = s0;
     s1 = Xor(s1, ShiftLeft<23>(s1));
     state1 = Xor(s1, Xor(s0, Xor(ShiftRight<18>(s1), ShiftRight<5>(s0))));
@@ -219,32 +270,34 @@ class Xorshift128Plus {
   }
 };
 
-template <typename T, class DU64, HWY_IF_NOT_FLOAT(T)>
-Vec<DU64> RandomValues(DU64 du64, Vec<DU64>& s0, Vec<DU64>& s1,
-                       const Vec<DU64> mask) {
-  const Vec<DU64> bits = Xorshift128Plus::RandomBits(du64, s0, s1);
-  return And(bits, mask);
+template <class D, class VU64, HWY_IF_NOT_FLOAT_D(D)>
+Vec<D> RandomValues(D d, VU64& s0, VU64& s1, const VU64 mask) {
+  const VU64 bits = Xorshift128Plus::RandomBits(s0, s1);
+  return BitCast(d, And(bits, mask));
 }
 
-// Important to avoid denormals, which are flushed to zero by SIMD but not
+// It is important to avoid denormals, which are flushed to zero by SIMD but not
 // scalar sorts, and NaN, which may be ordered differently in scalar vs. SIMD.
-template <typename T, class DU64, HWY_IF_FLOAT(T)>
-Vec<DU64> RandomValues(DU64 du64, Vec<DU64>& s0, Vec<DU64>& s1,
-                       const Vec<DU64> mask) {
-  const Vec<DU64> bits = Xorshift128Plus::RandomBits(du64, s0, s1);
-  const Vec<DU64> values = And(bits, mask);
-#if HWY_TARGET == HWY_SCALAR  // Cannot repartition u64 to i32
-  const RebindToSigned<DU64> di;
+template <class DF, class VU64, HWY_IF_FLOAT_D(DF)>
+Vec<DF> RandomValues(DF df, VU64& s0, VU64& s1, const VU64 mask) {
+  using TF = TFromD<DF>;
+  const RebindToUnsigned<decltype(df)> du;
+  using VU = Vec<decltype(du)>;
+
+  const VU64 bits64 = And(Xorshift128Plus::RandomBits(s0, s1), mask);
+
+#if HWY_TARGET == HWY_SCALAR  // Cannot repartition u64 to smaller types
+  using TU = MakeUnsigned<TF>;
+  const VU bits = Set(du, static_cast<TU>(GetLane(bits64) & LimitsMax<TU>()));
 #else
-  const Repartition<MakeSigned<T>, DU64> di;
+  const VU bits = BitCast(du, bits64);
 #endif
-  const RebindToFloat<decltype(di)> df;
-  const RebindToUnsigned<decltype(di)> du;
-  const auto k1 = BitCast(du64, Set(df, T{1.0}));
-  const auto mantissa = BitCast(du64, Set(du, MantissaMask<T>()));
-  // Avoid NaN/denormal by converting from (range-limited) integer.
-  const Vec<DU64> no_nan = OrAnd(k1, values, mantissa);
-  return BitCast(du64, ConvertTo(df, BitCast(di, no_nan)));
+  // Avoid NaN/denormal by only generating values in [1, 2), i.e. random
+  // mantissas with the exponent taken from the representation of 1.0.
+  const VU k1 = BitCast(du, Set(df, TF{1.0}));
+  const VU mantissa_mask = Set(du, MantissaMask<TF>());
+  const VU representation = OrAnd(k1, bits, mantissa_mask);
+  return BitCast(df, representation);
 }
 
 template <class DU64>
@@ -272,29 +325,29 @@ InputStats<T> GenerateInput(const Dist dist, T* v, size_t num) {
   SortTag<uint64_t> du64;
   using VU64 = Vec<decltype(du64)>;
   const size_t N64 = Lanes(du64);
-  auto buf = hwy::AllocateAligned<uint64_t>(2 * N64);
-  Xorshift128Plus::GenerateSeeds(du64, buf.get());
-  auto s0 = Load(du64, buf.get());
-  auto s1 = Load(du64, buf.get() + N64);
-
-  const VU64 mask = MaskForDist(du64, dist, sizeof(T));
+  auto seeds = hwy::AllocateAligned<uint64_t>(2 * N64);
+  Xorshift128Plus::GenerateSeeds(du64, seeds.get());
+  VU64 s0 = Load(du64, seeds.get());
+  VU64 s1 = Load(du64, seeds.get() + N64);
 
+#if HWY_TARGET == HWY_SCALAR
+  const Sisd<T> d;
+#else
   const Repartition<T, decltype(du64)> d;
+#endif
+  using V = Vec<decltype(d)>;
   const size_t N = Lanes(d);
+  const VU64 mask = MaskForDist(du64, dist, sizeof(T));
+  auto buf = hwy::AllocateAligned<T>(N);
+
   size_t i = 0;
   for (; i + N <= num; i += N) {
-    const VU64 bits = RandomValues<T>(du64, s0, s1, mask);
-#if HWY_ARCH_RVV
-    // v may not be 64-bit aligned
-    StoreU(bits, du64, buf.get());
-    memcpy(v + i, buf.get(), N64 * sizeof(uint64_t));
-#else
-    StoreU(bits, du64, reinterpret_cast<uint64_t*>(v + i));
-#endif
+    const V values = RandomValues(d, s0, s1, mask);
+    StoreU(values, d, v + i);
   }
   if (i < num) {
-    const VU64 bits = RandomValues<T>(du64, s0, s1, mask);
-    StoreU(bits, du64, buf.get());
+    const V values = RandomValues(d, s0, s1, mask);
+    StoreU(values, d, buf.get());
     memcpy(v + i, buf.get(), (num - i) * sizeof(T));
   }
 
@@ -318,12 +371,58 @@ struct SharedState {
   std::vector<ThreadLocal> tls{1};
 };
 
-template <class Order, typename T>
-void Run(Algo algo, T* HWY_RESTRICT inout, size_t num, SharedState& shared,
-         size_t thread) {
-  using detail::HeapSort;
+// Bridge from keys (passed to Run) to lanes as expected by HeapSort. For
+// non-128-bit keys they are the same:
+template <class Order, typename KeyType, HWY_IF_NOT_LANE_SIZE(KeyType, 16)>
+void CallHeapSort(KeyType* HWY_RESTRICT keys, const size_t num_keys) {
   using detail::TraitsLane;
   using detail::SharedTraits;
+  if (Order().IsAscending()) {
+    const SharedTraits<TraitsLane<detail::OrderAscending<KeyType>>> st;
+    return detail::HeapSort(st, keys, num_keys);
+  } else {
+    const SharedTraits<TraitsLane<detail::OrderDescending<KeyType>>> st;
+    return detail::HeapSort(st, keys, num_keys);
+  }
+}
+
+#if VQSORT_ENABLED
+template <class Order>
+void CallHeapSort(hwy::uint128_t* HWY_RESTRICT keys, const size_t num_keys) {
+  using detail::SharedTraits;
+  using detail::Traits128;
+  uint64_t* lanes = reinterpret_cast<uint64_t*>(keys);
+  const size_t num_lanes = num_keys * 2;
+  if (Order().IsAscending()) {
+    const SharedTraits<Traits128<detail::OrderAscending128>> st;
+    return detail::HeapSort(st, lanes, num_lanes);
+  } else {
+    const SharedTraits<Traits128<detail::OrderDescending128>> st;
+    return detail::HeapSort(st, lanes, num_lanes);
+  }
+}
+
+template <class Order>
+void CallHeapSort(K64V64* HWY_RESTRICT keys, const size_t num_keys) {
+  using detail::SharedTraits;
+  using detail::Traits128;
+  uint64_t* lanes = reinterpret_cast<uint64_t*>(keys);
+  const size_t num_lanes = num_keys * 2;
+  if (Order().IsAscending()) {
+    const SharedTraits<Traits128<detail::OrderAscendingKV128>> st;
+    return detail::HeapSort(st, lanes, num_lanes);
+  } else {
+    const SharedTraits<Traits128<detail::OrderDescendingKV128>> st;
+    return detail::HeapSort(st, lanes, num_lanes);
+  }
+}
+#endif  // VQSORT_ENABLED
+
+template <class Order, typename KeyType>
+void Run(Algo algo, KeyType* HWY_RESTRICT inout, size_t num,
+         SharedState& shared, size_t thread) {
+  const std::less<KeyType> less;
+  const std::greater<KeyType> greater;
 
   switch (algo) {
 #if HAVE_AVX2SORT
@@ -334,20 +433,18 @@ void Run(Algo algo, T* HWY_RESTRICT inout, size_t num, SharedState& shared,
 #if HAVE_IPS4O
     case Algo::kIPS4O:
       if (Order().IsAscending()) {
-        return ips4o::sort(inout, inout + num, std::less<T>());
+        return ips4o::sort(inout, inout + num, less);
       } else {
-        return ips4o::sort(inout, inout + num, std::greater<T>());
+        return ips4o::sort(inout, inout + num, greater);
       }
 #endif
 
 #if HAVE_PARALLEL_IPS4O
     case Algo::kParallelIPS4O:
       if (Order().IsAscending()) {
-        return ips4o::parallel::sort(inout, inout + num, std::less<T>(),
-                                     shared.pool);
+        return ips4o::parallel::sort(inout, inout + num, less, shared.pool);
       } else {
-        return ips4o::parallel::sort(inout, inout + num, std::greater<T>(),
-                                     shared.pool);
+        return ips4o::parallel::sort(inout, inout + num, greater, shared.pool);
       }
 #endif
 
@@ -360,33 +457,47 @@ void Run(Algo algo, T* HWY_RESTRICT inout, size_t num, SharedState& shared,
 #if HAVE_PDQSORT
     case Algo::kPDQ:
       if (Order().IsAscending()) {
-        return boost::sort::pdqsort_branchless(inout, inout + num,
-                                               std::less<T>());
+        return boost::sort::pdqsort_branchless(inout, inout + num, less);
       } else {
-        return boost::sort::pdqsort_branchless(inout, inout + num,
-                                               std::greater<T>());
+        return boost::sort::pdqsort_branchless(inout, inout + num, greater);
       }
 #endif
 
+#if HAVE_VXSORT
+    case Algo::kVXSort: {
+#if (VXSORT_AVX3 && HWY_TARGET != HWY_AVX3) || \
+    (!VXSORT_AVX3 && HWY_TARGET != HWY_AVX2)
+      fprintf(stderr, "Do not call for target %s\n",
+              hwy::TargetName(HWY_TARGET));
+      return;
+#else
+#if VXSORT_AVX3
+      vxsort::vxsort<KeyType, vxsort::AVX512> vx;
+#else
+      vxsort::vxsort<KeyType, vxsort::AVX2> vx;
+#endif
+      if (Order().IsAscending()) {
+        return vx.sort(inout, inout + num - 1);
+      } else {
+        fprintf(stderr, "Skipping VX - does not support descending order\n");
+        return;
+      }
+#endif  // enabled for this target
+    }
+#endif  // HAVE_VXSORT
+
     case Algo::kStd:
       if (Order().IsAscending()) {
-        return std::sort(inout, inout + num, std::less<T>());
+        return std::sort(inout, inout + num, less);
       } else {
-        return std::sort(inout, inout + num, std::greater<T>());
+        return std::sort(inout, inout + num, greater);
       }
 
     case Algo::kVQSort:
       return shared.tls[thread].sorter(inout, num, Order());
 
     case Algo::kHeap:
-      HWY_ASSERT(sizeof(T) < 16);
-      if (Order().IsAscending()) {
-        const SharedTraits<TraitsLane<detail::OrderAscending>> st;
-        return HeapSort(st, inout, num);
-      } else {
-        const SharedTraits<TraitsLane<detail::OrderDescending>> st;
-        return HeapSort(st, inout, num);
-      }
+      return CallHeapSort<Order>(inout, num);
 
     default:
       HWY_ABORT("Not implemented");
diff --git a/media/highway/src/hwy/contrib/sort/bench_parallel.cc b/media/highway/src/hwy/contrib/sort/bench_parallel.cc
index c0cb058dd2..1c8c928e21 100644
--- a/media/highway/src/hwy/contrib/sort/bench_parallel.cc
+++ b/media/highway/src/hwy/contrib/sort/bench_parallel.cc
@@ -28,10 +28,9 @@
 #include <vector>
 
 // clang-format off
-#include "hwy/contrib/sort/vqsort.h"
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_parallel.cc"  //NOLINT
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/algo-inl.h"
@@ -46,8 +45,6 @@ namespace hwy {
 namespace HWY_NAMESPACE {
 namespace {
 
-#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_EMU128
-
 class ThreadPool {
  public:
   // Starts the given number of worker threads and blocks until they are ready.
@@ -169,16 +166,21 @@ class ThreadPool {
   const void* data_;                               // points to caller's Func
 };
 
-template <class Order, typename T>
-void RunWithoutVerify(const Dist dist, const size_t num, const Algo algo,
-                      SharedState& shared, size_t thread) {
-  auto aligned = hwy::AllocateAligned<T>(num);
+template <class Traits>
+void RunWithoutVerify(Traits st, const Dist dist, const size_t num_keys,
+                      const Algo algo, SharedState& shared, size_t thread) {
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
+  using Order = typename Traits::Order;
+  const size_t num_lanes = num_keys * st.LanesPerKey();
+  auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
 
-  (void)GenerateInput(dist, aligned.get(), num);
+  (void)GenerateInput(dist, aligned.get(), num_lanes);
 
   const Timestamp t0;
-  Run<Order>(algo, aligned.get(), num, shared, thread);
-  HWY_ASSERT(aligned[0] < aligned[num - 1]);
+  Run<Order>(algo, reinterpret_cast<KeyType*>(aligned.get()), num_keys, shared,
+             thread);
+  HWY_ASSERT(aligned[0] < aligned[num_lanes - 1]);
 }
 
 void BenchParallel() {
@@ -190,17 +192,16 @@ void BenchParallel() {
   ThreadPool pool;
   const size_t NT = pool.NumThreads();
 
-  using T = int64_t;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st;
-
-  size_t num = 100 * 1000 * 1000;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int64_t>>> st;
+  using KeyType = typename decltype(st)::KeyType;
+  const size_t num_keys = size_t{100} * 1000 * 1000;
 
 #if HAVE_IPS4O
   const Algo algo = Algo::kIPS4O;
 #else
   const Algo algo = Algo::kVQSort;
 #endif
-  const Dist dist = Dist::kUniform16;
+  const Dist dist = Dist::kUniform32;
 
   SharedState shared;
   shared.tls.resize(NT);
@@ -210,18 +211,15 @@ void BenchParallel() {
     Timestamp t0;
     // Default capture because MSVC wants algo/dist but clang does not.
     pool.RunOnThreads(nt, [=, &shared](size_t thread) {
-      RunWithoutVerify<SortAscending, T>(dist, num, algo, shared, thread);
+      RunWithoutVerify(st, dist, num_keys, algo, shared, thread);
     });
     const double sec = SecondsSince(t0);
-    results.push_back(MakeResult<T>(algo, dist, st, num, nt, sec));
+    results.emplace_back(algo, dist, num_keys, nt, sec, sizeof(KeyType),
+                         st.KeyString());
     results.back().Print();
   }
 }
 
-#else
-void BenchParallel() {}
-#endif
-
 }  // namespace
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
diff --git a/media/highway/src/hwy/contrib/sort/bench_sort.cc b/media/highway/src/hwy/contrib/sort/bench_sort.cc
index 65fc3e18d9..a668fde907 100644
--- a/media/highway/src/hwy/contrib/sort/bench_sort.cc
+++ b/media/highway/src/hwy/contrib/sort/bench_sort.cc
@@ -13,31 +13,33 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <stdint.h>
+#include <stdio.h>
+
+#include <vector>
+
 // clang-format off
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_sort.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/algo-inl.h"
 #include "hwy/contrib/sort/result-inl.h"
-#include "hwy/contrib/sort/vqsort.h"
 #include "hwy/contrib/sort/sorting_networks-inl.h"  // SharedTraits
 #include "hwy/contrib/sort/traits-inl.h"
 #include "hwy/contrib/sort/traits128-inl.h"
 #include "hwy/tests/test_util-inl.h"
 // clang-format on
 
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>  // memcpy
-
-#include <vector>
+// Mode for larger sorts because M1 is able to access more than the per-core
+// share of L2, so 1M elements might still be in cache.
+#define SORT_100M 0
 
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 // Defined within HWY_ONCE, used by BenchAllSort.
-extern uint32_t first_sort_target;
+extern int64_t first_sort_target;
 
 namespace HWY_NAMESPACE {
 namespace {
@@ -46,41 +48,52 @@ using detail::OrderAscending;
 using detail::OrderDescending;
 using detail::SharedTraits;
 
-#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_EMU128
+#if VQSORT_ENABLED || HWY_IDE
 using detail::OrderAscending128;
-using detail::OrderDescending128;
+using detail::OrderAscendingKV128;
 using detail::Traits128;
 
-template <class Traits, typename T>
+template <class Traits>
 HWY_NOINLINE void BenchPartition() {
-  const SortTag<T> d;
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
+  const SortTag<LaneType> d;
   detail::SharedTraits<Traits> st;
   const Dist dist = Dist::kUniform8;
   double sum = 0.0;
 
+  detail::Generator rng(&sum, 123);  // for ChoosePivot
+
   const size_t max_log2 = AdjustedLog2Reps(20);
   for (size_t log2 = max_log2; log2 < max_log2 + 1; ++log2) {
-    const size_t num = 1ull << log2;
-    auto aligned = hwy::AllocateAligned<T>(num);
-    auto buf =
-        hwy::AllocateAligned<T>(hwy::SortConstants::PartitionBufNum(Lanes(d)));
+    const size_t num_lanes = 1ull << log2;
+    const size_t num_keys = num_lanes / st.LanesPerKey();
+    auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
+    auto buf = hwy::AllocateAligned<LaneType>(
+        HWY_MAX(hwy::SortConstants::PartitionBufNum(Lanes(d)),
+                hwy::SortConstants::PivotBufNum(sizeof(LaneType), Lanes(d))));
 
     std::vector<double> seconds;
-    const size_t num_reps = (1ull << (14 - log2 / 2)) * kReps;
+    const size_t num_reps = (1ull << (14 - log2 / 2)) * 30;
     for (size_t rep = 0; rep < num_reps; ++rep) {
-      (void)GenerateInput(dist, aligned.get(), num);
+      (void)GenerateInput(dist, aligned.get(), num_lanes);
 
-      const Timestamp t0;
+      // The pivot value can influence performance. Do exactly what vqsort will
+      // do so that the performance (influenced by prefetching and branch
+      // prediction) is likely to predict the actual performance inside vqsort.
+      detail::DrawSamples(d, st, aligned.get(), num_lanes, buf.get(), rng);
+      detail::SortSamples(d, st, buf.get());
+      auto pivot = detail::ChoosePivotByRank(d, st, buf.get());
 
-      detail::Partition(d, st, aligned.get(), 0, num - 1, Set(d, T(128)),
-                        buf.get());
+      const Timestamp t0;
+      detail::Partition(d, st, aligned.get(), num_lanes - 1, pivot, buf.get());
       seconds.push_back(SecondsSince(t0));
       // 'Use' the result to prevent optimizing out the partition.
-      sum += static_cast<double>(aligned.get()[num / 2]);
+      sum += static_cast<double>(aligned.get()[num_lanes / 2]);
     }
 
-    MakeResult<T>(Algo::kVQSort, dist, st, num, 1,
-                  SummarizeMeasurements(seconds))
+    Result(Algo::kVQSort, dist, num_keys, 1, SummarizeMeasurements(seconds),
+           sizeof(KeyType), st.KeyString())
         .Print();
   }
   HWY_ASSERT(sum != 999999);  // Prevent optimizing out
@@ -88,51 +101,60 @@ HWY_NOINLINE void BenchPartition() {
 
 HWY_NOINLINE void BenchAllPartition() {
   // Not interested in benchmark results for these targets
-  if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
+  if (HWY_TARGET == HWY_SSSE3) {
     return;
   }
 
-  BenchPartition<TraitsLane<OrderDescending>, float>();
-  BenchPartition<TraitsLane<OrderAscending>, int64_t>();
-  BenchPartition<Traits128<OrderDescending128>, uint64_t>();
+  BenchPartition<TraitsLane<OrderDescending<float>>>();
+  BenchPartition<TraitsLane<OrderDescending<int32_t>>>();
+  BenchPartition<TraitsLane<OrderDescending<int64_t>>>();
+  BenchPartition<Traits128<OrderAscending128>>();
+  // BenchPartition<Traits128<OrderDescending128>>();
+  BenchPartition<Traits128<OrderAscendingKV128>>();
 }
 
-template <class Traits, typename T>
+template <class Traits>
 HWY_NOINLINE void BenchBase(std::vector<Result>& results) {
   // Not interested in benchmark results for these targets
   if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
     return;
   }
 
-  const SortTag<T> d;
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
+  const SortTag<LaneType> d;
   detail::SharedTraits<Traits> st;
   const Dist dist = Dist::kUniform32;
 
   const size_t N = Lanes(d);
-  const size_t num = SortConstants::BaseCaseNum(N);
-  auto keys = hwy::AllocateAligned<T>(num);
-  auto buf = hwy::AllocateAligned<T>(num + N);
+  const size_t num_lanes = SortConstants::BaseCaseNum(N);
+  const size_t num_keys = num_lanes / st.LanesPerKey();
+  auto keys = hwy::AllocateAligned<LaneType>(num_lanes);
+  auto buf = hwy::AllocateAligned<LaneType>(num_lanes + N);
 
   std::vector<double> seconds;
   double sum = 0;                             // prevents elision
   constexpr size_t kMul = AdjustedReps(600);  // ensures long enough to measure
 
-  for (size_t rep = 0; rep < kReps; ++rep) {
-    InputStats<T> input_stats = GenerateInput(dist, keys.get(), num);
+  for (size_t rep = 0; rep < 30; ++rep) {
+    InputStats<LaneType> input_stats =
+        GenerateInput(dist, keys.get(), num_lanes);
 
     const Timestamp t0;
     for (size_t i = 0; i < kMul; ++i) {
-      detail::BaseCase(d, st, keys.get(), num, buf.get());
+      detail::BaseCase(d, st, keys.get(), keys.get() + num_lanes, num_lanes,
+                       buf.get());
       sum += static_cast<double>(keys[0]);
     }
     seconds.push_back(SecondsSince(t0));
     // printf("%f\n", seconds.back());
 
-    HWY_ASSERT(VerifySort(st, input_stats, keys.get(), num, "BenchBase"));
+    HWY_ASSERT(VerifySort(st, input_stats, keys.get(), num_lanes, "BenchBase"));
   }
   HWY_ASSERT(sum < 1E99);
-  results.push_back(MakeResult<T>(Algo::kVQSort, dist, st, num * kMul, 1,
-                                  SummarizeMeasurements(seconds)));
+  results.emplace_back(Algo::kVQSort, dist, num_keys * kMul, 1,
+                       SummarizeMeasurements(seconds), sizeof(KeyType),
+                       st.KeyString());
 }
 
 HWY_NOINLINE void BenchAllBase() {
@@ -142,14 +164,19 @@ HWY_NOINLINE void BenchAllBase() {
   }
 
   std::vector<Result> results;
-  BenchBase<TraitsLane<OrderAscending>, float>(results);
-  BenchBase<TraitsLane<OrderDescending>, int64_t>(results);
-  BenchBase<Traits128<OrderAscending128>, uint64_t>(results);
+  BenchBase<TraitsLane<OrderAscending<float>>>(results);
+  BenchBase<TraitsLane<OrderDescending<int64_t>>>(results);
+  BenchBase<Traits128<OrderAscending128>>(results);
   for (const Result& r : results) {
     r.Print();
   }
 }
 
+#else
+void BenchAllPartition() {}
+void BenchAllBase() {}
+#endif  // VQSORT_ENABLED
+
 std::vector<Algo> AlgoForBench() {
   return {
 #if HAVE_AVX2SORT
@@ -166,45 +193,64 @@ std::vector<Algo> AlgoForBench() {
 #if HAVE_SORT512
         Algo::kSort512,
 #endif
+// Only include if we're compiling for the target it supports.
+#if HAVE_VXSORT && ((VXSORT_AVX3 && HWY_TARGET == HWY_AVX3) || \
+                    (!VXSORT_AVX3 && HWY_TARGET == HWY_AVX2))
+        Algo::kVXSort,
+#endif
 
-// These are 10-20x slower, but that's OK for the default size when we are
-// not testing the parallel mode.
 #if !HAVE_PARALLEL_IPS4O
+#if !SORT_100M
+        // These are 10-20x slower, but that's OK for the default size when we
+        // are not testing the parallel nor 100M modes.
         Algo::kStd, Algo::kHeap,
+#endif
 
         Algo::kVQSort,  // only ~4x slower, but not required for Table 1a
 #endif
-
   };
 }
 
-template <class Traits, typename T>
-HWY_NOINLINE void BenchSort(size_t num) {
+template <class Traits>
+HWY_NOINLINE void BenchSort(size_t num_keys) {
   if (first_sort_target == 0) first_sort_target = HWY_TARGET;
 
   SharedState shared;
   detail::SharedTraits<Traits> st;
-  auto aligned = hwy::AllocateAligned<T>(num);
+  using Order = typename Traits::Order;
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
+  const size_t num_lanes = num_keys * st.LanesPerKey();
+  auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
+
+  const size_t reps = num_keys > 1000 * 1000 ? 10 : 30;
+
   for (Algo algo : AlgoForBench()) {
     // Other algorithms don't depend on the vector instructions, so only run
     // them for the first target.
-    if (algo != Algo::kVQSort && HWY_TARGET != first_sort_target) continue;
+#if !HAVE_VXSORT
+    if (algo != Algo::kVQSort && HWY_TARGET != first_sort_target) {
+      continue;
+    }
+#endif
 
     for (Dist dist : AllDist()) {
       std::vector<double> seconds;
-      for (size_t rep = 0; rep < kReps; ++rep) {
-        InputStats<T> input_stats = GenerateInput(dist, aligned.get(), num);
+      for (size_t rep = 0; rep < reps; ++rep) {
+        InputStats<LaneType> input_stats =
+            GenerateInput(dist, aligned.get(), num_lanes);
 
         const Timestamp t0;
-        Run<typename Traits::Order>(algo, aligned.get(), num, shared,
-                                    /*thread=*/0);
+        Run<Order>(algo, reinterpret_cast<KeyType*>(aligned.get()), num_keys,
+                   shared, /*thread=*/0);
         seconds.push_back(SecondsSince(t0));
         // printf("%f\n", seconds.back());
 
         HWY_ASSERT(
-            VerifySort(st, input_stats, aligned.get(), num, "BenchSort"));
+            VerifySort(st, input_stats, aligned.get(), num_lanes, "BenchSort"));
       }
-      MakeResult<T>(algo, dist, st, num, 1, SummarizeMeasurements(seconds))
+      Result(algo, dist, num_keys, 1, SummarizeMeasurements(seconds),
+             sizeof(KeyType), st.KeyString())
           .Print();
     }  // dist
   }    // algo
@@ -220,32 +266,29 @@ HWY_NOINLINE void BenchAllSort() {
   constexpr size_t M = K * K;
   (void)K;
   (void)M;
-  for (size_t num : {
-#if HAVE_PARALLEL_IPS4O
+  for (size_t num_keys : {
+#if HAVE_PARALLEL_IPS4O || SORT_100M
          100 * M,
 #else
-         AdjustedReps(1 * M),
+        1 * M,
 #endif
        }) {
-    BenchSort<TraitsLane<OrderAscending>, float>(num);
-    // BenchSort<TraitsLane<OrderDescending>, double>(num);
-    // BenchSort<TraitsLane<OrderAscending>, int16_t>(num);
-    BenchSort<TraitsLane<OrderDescending>, int32_t>(num);
-    BenchSort<TraitsLane<OrderAscending>, int64_t>(num);
-    // BenchSort<TraitsLane<OrderDescending>, uint16_t>(num);
-    // BenchSort<TraitsLane<OrderDescending>, uint32_t>(num);
-    // BenchSort<TraitsLane<OrderAscending>, uint64_t>(num);
-
-    BenchSort<Traits128<OrderAscending128>, uint64_t>(num);
+    BenchSort<TraitsLane<OrderAscending<float>>>(num_keys);
+    // BenchSort<TraitsLane<OrderDescending<double>>>(num_keys);
+    // BenchSort<TraitsLane<OrderAscending<int16_t>>>(num_keys);
+    BenchSort<TraitsLane<OrderDescending<int32_t>>>(num_keys);
+    BenchSort<TraitsLane<OrderAscending<int64_t>>>(num_keys);
+    // BenchSort<TraitsLane<OrderDescending<uint16_t>>>(num_keys);
+    // BenchSort<TraitsLane<OrderDescending<uint32_t>>>(num_keys);
+    // BenchSort<TraitsLane<OrderAscending<uint64_t>>>(num_keys);
+
+#if !HAVE_VXSORT && VQSORT_ENABLED
+    BenchSort<Traits128<OrderAscending128>>(num_keys);
+    BenchSort<Traits128<OrderAscendingKV128>>(num_keys);
+#endif
   }
 }
 
-#else
-void BenchAllPartition() {}
-void BenchAllBase() {}
-void BenchAllSort() {}
-#endif
-
 }  // namespace
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
@@ -255,7 +298,7 @@ HWY_AFTER_NAMESPACE();
 #if HWY_ONCE
 
 namespace hwy {
-uint32_t first_sort_target = 0;  // none run yet
+int64_t first_sort_target = 0;  // none run yet
 namespace {
 HWY_BEFORE_TEST(BenchSort);
 HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllPartition);
diff --git a/media/highway/src/hwy/contrib/sort/disabled_targets.h b/media/highway/src/hwy/contrib/sort/disabled_targets.h
deleted file mode 100644
index 4c3f54b45b..0000000000
--- a/media/highway/src/hwy/contrib/sort/disabled_targets.h
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright 2022 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Speed up MSVC builds by building fewer targets. This header must be included
-// from all TUs that contain a HWY_DYNAMIC_DISPATCH to vqsort, i.e. vqsort_*.cc.
-// However, users of vqsort.h are unaffected.
-
-#ifndef HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_
-#define HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_
-
-#include "hwy/base.h"
-
-#if HWY_COMPILER_MSVC
-#undef HWY_DISABLED_TARGETS
-// Either HWY_SCALAR/HWY_EMU128 remains, so we still have a valid target.
-#define HWY_DISABLED_TARGETS (HWY_SSSE3 | HWY_SSE4)
-#endif  // HWY_COMPILER_MSVC
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_
diff --git a/media/highway/src/hwy/contrib/sort/result-inl.h b/media/highway/src/hwy/contrib/sort/result-inl.h
index 402f639d5b..f3d842dfbd 100644
--- a/media/highway/src/hwy/contrib/sort/result-inl.h
+++ b/media/highway/src/hwy/contrib/sort/result-inl.h
@@ -34,20 +34,19 @@ struct Timestamp {
   double t;
 };
 
-double SecondsSince(const Timestamp& t0) {
+static inline double SecondsSince(const Timestamp& t0) {
   const Timestamp t1;
   return t1.t - t0.t;
 }
 
-constexpr size_t kReps = 30;
-
 // Returns trimmed mean (we don't want to run an out-of-L3-cache sort often
 // enough for the mode to be reliable).
-double SummarizeMeasurements(std::vector<double>& seconds) {
+static inline double SummarizeMeasurements(std::vector<double>& seconds) {
   std::sort(seconds.begin(), seconds.end());
   double sum = 0;
   int count = 0;
-  for (size_t i = kReps / 4; i < seconds.size() - kReps / 2; ++i) {
+  const size_t num = seconds.size();
+  for (size_t i = num / 4; i < num / 2; ++i) {
     sum += seconds[i];
     count += 1;
   }
@@ -72,72 +71,62 @@ namespace HWY_NAMESPACE {
 
 struct Result {
   Result() {}
-  Result(const uint32_t target, const Algo algo, Dist dist, bool is128,
-         size_t num, size_t num_threads, double sec, size_t sizeof_t,
-         const char* type_name)
-      : target(target),
+  Result(const Algo algo, Dist dist, size_t num_keys, size_t num_threads,
+         double sec, size_t sizeof_key, const std::string& key_name)
+      : target(HWY_TARGET),
         algo(algo),
         dist(dist),
-        is128(is128),
-        num(num),
+        num_keys(num_keys),
         num_threads(num_threads),
         sec(sec),
-        sizeof_t(sizeof_t),
-        type_name(type_name) {}
+        sizeof_key(sizeof_key),
+        key_name(key_name) {}
 
   void Print() const {
-    const double bytes = static_cast<double>(num) *
+    const double bytes = static_cast<double>(num_keys) *
                          static_cast<double>(num_threads) *
-                         static_cast<double>(sizeof_t);
+                         static_cast<double>(sizeof_key);
     printf("%10s: %12s: %7s: %9s: %.2E %4.0f MB/s (%2zu threads)\n",
-           hwy::TargetName(target), AlgoName(algo),
-           is128 ? "u128" : type_name.c_str(), DistName(dist),
-           static_cast<double>(num), bytes * 1E-6 / sec, num_threads);
+           hwy::TargetName(target), AlgoName(algo), key_name.c_str(),
+           DistName(dist), static_cast<double>(num_keys), bytes * 1E-6 / sec,
+           num_threads);
   }
 
-  uint32_t target;
+  int64_t target;
   Algo algo;
   Dist dist;
-  bool is128;
-  size_t num = 0;
+  size_t num_keys = 0;
   size_t num_threads = 0;
   double sec = 0.0;
-  size_t sizeof_t = 0;
-  std::string type_name;
+  size_t sizeof_key = 0;
+  std::string key_name;
 };
 
-template <typename T, class Traits>
-Result MakeResult(const Algo algo, Dist dist, Traits st, size_t num,
-                  size_t num_threads, double sec) {
-  char string100[100];
-  hwy::detail::TypeName(hwy::detail::MakeTypeInfo<T>(), 1, string100);
-  return Result(HWY_TARGET, algo, dist, st.Is128(), num, num_threads, sec,
-                sizeof(T), string100);
-}
-
-template <class Traits, typename T>
-bool VerifySort(Traits st, const InputStats<T>& input_stats, const T* out,
-                size_t num, const char* caller) {
-  constexpr size_t N1 = st.Is128() ? 2 : 1;
-  HWY_ASSERT(num >= N1);
+template <class Traits, typename LaneType>
+bool VerifySort(Traits st, const InputStats<LaneType>& input_stats,
+                const LaneType* out, size_t num_lanes, const char* caller) {
+  constexpr size_t N1 = st.LanesPerKey();
+  HWY_ASSERT(num_lanes >= N1);
 
-  InputStats<T> output_stats;
+  InputStats<LaneType> output_stats;
   // Ensure it matches the sort order
-  for (size_t i = 0; i < num - N1; i += N1) {
+  for (size_t i = 0; i < num_lanes - N1; i += N1) {
     output_stats.Notify(out[i]);
     if (N1 == 2) output_stats.Notify(out[i + 1]);
     // Reverse order instead of checking !Compare1 so we accept equal keys.
     if (st.Compare1(out + i + N1, out + i)) {
-      printf("%s: i=%d of %d: N1=%d %5.0f %5.0f vs. %5.0f %5.0f\n\n", caller,
-             static_cast<int>(i), static_cast<int>(num), static_cast<int>(N1),
-             double(out[i + 1]), double(out[i + 0]), double(out[i + N1 + 1]),
-             double(out[i + N1]));
+      printf("%s: i=%d of %d lanes: N1=%d %5.0f %5.0f vs. %5.0f %5.0f\n\n",
+             caller, static_cast<int>(i), static_cast<int>(num_lanes),
+             static_cast<int>(N1), static_cast<double>(out[i + 1]),
+             static_cast<double>(out[i + 0]),
+             static_cast<double>(out[i + N1 + 1]),
+             static_cast<double>(out[i + N1]));
       HWY_ABORT("%d-bit sort is incorrect\n",
-                static_cast<int>(sizeof(T) * 8 * N1));
+                static_cast<int>(sizeof(LaneType) * 8 * N1));
     }
   }
-  output_stats.Notify(out[num - N1]);
-  if (N1 == 2) output_stats.Notify(out[num - N1 + 1]);
+  output_stats.Notify(out[num_lanes - N1]);
+  if (N1 == 2) output_stats.Notify(out[num_lanes - N1 + 1]);
 
   return input_stats == output_stats;
 }
diff --git a/media/highway/src/hwy/contrib/sort/shared-inl.h b/media/highway/src/hwy/contrib/sort/shared-inl.h
index f98a3d5286..ea604ed914 100644
--- a/media/highway/src/hwy/contrib/sort/shared-inl.h
+++ b/media/highway/src/hwy/contrib/sort/shared-inl.h
@@ -28,8 +28,8 @@ namespace hwy {
 struct SortConstants {
 // SortingNetwork reshapes its input into a matrix. This is the maximum number
 // of *keys* per vector.
-#if HWY_COMPILER_MSVC
-  static constexpr size_t kMaxCols = 8;  // avoids build timeout
+#if HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
+  static constexpr size_t kMaxCols = 8;  // avoid build timeout/stack overflow
 #else
   static constexpr size_t kMaxCols = 16;  // enough for u32 in 512-bit vector
 #endif
@@ -102,6 +102,17 @@ struct SortConstants {
 
 #include "hwy/highway.h"
 
+// vqsort isn't available on HWY_SCALAR, and builds time out on MSVC opt and
+// Arm v7 debug.
+#undef VQSORT_ENABLED
+#if (HWY_TARGET == HWY_SCALAR) ||                 \
+    (HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD) || \
+    (HWY_ARCH_ARM_V7 && HWY_IS_DEBUG_BUILD)
+#define VQSORT_ENABLED 0
+#else
+#define VQSORT_ENABLED 1
+#endif
+
 namespace hwy {
 namespace HWY_NAMESPACE {
 
diff --git a/media/highway/src/hwy/contrib/sort/sort_test.cc b/media/highway/src/hwy/contrib/sort/sort_test.cc
index 2f44866a26..2d1f1d5169 100644
--- a/media/highway/src/hwy/contrib/sort/sort_test.cc
+++ b/media/highway/src/hwy/contrib/sort/sort_test.cc
@@ -13,98 +13,83 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>  // memcpy
+
+#include <unordered_map>
+#include <vector>
+
 // clang-format off
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/sort_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 #include "hwy/contrib/sort/vqsort.h"
 // After foreach_target
 #include "hwy/contrib/sort/algo-inl.h"
+#include "hwy/contrib/sort/traits128-inl.h"
 #include "hwy/contrib/sort/result-inl.h"
 #include "hwy/contrib/sort/vqsort-inl.h"  // BaseCase
 #include "hwy/tests/test_util-inl.h"
 // clang-format on
 
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>  // memcpy
-
-#include <algorithm>  // std::max
-#include <vector>
-
-#undef VQSORT_TEST_IMPL
-#if (HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128) || \
-    (defined(_MSC_VER) && !HWY_IS_DEBUG_BUILD)
-// Scalar does not implement these, and MSVC non-debug builds time out.
-#define VQSORT_TEST_IMPL 0
-#else
-#define VQSORT_TEST_IMPL 1
-#endif
-
-#undef VQSORT_TEST_SORT
-// MSVC non-debug builds time out.
-#if defined(_MSC_VER) && !HWY_IS_DEBUG_BUILD
-#define VQSORT_TEST_SORT 0
-#else
-#define VQSORT_TEST_SORT 1
-#endif
-
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
 namespace {
 
-#if VQSORT_TEST_IMPL || VQSORT_TEST_SORT
-using detail::TraitsLane;
 using detail::OrderAscending;
-using detail::OrderAscending128;
 using detail::OrderDescending;
-using detail::OrderDescending128;
 using detail::SharedTraits;
+using detail::TraitsLane;
+#if VQSORT_ENABLED || HWY_IDE
+using detail::OrderAscending128;
+using detail::OrderAscendingKV128;
+using detail::OrderAscendingKV64;
+using detail::OrderDescending128;
+using detail::OrderDescendingKV128;
+using detail::OrderDescendingKV64;
 using detail::Traits128;
-#endif
-
-#if !VQSORT_TEST_IMPL
-static void TestAllMedian() {}
-static void TestAllBaseCase() {}
-static void TestAllPartition() {}
-static void TestAllGenerator() {}
-#else
 
 template <class Traits>
 static HWY_NOINLINE void TestMedian3() {
-  using T = uint64_t;
-  using D = CappedTag<T, 1>;
+  using LaneType = typename Traits::LaneType;
+  using D = CappedTag<LaneType, 1>;
   SharedTraits<Traits> st;
   const D d;
   using V = Vec<D>;
   for (uint32_t bits = 0; bits < 8; ++bits) {
-    const V v0 = Set(d, T{(bits & (1u << 0)) ? 1u : 0u});
-    const V v1 = Set(d, T{(bits & (1u << 1)) ? 1u : 0u});
-    const V v2 = Set(d, T{(bits & (1u << 2)) ? 1u : 0u});
-    const T m = GetLane(detail::MedianOf3(st, v0, v1, v2));
+    const V v0 = Set(d, LaneType{(bits & (1u << 0)) ? 1u : 0u});
+    const V v1 = Set(d, LaneType{(bits & (1u << 1)) ? 1u : 0u});
+    const V v2 = Set(d, LaneType{(bits & (1u << 2)) ? 1u : 0u});
+    const LaneType m = GetLane(detail::MedianOf3(st, v0, v1, v2));
     // If at least half(rounded up) of bits are 1, so is the median.
     const size_t count = PopCount(bits);
-    HWY_ASSERT_EQ((count >= 2) ? static_cast<T>(1) : 0, m);
+    HWY_ASSERT_EQ((count >= 2) ? static_cast<LaneType>(1) : 0, m);
   }
 }
 
 HWY_NOINLINE void TestAllMedian() {
-  TestMedian3<TraitsLane<OrderAscending> >();
+  TestMedian3<TraitsLane<OrderAscending<uint64_t> > >();
 }
 
-template <class Traits, typename T>
+template <class Traits>
 static HWY_NOINLINE void TestBaseCaseAscDesc() {
+  using LaneType = typename Traits::LaneType;
   SharedTraits<Traits> st;
-  const SortTag<T> d;
+  const SortTag<LaneType> d;
   const size_t N = Lanes(d);
   const size_t base_case_num = SortConstants::BaseCaseNum(N);
   const size_t N1 = st.LanesPerKey();
 
   constexpr int kDebug = 0;
-  auto aligned_keys = hwy::AllocateAligned<T>(N + base_case_num + N);
-  auto buf = hwy::AllocateAligned<T>(base_case_num + 2 * N);
+  auto aligned_lanes = hwy::AllocateAligned<LaneType>(N + base_case_num + N);
+  auto buf = hwy::AllocateAligned<LaneType>(base_case_num + 2 * N);
 
   std::vector<size_t> lengths;
   lengths.push_back(HWY_MAX(1, N1));
@@ -125,43 +110,45 @@ static HWY_NOINLINE void TestBaseCaseAscDesc() {
   for (bool asc : {false, true}) {
     for (size_t len : lengths) {
       for (size_t misalign : misalignments) {
-        T* HWY_RESTRICT keys = aligned_keys.get() + misalign;
+        LaneType* HWY_RESTRICT lanes = aligned_lanes.get() + misalign;
         if (kDebug) {
           printf("============%s asc %d N1 %d len %d misalign %d\n",
-                 hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(N1),
+                 st.KeyString().c_str(), asc, static_cast<int>(N1),
                  static_cast<int>(len), static_cast<int>(misalign));
         }
 
         for (size_t i = 0; i < misalign; ++i) {
-          aligned_keys[i] = hwy::LowestValue<T>();
+          aligned_lanes[i] = hwy::LowestValue<LaneType>();
         }
-        InputStats<T> input_stats;
+        InputStats<LaneType> input_stats;
         for (size_t i = 0; i < len; ++i) {
-          keys[i] =
-              asc ? static_cast<T>(T(i) + 1) : static_cast<T>(T(len) - T(i));
-          input_stats.Notify(keys[i]);
-          if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i]));
+          lanes[i] = asc ? static_cast<LaneType>(LaneType(i) + 1)
+                         : static_cast<LaneType>(LaneType(len) - LaneType(i));
+          input_stats.Notify(lanes[i]);
+          if (kDebug >= 2) {
+            printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
+          }
         }
         for (size_t i = len; i < base_case_num + N; ++i) {
-          keys[i] = hwy::LowestValue<T>();
+          lanes[i] = hwy::LowestValue<LaneType>();
         }
 
-        detail::BaseCase(d, st, keys, len, buf.get());
+        detail::BaseCase(d, st, lanes, lanes + len, len, buf.get());
 
         if (kDebug >= 2) {
           printf("out>>>>>>\n");
           for (size_t i = 0; i < len; ++i) {
-            printf("%3zu: %f\n", i, double(keys[i]));
+            printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
           }
         }
 
-        HWY_ASSERT(VerifySort(st, input_stats, keys, len, "BaseAscDesc"));
+        HWY_ASSERT(VerifySort(st, input_stats, lanes, len, "BaseAscDesc"));
         for (size_t i = 0; i < misalign; ++i) {
-          if (aligned_keys[i] != hwy::LowestValue<T>())
+          if (aligned_lanes[i] != hwy::LowestValue<LaneType>())
             HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
         }
         for (size_t i = len; i < base_case_num + N; ++i) {
-          if (keys[i] != hwy::LowestValue<T>())
+          if (lanes[i] != hwy::LowestValue<LaneType>())
             HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
         }
       }  // misalign
@@ -169,17 +156,18 @@ static HWY_NOINLINE void TestBaseCaseAscDesc() {
   }      // asc
 }
 
-template <class Traits, typename T>
+template <class Traits>
 static HWY_NOINLINE void TestBaseCase01() {
+  using LaneType = typename Traits::LaneType;
   SharedTraits<Traits> st;
-  const SortTag<T> d;
+  const SortTag<LaneType> d;
   const size_t N = Lanes(d);
   const size_t base_case_num = SortConstants::BaseCaseNum(N);
   const size_t N1 = st.LanesPerKey();
 
   constexpr int kDebug = 0;
-  auto keys = hwy::AllocateAligned<T>(base_case_num + N);
-  auto buf = hwy::AllocateAligned<T>(base_case_num + 2 * N);
+  auto lanes = hwy::AllocateAligned<LaneType>(base_case_num + N);
+  auto buf = hwy::AllocateAligned<LaneType>(base_case_num + 2 * N);
 
   std::vector<size_t> lengths;
   lengths.push_back(HWY_MAX(1, N1));
@@ -191,65 +179,66 @@ static HWY_NOINLINE void TestBaseCase01() {
 
   for (size_t len : lengths) {
     if (kDebug) {
-      printf("============%s 01 N1 %d len %d\n", hwy::TypeName(T(), 1).c_str(),
+      printf("============%s 01 N1 %d len %d\n", st.KeyString().c_str(),
              static_cast<int>(N1), static_cast<int>(len));
     }
     const uint64_t kMaxBits = AdjustedLog2Reps(HWY_MIN(len, size_t{14}));
     for (uint64_t bits = 0; bits < ((1ull << kMaxBits) - 1); ++bits) {
-      InputStats<T> input_stats;
+      InputStats<LaneType> input_stats;
       for (size_t i = 0; i < len; ++i) {
-        keys[i] = (i < 64 && (bits & (1ull << i))) ? 1 : 0;
-        input_stats.Notify(keys[i]);
-        if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i]));
+        lanes[i] = (i < 64 && (bits & (1ull << i))) ? 1 : 0;
+        input_stats.Notify(lanes[i]);
+        if (kDebug >= 2) {
+          printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
+        }
       }
       for (size_t i = len; i < base_case_num + N; ++i) {
-        keys[i] = hwy::LowestValue<T>();
+        lanes[i] = hwy::LowestValue<LaneType>();
       }
 
-      detail::BaseCase(d, st, keys.get(), len, buf.get());
+      detail::BaseCase(d, st, lanes.get(), lanes.get() + len, len, buf.get());
 
       if (kDebug >= 2) {
         printf("out>>>>>>\n");
         for (size_t i = 0; i < len; ++i) {
-          printf("%3zu: %f\n", i, double(keys[i]));
+          printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
         }
       }
 
-      HWY_ASSERT(VerifySort(st, input_stats, keys.get(), len, "Base01"));
+      HWY_ASSERT(VerifySort(st, input_stats, lanes.get(), len, "Base01"));
       for (size_t i = len; i < base_case_num + N; ++i) {
-        if (keys[i] != hwy::LowestValue<T>())
+        if (lanes[i] != hwy::LowestValue<LaneType>())
           HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
       }
     }  // bits
   }    // len
 }
 
-template <class Traits, typename T>
+template <class Traits>
 static HWY_NOINLINE void TestBaseCase() {
-  TestBaseCaseAscDesc<Traits, T>();
-  TestBaseCase01<Traits, T>();
+  TestBaseCaseAscDesc<Traits>();
+  TestBaseCase01<Traits>();
 }
 
 HWY_NOINLINE void TestAllBaseCase() {
   // Workaround for stack overflow on MSVC debug.
-#if defined(_MSC_VER) && HWY_IS_DEBUG_BUILD && (HWY_TARGET == HWY_AVX3)
+#if defined(_MSC_VER)
   return;
 #endif
-
-  TestBaseCase<TraitsLane<OrderAscending>, int32_t>();
-  TestBaseCase<TraitsLane<OrderDescending>, int64_t>();
-  TestBaseCase<Traits128<OrderAscending128>, uint64_t>();
-  TestBaseCase<Traits128<OrderDescending128>, uint64_t>();
+  TestBaseCase<TraitsLane<OrderAscending<int32_t> > >();
+  TestBaseCase<TraitsLane<OrderDescending<int64_t> > >();
+  TestBaseCase<Traits128<OrderAscending128> >();
+  TestBaseCase<Traits128<OrderDescending128> >();
 }
 
-template <class Traits, typename T>
-static HWY_NOINLINE void VerifyPartition(Traits st, T* HWY_RESTRICT keys,
-                                         size_t left, size_t border,
-                                         size_t right, const size_t N1,
-                                         const T* pivot) {
+template <class Traits>
+static HWY_NOINLINE void VerifyPartition(
+    Traits st, typename Traits::LaneType* HWY_RESTRICT lanes, size_t left,
+    size_t border, size_t right, const size_t N1,
+    const typename Traits::LaneType* pivot) {
   /* for (size_t i = left; i < right; ++i) {
      if (i == border) printf("--\n");
-     printf("%4zu: %3d\n", i, keys[i]);
+     printf("%4zu: %3d\n", i, lanes[i]);
    }*/
 
   HWY_ASSERT(left % N1 == 0);
@@ -257,30 +246,33 @@ static HWY_NOINLINE void VerifyPartition(Traits st, T* HWY_RESTRICT keys,
   HWY_ASSERT(right % N1 == 0);
   const bool asc = typename Traits::Order().IsAscending();
   for (size_t i = left; i < border; i += N1) {
-    if (st.Compare1(pivot, keys + i)) {
+    if (st.Compare1(pivot, lanes + i)) {
       HWY_ABORT(
           "%s: asc %d left[%d] piv %.0f %.0f compares before %.0f %.0f "
           "border %d",
-          hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(i),
-          double(pivot[1]), double(pivot[0]), double(keys[i + 1]),
-          double(keys[i + 0]), static_cast<int>(border));
+          st.KeyString().c_str(), asc, static_cast<int>(i),
+          static_cast<double>(pivot[1]), static_cast<double>(pivot[0]),
+          static_cast<double>(lanes[i + 1]), static_cast<double>(lanes[i + 0]),
+          static_cast<int>(border));
     }
   }
   for (size_t i = border; i < right; i += N1) {
-    if (!st.Compare1(pivot, keys + i)) {
+    if (!st.Compare1(pivot, lanes + i)) {
       HWY_ABORT(
           "%s: asc %d right[%d] piv %.0f %.0f compares after %.0f %.0f "
           "border %d",
-          hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(i),
-          double(pivot[1]), double(pivot[0]), double(keys[i + 1]),
-          double(keys[i]), static_cast<int>(border));
+          st.KeyString().c_str(), asc, static_cast<int>(i),
+          static_cast<double>(pivot[1]), static_cast<double>(pivot[0]),
+          static_cast<double>(lanes[i + 1]), static_cast<double>(lanes[i]),
+          static_cast<int>(border));
     }
   }
 }
 
-template <class Traits, typename T>
+template <class Traits>
 static HWY_NOINLINE void TestPartition() {
-  const SortTag<T> d;
+  using LaneType = typename Traits::LaneType;
+  const SortTag<LaneType> d;
   SharedTraits<Traits> st;
   const bool asc = typename Traits::Order().IsAscending();
   const size_t N = Lanes(d);
@@ -288,71 +280,87 @@ static HWY_NOINLINE void TestPartition() {
   const size_t base_case_num = SortConstants::BaseCaseNum(N);
   // left + len + align
   const size_t total = 32 + (base_case_num + 4 * HWY_MAX(N, 4)) + 2 * N;
-  auto aligned_keys = hwy::AllocateAligned<T>(total);
-  auto buf = hwy::AllocateAligned<T>(SortConstants::PartitionBufNum(N));
+  auto aligned_lanes = hwy::AllocateAligned<LaneType>(total);
+  auto buf = hwy::AllocateAligned<LaneType>(SortConstants::PartitionBufNum(N));
 
   const size_t N1 = st.LanesPerKey();
   for (bool in_asc : {false, true}) {
-    for (int left_i : {0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 22, 28, 29, 30, 31}) {
+    for (int left_i : {0, 1, 4, 6, 7, 8, 12, 15, 22, 28, 30, 31}) {
       const size_t left = static_cast<size_t>(left_i) & ~(N1 - 1);
-      for (size_t ofs : {N, N + 1, N + 2, N + 3, 2 * N, 2 * N + 1, 2 * N + 2,
-                         2 * N + 3, 3 * N - 1, 4 * N - 3, 4 * N - 2}) {
+      for (size_t ofs : {N, N + 1, N + 3, 2 * N, 2 * N + 2, 2 * N + 3,
+                         3 * N - 1, 4 * N - 3, 4 * N - 2}) {
         const size_t len = (base_case_num + ofs) & ~(N1 - 1);
-        for (T pivot1 :
-             {T(0), T(len / 3), T(len / 2), T(2 * len / 3), T(len)}) {
-          const T pivot2[2] = {pivot1, 0};
+        for (LaneType pivot1 :
+             {LaneType(0), LaneType(len / 3), LaneType(len / 2),
+              LaneType(2 * len / 3), LaneType(len)}) {
+          const LaneType pivot2[2] = {pivot1, 0};
           const auto pivot = st.SetKey(d, pivot2);
           for (size_t misalign = 0; misalign < N;
                misalign += st.LanesPerKey()) {
-            T* HWY_RESTRICT keys = aligned_keys.get() + misalign;
+            LaneType* HWY_RESTRICT lanes = aligned_lanes.get() + misalign;
             const size_t right = left + len;
             if (kDebug) {
               printf(
                   "=========%s asc %d left %d len %d right %d piv %.0f %.0f\n",
-                  hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(left),
+                  st.KeyString().c_str(), asc, static_cast<int>(left),
                   static_cast<int>(len), static_cast<int>(right),
-                  double(pivot2[1]), double(pivot2[0]));
+                  static_cast<double>(pivot2[1]),
+                  static_cast<double>(pivot2[0]));
             }
 
             for (size_t i = 0; i < misalign; ++i) {
-              aligned_keys[i] = hwy::LowestValue<T>();
+              aligned_lanes[i] = hwy::LowestValue<LaneType>();
             }
             for (size_t i = 0; i < left; ++i) {
-              keys[i] = hwy::LowestValue<T>();
+              lanes[i] = hwy::LowestValue<LaneType>();
             }
+            std::unordered_map<LaneType, int> counts;
             for (size_t i = left; i < right; ++i) {
-              keys[i] = static_cast<T>(in_asc ? T(i + 1) - static_cast<T>(left)
-                                              : static_cast<T>(right) - T(i));
-              if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i]));
+              lanes[i] = static_cast<LaneType>(
+                  in_asc ? LaneType(i + 1) - static_cast<LaneType>(left)
+                         : static_cast<LaneType>(right) - LaneType(i));
+              ++counts[lanes[i]];
+              if (kDebug >= 2) {
+                printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
+              }
             }
             for (size_t i = right; i < total - misalign; ++i) {
-              keys[i] = hwy::LowestValue<T>();
+              lanes[i] = hwy::LowestValue<LaneType>();
             }
 
             size_t border =
-                detail::Partition(d, st, keys, left, right, pivot, buf.get());
+                left + detail::Partition(d, st, lanes + left, right - left,
+                                         pivot, buf.get());
 
             if (kDebug >= 2) {
               printf("out>>>>>>\n");
               for (size_t i = left; i < right; ++i) {
-                printf("%3zu: %f\n", i, double(keys[i]));
+                printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
               }
               for (size_t i = right; i < total - misalign; ++i) {
-                printf("%3zu: sentinel %f\n", i, double(keys[i]));
+                printf("%3zu: sentinel %f\n", i, static_cast<double>(lanes[i]));
               }
             }
-
-            VerifyPartition(st, keys, left, border, right, N1, pivot2);
+            for (size_t i = left; i < right; ++i) {
+              --counts[lanes[i]];
+            }
+            for (auto kv : counts) {
+              if (kv.second != 0) {
+                PrintValue(kv.first);
+                HWY_ABORT("Incorrect count %d\n", kv.second);
+              }
+            }
+            VerifyPartition(st, lanes, left, border, right, N1, pivot2);
             for (size_t i = 0; i < misalign; ++i) {
-              if (aligned_keys[i] != hwy::LowestValue<T>())
+              if (aligned_lanes[i] != hwy::LowestValue<LaneType>())
                 HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
             }
             for (size_t i = 0; i < left; ++i) {
-              if (keys[i] != hwy::LowestValue<T>())
+              if (lanes[i] != hwy::LowestValue<LaneType>())
                 HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
             }
             for (size_t i = right; i < total - misalign; ++i) {
-              if (keys[i] != hwy::LowestValue<T>())
+              if (lanes[i] != hwy::LowestValue<LaneType>())
                 HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
             }
           }  // misalign
@@ -363,15 +371,18 @@ static HWY_NOINLINE void TestPartition() {
 }
 
 HWY_NOINLINE void TestAllPartition() {
-  TestPartition<TraitsLane<OrderAscending>, int16_t>();
-  TestPartition<TraitsLane<OrderDescending>, int32_t>();
-  TestPartition<TraitsLane<OrderAscending>, int64_t>();
-  TestPartition<TraitsLane<OrderDescending>, float>();
+  TestPartition<TraitsLane<OrderDescending<int32_t> > >();
+  TestPartition<Traits128<OrderAscending128> >();
+
+#if !HWY_IS_DEBUG_BUILD
+  TestPartition<TraitsLane<OrderAscending<int16_t> > >();
+  TestPartition<TraitsLane<OrderAscending<int64_t> > >();
+  TestPartition<TraitsLane<OrderDescending<float> > >();
 #if HWY_HAVE_FLOAT64
-  TestPartition<TraitsLane<OrderDescending>, double>();
+  TestPartition<TraitsLane<OrderDescending<double> > >();
+#endif
+  TestPartition<Traits128<OrderDescending128> >();
 #endif
-  TestPartition<Traits128<OrderAscending128>, uint64_t>();
-  TestPartition<Traits128<OrderDescending128>, uint64_t>();
 }
 
 // (used for sample selection for choosing a pivot)
@@ -401,7 +412,7 @@ static HWY_NOINLINE void TestRandomGenerator() {
 
     // Also ensure the mean is near the middle of the range
     const double expected = (num_blocks - 1) / 2.0;
-    const double actual = double(sum) / kReps;
+    const double actual = static_cast<double>(sum) / kReps;
     HWY_ASSERT(0.9 * expected <= actual && actual <= 1.1 * expected);
   }
 }
@@ -411,22 +422,26 @@ HWY_NOINLINE void TestAllGenerator() {
   TestRandomGenerator<uint64_t>();
 }
 
-#endif  // VQSORT_TEST_IMPL
-
-#if !VQSORT_TEST_SORT
-static void TestAllSort() {}
 #else
+static void TestAllMedian() {}
+static void TestAllBaseCase() {}
+static void TestAllPartition() {}
+static void TestAllGenerator() {}
+#endif  // VQSORT_ENABLED
 
 // Remembers input, and compares results to that of a reference algorithm.
-template <class Traits, typename T>
+template <class Traits>
 class CompareResults {
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
+
  public:
-  void SetInput(const T* in, size_t num) {
-    copy_.resize(num);
-    memcpy(copy_.data(), in, num * sizeof(T));
+  CompareResults(const LaneType* in, size_t num_lanes) {
+    copy_.resize(num_lanes);
+    memcpy(copy_.data(), in, num_lanes * sizeof(LaneType));
   }
 
-  bool Verify(const T* output) {
+  bool Verify(const LaneType* output) {
 #if HAVE_PDQSORT
     const Algo reference = Algo::kPDQ;
 #else
@@ -434,13 +449,34 @@ class CompareResults {
 #endif
     SharedState shared;
     using Order = typename Traits::Order;
-    Run<Order>(reference, copy_.data(), copy_.size(), shared,
-               /*thread=*/0);
-
+    const Traits st;
+    const size_t num_keys = copy_.size() / st.LanesPerKey();
+    Run<Order>(reference, reinterpret_cast<KeyType*>(copy_.data()), num_keys,
+               shared, /*thread=*/0);
+#if VQSORT_PRINT >= 3
+    fprintf(stderr, "\nExpected:\n");
+    for (size_t i = 0; i < copy_.size(); ++i) {
+      PrintValue(copy_[i]);
+    }
+    fprintf(stderr, "\n");
+#endif
     for (size_t i = 0; i < copy_.size(); ++i) {
       if (copy_[i] != output[i]) {
-        fprintf(stderr, "Asc %d mismatch at %d: %A %A\n", Order().IsAscending(),
-                static_cast<int>(i), double(copy_[i]), double(output[i]));
+        if (sizeof(KeyType) == 16) {
+          fprintf(stderr,
+                  "%s Asc %d mismatch at %d of %d: %" PRIu64 " %" PRIu64 "\n",
+                  st.KeyString().c_str(), Order().IsAscending(),
+                  static_cast<int>(i), static_cast<int>(copy_.size()),
+                  static_cast<uint64_t>(copy_[i]),
+                  static_cast<uint64_t>(output[i]));
+        } else {
+          fprintf(stderr, "Type %s Asc %d mismatch at %d of %d: ",
+                  st.KeyString().c_str(), Order().IsAscending(),
+                  static_cast<int>(i), static_cast<int>(copy_.size()));
+          PrintValue(copy_[i]);
+          PrintValue(output[i]);
+          fprintf(stderr, "\n");
+        }
         return false;
       }
     }
@@ -448,7 +484,7 @@ class CompareResults {
   }
 
  private:
-  std::vector<T> copy_;
+  std::vector<LaneType> copy_;
 };
 
 std::vector<Algo> AlgoForTest() {
@@ -469,65 +505,62 @@ std::vector<Algo> AlgoForTest() {
   };
 }
 
-template <class Traits, typename T>
-void TestSort(size_t num) {
-  // TODO(janwas): fix
-  if (HWY_TARGET == HWY_SSSE3) return;
+template <class Traits>
+void TestSort(size_t num_lanes) {
 // Workaround for stack overflow on clang-cl (/F 8388608 does not help).
-#if defined(_MSC_VER) && HWY_IS_DEBUG_BUILD && (HWY_TARGET == HWY_AVX3)
+#if defined(_MSC_VER)
   return;
 #endif
-
+  using Order = typename Traits::Order;
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
   SharedState shared;
   SharedTraits<Traits> st;
 
   // Round up to a whole number of keys.
-  num += (st.Is128() && (num & 1));
+  num_lanes += (st.Is128() && (num_lanes & 1));
+  const size_t num_keys = num_lanes / st.LanesPerKey();
 
   constexpr size_t kMaxMisalign = 16;
-  auto aligned = hwy::AllocateAligned<T>(kMaxMisalign + num + kMaxMisalign);
+  auto aligned =
+      hwy::AllocateAligned<LaneType>(kMaxMisalign + num_lanes + kMaxMisalign);
   for (Algo algo : AlgoForTest()) {
-#if HAVE_IPS4O
-    if (st.Is128() && (algo == Algo::kIPS4O || algo == Algo::kParallelIPS4O)) {
-      continue;
-    }
-#endif
     for (Dist dist : AllDist()) {
       for (size_t misalign : {size_t{0}, size_t{st.LanesPerKey()},
                               size_t{3 * st.LanesPerKey()}, kMaxMisalign / 2}) {
-        T* keys = aligned.get() + misalign;
+        LaneType* lanes = aligned.get() + misalign;
 
         // Set up red zones before/after the keys to sort
         for (size_t i = 0; i < misalign; ++i) {
-          aligned[i] = hwy::LowestValue<T>();
+          aligned[i] = hwy::LowestValue<LaneType>();
         }
         for (size_t i = 0; i < kMaxMisalign; ++i) {
-          keys[num + i] = hwy::HighestValue<T>();
+          lanes[num_lanes + i] = hwy::HighestValue<LaneType>();
         }
 #if HWY_IS_MSAN
-        __msan_poison(aligned.get(), misalign * sizeof(T));
-        __msan_poison(keys + num, kMaxMisalign * sizeof(T));
+        __msan_poison(aligned.get(), misalign * sizeof(LaneType));
+        __msan_poison(lanes + num_lanes, kMaxMisalign * sizeof(LaneType));
 #endif
-        InputStats<T> input_stats = GenerateInput(dist, keys, num);
+        InputStats<LaneType> input_stats =
+            GenerateInput(dist, lanes, num_lanes);
 
-        CompareResults<Traits, T> compare;
-        compare.SetInput(keys, num);
-
-        Run<typename Traits::Order>(algo, keys, num, shared, /*thread=*/0);
-        HWY_ASSERT(compare.Verify(keys));
-        HWY_ASSERT(VerifySort(st, input_stats, keys, num, "TestSort"));
+        CompareResults<Traits> compare(lanes, num_lanes);
+        Run<Order>(algo, reinterpret_cast<KeyType*>(lanes), num_keys, shared,
+                   /*thread=*/0);
+        HWY_ASSERT(compare.Verify(lanes));
+        HWY_ASSERT(VerifySort(st, input_stats, lanes, num_lanes, "TestSort"));
 
         // Check red zones
 #if HWY_IS_MSAN
-        __msan_unpoison(aligned.get(), misalign * sizeof(T));
-        __msan_unpoison(keys + num, kMaxMisalign * sizeof(T));
+        __msan_unpoison(aligned.get(), misalign * sizeof(LaneType));
+        __msan_unpoison(lanes + num_lanes, kMaxMisalign * sizeof(LaneType));
 #endif
         for (size_t i = 0; i < misalign; ++i) {
-          if (aligned[i] != hwy::LowestValue<T>())
+          if (aligned[i] != hwy::LowestValue<LaneType>())
             HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
         }
-        for (size_t i = num; i < num + kMaxMisalign; ++i) {
-          if (keys[i] != hwy::HighestValue<T>())
+        for (size_t i = num_lanes; i < num_lanes + kMaxMisalign; ++i) {
+          if (lanes[i] != hwy::HighestValue<LaneType>())
             HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
         }
       }  // misalign
@@ -536,32 +569,40 @@ void TestSort(size_t num) {
 }
 
 void TestAllSort() {
-  const size_t num = AdjustedReps(20 * 1000);
-
-  TestSort<TraitsLane<OrderAscending>, int16_t>(num);
-  TestSort<TraitsLane<OrderDescending>, uint16_t>(num);
+  for (int num : {129, 504, 3 * 1000, 34567}) {
+    const size_t num_lanes = AdjustedReps(static_cast<size_t>(num));
+    TestSort<TraitsLane<OrderAscending<int16_t> > >(num_lanes);
+    TestSort<TraitsLane<OrderDescending<uint16_t> > >(num_lanes);
 
-  TestSort<TraitsLane<OrderDescending>, int32_t>(num);
-  TestSort<TraitsLane<OrderDescending>, uint32_t>(num);
+    TestSort<TraitsLane<OrderDescending<int32_t> > >(num_lanes);
+    TestSort<TraitsLane<OrderDescending<uint32_t> > >(num_lanes);
 
-  TestSort<TraitsLane<OrderAscending>, int64_t>(num);
-  TestSort<TraitsLane<OrderAscending>, uint64_t>(num);
+    TestSort<TraitsLane<OrderAscending<int64_t> > >(num_lanes);
+    TestSort<TraitsLane<OrderAscending<uint64_t> > >(num_lanes);
 
-  // WARNING: for float types, SIMD comparisons will flush denormals to zero,
-  // causing mismatches with scalar sorts. In this test, we avoid generating
-  // denormal inputs.
-  TestSort<TraitsLane<OrderAscending>, float>(num);
+    // WARNING: for float types, SIMD comparisons will flush denormals to
+    // zero, causing mismatches with scalar sorts. In this test, we avoid
+    // generating denormal inputs.
+    TestSort<TraitsLane<OrderAscending<float> > >(num_lanes);
 #if HWY_HAVE_FLOAT64  // protects algo-inl's GenerateRandom
-  if (Sorter::HaveFloat64()) {
-    TestSort<TraitsLane<OrderDescending>, double>(num);
-  }
+    if (Sorter::HaveFloat64()) {
+      TestSort<TraitsLane<OrderDescending<double> > >(num_lanes);
+    }
 #endif
 
-  TestSort<Traits128<OrderAscending128>, uint64_t>(num);
-  TestSort<Traits128<OrderAscending128>, uint64_t>(num);
-}
+// Our HeapSort does not support 128-bit keys.
+#if VQSORT_ENABLED
+    TestSort<Traits128<OrderAscending128> >(num_lanes);
+    TestSort<Traits128<OrderDescending128> >(num_lanes);
 
-#endif  // VQSORT_TEST_SORT
+    TestSort<TraitsLane<OrderAscendingKV64> >(num_lanes);
+    TestSort<TraitsLane<OrderDescendingKV64> >(num_lanes);
+
+    TestSort<Traits128<OrderAscendingKV128> >(num_lanes);
+    TestSort<Traits128<OrderDescendingKV128> >(num_lanes);
+#endif
+  }
+}
 
 }  // namespace
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/media/highway/src/hwy/contrib/sort/sorting_networks-inl.h b/media/highway/src/hwy/contrib/sort/sorting_networks-inl.h
index ce72fe6b58..3cc545b7ab 100644
--- a/media/highway/src/hwy/contrib/sort/sorting_networks-inl.h
+++ b/media/highway/src/hwy/contrib/sort/sorting_networks-inl.h
@@ -22,7 +22,6 @@
 #define HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
 #endif
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/shared-inl.h"  // SortConstants
 #include "hwy/highway.h"
 
@@ -31,6 +30,8 @@ namespace hwy {
 namespace HWY_NAMESPACE {
 namespace detail {
 
+#if VQSORT_ENABLED
+
 using Constants = hwy::SortConstants;
 
 // ------------------------------ SharedTraits
@@ -595,12 +596,14 @@ HWY_INLINE void Merge16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
 // `buf` ensures full vectors are aligned, and enables loads/stores without
 //   bounds checks.
 //
+// NOINLINE because this is large and called twice from vqsort-inl.h.
+//
 // References:
 // https://drops.dagstuhl.de/opus/volltexte/2021/13775/pdf/LIPIcs-SEA-2021-3.pdf
 // https://github.com/simd-sorting/fast-and-robust/blob/master/avx2_sort_demo/avx2sort.h
 // "Entwurf und Implementierung vektorisierter Sortieralgorithmen" (M. Blacher)
 template <class Traits, typename T>
-HWY_INLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
+HWY_NOINLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
   const CappedTag<T, Constants::kMaxCols> d;
   using V = decltype(Zero(d));
 
@@ -647,8 +650,8 @@ HWY_INLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
         Merge8(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
                ve, vf);
 
-        // Avoids build timeout
-#if !HWY_COMPILER_MSVC
+        // Avoids build timeout. Must match #if condition in kMaxCols.
+#if !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD
         if (HWY_LIKELY(keys >= 16 && kMaxKeys >= 16)) {
           Merge16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
                   ve, vf);
@@ -678,6 +681,11 @@ HWY_INLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
   StoreU(vf, d, buf + 0xf * cols);
 }
 
+#else
+template <class Base>
+struct SharedTraits : public Base {};
+#endif  // VQSORT_ENABLED
+
 }  // namespace detail
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
diff --git a/media/highway/src/hwy/contrib/sort/traits-inl.h b/media/highway/src/hwy/contrib/sort/traits-inl.h
index e86a2a1eee..8b87c82629 100644
--- a/media/highway/src/hwy/contrib/sort/traits-inl.h
+++ b/media/highway/src/hwy/contrib/sort/traits-inl.h
@@ -22,37 +22,70 @@
 #define HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
 #endif
 
-#include "hwy/contrib/sort/disabled_targets.h"
+#include <string>
+
 #include "hwy/contrib/sort/shared-inl.h"  // SortConstants
 #include "hwy/contrib/sort/vqsort.h"      // SortDescending
 #include "hwy/highway.h"
+#include "hwy/print.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
 namespace detail {
 
+#if VQSORT_ENABLED || HWY_IDE
+
 // Highway does not provide a lane type for 128-bit keys, so we use uint64_t
 // along with an abstraction layer for single-lane vs. lane-pair, which is
 // independent of the order.
+template <typename T>
 struct KeyLane {
+  static constexpr bool Is128() { return false; }
   constexpr size_t LanesPerKey() const { return 1; }
 
+  // What type bench_sort should allocate for generating inputs.
+  using LaneType = T;
+  // What type to pass to Sorter::operator().
+  using KeyType = T;
+
+  std::string KeyString() const {
+    char string100[100];
+    hwy::detail::TypeName(hwy::detail::MakeTypeInfo<KeyType>(), 1, string100);
+    return string100;
+  }
+
   // For HeapSort
-  template <typename T>
   HWY_INLINE void Swap(T* a, T* b) const {
     const T temp = *a;
     *a = *b;
     *b = temp;
   }
 
+  template <class V, class M>
+  HWY_INLINE V CompressKeys(V keys, M mask) const {
+    return CompressNot(keys, mask);
+  }
+
   // Broadcasts one key into a vector
   template <class D>
-  HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
+  HWY_INLINE Vec<D> SetKey(D d, const T* key) const {
     return Set(d, *key);
   }
 
   template <class D>
+  HWY_INLINE Mask<D> EqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
+    return Eq(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> NotEqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
+    return Ne(a, b);
+  }
+
+  HWY_INLINE bool Equal1(const T* a, const T* b) { return *a == *b; }
+
+  template <class D>
   HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const {
     return Reverse(d, v);
   }
@@ -102,7 +135,7 @@ struct KeyLane {
 #if HWY_HAVE_FLOAT64  // in case D is float32
     const RepartitionToWide<D> dw;
 #else
-    const RepartitionToWide<RebindToUnsigned<D>> dw;
+    const RepartitionToWide<RebindToUnsigned<D> > dw;
 #endif
     return BitCast(d, SwapAdjacentPairs(dw, BitCast(dw, v)));
   }
@@ -118,7 +151,7 @@ struct KeyLane {
 #if HWY_HAVE_FLOAT64  // in case D is float32
     const RepartitionToWide<D> dw;
 #else
-    const RepartitionToWide<RebindToUnsigned<D>> dw;
+    const RepartitionToWide<RebindToUnsigned<D> > dw;
 #endif
     return BitCast(d, OddEven(BitCast(dw, odd), BitCast(dw, even)));
   }
@@ -132,7 +165,7 @@ struct KeyLane {
 #if HWY_HAVE_FLOAT64  // in case D is float32
     const RepartitionToWide<D> dw;
 #else
-    const RepartitionToWide<RebindToUnsigned<D>> dw;
+    const RepartitionToWide<RebindToUnsigned<D> > dw;
 #endif
     return BitCast(d, OddEvenPairs(dw, BitCast(dw, odd), BitCast(dw, even)));
   }
@@ -149,13 +182,11 @@ struct KeyLane {
 // We avoid overloaded functions because we want all functions to be callable
 // from a SortTraits without per-function wrappers. Specializing would work, but
 // we are anyway going to specialize at a higher level.
-struct OrderAscending : public KeyLane {
+template <typename T>
+struct OrderAscending : public KeyLane<T> {
   using Order = SortAscending;
 
-  template <typename T>
-  HWY_INLINE bool Compare1(const T* a, const T* b) {
-    return *a < *b;
-  }
+  HWY_INLINE bool Compare1(const T* a, const T* b) { return *a < *b; }
 
   template <class D>
   HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
@@ -175,34 +206,37 @@ struct OrderAscending : public KeyLane {
 
   template <class D>
   HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
-                                 TFromD<D>* HWY_RESTRICT /* buf */) const {
+                                 T* HWY_RESTRICT /* buf */) const {
     return MinOfLanes(d, v);
   }
 
   template <class D>
   HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
-                                TFromD<D>* HWY_RESTRICT /* buf */) const {
+                                T* HWY_RESTRICT /* buf */) const {
     return MaxOfLanes(d, v);
   }
 
   template <class D>
   HWY_INLINE Vec<D> FirstValue(D d) const {
-    return Set(d, hwy::LowestValue<TFromD<D>>());
+    return Set(d, hwy::LowestValue<T>());
   }
 
   template <class D>
   HWY_INLINE Vec<D> LastValue(D d) const {
-    return Set(d, hwy::HighestValue<TFromD<D>>());
+    return Set(d, hwy::HighestValue<T>());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    return Sub(v, Set(d, hwy::Epsilon<T>()));
   }
 };
 
-struct OrderDescending : public KeyLane {
+template <typename T>
+struct OrderDescending : public KeyLane<T> {
   using Order = SortDescending;
 
-  template <typename T>
-  HWY_INLINE bool Compare1(const T* a, const T* b) {
-    return *b < *a;
-  }
+  HWY_INLINE bool Compare1(const T* a, const T* b) { return *b < *a; }
 
   template <class D>
   HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
@@ -221,32 +255,140 @@ struct OrderDescending : public KeyLane {
 
   template <class D>
   HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
-                                 TFromD<D>* HWY_RESTRICT /* buf */) const {
+                                 T* HWY_RESTRICT /* buf */) const {
     return MaxOfLanes(d, v);
   }
 
   template <class D>
   HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
-                                TFromD<D>* HWY_RESTRICT /* buf */) const {
+                                T* HWY_RESTRICT /* buf */) const {
     return MinOfLanes(d, v);
   }
 
   template <class D>
   HWY_INLINE Vec<D> FirstValue(D d) const {
-    return Set(d, hwy::HighestValue<TFromD<D>>());
+    return Set(d, hwy::HighestValue<T>());
   }
 
   template <class D>
   HWY_INLINE Vec<D> LastValue(D d) const {
-    return Set(d, hwy::LowestValue<TFromD<D>>());
+    return Set(d, hwy::LowestValue<T>());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    return Add(v, Set(d, hwy::Epsilon<T>()));
+  }
+};
+
+struct OrderAscendingKV64 : public KeyLane<uint64_t> {
+  using Order = SortAscending;
+
+  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
+    return (*a >> 32) < (*b >> 32);
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
+    return Lt(ShiftRight<32>(a), ShiftRight<32>(b));
+  }
+
+  // Not required to be stable (preserving the order of equivalent keys), so
+  // we can include the value in the comparison.
+  template <class D>
+  HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+    return Min(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+    return Max(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
+                                 uint64_t* HWY_RESTRICT /* buf */) const {
+    return MinOfLanes(d, v);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
+                                uint64_t* HWY_RESTRICT /* buf */) const {
+    return MaxOfLanes(d, v);
+  }
+
+  // Same as for regular lanes.
+  template <class D>
+  HWY_INLINE Vec<D> FirstValue(D d) const {
+    return Set(d, hwy::LowestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastValue(D d) const {
+    return Set(d, hwy::HighestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    return Sub(v, Set(d, 1));
+  }
+};
+
+struct OrderDescendingKV64 : public KeyLane<uint64_t> {
+  using Order = SortDescending;
+
+  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
+    return (*b >> 32) < (*a >> 32);
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
+    return Lt(ShiftRight<32>(b), ShiftRight<32>(a));
+  }
+
+  // Not required to be stable (preserving the order of equivalent keys), so
+  // we can include the value in the comparison.
+  template <class D>
+  HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+    return Max(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+    return Min(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
+                                 uint64_t* HWY_RESTRICT /* buf */) const {
+    return MaxOfLanes(d, v);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
+                                uint64_t* HWY_RESTRICT /* buf */) const {
+    return MinOfLanes(d, v);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> FirstValue(D d) const {
+    return Set(d, hwy::HighestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastValue(D d) const {
+    return Set(d, hwy::LowestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    return Add(v, Set(d, 1));
   }
 };
 
 // Shared code that depends on Order.
 template <class Base>
 struct TraitsLane : public Base {
-  constexpr bool Is128() const { return false; }
-
   // For each lane i: replaces a[i] with the first and b[i] with the second
   // according to Base.
   // Corresponds to a conditional swap, which is one "node" of a sorting
@@ -316,6 +458,66 @@ struct TraitsLane : public Base {
   }
 };
 
+#else
+
+// Base class shared between OrderAscending, OrderDescending.
+template <typename T>
+struct KeyLane {
+  constexpr bool Is128() const { return false; }
+  constexpr size_t LanesPerKey() const { return 1; }
+
+  using LaneType = T;
+  using KeyType = T;
+
+  std::string KeyString() const {
+    char string100[100];
+    hwy::detail::TypeName(hwy::detail::MakeTypeInfo<KeyType>(), 1, string100);
+    return string100;
+  }
+};
+
+template <typename T>
+struct OrderAscending : public KeyLane<T> {
+  using Order = SortAscending;
+
+  HWY_INLINE bool Compare1(const T* a, const T* b) { return *a < *b; }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) {
+    return Lt(a, b);
+  }
+};
+
+template <typename T>
+struct OrderDescending : public KeyLane<T> {
+  using Order = SortDescending;
+
+  HWY_INLINE bool Compare1(const T* a, const T* b) { return *b < *a; }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) {
+    return Lt(b, a);
+  }
+};
+
+template <class Order>
+struct TraitsLane : public Order {
+  // For HeapSort
+  template <typename T>  // MSVC doesn't find typename Order::LaneType.
+  HWY_INLINE void Swap(T* a, T* b) const {
+    const T temp = *a;
+    *a = *b;
+    *b = temp;
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
+    return Set(d, *key);
+  }
+};
+
+#endif  // VQSORT_ENABLED
+
 }  // namespace detail
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
diff --git a/media/highway/src/hwy/contrib/sort/traits128-inl.h b/media/highway/src/hwy/contrib/sort/traits128-inl.h
index 02948d799c..c69206440f 100644
--- a/media/highway/src/hwy/contrib/sort/traits128-inl.h
+++ b/media/highway/src/hwy/contrib/sort/traits128-inl.h
@@ -22,6 +22,9 @@
 #define HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
 #endif
 
+#include <string>
+
+#include "hwy/contrib/sort/shared-inl.h"
 #include "hwy/contrib/sort/vqsort.h"  // SortDescending
 #include "hwy/highway.h"
 
@@ -30,48 +33,31 @@ namespace hwy {
 namespace HWY_NAMESPACE {
 namespace detail {
 
-#if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
-
-struct OrderAscending128 {
-  using Order = SortAscending;
-
-  template <typename T>
-  HWY_INLINE bool Compare1(const T* a, const T* b) {
-    return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1];
-  }
-};
-
-struct OrderDescending128 {
-  using Order = SortDescending;
-
-  template <typename T>
-  HWY_INLINE bool Compare1(const T* a, const T* b) {
-    return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1];
-  }
-};
-
-template <class Order>
-struct Traits128 : public Order {
-  constexpr bool Is128() const { return true; }
-  constexpr size_t LanesPerKey() const { return 2; }
-};
-
-#else
+#if VQSORT_ENABLED || HWY_IDE
 
 // Highway does not provide a lane type for 128-bit keys, so we use uint64_t
 // along with an abstraction layer for single-lane vs. lane-pair, which is
 // independent of the order.
-struct Key128 {
+struct KeyAny128 {
+  static constexpr bool Is128() { return true; }
   constexpr size_t LanesPerKey() const { return 2; }
 
-  template <typename T>
-  HWY_INLINE void Swap(T* a, T* b) const {
-    const FixedTag<T, 2> d;
+  // What type bench_sort should allocate for generating inputs.
+  using LaneType = uint64_t;
+  // KeyType and KeyString are defined by derived classes.
+
+  HWY_INLINE void Swap(LaneType* a, LaneType* b) const {
+    const FixedTag<LaneType, 2> d;
     const auto temp = LoadU(d, a);
     StoreU(LoadU(d, b), d, a);
     StoreU(temp, d, b);
   }
 
+  template <class V, class M>
+  HWY_INLINE V CompressKeys(V keys, M mask) const {
+    return CompressBlocksNot(keys, mask);
+  }
+
   template <class D>
   HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
     return LoadDup128(d, key);
@@ -136,6 +122,28 @@ struct Key128 {
   }
 };
 
+// Base class shared between OrderAscending128, OrderDescending128.
+struct Key128 : public KeyAny128 {
+  // What type to pass to Sorter::operator().
+  using KeyType = hwy::uint128_t;
+
+  std::string KeyString() const { return "U128"; }
+
+  template <class D>
+  HWY_INLINE Mask<D> EqualKeys(D d, Vec<D> a, Vec<D> b) const {
+    return Eq128(d, a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> NotEqualKeys(D d, Vec<D> a, Vec<D> b) const {
+    return Ne128(d, a, b);
+  }
+
+  HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
+    return a[0] == b[0] && a[1] == b[1];
+  }
+};
+
 // Anything order-related depends on the key traits *and* the order (see
 // FirstOfLanes). We cannot implement just one Compare function because Lt128
 // only compiles if the lane type is u64. Thus we need either overloaded
@@ -146,8 +154,7 @@ struct Key128 {
 struct OrderAscending128 : public Key128 {
   using Order = SortAscending;
 
-  template <typename T>
-  HWY_INLINE bool Compare1(const T* a, const T* b) {
+  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
     return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1];
   }
 
@@ -172,30 +179,6 @@ struct OrderAscending128 : public Key128 {
     return Max128(d, a, b);
   }
 
-  template <class D>
-  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
-                                 TFromD<D>* HWY_RESTRICT buf) const {
-    const size_t N = Lanes(d);
-    Store(v, d, buf);
-    v = SetKey(d, buf + 0);  // result must be broadcasted
-    for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
-      v = First(d, v, SetKey(d, buf + i));
-    }
-    return v;
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
-                                TFromD<D>* HWY_RESTRICT buf) const {
-    const size_t N = Lanes(d);
-    Store(v, d, buf);
-    v = SetKey(d, buf + 0);  // result must be broadcasted
-    for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
-      v = Last(d, v, SetKey(d, buf + i));
-    }
-    return v;
-  }
-
   // Same as for regular lanes because 128-bit lanes are u64.
   template <class D>
   HWY_INLINE Vec<D> FirstValue(D d) const {
@@ -206,13 +189,22 @@ struct OrderAscending128 : public Key128 {
   HWY_INLINE Vec<D> LastValue(D d) const {
     return Set(d, hwy::HighestValue<TFromD<D> >());
   }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    const Vec<D> k0 = Zero(d);
+    const Vec<D> k1 = OddEven(k0, Set(d, 1));
+    const Mask<D> borrow = Eq(v, k0);  // don't-care, lo == 0
+    // lo == 0? 1 : 0, 0
+    const Vec<D> adjust = ShiftLeftLanes<1>(IfThenElseZero(borrow, k1));
+    return Sub(Sub(v, k1), adjust);
+  }
 };
 
 struct OrderDescending128 : public Key128 {
   using Order = SortDescending;
 
-  template <typename T>
-  HWY_INLINE bool Compare1(const T* a, const T* b) {
+  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
     return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1];
   }
 
@@ -237,28 +229,122 @@ struct OrderDescending128 : public Key128 {
     return Min128(d, a, b);
   }
 
+  // Same as for regular lanes because 128-bit lanes are u64.
   template <class D>
-  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
-                                 TFromD<D>* HWY_RESTRICT buf) const {
-    const size_t N = Lanes(d);
-    Store(v, d, buf);
-    v = SetKey(d, buf + 0);  // result must be broadcasted
-    for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
-      v = First(d, v, SetKey(d, buf + i));
-    }
-    return v;
+  HWY_INLINE Vec<D> FirstValue(D d) const {
+    return Set(d, hwy::HighestValue<TFromD<D> >());
   }
 
   template <class D>
-  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
-                                TFromD<D>* HWY_RESTRICT buf) const {
-    const size_t N = Lanes(d);
-    Store(v, d, buf);
-    v = SetKey(d, buf + 0);  // result must be broadcasted
-    for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
-      v = Last(d, v, SetKey(d, buf + i));
-    }
-    return v;
+  HWY_INLINE Vec<D> LastValue(D d) const {
+    return Set(d, hwy::LowestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
+    const Vec<D> added = Add(v, k1);
+    const Mask<D> overflowed = Lt(added, v);  // false, overflowed
+    // overflowed? 1 : 0, 0
+    const Vec<D> adjust = ShiftLeftLanes<1>(IfThenElseZero(overflowed, k1));
+    return Add(added, adjust);
+  }
+};
+
+// Base class shared between OrderAscendingKV128, OrderDescendingKV128.
+struct KeyValue128 : public KeyAny128 {
+  // What type to pass to Sorter::operator().
+  using KeyType = K64V64;
+
+  std::string KeyString() const { return "KV128"; }
+
+  template <class D>
+  HWY_INLINE Mask<D> EqualKeys(D d, Vec<D> a, Vec<D> b) const {
+    return Eq128Upper(d, a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> NotEqualKeys(D d, Vec<D> a, Vec<D> b) const {
+    return Ne128Upper(d, a, b);
+  }
+
+  HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
+    return a[1] == b[1];
+  }
+};
+
+struct OrderAscendingKV128 : public KeyValue128 {
+  using Order = SortAscending;
+
+  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
+    return a[1] < b[1];
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
+    return Lt128Upper(d, a, b);
+  }
+
+  // Used by CompareTop
+  template <class V>
+  HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
+    return Lt(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
+    return Min128Upper(d, a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
+    return Max128Upper(d, a, b);
+  }
+
+  // Same as for regular lanes because 128-bit lanes are u64.
+  template <class D>
+  HWY_INLINE Vec<D> FirstValue(D d) const {
+    return Set(d, hwy::LowestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastValue(D d) const {
+    return Set(d, hwy::HighestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    const Vec<D> k1 = OddEven(Set(d, 1), Zero(d));
+    return Sub(v, k1);
+  }
+};
+
+struct OrderDescendingKV128 : public KeyValue128 {
+  using Order = SortDescending;
+
+  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
+    return b[1] < a[1];
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
+    return Lt128Upper(d, b, a);
+  }
+
+  // Used by CompareTop
+  template <class V>
+  HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
+    return Lt(b, a);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
+    return Max128Upper(d, a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
+    return Min128Upper(d, a, b);
   }
 
   // Same as for regular lanes because 128-bit lanes are u64.
@@ -271,21 +357,32 @@ struct OrderDescending128 : public Key128 {
   HWY_INLINE Vec<D> LastValue(D d) const {
     return Set(d, hwy::LowestValue<TFromD<D> >());
   }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    const Vec<D> k1 = OddEven(Set(d, 1), Zero(d));
+    return Add(v, k1);
+  }
 };
 
 // Shared code that depends on Order.
 template <class Base>
 class Traits128 : public Base {
-#if HWY_TARGET <= HWY_AVX2
+  // Special case for >= 256 bit vectors
+#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SVE_256
   // Returns vector with only the top u64 lane valid. Useful when the next step
   // is to replicate the mask anyway.
   template <class D>
   HWY_INLINE HWY_MAYBE_UNUSED Vec<D> CompareTop(D d, Vec<D> a, Vec<D> b) const {
     const Base* base = static_cast<const Base*>(this);
-    const Vec<D> eqHL = VecFromMask(d, Eq(a, b));
+    const Mask<D> eqHL = Eq(a, b);
     const Vec<D> ltHL = VecFromMask(d, base->CompareLanes(a, b));
+#if HWY_TARGET == HWY_SVE_256
+    return IfThenElse(eqHL, DupEven(ltHL), ltHL);
+#else
     const Vec<D> ltLX = ShiftLeftLanes<1>(ltHL);
-    return OrAnd(ltHL, eqHL, ltLX);
+    return OrAnd(ltHL, VecFromMask(d, eqHL), ltLX);
+#endif
   }
 
   // We want to swap 2 u128, i.e. 4 u64 lanes, based on the 0 or FF..FF mask in
@@ -293,16 +390,42 @@ class Traits128 : public Base {
   // replicate it 4x. Only called for >= 256-bit vectors.
   template <class V>
   HWY_INLINE V ReplicateTop4x(V v) const {
-#if HWY_TARGET <= HWY_AVX3
+#if HWY_TARGET == HWY_SVE_256
+    return svdup_lane_u64(v, 3);
+#elif HWY_TARGET <= HWY_AVX3
     return V{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
 #else  // AVX2
     return V{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
 #endif
   }
-#endif
+#endif  // HWY_TARGET
 
  public:
-  constexpr bool Is128() const { return true; }
+  template <class D>
+  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
+                                 TFromD<D>* HWY_RESTRICT buf) const {
+    const Base* base = static_cast<const Base*>(this);
+    const size_t N = Lanes(d);
+    Store(v, d, buf);
+    v = base->SetKey(d, buf + 0);  // result must be broadcasted
+    for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) {
+      v = base->First(d, v, base->SetKey(d, buf + i));
+    }
+    return v;
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
+                                TFromD<D>* HWY_RESTRICT buf) const {
+    const Base* base = static_cast<const Base*>(this);
+    const size_t N = Lanes(d);
+    Store(v, d, buf);
+    v = base->SetKey(d, buf + 0);  // result must be broadcasted
+    for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) {
+      v = base->Last(d, v, base->SetKey(d, buf + i));
+    }
+    return v;
+  }
 
   template <class D>
   HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const {
@@ -320,7 +443,7 @@ class Traits128 : public Base {
     const Base* base = static_cast<const Base*>(this);
     Vec<D> swapped = base->ReverseKeys2(d, v);
 
-#if HWY_TARGET <= HWY_AVX2
+#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SVE_256
     const Vec<D> select = ReplicateTop4x(CompareTop(d, v, swapped));
     return IfVecThenElse(select, swapped, v);
 #else
@@ -358,7 +481,7 @@ class Traits128 : public Base {
   }
 };
 
-#endif  // HWY_TARGET
+#endif  // VQSORT_ENABLED
 
 }  // namespace detail
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/media/highway/src/hwy/contrib/sort/vqsort-inl.h b/media/highway/src/hwy/contrib/sort/vqsort-inl.h
index 50b4d16f0c..10584d2465 100644
--- a/media/highway/src/hwy/contrib/sort/vqsort-inl.h
+++ b/media/highway/src/hwy/contrib/sort/vqsort-inl.h
@@ -17,6 +17,10 @@
 #ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
 #define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
 
+#ifndef VQSORT_PRINT
+#define VQSORT_PRINT 0
+#endif
+
 // Makes it harder for adversaries to predict our sampling locations, at the
 // cost of 1-2% increased runtime.
 #ifndef VQSORT_SECURE_RNG
@@ -27,10 +31,10 @@
 #include "third_party/absl/random/random.h"
 #endif
 
+#include <stdio.h>  // unconditional #include so we can use if(VQSORT_PRINT).
 #include <string.h>  // memcpy
 
-#include "hwy/cache_control.h"  // Prefetch
-#include "hwy/contrib/sort/disabled_targets.h"
+#include "hwy/cache_control.h"        // Prefetch
 #include "hwy/contrib/sort/vqsort.h"  // Fill24Bytes
 
 #if HWY_IS_MSAN
@@ -48,8 +52,13 @@
 #define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
 #endif
 
+#if VQSORT_PRINT
+#include "hwy/print-inl.h"
+#endif
+
 #include "hwy/contrib/sort/shared-inl.h"
 #include "hwy/contrib/sort/sorting_networks-inl.h"
+// Placeholder for internal instrumentation. Do not remove.
 #include "hwy/highway.h"
 
 HWY_BEFORE_NAMESPACE();
@@ -57,117 +66,91 @@ namespace hwy {
 namespace HWY_NAMESPACE {
 namespace detail {
 
-#if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
+using Constants = hwy::SortConstants;
 
-template <typename T>
-void Swap(T* a, T* b) {
-  T t = *a;
-  *a = *b;
-  *b = t;
+// Wrappers to avoid #if in user code (interferes with code folding)
+
+HWY_INLINE void UnpoisonIfMemorySanitizer(void* p, size_t bytes) {
+#if HWY_IS_MSAN
+  __msan_unpoison(p, bytes);
+#else
+  (void)p;
+  (void)bytes;
+#endif
 }
 
-// Scalar version of HeapSort (see below)
-template <class Traits, typename T>
-void HeapSort(Traits st, T* HWY_RESTRICT keys, const size_t num) {
-  if (num < 2) return;
+template <class D>
+HWY_INLINE void MaybePrintVector(D d, const char* label, Vec<D> v,
+                                 size_t start = 0, size_t max_lanes = 16) {
+#if VQSORT_PRINT >= 2  // Print is only defined #if
+  Print(d, label, v, start, max_lanes);
+#else
+  (void)d;
+  (void)label;
+  (void)v;
+  (void)start;
+  (void)max_lanes;
+#endif
+}
 
-  // Build heap.
-  for (size_t i = 1; i < num; i += 1) {
-    size_t j = i;
-    while (j != 0) {
-      const size_t idx_parent = ((j - 1) / 1 / 2);
-      if (!st.Compare1(keys + idx_parent, keys + j)) {
-        break;
-      }
-      Swap(keys + j, keys + idx_parent);
-      j = idx_parent;
-    }
-  }
+// ------------------------------ HeapSort
 
-  for (size_t i = num - 1; i != 0; i -= 1) {
-    // Swap root with last
-    Swap(keys + 0, keys + i);
+template <class Traits, typename T>
+void SiftDown(Traits st, T* HWY_RESTRICT lanes, const size_t num_lanes,
+              size_t start) {
+  constexpr size_t N1 = st.LanesPerKey();
+  const FixedTag<T, N1> d;
 
-    // Sift down the new root.
-    size_t j = 0;
-    while (j < i) {
-      const size_t left = 2 * j + 1;
-      const size_t right = 2 * j + 2;
-      if (left >= i) break;
-      size_t idx_larger = j;
-      if (st.Compare1(keys + j, keys + left)) {
-        idx_larger = left;
-      }
-      if (right < i && st.Compare1(keys + idx_larger, keys + right)) {
-        idx_larger = right;
-      }
-      if (idx_larger == j) break;
-      Swap(keys + j, keys + idx_larger);
-      j = idx_larger;
+  while (start < num_lanes) {
+    const size_t left = 2 * start + N1;
+    const size_t right = 2 * start + 2 * N1;
+    if (left >= num_lanes) break;
+    size_t idx_larger = start;
+    const auto key_j = st.SetKey(d, lanes + start);
+    if (AllTrue(d, st.Compare(d, key_j, st.SetKey(d, lanes + left)))) {
+      idx_larger = left;
+    }
+    if (right < num_lanes &&
+        AllTrue(d, st.Compare(d, st.SetKey(d, lanes + idx_larger),
+                              st.SetKey(d, lanes + right)))) {
+      idx_larger = right;
     }
+    if (idx_larger == start) break;
+    st.Swap(lanes + start, lanes + idx_larger);
+    start = idx_larger;
   }
 }
 
-#else
-
-using Constants = hwy::SortConstants;
-
-// ------------------------------ HeapSort
-
 // Heapsort: O(1) space, O(N*logN) worst-case comparisons.
 // Based on LLVM sanitizer_common.h, licensed under Apache-2.0.
 template <class Traits, typename T>
-void HeapSort(Traits st, T* HWY_RESTRICT keys, const size_t num) {
+void HeapSort(Traits st, T* HWY_RESTRICT lanes, const size_t num_lanes) {
   constexpr size_t N1 = st.LanesPerKey();
-  const FixedTag<T, N1> d;
 
-  if (num < 2 * N1) return;
+  if (num_lanes < 2 * N1) return;
 
   // Build heap.
-  for (size_t i = N1; i < num; i += N1) {
-    size_t j = i;
-    while (j != 0) {
-      const size_t idx_parent = ((j - N1) / N1 / 2) * N1;
-      if (AllFalse(d, st.Compare(d, st.SetKey(d, keys + idx_parent),
-                                 st.SetKey(d, keys + j)))) {
-        break;
-      }
-      st.Swap(keys + j, keys + idx_parent);
-      j = idx_parent;
-    }
+  for (size_t i = ((num_lanes - N1) / N1 / 2) * N1; i != (~N1 + 1); i -= N1) {
+    SiftDown(st, lanes, num_lanes, i);
   }
 
-  for (size_t i = num - N1; i != 0; i -= N1) {
+  for (size_t i = num_lanes - N1; i != 0; i -= N1) {
     // Swap root with last
-    st.Swap(keys + 0, keys + i);
+    st.Swap(lanes + 0, lanes + i);
 
     // Sift down the new root.
-    size_t j = 0;
-    while (j < i) {
-      const size_t left = 2 * j + N1;
-      const size_t right = 2 * j + 2 * N1;
-      if (left >= i) break;
-      size_t idx_larger = j;
-      const auto key_j = st.SetKey(d, keys + j);
-      if (AllTrue(d, st.Compare(d, key_j, st.SetKey(d, keys + left)))) {
-        idx_larger = left;
-      }
-      if (right < i && AllTrue(d, st.Compare(d, st.SetKey(d, keys + idx_larger),
-                                             st.SetKey(d, keys + right)))) {
-        idx_larger = right;
-      }
-      if (idx_larger == j) break;
-      st.Swap(keys + j, keys + idx_larger);
-      j = idx_larger;
-    }
+    SiftDown(st, lanes, i, 0);
   }
 }
 
+#if VQSORT_ENABLED || HWY_IDE
+
 // ------------------------------ BaseCase
 
 // Sorts `keys` within the range [0, num) via sorting network.
 template <class D, class Traits, typename T>
-HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
+HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys,
+                           T* HWY_RESTRICT keys_end, size_t num,
                            T* HWY_RESTRICT buf) {
   const size_t N = Lanes(d);
   using V = decltype(Zero(d));
@@ -185,6 +168,18 @@ HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
       HWY_MAX(st.LanesPerKey(), num_pow2 >> Constants::kMaxRowsLog2);
   HWY_DASSERT(cols <= N);
 
+  // We can avoid padding and load/store directly to `keys` after checking the
+  // original input array has enough space. Except at the right border, it's OK
+  // to sort more than the current sub-array. Even if we sort across a previous
+  // partition point, we know that keys will not migrate across it. However, we
+  // must use the maximum size of the sorting network, because the StoreU of its
+  // last vector would otherwise write invalid data starting at kMaxRows * cols.
+  const size_t N_sn = Lanes(CappedTag<T, Constants::kMaxCols>());
+  if (HWY_LIKELY(keys + N_sn * Constants::kMaxRows <= keys_end)) {
+    SortingNetwork(st, keys, N_sn);
+    return;
+  }
+
   // Copy `keys` to `buf`.
   size_t i;
   for (i = 0; i + N <= num; i += N) {
@@ -211,19 +206,19 @@ HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
 
 // ------------------------------ Partition
 
-// Consumes from `left` until a multiple of kUnroll*N remains.
-// Temporarily stores the right side into `buf`, then moves behind `right`.
+// Consumes from `keys` until a multiple of kUnroll*N remains.
+// Temporarily stores the right side into `buf`, then moves behind `num`.
+// Returns the number of keys consumed from the left side.
 template <class D, class Traits, class T>
-HWY_NOINLINE void PartitionToMultipleOfUnroll(D d, Traits st,
-                                              T* HWY_RESTRICT keys,
-                                              size_t& left, size_t& right,
-                                              const Vec<D> pivot,
-                                              T* HWY_RESTRICT buf) {
+HWY_NOINLINE size_t PartitionToMultipleOfUnroll(D d, Traits st,
+                                                T* HWY_RESTRICT keys,
+                                                size_t& num, const Vec<D> pivot,
+                                                T* HWY_RESTRICT buf) {
   constexpr size_t kUnroll = Constants::kPartitionUnroll;
   const size_t N = Lanes(d);
-  size_t readL = left;
+  size_t readL = 0;
+  T* HWY_RESTRICT posL = keys;
   size_t bufR = 0;
-  const size_t num = right - left;
   // Partition requires both a multiple of kUnroll*N and at least
   // 2*kUnroll*N for the initial loads. If less, consume all here.
   const size_t num_rem =
@@ -234,7 +229,7 @@ HWY_NOINLINE void PartitionToMultipleOfUnroll(D d, Traits st,
     readL += N;
 
     const auto comp = st.Compare(d, pivot, vL);
-    left += CompressBlendedStore(vL, Not(comp), d, keys + left);
+    posL += CompressBlendedStore(vL, Not(comp), d, posL);
     bufR += CompressStore(vL, comp, d, buf + bufR);
   }
   // Last iteration: only use valid lanes.
@@ -243,58 +238,64 @@ HWY_NOINLINE void PartitionToMultipleOfUnroll(D d, Traits st,
     const Vec<D> vL = LoadU(d, keys + readL);
 
     const auto comp = st.Compare(d, pivot, vL);
-    left += CompressBlendedStore(vL, AndNot(comp, mask), d, keys + left);
+    posL += CompressBlendedStore(vL, AndNot(comp, mask), d, posL);
     bufR += CompressStore(vL, And(comp, mask), d, buf + bufR);
   }
 
   // MSAN seems not to understand CompressStore. buf[0, bufR) are valid.
-#if HWY_IS_MSAN
-  __msan_unpoison(buf, bufR * sizeof(T));
-#endif
+  UnpoisonIfMemorySanitizer(buf, bufR * sizeof(T));
 
-  // Everything we loaded was put into buf, or behind the new `left`, after
-  // which there is space for bufR items. First move items from `right` to
-  // `left` to free up space, then copy `buf` into the vacated `right`.
+  // Everything we loaded was put into buf, or behind the current `posL`, after
+  // which there is space for bufR items. First move items from `keys + num` to
+  // `posL` to free up space, then copy `buf` into the vacated `keys + num`.
   // A loop with masked loads from `buf` is insufficient - we would also need to
-  // mask from `right`. Combining a loop with memcpy for the remainders is
+  // mask from `keys + num`. Combining a loop with memcpy for the remainders is
   // slower than just memcpy, so we use that for simplicity.
-  right -= bufR;
-  memcpy(keys + left, keys + right, bufR * sizeof(T));
-  memcpy(keys + right, buf, bufR * sizeof(T));
+  num -= bufR;
+  memcpy(posL, keys + num, bufR * sizeof(T));
+  memcpy(keys + num, buf, bufR * sizeof(T));
+  return static_cast<size_t>(posL - keys);  // caller will shrink num by this.
+}
+
+template <class V>
+V OrXor(const V o, const V x1, const V x2) {
+  // TODO(janwas): add op so we can benefit from AVX-512 ternlog?
+  return Or(o, Xor(x1, x2));
 }
 
+// Note: we could track the OrXor of v and pivot to see if the entire left
+// partition is equal, but that happens rarely and thus is a net loss.
 template <class D, class Traits, typename T>
 HWY_INLINE void StoreLeftRight(D d, Traits st, const Vec<D> v,
                                const Vec<D> pivot, T* HWY_RESTRICT keys,
-                               size_t& writeL, size_t& writeR) {
+                               size_t& writeL, size_t& remaining) {
   const size_t N = Lanes(d);
 
   const auto comp = st.Compare(d, pivot, v);
 
-  if (hwy::HWY_NAMESPACE::CompressIsPartition<T>::value) {
+  remaining -= N;
+  if (hwy::HWY_NAMESPACE::CompressIsPartition<T>::value ||
+      (HWY_MAX_BYTES == 16 && st.Is128())) {
     // Non-native Compress (e.g. AVX2): we are able to partition a vector using
     // a single Compress+two StoreU instead of two Compress[Blended]Store. The
     // latter are more expensive. Because we store entire vectors, the contents
     // between the updated writeL and writeR are ignored and will be overwritten
     // by subsequent calls. This works because writeL and writeR are at least
     // two vectors apart.
-    const auto mask = Not(comp);
-    const auto lr = Compress(v, mask);
-    const size_t num_left = CountTrue(d, mask);
+    const auto lr = st.CompressKeys(v, comp);
+    const size_t num_left = N - CountTrue(d, comp);
     StoreU(lr, d, keys + writeL);
-    writeL += num_left;
     // Now write the right-side elements (if any), such that the previous writeR
     // is one past the end of the newly written right elements, then advance.
-    StoreU(lr, d, keys + writeR - N);
-    writeR -= (N - num_left);
+    StoreU(lr, d, keys + remaining + writeL);
+    writeL += num_left;
   } else {
     // Native Compress[Store] (e.g. AVX3), which only keep the left or right
     // side, not both, hence we require two calls.
     const size_t num_left = CompressStore(v, Not(comp), d, keys + writeL);
     writeL += num_left;
 
-    writeR -= (N - num_left);
-    (void)CompressBlendedStore(v, comp, d, keys + writeR);
+    (void)CompressBlendedStore(v, comp, d, keys + remaining + writeL);
   }
 }
 
@@ -303,11 +304,11 @@ HWY_INLINE void StoreLeftRight4(D d, Traits st, const Vec<D> v0,
                                 const Vec<D> v1, const Vec<D> v2,
                                 const Vec<D> v3, const Vec<D> pivot,
                                 T* HWY_RESTRICT keys, size_t& writeL,
-                                size_t& writeR) {
-  StoreLeftRight(d, st, v0, pivot, keys, writeL, writeR);
-  StoreLeftRight(d, st, v1, pivot, keys, writeL, writeR);
-  StoreLeftRight(d, st, v2, pivot, keys, writeL, writeR);
-  StoreLeftRight(d, st, v3, pivot, keys, writeL, writeR);
+                                size_t& remaining) {
+  StoreLeftRight(d, st, v0, pivot, keys, writeL, remaining);
+  StoreLeftRight(d, st, v1, pivot, keys, writeL, remaining);
+  StoreLeftRight(d, st, v2, pivot, keys, writeL, remaining);
+  StoreLeftRight(d, st, v3, pivot, keys, writeL, remaining);
 }
 
 // Moves "<= pivot" keys to the front, and others to the back. pivot is
@@ -315,80 +316,136 @@ HWY_INLINE void StoreLeftRight4(D d, Traits st, const Vec<D> v0,
 //
 // Aligned loads do not seem to be worthwhile (not bottlenecked by load ports).
 template <class D, class Traits, typename T>
-HWY_NOINLINE size_t Partition(D d, Traits st, T* HWY_RESTRICT keys, size_t left,
-                              size_t right, const Vec<D> pivot,
-                              T* HWY_RESTRICT buf) {
+HWY_NOINLINE size_t Partition(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
+                              const Vec<D> pivot, T* HWY_RESTRICT buf) {
   using V = decltype(Zero(d));
   const size_t N = Lanes(d);
 
   // StoreLeftRight will CompressBlendedStore ending at `writeR`. Unless all
   // lanes happen to be in the right-side partition, this will overrun `keys`,
   // which triggers asan errors. Avoid by special-casing the last vector.
-  HWY_DASSERT(right - left > 2 * N);  // ensured by HandleSpecialCases
-  right -= N;
-  const size_t last = right;
+  HWY_DASSERT(num > 2 * N);  // ensured by HandleSpecialCases
+  num -= N;
+  size_t last = num;
   const V vlast = LoadU(d, keys + last);
 
-  PartitionToMultipleOfUnroll(d, st, keys, left, right, pivot, buf);
+  const size_t consumedL =
+      PartitionToMultipleOfUnroll(d, st, keys, num, pivot, buf);
+  keys += consumedL;
+  last -= consumedL;
+  num -= consumedL;
   constexpr size_t kUnroll = Constants::kPartitionUnroll;
 
-  // Invariant: [left, writeL) and [writeR, right) are already partitioned.
-  size_t writeL = left;
-  size_t writeR = right;
-
-  const size_t num = right - left;
+  // Partition splits the vector into 3 sections, left to right: Elements
+  // smaller or equal to the pivot, unpartitioned elements and elements larger
+  // than the pivot. To write elements unconditionally on the loop body without
+  // overwriting existing data, we maintain two regions of the loop where all
+  // elements have been copied elsewhere (e.g. vector registers.). I call these
+  // bufferL and bufferR, for left and right respectively.
+  //
+  // These regions are tracked by the indices (writeL, writeR, left, right) as
+  // presented in the diagram below.
+  //
+  //              writeL                                  writeR
+  //               \/                                       \/
+  //  |  <= pivot   | bufferL |   unpartitioned   | bufferR |   > pivot   |
+  //                          \/                  \/
+  //                         left                 right
+  //
+  // In the main loop body below we choose a side, load some elements out of the
+  // vector and move either `left` or `right`. Next we call into StoreLeftRight
+  // to partition the data, and the partitioned elements will be written either
+  // to writeR or writeL and the corresponding index will be moved accordingly.
+  //
+  // Note that writeR is not explicitly tracked as an optimization for platforms
+  // with conditional operations. Instead we track writeL and the number of
+  // elements left to process (`remaining`). From the diagram above we can see
+  // that:
+  //    writeR - writeL = remaining => writeR = remaining + writeL
+  //
+  // Tracking `remaining` is advantageous because each iteration reduces the
+  // number of unpartitioned elements by a fixed amount, so we can compute
+  // `remaining` without data dependencies.
+  //
+  size_t writeL = 0;
+  size_t remaining = num;
+
+  const T* HWY_RESTRICT readL = keys;
+  const T* HWY_RESTRICT readR = keys + num;
   // Cannot load if there were fewer than 2 * kUnroll * N.
   if (HWY_LIKELY(num != 0)) {
     HWY_DASSERT(num >= 2 * kUnroll * N);
     HWY_DASSERT((num & (kUnroll * N - 1)) == 0);
 
-    // Make space for writing in-place by reading from left and right.
-    const V vL0 = LoadU(d, keys + left + 0 * N);
-    const V vL1 = LoadU(d, keys + left + 1 * N);
-    const V vL2 = LoadU(d, keys + left + 2 * N);
-    const V vL3 = LoadU(d, keys + left + 3 * N);
-    left += kUnroll * N;
-    right -= kUnroll * N;
-    const V vR0 = LoadU(d, keys + right + 0 * N);
-    const V vR1 = LoadU(d, keys + right + 1 * N);
-    const V vR2 = LoadU(d, keys + right + 2 * N);
-    const V vR3 = LoadU(d, keys + right + 3 * N);
-
-    // The left/right updates may consume all inputs, so check before the loop.
-    while (left != right) {
+    // Make space for writing in-place by reading from readL/readR.
+    const V vL0 = LoadU(d, readL + 0 * N);
+    const V vL1 = LoadU(d, readL + 1 * N);
+    const V vL2 = LoadU(d, readL + 2 * N);
+    const V vL3 = LoadU(d, readL + 3 * N);
+    readL += kUnroll * N;
+    readR -= kUnroll * N;
+    const V vR0 = LoadU(d, readR + 0 * N);
+    const V vR1 = LoadU(d, readR + 1 * N);
+    const V vR2 = LoadU(d, readR + 2 * N);
+    const V vR3 = LoadU(d, readR + 3 * N);
+
+    // readL/readR changed above, so check again before the loop.
+    while (readL != readR) {
       V v0, v1, v2, v3;
 
-      // Free up capacity for writing by loading from the side that has less.
       // Data-dependent but branching is faster than forcing branch-free.
-      const size_t capacityL = left - writeL;
-      const size_t capacityR = writeR - right;
-      HWY_DASSERT(capacityL <= num && capacityR <= num);  // >= 0
-      if (capacityR < capacityL) {
-        right -= kUnroll * N;
-        v0 = LoadU(d, keys + right + 0 * N);
-        v1 = LoadU(d, keys + right + 1 * N);
-        v2 = LoadU(d, keys + right + 2 * N);
-        v3 = LoadU(d, keys + right + 3 * N);
-        hwy::Prefetch(keys + right - 3 * kUnroll * N);
+      const size_t capacityL =
+          static_cast<size_t>((readL - keys) - static_cast<ptrdiff_t>(writeL));
+      HWY_DASSERT(capacityL <= num);  // >= 0
+      // Load data from the end of the vector with less data (front or back).
+      // The next paragraphs explain how this works.
+      //
+      // let block_size = (kUnroll * N)
+      // On the loop prelude we load block_size elements from the front of the
+      // vector and an additional block_size elements from the back. On each
+      // iteration k elements are written to the front of the vector and
+      // (block_size - k) to the back.
+      //
+      // This creates a loop invariant where the capacity on the front
+      // (capacityL) and on the back (capacityR) always add to 2 * block_size.
+      // In other words:
+      //    capacityL + capacityR = 2 * block_size
+      //    capacityR = 2 * block_size - capacityL
+      //
+      // This means that:
+      //    capacityL < capacityR <=>
+      //    capacityL < 2 * block_size - capacityL <=>
+      //    2 * capacityL < 2 * block_size <=>
+      //    capacityL < block_size
+      //
+      // Thus the check on the next line is equivalent to capacityL > capacityR.
+      //
+      if (kUnroll * N < capacityL) {
+        readR -= kUnroll * N;
+        v0 = LoadU(d, readR + 0 * N);
+        v1 = LoadU(d, readR + 1 * N);
+        v2 = LoadU(d, readR + 2 * N);
+        v3 = LoadU(d, readR + 3 * N);
+        hwy::Prefetch(readR - 3 * kUnroll * N);
       } else {
-        v0 = LoadU(d, keys + left + 0 * N);
-        v1 = LoadU(d, keys + left + 1 * N);
-        v2 = LoadU(d, keys + left + 2 * N);
-        v3 = LoadU(d, keys + left + 3 * N);
-        left += kUnroll * N;
-        hwy::Prefetch(keys + left + 3 * kUnroll * N);
+        v0 = LoadU(d, readL + 0 * N);
+        v1 = LoadU(d, readL + 1 * N);
+        v2 = LoadU(d, readL + 2 * N);
+        v3 = LoadU(d, readL + 3 * N);
+        readL += kUnroll * N;
+        hwy::Prefetch(readL + 3 * kUnroll * N);
       }
 
-      StoreLeftRight4(d, st, v0, v1, v2, v3, pivot, keys, writeL, writeR);
+      StoreLeftRight4(d, st, v0, v1, v2, v3, pivot, keys, writeL, remaining);
     }
 
-    // Now finish writing the initial left/right to the middle.
-    StoreLeftRight4(d, st, vL0, vL1, vL2, vL3, pivot, keys, writeL, writeR);
-    StoreLeftRight4(d, st, vR0, vR1, vR2, vR3, pivot, keys, writeL, writeR);
+    // Now finish writing the saved vectors to the middle.
+    StoreLeftRight4(d, st, vL0, vL1, vL2, vL3, pivot, keys, writeL, remaining);
+    StoreLeftRight4(d, st, vR0, vR1, vR2, vR3, pivot, keys, writeL, remaining);
   }
 
   // We have partitioned [left, right) such that writeL is the boundary.
-  HWY_DASSERT(writeL == writeR);
+  HWY_DASSERT(remaining == 0);
   // Make space for inserting vlast: move up to N of the first right-side keys
   // into the unused space starting at last. If we have fewer, ensure they are
   // the last items in that vector by subtracting from the *load* address,
@@ -402,10 +459,250 @@ HWY_NOINLINE size_t Partition(D d, Traits st, T* HWY_RESTRICT keys, size_t left,
   writeL += CompressBlendedStore(vlast, Not(comp), d, keys + writeL);
   (void)CompressBlendedStore(vlast, comp, d, keys + writeL);
 
-  return writeL;
+  return consumedL + writeL;
 }
 
-// ------------------------------ Pivot
+// Returns true and partitions if [keys, keys + num) contains only {valueL,
+// valueR}. Otherwise, sets third to the first differing value; keys may have
+// been reordered and a regular Partition is still necessary.
+template <class D, class Traits, typename T>
+HWY_NOINLINE bool MaybePartitionTwoValue(D d, Traits st, T* HWY_RESTRICT keys,
+                                         size_t num, const Vec<D> valueL,
+                                         const Vec<D> valueR, Vec<D>& third,
+                                         T* HWY_RESTRICT buf) {
+  const size_t N = Lanes(d);
+
+  size_t i = 0;
+  size_t writeL = 0;
+
+  // As long as all lanes are equal to L or R, we can overwrite with valueL.
+  // This is faster than first counting, then backtracking to fill L and R.
+  for (; i + N <= num; i += N) {
+    const Vec<D> v = LoadU(d, keys + i);
+    // It is not clear how to apply OrXor here - that can check if *both*
+    // comparisons are true, but here we want *either*. Comparing the unsigned
+    // min of differences to zero works, but is expensive for u64 prior to AVX3.
+    const Mask<D> eqL = st.EqualKeys(d, v, valueL);
+    const Mask<D> eqR = st.EqualKeys(d, v, valueR);
+    // At least one other value present; will require a regular partition.
+    // On AVX-512, Or + AllTrue are folded into a single kortest if we are
+    // careful with the FindKnownFirstTrue argument, see below.
+    if (HWY_UNLIKELY(!AllTrue(d, Or(eqL, eqR)))) {
+      // If we repeat Or(eqL, eqR) here, the compiler will hoist it into the
+      // loop, which is a pessimization because this if-true branch is cold.
+      // We can defeat this via Not(Xor), which is equivalent because eqL and
+      // eqR cannot be true at the same time. Can we elide the additional Not?
+      // FindFirstFalse instructions are generally unavailable, but we can
+      // fuse Not and Xor/Or into one ExclusiveNeither.
+      const size_t lane = FindKnownFirstTrue(d, ExclusiveNeither(eqL, eqR));
+      third = st.SetKey(d, keys + i + lane);
+      if (VQSORT_PRINT >= 2) {
+        fprintf(stderr, "found 3rd value at vec %zu; writeL %zu\n", i, writeL);
+      }
+      // 'Undo' what we did by filling the remainder of what we read with R.
+      for (; writeL + N <= i; writeL += N) {
+        StoreU(valueR, d, keys + writeL);
+      }
+      BlendedStore(valueR, FirstN(d, i - writeL), d, keys + writeL);
+      return false;
+    }
+    StoreU(valueL, d, keys + writeL);
+    writeL += CountTrue(d, eqL);
+  }
+
+  // Final vector, masked comparison (no effect if i == num)
+  const size_t remaining = num - i;
+  SafeCopyN(remaining, d, keys + i, buf);
+  const Vec<D> v = Load(d, buf);
+  const Mask<D> valid = FirstN(d, remaining);
+  const Mask<D> eqL = And(st.EqualKeys(d, v, valueL), valid);
+  const Mask<D> eqR = st.EqualKeys(d, v, valueR);
+  // Invalid lanes are considered equal.
+  const Mask<D> eq = Or(Or(eqL, eqR), Not(valid));
+  // At least one other value present; will require a regular partition.
+  if (HWY_UNLIKELY(!AllTrue(d, eq))) {
+    const size_t lane = FindKnownFirstTrue(d, Not(eq));
+    third = st.SetKey(d, keys + i + lane);
+    if (VQSORT_PRINT >= 2) {
+      fprintf(stderr, "found 3rd value at partial vec %zu; writeL %zu\n", i,
+              writeL);
+    }
+    // 'Undo' what we did by filling the remainder of what we read with R.
+    for (; writeL + N <= i; writeL += N) {
+      StoreU(valueR, d, keys + writeL);
+    }
+    BlendedStore(valueR, FirstN(d, i - writeL), d, keys + writeL);
+    return false;
+  }
+  BlendedStore(valueL, valid, d, keys + writeL);
+  writeL += CountTrue(d, eqL);
+
+  // Fill right side
+  i = writeL;
+  for (; i + N <= num; i += N) {
+    StoreU(valueR, d, keys + i);
+  }
+  BlendedStore(valueR, FirstN(d, num - i), d, keys + i);
+
+  if (VQSORT_PRINT >= 2) {
+    fprintf(stderr, "Successful MaybePartitionTwoValue\n");
+  }
+  return true;
+}
+
+// Same as above, except that the pivot equals valueR, so scan right to left.
+template <class D, class Traits, typename T>
+HWY_NOINLINE bool MaybePartitionTwoValueR(D d, Traits st, T* HWY_RESTRICT keys,
+                                          size_t num, const Vec<D> valueL,
+                                          const Vec<D> valueR, Vec<D>& third,
+                                          T* HWY_RESTRICT buf) {
+  const size_t N = Lanes(d);
+
+  HWY_DASSERT(num >= N);
+  size_t pos = num - N;  // current read/write position
+  size_t countR = 0;     // number of valueR found
+
+  // For whole vectors, in descending address order: as long as all lanes are
+  // equal to L or R, overwrite with valueR. This is faster than counting, then
+  // filling both L and R. Loop terminates after unsigned wraparound.
+  for (; pos < num; pos -= N) {
+    const Vec<D> v = LoadU(d, keys + pos);
+    // It is not clear how to apply OrXor here - that can check if *both*
+    // comparisons are true, but here we want *either*. Comparing the unsigned
+    // min of differences to zero works, but is expensive for u64 prior to AVX3.
+    const Mask<D> eqL = st.EqualKeys(d, v, valueL);
+    const Mask<D> eqR = st.EqualKeys(d, v, valueR);
+    // If there is a third value, stop and undo what we've done. On AVX-512,
+    // Or + AllTrue are folded into a single kortest, but only if we are
+    // careful with the FindKnownFirstTrue argument - see prior comment on that.
+    if (HWY_UNLIKELY(!AllTrue(d, Or(eqL, eqR)))) {
+      const size_t lane = FindKnownFirstTrue(d, ExclusiveNeither(eqL, eqR));
+      third = st.SetKey(d, keys + pos + lane);
+      if (VQSORT_PRINT >= 2) {
+        fprintf(stderr, "found 3rd value at vec %zu; countR %zu\n", pos,
+                countR);
+        MaybePrintVector(d, "third", third, 0, st.LanesPerKey());
+      }
+      pos += N;  // rewind: we haven't yet committed changes in this iteration.
+      // We have filled [pos, num) with R, but only countR of them should have
+      // been written. Rewrite [pos, num - countR) to L.
+      HWY_DASSERT(countR <= num - pos);
+      const size_t endL = num - countR;
+      for (; pos + N <= endL; pos += N) {
+        StoreU(valueL, d, keys + pos);
+      }
+      BlendedStore(valueL, FirstN(d, endL - pos), d, keys + pos);
+      return false;
+    }
+    StoreU(valueR, d, keys + pos);
+    countR += CountTrue(d, eqR);
+  }
+
+  // Final partial (or empty) vector, masked comparison.
+  const size_t remaining = pos + N;
+  HWY_DASSERT(remaining <= N);
+  const Vec<D> v = LoadU(d, keys);  // Safe because num >= N.
+  const Mask<D> valid = FirstN(d, remaining);
+  const Mask<D> eqL = st.EqualKeys(d, v, valueL);
+  const Mask<D> eqR = And(st.EqualKeys(d, v, valueR), valid);
+  // Invalid lanes are considered equal.
+  const Mask<D> eq = Or(Or(eqL, eqR), Not(valid));
+  // At least one other value present; will require a regular partition.
+  if (HWY_UNLIKELY(!AllTrue(d, eq))) {
+    const size_t lane = FindKnownFirstTrue(d, Not(eq));
+    third = st.SetKey(d, keys + lane);
+    if (VQSORT_PRINT >= 2) {
+      fprintf(stderr, "found 3rd value at partial vec %zu; writeR %zu\n", pos,
+              countR);
+      MaybePrintVector(d, "third", third, 0, st.LanesPerKey());
+    }
+    pos += N;  // rewind: we haven't yet committed changes in this iteration.
+    // We have filled [pos, num) with R, but only countR of them should have
+    // been written. Rewrite [pos, num - countR) to L.
+    HWY_DASSERT(countR <= num - pos);
+    const size_t endL = num - countR;
+    for (; pos + N <= endL; pos += N) {
+      StoreU(valueL, d, keys + pos);
+    }
+    BlendedStore(valueL, FirstN(d, endL - pos), d, keys + pos);
+    return false;
+  }
+  const size_t lastR = CountTrue(d, eqR);
+  countR += lastR;
+
+  // First finish writing valueR - [0, N) lanes were not yet written.
+  StoreU(valueR, d, keys);  // Safe because num >= N.
+
+  // Fill left side (ascending order for clarity)
+  const size_t endL = num - countR;
+  size_t i = 0;
+  for (; i + N <= endL; i += N) {
+    StoreU(valueL, d, keys + i);
+  }
+  Store(valueL, d, buf);
+  SafeCopyN(endL - i, d, buf, keys + i);  // avoids asan overrun
+
+  if (VQSORT_PRINT >= 2) {
+    fprintf(stderr,
+            "MaybePartitionTwoValueR countR %zu pos %zu i %zu endL %zu\n",
+            countR, pos, i, endL);
+  }
+
+  return true;
+}
+
+// `idx_second` is `first_mismatch` from `AllEqual` and thus the index of the
+// second key. This is the first path into `MaybePartitionTwoValue`, called
+// when all samples are equal. Returns false if there are at least a third
+// value and sets `third`. Otherwise, partitions the array and returns true.
+template <class D, class Traits, typename T>
+HWY_INLINE bool PartitionIfTwoKeys(D d, Traits st, const Vec<D> pivot,
+                                   T* HWY_RESTRICT keys, size_t num,
+                                   const size_t idx_second, const Vec<D> second,
+                                   Vec<D>& third, T* HWY_RESTRICT buf) {
+  // True if second comes before pivot.
+  const bool is_pivotR = AllFalse(d, st.Compare(d, pivot, second));
+  if (VQSORT_PRINT >= 1) {
+    fprintf(stderr, "Samples all equal, diff at %zu, isPivotR %d\n", idx_second,
+            is_pivotR);
+  }
+  HWY_DASSERT(AllFalse(d, st.EqualKeys(d, second, pivot)));
+
+  // If pivot is R, we scan backwards over the entire array. Otherwise,
+  // we already scanned up to idx_second and can leave those in place.
+  return is_pivotR ? MaybePartitionTwoValueR(d, st, keys, num, second, pivot,
+                                             third, buf)
+                   : MaybePartitionTwoValue(d, st, keys + idx_second,
+                                            num - idx_second, pivot, second,
+                                            third, buf);
+}
+
+// Second path into `MaybePartitionTwoValue`, called when not all samples are
+// equal. `samples` is sorted.
+template <class D, class Traits, typename T>
+HWY_INLINE bool PartitionIfTwoSamples(D d, Traits st, T* HWY_RESTRICT keys,
+                                      size_t num, T* HWY_RESTRICT samples) {
+  constexpr size_t kSampleLanes = 3 * 64 / sizeof(T);
+  constexpr size_t N1 = st.LanesPerKey();
+  const Vec<D> valueL = st.SetKey(d, samples);
+  const Vec<D> valueR = st.SetKey(d, samples + kSampleLanes - N1);
+  HWY_DASSERT(AllTrue(d, st.Compare(d, valueL, valueR)));
+  HWY_DASSERT(AllFalse(d, st.EqualKeys(d, valueL, valueR)));
+  const Vec<D> prev = st.PrevValue(d, valueR);
+  // If the sample has more than two values, then the keys have at least that
+  // many, and thus this special case is inapplicable.
+  if (HWY_UNLIKELY(!AllTrue(d, st.EqualKeys(d, valueL, prev)))) {
+    return false;
+  }
+
+  // Must not overwrite samples because if this returns false, caller wants to
+  // read the original samples again.
+  T* HWY_RESTRICT buf = samples + kSampleLanes;
+  Vec<D> third;  // unused
+  return MaybePartitionTwoValue(d, st, keys, num, valueL, valueR, third, buf);
+}
+
+// ------------------------------ Pivot sampling
 
 template <class Traits, class V>
 HWY_INLINE V MedianOf3(Traits st, V v0, V v1, V v2) {
@@ -425,41 +722,6 @@ HWY_INLINE V MedianOf3(Traits st, V v0, V v1, V v2) {
   return v1;
 }
 
-// Replaces triplets with their median and recurses until less than 3 keys
-// remain. Ignores leftover values (non-whole triplets)!
-template <class D, class Traits, typename T>
-Vec<D> RecursiveMedianOf3(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
-                          T* HWY_RESTRICT buf) {
-  const size_t N = Lanes(d);
-  constexpr size_t N1 = st.LanesPerKey();
-
-  if (num < 3 * N1) return st.SetKey(d, keys);
-
-  size_t read = 0;
-  size_t written = 0;
-
-  // Triplets of vectors
-  for (; read + 3 * N <= num; read += 3 * N) {
-    const auto v0 = Load(d, keys + read + 0 * N);
-    const auto v1 = Load(d, keys + read + 1 * N);
-    const auto v2 = Load(d, keys + read + 2 * N);
-    Store(MedianOf3(st, v0, v1, v2), d, buf + written);
-    written += N;
-  }
-
-  // Triplets of keys
-  for (; read + 3 * N1 <= num; read += 3 * N1) {
-    const auto v0 = st.SetKey(d, keys + read + 0 * N1);
-    const auto v1 = st.SetKey(d, keys + read + 1 * N1);
-    const auto v2 = st.SetKey(d, keys + read + 2 * N1);
-    StoreU(MedianOf3(st, v0, v1, v2), d, buf + written);
-    written += N1;
-  }
-
-  // Tail recursion; swap buffers
-  return RecursiveMedianOf3(d, st, buf, written, keys);
-}
-
 #if VQSORT_SECURE_RNG
 using Generator = absl::BitGen;
 #else
@@ -505,19 +767,16 @@ HWY_INLINE size_t RandomChunkIndex(const uint32_t num_chunks, uint32_t bits) {
   return static_cast<size_t>(chunk_index);
 }
 
+// Writes samples from `keys[0, num)` into `buf`.
 template <class D, class Traits, typename T>
-HWY_NOINLINE Vec<D> ChoosePivot(D d, Traits st, T* HWY_RESTRICT keys,
-                                const size_t begin, const size_t end,
-                                T* HWY_RESTRICT buf, Generator& rng) {
+HWY_INLINE void DrawSamples(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
+                            T* HWY_RESTRICT buf, Generator& rng) {
   using V = decltype(Zero(d));
   const size_t N = Lanes(d);
 
   // Power of two
   const size_t lanes_per_chunk = Constants::LanesPerChunk(sizeof(T), N);
 
-  keys += begin;
-  size_t num = end - begin;
-
   // Align start of keys to chunks. We always have at least 2 chunks because the
   // base case would have handled anything up to 16 vectors, i.e. >= 4 chunks.
   HWY_DASSERT(num >= 2 * lanes_per_chunk);
@@ -572,105 +831,530 @@ HWY_NOINLINE Vec<D> ChoosePivot(D d, Traits st, T* HWY_RESTRICT keys,
     const V medians2 = MedianOf3(st, v6, v7, v8);
     Store(medians2, d, buf + i + lanes_per_chunk * 2);
   }
+}
+
+// For detecting inputs where (almost) all keys are equal.
+template <class D, class Traits>
+HWY_INLINE bool UnsortedSampleEqual(D d, Traits st,
+                                    const TFromD<D>* HWY_RESTRICT samples) {
+  constexpr size_t kSampleLanes = 3 * 64 / sizeof(TFromD<D>);
+  const size_t N = Lanes(d);
+  using V = Vec<D>;
+
+  const V first = st.SetKey(d, samples);
+  // OR of XOR-difference may be faster than comparison.
+  V diff = Zero(d);
+  size_t i = 0;
+  for (; i + N <= kSampleLanes; i += N) {
+    const V v = Load(d, samples + i);
+    diff = OrXor(diff, first, v);
+  }
+  // Remainder, if any.
+  const V v = Load(d, samples + i);
+  const auto valid = FirstN(d, kSampleLanes - i);
+  diff = IfThenElse(valid, OrXor(diff, first, v), diff);
+
+  // Must avoid floating-point comparisons (for -0)
+  const RebindToUnsigned<D> du;
+  return AllTrue(du, Eq(BitCast(du, diff), Zero(du)));
+}
+
+template <class D, class Traits, typename T>
+HWY_INLINE void SortSamples(D d, Traits st, T* HWY_RESTRICT buf) {
+  // buf contains 192 bytes, so 16 128-bit vectors are necessary and sufficient.
+  constexpr size_t kSampleLanes = 3 * 64 / sizeof(T);
+  const CappedTag<T, 16 / sizeof(T)> d128;
+  const size_t N128 = Lanes(d128);
+  constexpr size_t kCols = HWY_MIN(16 / sizeof(T), Constants::kMaxCols);
+  constexpr size_t kBytes = kCols * Constants::kMaxRows * sizeof(T);
+  static_assert(192 <= kBytes, "");
+  // Fill with padding - last in sort order.
+  const auto kPadding = st.LastValue(d128);
+  // Initialize an extra vector because SortingNetwork loads full vectors,
+  // which may exceed cols*kMaxRows.
+  for (size_t i = kSampleLanes; i <= kBytes / sizeof(T); i += N128) {
+    StoreU(kPadding, d128, buf + i);
+  }
+
+  SortingNetwork(st, buf, kCols);
+
+  if (VQSORT_PRINT >= 2) {
+    const size_t N = Lanes(d);
+    fprintf(stderr, "Samples:\n");
+    for (size_t i = 0; i < kSampleLanes; i += N) {
+      MaybePrintVector(d, "", Load(d, buf + i), 0, N);
+    }
+  }
+}
+
+// ------------------------------ Pivot selection
+
+enum class PivotResult {
+  kDone,     // stop without partitioning (all equal, or two-value partition)
+  kNormal,   // partition and recurse left and right
+  kIsFirst,  // partition but skip left recursion
+  kWasLast,  // partition but skip right recursion
+};
+
+HWY_INLINE const char* PivotResultString(PivotResult result) {
+  switch (result) {
+    case PivotResult::kDone:
+      return "done";
+    case PivotResult::kNormal:
+      return "normal";
+    case PivotResult::kIsFirst:
+      return "first";
+    case PivotResult::kWasLast:
+      return "last";
+  }
+  return "unknown";
+}
+
+template <class Traits, typename T>
+HWY_INLINE size_t PivotRank(Traits st, const T* HWY_RESTRICT samples) {
+  constexpr size_t kSampleLanes = 3 * 64 / sizeof(T);
+  constexpr size_t N1 = st.LanesPerKey();
+
+  constexpr size_t kRankMid = kSampleLanes / 2;
+  static_assert(kRankMid % N1 == 0, "Mid is not an aligned key");
+
+  // Find the previous value not equal to the median.
+  size_t rank_prev = kRankMid - N1;
+  for (; st.Equal1(samples + rank_prev, samples + kRankMid); rank_prev -= N1) {
+    // All previous samples are equal to the median.
+    if (rank_prev == 0) return 0;
+  }
+
+  size_t rank_next = rank_prev + N1;
+  for (; st.Equal1(samples + rank_next, samples + kRankMid); rank_next += N1) {
+    // The median is also the largest sample. If it is also the largest key,
+    // we'd end up with an empty right partition, so choose the previous key.
+    if (rank_next == kSampleLanes - N1) return rank_prev;
+  }
+
+  // If we choose the median as pivot, the ratio of keys ending in the left
+  // partition will likely be rank_next/kSampleLanes (if the sample is
+  // representative). This is because equal-to-pivot values also land in the
+  // left - it's infeasible to do an in-place vectorized 3-way partition.
+  // Check whether prev would lead to a more balanced partition.
+  const size_t excess_if_median = rank_next - kRankMid;
+  const size_t excess_if_prev = kRankMid - rank_prev;
+  return excess_if_median < excess_if_prev ? kRankMid : rank_prev;
+}
+
+// Returns pivot chosen from `samples`. It will never be the largest key
+// (thus the right partition will never be empty).
+template <class D, class Traits, typename T>
+HWY_INLINE Vec<D> ChoosePivotByRank(D d, Traits st,
+                                    const T* HWY_RESTRICT samples) {
+  const size_t pivot_rank = PivotRank(st, samples);
+  const Vec<D> pivot = st.SetKey(d, samples + pivot_rank);
+  if (VQSORT_PRINT >= 2) {
+    fprintf(stderr, "  Pivot rank %zu = %f\n", pivot_rank,
+            static_cast<double>(GetLane(pivot)));
+  }
+  return pivot;
+}
+
+// Returns true if all keys equal `pivot`, otherwise returns false and sets
+// `*first_mismatch' to the index of the first differing key.
+template <class D, class Traits, typename T>
+HWY_NOINLINE bool AllEqual(D d, Traits st, const Vec<D> pivot,
+                           const T* HWY_RESTRICT keys, size_t num,
+                           size_t* HWY_RESTRICT first_mismatch) {
+  const size_t N = Lanes(d);
+  // Ensures we can use overlapping loads for the tail; see HandleSpecialCases.
+  HWY_DASSERT(num >= N);
+  const Vec<D> zero = Zero(d);
+
+  // Vector-align keys + i.
+  const size_t misalign =
+      (reinterpret_cast<uintptr_t>(keys) / sizeof(T)) & (N - 1);
+  HWY_DASSERT(misalign % st.LanesPerKey() == 0);
+  const size_t consume = N - misalign;
+  {
+    const Vec<D> v = LoadU(d, keys);
+    // Only check masked lanes; consider others to be equal.
+    const Mask<D> diff = And(FirstN(d, consume), st.NotEqualKeys(d, v, pivot));
+    if (HWY_UNLIKELY(!AllFalse(d, diff))) {
+      const size_t lane = FindKnownFirstTrue(d, diff);
+      *first_mismatch = lane;
+      return false;
+    }
+  }
+  size_t i = consume;
+  HWY_DASSERT(((reinterpret_cast<uintptr_t>(keys + i) / sizeof(T)) & (N - 1)) ==
+              0);
+
+  // Sticky bits registering any difference between `keys` and the first key.
+  // We use vector XOR because it may be cheaper than comparisons, especially
+  // for 128-bit. 2x unrolled for more ILP.
+  Vec<D> diff0 = zero;
+  Vec<D> diff1 = zero;
+
+  // We want to stop once a difference has been found, but without slowing
+  // down the loop by comparing during each iteration. The compromise is to
+  // compare after a 'group', which consists of kLoops times two vectors.
+  constexpr size_t kLoops = 8;
+  const size_t lanes_per_group = kLoops * 2 * N;
+
+  for (; i + lanes_per_group <= num; i += lanes_per_group) {
+    HWY_DEFAULT_UNROLL
+    for (size_t loop = 0; loop < kLoops; ++loop) {
+      const Vec<D> v0 = Load(d, keys + i + loop * 2 * N);
+      const Vec<D> v1 = Load(d, keys + i + loop * 2 * N + N);
+      diff0 = OrXor(diff0, v0, pivot);
+      diff1 = OrXor(diff1, v1, pivot);
+    }
+    diff0 = Or(diff0, diff1);
+
+    // If there was a difference in the entire group: (use du because we must
+    // avoid floating-point comparisons for -0)
+    const RebindToUnsigned<D> du;
+    if (HWY_UNLIKELY(!AllTrue(du, Eq(BitCast(du, diff0), Zero(du))))) {
+      // .. then loop until the first one, with termination guarantee.
+      for (;; i += N) {
+        const Vec<D> v = Load(d, keys + i);
+        const Mask<D> diff = st.NotEqualKeys(d, v, pivot);
+        if (HWY_UNLIKELY(!AllFalse(d, diff))) {
+          const size_t lane = FindKnownFirstTrue(d, diff);
+          *first_mismatch = i + lane;
+          return false;
+        }
+      }
+    }
+  }
+
+  // Whole vectors, no unrolling, compare directly
+  for (; i + N <= num; i += N) {
+    const Vec<D> v = Load(d, keys + i);
+    const Mask<D> diff = st.NotEqualKeys(d, v, pivot);
+    if (HWY_UNLIKELY(!AllFalse(d, diff))) {
+      const size_t lane = FindKnownFirstTrue(d, diff);
+      *first_mismatch = i + lane;
+      return false;
+    }
+  }
+  // Always re-check the last (unaligned) vector to reduce branching.
+  i = num - N;
+  const Vec<D> v = LoadU(d, keys + i);
+  const Mask<D> diff = st.NotEqualKeys(d, v, pivot);
+  if (HWY_UNLIKELY(!AllFalse(d, diff))) {
+    const size_t lane = FindKnownFirstTrue(d, diff);
+    *first_mismatch = i + lane;
+    return false;
+  }
 
-  return RecursiveMedianOf3(d, st, buf, 3 * lanes_per_chunk,
-                            buf + 3 * lanes_per_chunk);
+  if (VQSORT_PRINT >= 1) {
+    fprintf(stderr, "All keys equal\n");
+  }
+  return true;  // all equal
 }
 
-// Compute exact min/max to detect all-equal partitions. Only called after a
-// degenerate Partition (none in the right partition).
 template <class D, class Traits, typename T>
-HWY_NOINLINE void ScanMinMax(D d, Traits st, const T* HWY_RESTRICT keys,
-                             size_t num, T* HWY_RESTRICT buf, Vec<D>& first,
-                             Vec<D>& last) {
+HWY_NOINLINE bool ExistsAnyBefore(D d, Traits st, const T* HWY_RESTRICT keys,
+                                  size_t num, const Vec<D> pivot) {
   const size_t N = Lanes(d);
+  HWY_DASSERT(num >= N);  // See HandleSpecialCases
 
-  first = st.LastValue(d);
-  last = st.FirstValue(d);
+  if (VQSORT_PRINT >= 2) {
+    fprintf(stderr, "Scanning for before\n");
+  }
 
   size_t i = 0;
+
+  constexpr size_t kLoops = 16;
+  const size_t lanes_per_group = kLoops * N;
+
+  Vec<D> first = pivot;
+
+  // Whole group, unrolled
+  for (; i + lanes_per_group <= num; i += lanes_per_group) {
+    HWY_DEFAULT_UNROLL
+    for (size_t loop = 0; loop < kLoops; ++loop) {
+      const Vec<D> curr = LoadU(d, keys + i + loop * N);
+      first = st.First(d, first, curr);
+    }
+
+    if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, first, pivot)))) {
+      if (VQSORT_PRINT >= 2) {
+        fprintf(stderr, "Stopped scanning at end of group %zu\n",
+                i + lanes_per_group);
+      }
+      return true;
+    }
+  }
+  // Whole vectors, no unrolling
   for (; i + N <= num; i += N) {
-    const Vec<D> v = LoadU(d, keys + i);
-    first = st.First(d, v, first);
-    last = st.Last(d, v, last);
+    const Vec<D> curr = LoadU(d, keys + i);
+    if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, curr, pivot)))) {
+      if (VQSORT_PRINT >= 2) {
+        fprintf(stderr, "Stopped scanning at %zu\n", i);
+      }
+      return true;
+    }
   }
+  // If there are remainders, re-check the last whole vector.
   if (HWY_LIKELY(i != num)) {
-    HWY_DASSERT(num >= N);  // See HandleSpecialCases
-    const Vec<D> v = LoadU(d, keys + num - N);
-    first = st.First(d, v, first);
-    last = st.Last(d, v, last);
+    const Vec<D> curr = LoadU(d, keys + num - N);
+    if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, curr, pivot)))) {
+      if (VQSORT_PRINT >= 2) {
+        fprintf(stderr, "Stopped scanning at last %zu\n", num - N);
+      }
+      return true;
+    }
   }
 
-  first = st.FirstOfLanes(d, first, buf);
-  last = st.LastOfLanes(d, last, buf);
+  return false;  // pivot is the first
 }
 
 template <class D, class Traits, typename T>
-void Recurse(D d, Traits st, T* HWY_RESTRICT keys, const size_t begin,
-             const size_t end, const Vec<D> pivot, T* HWY_RESTRICT buf,
-             Generator& rng, size_t remaining_levels) {
-  HWY_DASSERT(begin + 1 < end);
-  const size_t num = end - begin;  // >= 2
-
-  // Too many degenerate partitions. This is extremely unlikely to happen
-  // because we select pivots from large (though still O(1)) samples.
-  if (HWY_UNLIKELY(remaining_levels == 0)) {
-    HeapSort(st, keys + begin, num);  // Slow but N*logN.
-    return;
+HWY_NOINLINE bool ExistsAnyAfter(D d, Traits st, const T* HWY_RESTRICT keys,
+                                 size_t num, const Vec<D> pivot) {
+  const size_t N = Lanes(d);
+  HWY_DASSERT(num >= N);  // See HandleSpecialCases
+
+  if (VQSORT_PRINT >= 2) {
+    fprintf(stderr, "Scanning for after\n");
   }
 
-  const ptrdiff_t base_case_num =
-      static_cast<ptrdiff_t>(Constants::BaseCaseNum(Lanes(d)));
-  const size_t bound = Partition(d, st, keys, begin, end, pivot, buf);
-
-  const ptrdiff_t num_left =
-      static_cast<ptrdiff_t>(bound) - static_cast<ptrdiff_t>(begin);
-  const ptrdiff_t num_right =
-      static_cast<ptrdiff_t>(end) - static_cast<ptrdiff_t>(bound);
-
-  // Check for degenerate partitions (i.e. Partition did not move any keys):
-  if (HWY_UNLIKELY(num_right == 0)) {
-    // Because the pivot is one of the keys, it must have been equal to the
-    // first or last key in sort order. Scan for the actual min/max:
-    // passing the current pivot as the new bound is insufficient because one of
-    // the partitions might not actually include that key.
-    Vec<D> first, last;
-    ScanMinMax(d, st, keys + begin, num, buf, first, last);
-    if (AllTrue(d, Eq(first, last))) return;
-
-    // Separate recursion to make sure that we don't pick `last` as the
-    // pivot - that would again lead to a degenerate partition.
-    Recurse(d, st, keys, begin, end, first, buf, rng, remaining_levels - 1);
+  size_t i = 0;
+
+  constexpr size_t kLoops = 16;
+  const size_t lanes_per_group = kLoops * N;
+
+  Vec<D> last = pivot;
+
+  // Whole group, unrolled
+  for (; i + lanes_per_group <= num; i += lanes_per_group) {
+    HWY_DEFAULT_UNROLL
+    for (size_t loop = 0; loop < kLoops; ++loop) {
+      const Vec<D> curr = LoadU(d, keys + i + loop * N);
+      last = st.Last(d, last, curr);
+    }
+
+    if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, pivot, last)))) {
+      if (VQSORT_PRINT >= 2) {
+        fprintf(stderr, "Stopped scanning at end of group %zu\n",
+                i + lanes_per_group);
+      }
+      return true;
+    }
+  }
+  // Whole vectors, no unrolling
+  for (; i + N <= num; i += N) {
+    const Vec<D> curr = LoadU(d, keys + i);
+    if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, pivot, curr)))) {
+      if (VQSORT_PRINT >= 2) {
+        fprintf(stderr, "Stopped scanning at %zu\n", i);
+      }
+      return true;
+    }
+  }
+  // If there are remainders, re-check the last whole vector.
+  if (HWY_LIKELY(i != num)) {
+    const Vec<D> curr = LoadU(d, keys + num - N);
+    if (HWY_UNLIKELY(!AllFalse(d, st.Compare(d, pivot, curr)))) {
+      if (VQSORT_PRINT >= 2) {
+        fprintf(stderr, "Stopped scanning at last %zu\n", num - N);
+      }
+      return true;
+    }
+  }
+
+  return false;  // pivot is the last
+}
+
+// Returns pivot chosen from `keys[0, num)`. It will never be the largest key
+// (thus the right partition will never be empty).
+template <class D, class Traits, typename T>
+HWY_INLINE Vec<D> ChoosePivotForEqualSamples(D d, Traits st,
+                                             T* HWY_RESTRICT keys, size_t num,
+                                             T* HWY_RESTRICT samples,
+                                             Vec<D> second, Vec<D> third,
+                                             PivotResult& result) {
+  const Vec<D> pivot = st.SetKey(d, samples);  // the single unique sample
+
+  // Early out for mostly-0 arrays, where pivot is often FirstValue.
+  if (HWY_UNLIKELY(AllTrue(d, st.EqualKeys(d, pivot, st.FirstValue(d))))) {
+    result = PivotResult::kIsFirst;
+    return pivot;
+  }
+  if (HWY_UNLIKELY(AllTrue(d, st.EqualKeys(d, pivot, st.LastValue(d))))) {
+    result = PivotResult::kWasLast;
+    return st.PrevValue(d, pivot);
+  }
+
+  // Check if pivot is between two known values. If so, it is not the first nor
+  // the last and we can avoid scanning.
+  st.Sort2(d, second, third);
+  HWY_DASSERT(AllTrue(d, st.Compare(d, second, third)));
+  const bool before = !AllFalse(d, st.Compare(d, second, pivot));
+  const bool after = !AllFalse(d, st.Compare(d, pivot, third));
+  // Only reached if there are three keys, which means pivot is either first,
+  // last, or in between. Thus there is another key that comes before or after.
+  HWY_DASSERT(before || after);
+  if (HWY_UNLIKELY(before)) {
+    // Neither first nor last.
+    if (HWY_UNLIKELY(after || ExistsAnyAfter(d, st, keys, num, pivot))) {
+      result = PivotResult::kNormal;
+      return pivot;
+    }
+
+    // We didn't find anything after pivot, so it is the last. Because keys
+    // equal to the pivot go to the left partition, the right partition would be
+    // empty and Partition will not have changed anything. Instead use the
+    // previous value in sort order, which is not necessarily an actual key.
+    result = PivotResult::kWasLast;
+    return st.PrevValue(d, pivot);
+  }
+
+  // Has after, and we found one before: in the middle.
+  if (HWY_UNLIKELY(ExistsAnyBefore(d, st, keys, num, pivot))) {
+    result = PivotResult::kNormal;
+    return pivot;
+  }
+
+  // Pivot is first. We could consider a special partition mode that only
+  // reads from and writes to the right side, and later fills in the left
+  // side, which we know is equal to the pivot. However, that leads to more
+  // cache misses if the array is large, and doesn't save much, hence is a
+  // net loss.
+  result = PivotResult::kIsFirst;
+  return pivot;
+}
+
+// ------------------------------ Quicksort recursion
+
+template <class D, class Traits, typename T>
+HWY_NOINLINE void PrintMinMax(D d, Traits st, const T* HWY_RESTRICT keys,
+                              size_t num, T* HWY_RESTRICT buf) {
+  if (VQSORT_PRINT >= 2) {
+    const size_t N = Lanes(d);
+    if (num < N) return;
+
+    Vec<D> first = st.LastValue(d);
+    Vec<D> last = st.FirstValue(d);
+
+    size_t i = 0;
+    for (; i + N <= num; i += N) {
+      const Vec<D> v = LoadU(d, keys + i);
+      first = st.First(d, v, first);
+      last = st.Last(d, v, last);
+    }
+    if (HWY_LIKELY(i != num)) {
+      HWY_DASSERT(num >= N);  // See HandleSpecialCases
+      const Vec<D> v = LoadU(d, keys + num - N);
+      first = st.First(d, v, first);
+      last = st.Last(d, v, last);
+    }
+
+    first = st.FirstOfLanes(d, first, buf);
+    last = st.LastOfLanes(d, last, buf);
+    MaybePrintVector(d, "first", first, 0, st.LanesPerKey());
+    MaybePrintVector(d, "last", last, 0, st.LanesPerKey());
+  }
+}
+
+// keys_end is the end of the entire user input, not just the current subarray
+// [keys, keys + num).
+template <class D, class Traits, typename T>
+HWY_NOINLINE void Recurse(D d, Traits st, T* HWY_RESTRICT keys,
+                          T* HWY_RESTRICT keys_end, const size_t num,
+                          T* HWY_RESTRICT buf, Generator& rng,
+                          size_t remaining_levels) {
+  HWY_DASSERT(num != 0);
+
+  if (HWY_UNLIKELY(num <= Constants::BaseCaseNum(Lanes(d)))) {
+    BaseCase(d, st, keys, keys_end, num, buf);
     return;
   }
 
-  if (HWY_UNLIKELY(num_left <= base_case_num)) {
-    BaseCase(d, st, keys + begin, static_cast<size_t>(num_left), buf);
-  } else {
-    const Vec<D> next_pivot = ChoosePivot(d, st, keys, begin, bound, buf, rng);
-    Recurse(d, st, keys, begin, bound, next_pivot, buf, rng,
-            remaining_levels - 1);
+  // Move after BaseCase so we skip printing for small subarrays.
+  if (VQSORT_PRINT >= 1) {
+    fprintf(stderr, "\n\n=== Recurse depth=%zu len=%zu\n", remaining_levels,
+            num);
+    PrintMinMax(d, st, keys, num, buf);
   }
-  if (HWY_UNLIKELY(num_right <= base_case_num)) {
-    BaseCase(d, st, keys + bound, static_cast<size_t>(num_right), buf);
+
+  DrawSamples(d, st, keys, num, buf, rng);
+
+  Vec<D> pivot;
+  PivotResult result = PivotResult::kNormal;
+  if (HWY_UNLIKELY(UnsortedSampleEqual(d, st, buf))) {
+    pivot = st.SetKey(d, buf);
+    size_t idx_second = 0;
+    if (HWY_UNLIKELY(AllEqual(d, st, pivot, keys, num, &idx_second))) {
+      return;
+    }
+    HWY_DASSERT(idx_second % st.LanesPerKey() == 0);
+    // Must capture the value before PartitionIfTwoKeys may overwrite it.
+    const Vec<D> second = st.SetKey(d, keys + idx_second);
+    MaybePrintVector(d, "pivot", pivot, 0, st.LanesPerKey());
+    MaybePrintVector(d, "second", second, 0, st.LanesPerKey());
+
+    Vec<D> third;
+    if (HWY_UNLIKELY(PartitionIfTwoKeys(d, st, pivot, keys, num, idx_second,
+                                        second, third, buf))) {
+      return;  // Done, skip recursion because each side has all-equal keys.
+    }
+
+    // We can no longer start scanning from idx_second because
+    // PartitionIfTwoKeys may have reordered keys.
+    pivot = ChoosePivotForEqualSamples(d, st, keys, num, buf, second, third,
+                                       result);
+    // If kNormal, `pivot` is very common but not the first/last. It is
+    // tempting to do a 3-way partition (to avoid moving the =pivot keys a
+    // second time), but that is a net loss due to the extra comparisons.
   } else {
-    const Vec<D> next_pivot = ChoosePivot(d, st, keys, bound, end, buf, rng);
-    Recurse(d, st, keys, bound, end, next_pivot, buf, rng,
+    SortSamples(d, st, buf);
+
+    if (HWY_UNLIKELY(PartitionIfTwoSamples(d, st, keys, num, buf))) {
+      return;
+    }
+
+    pivot = ChoosePivotByRank(d, st, buf);
+  }
+
+  // Too many recursions. This is unlikely to happen because we select pivots
+  // from large (though still O(1)) samples.
+  if (HWY_UNLIKELY(remaining_levels == 0)) {
+    if (VQSORT_PRINT >= 1) {
+      fprintf(stderr, "HeapSort reached, size=%zu\n", num);
+    }
+    HeapSort(st, keys, num);  // Slow but N*logN.
+    return;
+  }
+
+  const size_t bound = Partition(d, st, keys, num, pivot, buf);
+  if (VQSORT_PRINT >= 2) {
+    fprintf(stderr, "bound %zu num %zu result %s\n", bound, num,
+            PivotResultString(result));
+  }
+  if (HWY_LIKELY(result != PivotResult::kIsFirst)) {
+    // The left partition is not empty because the pivot is one of the keys.
+    HWY_DASSERT(0 != bound && bound != num);
+    Recurse(d, st, keys, keys_end, bound, buf, rng, remaining_levels - 1);
+  }
+  if (HWY_LIKELY(result != PivotResult::kWasLast)) {
+    // ChoosePivot* ensure pivot != last, so the right partition is never empty.
+    HWY_DASSERT(bound != num);
+    Recurse(d, st, keys + bound, keys_end, num - bound, buf, rng,
             remaining_levels - 1);
   }
 }
 
 // Returns true if sorting is finished.
 template <class D, class Traits, typename T>
-bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
-                        T* HWY_RESTRICT buf) {
+HWY_INLINE bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys,
+                                   size_t num) {
   const size_t N = Lanes(d);
   const size_t base_case_num = Constants::BaseCaseNum(N);
 
   // 128-bit keys require vectors with at least two u64 lanes, which is always
   // the case unless `d` requests partial vectors (e.g. fraction = 1/2) AND the
   // hardware vector width is less than 128bit / fraction.
-  const bool partial_128 = N < 2 && st.Is128();
+  const bool partial_128 = !IsFull(d) && N < 2 && st.Is128();
   // Partition assumes its input is at least two vectors. If vectors are huge,
   // base_case_num may actually be smaller. If so, which is only possible on
   // RVV, pass a capped or partial d (LMUL < 1). Use HWY_MAX_BYTES instead of
@@ -679,16 +1363,15 @@ bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
       HWY_MAX_BYTES / sizeof(T) > Constants::kMaxRows * Constants::kMaxCols;
   const bool huge_vec = kPotentiallyHuge && (2 * N > base_case_num);
   if (partial_128 || huge_vec) {
-    // PERFORMANCE WARNING: falling back to HeapSort.
+    if (VQSORT_PRINT >= 1) {
+      fprintf(stderr, "WARNING: using slow HeapSort: partial %d huge %d\n",
+              partial_128, huge_vec);
+    }
     HeapSort(st, keys, num);
     return true;
   }
 
-  // Small arrays: use sorting network, no need for other checks.
-  if (HWY_UNLIKELY(num <= base_case_num)) {
-    BaseCase(d, st, keys, num, buf);
-    return true;
-  }
+  // Small arrays are already handled by Recurse.
 
   // We could also check for already sorted/reverse/equal, but that's probably
   // counterproductive if vqsort is used as a base case.
@@ -696,7 +1379,7 @@ bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
   return false;  // not finished sorting
 }
 
-#endif  // HWY_TARGET
+#endif  // VQSORT_ENABLED
 }  // namespace detail
 
 // Sorts `keys[0..num-1]` according to the order defined by `st.Compare`.
@@ -713,12 +1396,11 @@ bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
 template <class D, class Traits, typename T>
 void Sort(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
           T* HWY_RESTRICT buf) {
-#if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128
-  (void)d;
-  (void)buf;
-  // PERFORMANCE WARNING: vqsort is not enabled for the non-SIMD target
-  return detail::HeapSort(st, keys, num);
-#else
+  if (VQSORT_PRINT >= 1) {
+    fprintf(stderr, "=============== Sort num %zu\n", num);
+  }
+
+#if VQSORT_ENABLED || HWY_IDE
 #if !HWY_HAVE_SCALABLE
   // On targets with fixed-size vectors, avoid _using_ the allocated memory.
   // We avoid (potentially expensive for small input sizes) allocations on
@@ -729,24 +1411,28 @@ void Sort(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
   buf = storage;
 #endif  // !HWY_HAVE_SCALABLE
 
-  if (detail::HandleSpecialCases(d, st, keys, num, buf)) return;
+  if (detail::HandleSpecialCases(d, st, keys, num)) return;
 
 #if HWY_MAX_BYTES > 64
   // sorting_networks-inl and traits assume no more than 512 bit vectors.
-  if (Lanes(d) > 64 / sizeof(T)) {
+  if (HWY_UNLIKELY(Lanes(d) > 64 / sizeof(T))) {
     return Sort(CappedTag<T, 64 / sizeof(T)>(), st, keys, num, buf);
   }
 #endif  // HWY_MAX_BYTES > 64
 
-  // Pulled out of the recursion so we can special-case degenerate partitions.
   detail::Generator rng(keys, num);
-  const Vec<D> pivot = detail::ChoosePivot(d, st, keys, 0, num, buf, rng);
 
   // Introspection: switch to worst-case N*logN heapsort after this many.
   const size_t max_levels = 2 * hwy::CeilLog2(num) + 4;
-
-  detail::Recurse(d, st, keys, 0, num, pivot, buf, rng, max_levels);
-#endif  // HWY_TARGET
+  detail::Recurse(d, st, keys, keys + num, num, buf, rng, max_levels);
+#else
+  (void)d;
+  (void)buf;
+  if (VQSORT_PRINT >= 1) {
+    fprintf(stderr, "WARNING: using slow HeapSort because vqsort disabled\n");
+  }
+  return detail::HeapSort(st, keys, num);
+#endif  // VQSORT_ENABLED
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/media/highway/src/hwy/contrib/sort/vqsort.cc b/media/highway/src/hwy/contrib/sort/vqsort.cc
index 95117d8a58..b3bac0720a 100644
--- a/media/highway/src/hwy/contrib/sort/vqsort.cc
+++ b/media/highway/src/hwy/contrib/sort/vqsort.cc
@@ -17,11 +17,9 @@
 
 #include <string.h>  // memset
 
-#include "hwy/aligned_allocator.h"
-
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/shared-inl.h"
@@ -36,13 +34,17 @@
 #endif
 #endif  // VQSORT_STACK
 
+#if !VQSORT_STACK
+#include "hwy/aligned_allocator.h"
+#endif
+
 // Check if we have sys/random.h. First skip some systems on which the check
 // itself (features.h) might be problematic.
 #if defined(ANDROID) || defined(__ANDROID__) || HWY_ARCH_RVV
 #define VQSORT_GETRANDOM 0
 #endif
 
-#if !defined(VQSORT_GETRANDOM) && (defined(linux) || defined(__linux__))
+#if !defined(VQSORT_GETRANDOM) && HWY_OS_LINUX
 #include <features.h>
 
 // ---- which libc
diff --git a/media/highway/src/hwy/contrib/sort/vqsort.h b/media/highway/src/hwy/contrib/sort/vqsort.h
index df1afb07db..88d78ac7f9 100644
--- a/media/highway/src/hwy/contrib/sort/vqsort.h
+++ b/media/highway/src/hwy/contrib/sort/vqsort.h
@@ -13,8 +13,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Interface to vectorized quicksort with dynamic dispatch. Measurements and
-// detailed description: https://arxiv.org/abs/2205.05982 .
+// Interface to vectorized quicksort with dynamic dispatch.
+// Blog post: https://tinyurl.com/vqsort-blog
+// Paper with measurements: https://arxiv.org/abs/2205.05982
+//
+// To ensure the overhead of using wide vectors (e.g. AVX2 or AVX-512) is
+// worthwhile, we recommend using this code for sorting arrays whose size is at
+// least 512 KiB.
 
 #ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
 #define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
@@ -23,15 +28,6 @@
 
 namespace hwy {
 
-// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
-// https://reviews.llvm.org/D86310
-#pragma pack(push, 1)
-struct alignas(16) uint128_t {
-  uint64_t lo;  // little-endian layout
-  uint64_t hi;
-};
-#pragma pack(pop)
-
 // Tag arguments that determine the sort order.
 struct SortAscending {
   constexpr bool IsAscending() const { return true; }
@@ -86,6 +82,12 @@ class HWY_CONTRIB_DLLEXPORT Sorter {
   void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
   void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
 
+  void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortAscending) const;
+  void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortDescending) const;
+
+  void operator()(K32V32* HWY_RESTRICT keys, size_t n, SortAscending) const;
+  void operator()(K32V32* HWY_RESTRICT keys, size_t n, SortDescending) const;
+
   // For internal use only
   static void Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes);
   static bool HaveFloat64();
diff --git a/media/highway/src/hwy/contrib/sort/vqsort_128a.cc b/media/highway/src/hwy/contrib/sort/vqsort_128a.cc
index 5ce2057f70..40daea85c7 100644
--- a/media/highway/src/hwy/contrib/sort/vqsort_128a.cc
+++ b/media/highway/src/hwy/contrib/sort/vqsort_128a.cc
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits128-inl.h"
@@ -30,9 +29,16 @@ namespace HWY_NAMESPACE {
 
 void Sort128Asc(uint64_t* HWY_RESTRICT keys, size_t num,
                 uint64_t* HWY_RESTRICT buf) {
+#if VQSORT_ENABLED
   SortTag<uint64_t> d;
   detail::SharedTraits<detail::Traits128<detail::OrderAscending128>> st;
   Sort(d, st, keys, num, buf);
+#else
+  (void) keys;
+  (void) num;
+  (void) buf;
+  HWY_ASSERT(0);
+#endif
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/media/highway/src/hwy/contrib/sort/vqsort_128d.cc b/media/highway/src/hwy/contrib/sort/vqsort_128d.cc
index 7218e1c4d2..357da840c1 100644
--- a/media/highway/src/hwy/contrib/sort/vqsort_128d.cc
+++ b/media/highway/src/hwy/contrib/sort/vqsort_128d.cc
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits128-inl.h"
@@ -30,9 +29,16 @@ namespace HWY_NAMESPACE {
 
 void Sort128Desc(uint64_t* HWY_RESTRICT keys, size_t num,
                  uint64_t* HWY_RESTRICT buf) {
+#if VQSORT_ENABLED
   SortTag<uint64_t> d;
   detail::SharedTraits<detail::Traits128<detail::OrderDescending128>> st;
   Sort(d, st, keys, num, buf);
+#else
+  (void) keys;
+  (void) num;
+  (void) buf;
+  HWY_ASSERT(0);
+#endif
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/media/highway/src/hwy/contrib/sort/vqsort_f32a.cc b/media/highway/src/hwy/contrib/sort/vqsort_f32a.cc
index 5934f8a496..3856eea5dd 100644
--- a/media/highway/src/hwy/contrib/sort/vqsort_f32a.cc
+++ b/media/highway/src/hwy/contrib/sort/vqsort_f32a.cc
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -30,7 +29,7 @@ namespace HWY_NAMESPACE {
 
 void SortF32Asc(float* HWY_RESTRICT keys, size_t num, float* HWY_RESTRICT buf) {
   SortTag<float> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<float>>> st;
   Sort(d, st, keys, num, buf);
 }
 
diff --git a/media/highway/src/hwy/contrib/sort/vqsort_f32d.cc b/media/highway/src/hwy/contrib/sort/vqsort_f32d.cc
index ec0469a744..7f5f97cdf2 100644
--- a/media/highway/src/hwy/contrib/sort/vqsort_f32d.cc
+++ b/media/highway/src/hwy/contrib/sort/vqsort_f32d.cc
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -31,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortF32Desc(float* HWY_RESTRICT keys, size_t num,
                  float* HWY_RESTRICT buf) {
   SortTag<float> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<float>>> st;
   Sort(d, st, keys, num, buf);
 }
 
diff --git a/media/highway/src/hwy/contrib/sort/vqsort_f64a.cc b/media/highway/src/hwy/contrib/sort/vqsort_f64a.cc
index b701c9f3de..287d5214e5 100644
--- a/media/highway/src/hwy/contrib/sort/vqsort_f64a.cc
+++ b/media/highway/src/hwy/contrib/sort/vqsort_f64a.cc
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -32,7 +31,7 @@ void SortF64Asc(double* HWY_RESTRICT keys, size_t num,
                 double* HWY_RESTRICT buf) {
 #if HWY_HAVE_FLOAT64
   SortTag<double> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<double>>> st;
   Sort(d, st, keys, num, buf);
 #else
   (void)keys;
diff --git a/media/highway/src/hwy/contrib/sort/vqsort_f64d.cc b/media/highway/src/hwy/contrib/sort/vqsort_f64d.cc
index 87ae9ca191..74d40c1ed3 100644
--- a/media/highway/src/hwy/contrib/sort/vqsort_f64d.cc
+++ b/media/highway/src/hwy/contrib/sort/vqsort_f64d.cc
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -32,7 +31,7 @@ void SortF64Desc(double* HWY_RESTRICT keys, size_t num,
                  double* HWY_RESTRICT buf) {
 #if HWY_HAVE_FLOAT64
   SortTag<double> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<double>>> st;
   Sort(d, st, keys, num, buf);
 #else
   (void)keys;
diff --git a/media/highway/src/hwy/contrib/sort/vqsort_i16a.cc b/media/highway/src/hwy/contrib/sort/vqsort_i16a.cc
index 6e64eeb0e6..ef4bb75bc4 100644
--- a/media/highway/src/hwy/contrib/sort/vqsort_i16a.cc
+++ b/media/highway/src/hwy/contrib/sort/vqsort_i16a.cc
@@ -13,20 +13,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
 #include "hwy/contrib/sort/vqsort-inl.h"
 
-// Workaround for build timeout
-#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
-
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
@@ -34,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortI16Asc(int16_t* HWY_RESTRICT keys, size_t num,
                 int16_t* HWY_RESTRICT buf) {
   SortTag<int16_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int16_t>>> st;
   Sort(d, st, keys, num, buf);
 }
 
@@ -56,5 +52,3 @@ void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n,
 
 }  // namespace hwy
 #endif  // HWY_ONCE
-
-#endif  // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
diff --git a/media/highway/src/hwy/contrib/sort/vqsort_i16d.cc b/media/highway/src/hwy/contrib/sort/vqsort_i16d.cc
index 922cee3775..6507ed6080 100644
--- a/media/highway/src/hwy/contrib/sort/vqsort_i16d.cc
+++ b/media/highway/src/hwy/contrib/sort/vqsort_i16d.cc
@@ -13,20 +13,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
 #include "hwy/contrib/sort/vqsort-inl.h"
 
-// Workaround for build timeout
-#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
-
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
@@ -34,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortI16Desc(int16_t* HWY_RESTRICT keys, size_t num,
                  int16_t* HWY_RESTRICT buf) {
   SortTag<int16_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int16_t>>> st;
   Sort(d, st, keys, num, buf);
 }
 
@@ -56,5 +52,3 @@ void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n,
 
 }  // namespace hwy
 #endif  // HWY_ONCE
-
-#endif  // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
diff --git a/media/highway/src/hwy/contrib/sort/vqsort_i32a.cc b/media/highway/src/hwy/contrib/sort/vqsort_i32a.cc
index 12204fbaee..ae65be997e 100644
--- a/media/highway/src/hwy/contrib/sort/vqsort_i32a.cc
+++ b/media/highway/src/hwy/contrib/sort/vqsort_i32a.cc
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -31,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortI32Asc(int32_t* HWY_RESTRICT keys, size_t num,
                 int32_t* HWY_RESTRICT buf) {
   SortTag<int32_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int32_t>>> st;
   Sort(d, st, keys, num, buf);
 }
 
diff --git a/media/highway/src/hwy/contrib/sort/vqsort_i32d.cc b/media/highway/src/hwy/contrib/sort/vqsort_i32d.cc
index fd2a4ff9fa..3ce276ee9c 100644
--- a/media/highway/src/hwy/contrib/sort/vqsort_i32d.cc
+++ b/media/highway/src/hwy/contrib/sort/vqsort_i32d.cc
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -31,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortI32Desc(int32_t* HWY_RESTRICT keys, size_t num,
                  int32_t* HWY_RESTRICT buf) {
   SortTag<int32_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int32_t>>> st;
   Sort(d, st, keys, num, buf);
 }
 
diff --git a/media/highway/src/hwy/contrib/sort/vqsort_i64a.cc b/media/highway/src/hwy/contrib/sort/vqsort_i64a.cc
index 6b9d225165..901b8ead8a 100644
--- a/media/highway/src/hwy/contrib/sort/vqsort_i64a.cc
+++ b/media/highway/src/hwy/contrib/sort/vqsort_i64a.cc
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -31,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortI64Asc(int64_t* HWY_RESTRICT keys, size_t num,
                 int64_t* HWY_RESTRICT buf) {
   SortTag<int64_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int64_t>>> st;
   Sort(d, st, keys, num, buf);
 }
 
diff --git a/media/highway/src/hwy/contrib/sort/vqsort_i64d.cc b/media/highway/src/hwy/contrib/sort/vqsort_i64d.cc
index ef3ac097b8..7713f2eb89 100644
--- a/media/highway/src/hwy/contrib/sort/vqsort_i64d.cc
+++ b/media/highway/src/hwy/contrib/sort/vqsort_i64d.cc
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -31,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortI64Desc(int64_t* HWY_RESTRICT keys, size_t num,
                  int64_t* HWY_RESTRICT buf) {
   SortTag<int64_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int64_t>>> st;
   Sort(d, st, keys, num, buf);
 }
 
diff --git a/media/highway/src/hwy/contrib/sort/vqsort_kv128a.cc b/media/highway/src/hwy/contrib/sort/vqsort_kv128a.cc
new file mode 100644
index 0000000000..1e02742ef1
--- /dev/null
+++ b/media/highway/src/hwy/contrib/sort/vqsort_kv128a.cc
@@ -0,0 +1,65 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+// clang-format off
+// (avoid line break, which would prevent Copybara rules from matching)
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv128a.cc"  //NOLINT
+// clang-format on
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits128-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortKV128Asc(uint64_t* HWY_RESTRICT keys, size_t num,
+                  uint64_t* HWY_RESTRICT buf) {
+#if VQSORT_ENABLED
+  SortTag<uint64_t> d;
+  detail::SharedTraits<detail::Traits128<detail::OrderAscendingKV128>> st;
+  Sort(d, st, keys, num, buf);
+#else
+  (void) keys;
+  (void) num;
+  (void) buf;
+  HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortKV128Asc);
+}  // namespace
+
+void Sorter::operator()(K64V64* HWY_RESTRICT keys, size_t n,
+                        SortAscending) const {
+  HWY_DYNAMIC_DISPATCH(SortKV128Asc)
+  (reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
diff --git a/media/highway/src/hwy/contrib/sort/vqsort_kv128d.cc b/media/highway/src/hwy/contrib/sort/vqsort_kv128d.cc
new file mode 100644
index 0000000000..3dd53b5da3
--- /dev/null
+++ b/media/highway/src/hwy/contrib/sort/vqsort_kv128d.cc
@@ -0,0 +1,65 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+// clang-format off
+// (avoid line break, which would prevent Copybara rules from matching)
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv128d.cc"  //NOLINT
+// clang-format on
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits128-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortKV128Desc(uint64_t* HWY_RESTRICT keys, size_t num,
+                   uint64_t* HWY_RESTRICT buf) {
+#if VQSORT_ENABLED
+  SortTag<uint64_t> d;
+  detail::SharedTraits<detail::Traits128<detail::OrderDescendingKV128>> st;
+  Sort(d, st, keys, num, buf);
+#else
+  (void) keys;
+  (void) num;
+  (void) buf;
+  HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortKV128Desc);
+}  // namespace
+
+void Sorter::operator()(K64V64* HWY_RESTRICT keys, size_t n,
+                        SortDescending) const {
+  HWY_DYNAMIC_DISPATCH(SortKV128Desc)
+  (reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
diff --git a/media/highway/src/hwy/contrib/sort/vqsort_kv64a.cc b/media/highway/src/hwy/contrib/sort/vqsort_kv64a.cc
new file mode 100644
index 0000000000..c513e3c4ce
--- /dev/null
+++ b/media/highway/src/hwy/contrib/sort/vqsort_kv64a.cc
@@ -0,0 +1,65 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+// clang-format off
+// (avoid line break, which would prevent Copybara rules from matching)
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv64a.cc"  //NOLINT
+// clang-format on
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortKV64Asc(uint64_t* HWY_RESTRICT keys, size_t num,
+                 uint64_t* HWY_RESTRICT buf) {
+#if VQSORT_ENABLED
+  SortTag<uint64_t> d;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscendingKV64>> st;
+  Sort(d, st, keys, num, buf);
+#else
+  (void) keys;
+  (void) num;
+  (void) buf;
+  HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortKV64Asc);
+}  // namespace
+
+void Sorter::operator()(K32V32* HWY_RESTRICT keys, size_t n,
+                        SortAscending) const {
+  HWY_DYNAMIC_DISPATCH(SortKV64Asc)
+  (reinterpret_cast<uint64_t*>(keys), n, Get<uint64_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
diff --git a/media/highway/src/hwy/contrib/sort/vqsort_kv64d.cc b/media/highway/src/hwy/contrib/sort/vqsort_kv64d.cc
new file mode 100644
index 0000000000..c6c5fdcf74
--- /dev/null
+++ b/media/highway/src/hwy/contrib/sort/vqsort_kv64d.cc
@@ -0,0 +1,65 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+// clang-format off
+// (avoid line break, which would prevent Copybara rules from matching)
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv64d.cc"  //NOLINT
+// clang-format on
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortKV64Desc(uint64_t* HWY_RESTRICT keys, size_t num,
+                  uint64_t* HWY_RESTRICT buf) {
+#if VQSORT_ENABLED
+  SortTag<uint64_t> d;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescendingKV64>> st;
+  Sort(d, st, keys, num, buf);
+#else
+  (void) keys;
+  (void) num;
+  (void) buf;
+  HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortKV64Desc);
+}  // namespace
+
+void Sorter::operator()(K32V32* HWY_RESTRICT keys, size_t n,
+                        SortDescending) const {
+  HWY_DYNAMIC_DISPATCH(SortKV64Desc)
+  (reinterpret_cast<uint64_t*>(keys), n, Get<uint64_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
diff --git a/media/highway/src/hwy/contrib/sort/vqsort_u16a.cc b/media/highway/src/hwy/contrib/sort/vqsort_u16a.cc
index 8bef7fba32..0a97ffa923 100644
--- a/media/highway/src/hwy/contrib/sort/vqsort_u16a.cc
+++ b/media/highway/src/hwy/contrib/sort/vqsort_u16a.cc
@@ -13,20 +13,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
 #include "hwy/contrib/sort/vqsort-inl.h"
 
-// Workaround for build timeout
-#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
-
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
@@ -34,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortU16Asc(uint16_t* HWY_RESTRICT keys, size_t num,
                 uint16_t* HWY_RESTRICT buf) {
   SortTag<uint16_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint16_t>>> st;
   Sort(d, st, keys, num, buf);
 }
 
@@ -56,5 +52,3 @@ void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n,
 
 }  // namespace hwy
 #endif  // HWY_ONCE
-
-#endif  // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
diff --git a/media/highway/src/hwy/contrib/sort/vqsort_u16d.cc b/media/highway/src/hwy/contrib/sort/vqsort_u16d.cc
index 4120873b9f..286ebbba65 100644
--- a/media/highway/src/hwy/contrib/sort/vqsort_u16d.cc
+++ b/media/highway/src/hwy/contrib/sort/vqsort_u16d.cc
@@ -13,20 +13,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
 #include "hwy/contrib/sort/vqsort-inl.h"
 
-// Workaround for build timeout
-#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
-
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
@@ -34,7 +30,8 @@ namespace HWY_NAMESPACE {
 void SortU16Desc(uint16_t* HWY_RESTRICT keys, size_t num,
                  uint16_t* HWY_RESTRICT buf) {
   SortTag<uint16_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint16_t>>>
+      st;
   Sort(d, st, keys, num, buf);
 }
 
@@ -56,5 +53,3 @@ void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n,
 
 }  // namespace hwy
 #endif  // HWY_ONCE
-
-#endif  // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
diff --git a/media/highway/src/hwy/contrib/sort/vqsort_u32a.cc b/media/highway/src/hwy/contrib/sort/vqsort_u32a.cc
index 4d33705b14..b6a69e6e28 100644
--- a/media/highway/src/hwy/contrib/sort/vqsort_u32a.cc
+++ b/media/highway/src/hwy/contrib/sort/vqsort_u32a.cc
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -31,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortU32Asc(uint32_t* HWY_RESTRICT keys, size_t num,
                 uint32_t* HWY_RESTRICT buf) {
   SortTag<uint32_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint32_t>>> st;
   Sort(d, st, keys, num, buf);
 }
 
diff --git a/media/highway/src/hwy/contrib/sort/vqsort_u32d.cc b/media/highway/src/hwy/contrib/sort/vqsort_u32d.cc
index e73fb82b6d..38fc1e1bfe 100644
--- a/media/highway/src/hwy/contrib/sort/vqsort_u32d.cc
+++ b/media/highway/src/hwy/contrib/sort/vqsort_u32d.cc
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -31,7 +30,8 @@ namespace HWY_NAMESPACE {
 void SortU32Desc(uint32_t* HWY_RESTRICT keys, size_t num,
                  uint32_t* HWY_RESTRICT buf) {
   SortTag<uint32_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint32_t>>>
+      st;
   Sort(d, st, keys, num, buf);
 }
 
diff --git a/media/highway/src/hwy/contrib/sort/vqsort_u64a.cc b/media/highway/src/hwy/contrib/sort/vqsort_u64a.cc
index b1dc3f78b3..a29824a6f9 100644
--- a/media/highway/src/hwy/contrib/sort/vqsort_u64a.cc
+++ b/media/highway/src/hwy/contrib/sort/vqsort_u64a.cc
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -31,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortU64Asc(uint64_t* HWY_RESTRICT keys, size_t num,
                 uint64_t* HWY_RESTRICT buf) {
   SortTag<uint64_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint64_t>>> st;
   Sort(d, st, keys, num, buf);
 }
 
diff --git a/media/highway/src/hwy/contrib/sort/vqsort_u64d.cc b/media/highway/src/hwy/contrib/sort/vqsort_u64d.cc
index 43acfd238f..d692458623 100644
--- a/media/highway/src/hwy/contrib/sort/vqsort_u64d.cc
+++ b/media/highway/src/hwy/contrib/sort/vqsort_u64d.cc
@@ -13,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@@ -31,7 +30,8 @@ namespace HWY_NAMESPACE {
 void SortU64Desc(uint64_t* HWY_RESTRICT keys, size_t num,
                  uint64_t* HWY_RESTRICT buf) {
   SortTag<uint64_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint64_t>>>
+      st;
   Sort(d, st, keys, num, buf);
 }
 
diff --git a/media/highway/src/hwy/detect_compiler_arch.h b/media/highway/src/hwy/detect_compiler_arch.h
index ffdc3552f8..98c6a55b04 100644
--- a/media/highway/src/hwy/detect_compiler_arch.h
+++ b/media/highway/src/hwy/detect_compiler_arch.h
@@ -30,9 +30,8 @@
 //------------------------------------------------------------------------------
 // Compiler
 
-// clang-cl defines _MSC_VER but doesn't behave like MSVC in other aspects like
-// used in HWY_DIAGNOSTICS(). We include a check that we are not clang for that
-// purpose.
+// Actual MSVC, not clang-cl, which defines _MSC_VER but doesn't behave like
+// MSVC in other aspects (e.g. HWY_DIAGNOSTICS).
 #if defined(_MSC_VER) && !defined(__clang__)
 #define HWY_COMPILER_MSVC _MSC_VER
 #else
@@ -51,19 +50,31 @@
 #define HWY_COMPILER_ICC 0
 #endif
 
+#ifdef __INTEL_LLVM_COMPILER
+#define HWY_COMPILER_ICX __INTEL_LLVM_COMPILER
+#else
+#define HWY_COMPILER_ICX 0
+#endif
+
+// HWY_COMPILER_GCC is a generic macro for all compilers implementing the GNU
+// compiler extensions (eg. Clang, Intel...)
 #ifdef __GNUC__
 #define HWY_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__)
 #else
 #define HWY_COMPILER_GCC 0
 #endif
 
-// Clang can masquerade as MSVC/GCC, in which case both are set.
+// Clang or clang-cl, not GCC.
 #ifdef __clang__
-#ifdef __APPLE__
-// Apple LLVM version is unrelated to the actual Clang version, which we need
-// for enabling workarounds. Use the presence of warning flags to deduce it.
+// In case of Apple LLVM (whose version number is unrelated to that of LLVM) or
+// an invalid version number, deduce it from the presence of warnings.
 // Adapted from https://github.com/simd-everywhere/simde/ simde-detect-clang.h.
-#if __has_warning("-Wformat-insufficient-args")
+#if defined(__apple_build_version__) || __clang_major__ >= 999
+#if __has_warning("-Wbitwise-instead-of-logical")
+#define HWY_COMPILER_CLANG 1400
+#elif __has_warning("-Wreserved-identifier")
+#define HWY_COMPILER_CLANG 1300
+#elif __has_warning("-Wformat-insufficient-args")
 #define HWY_COMPILER_CLANG 1200
 #elif __has_warning("-Wimplicit-const-int-float-conversion")
 #define HWY_COMPILER_CLANG 1100
@@ -74,24 +85,42 @@
 #elif __has_warning("-Wextra-semi-stmt") || \
     __has_builtin(__builtin_rotateleft32)
 #define HWY_COMPILER_CLANG 800
-#elif __has_warning("-Wc++98-compat-extra-semi")
+// For reasons unknown, XCode 10.3 (Apple LLVM version 10.0.1) is apparently
+// based on Clang 7, but does not support the warning we test.
+// See https://en.wikipedia.org/wiki/Xcode#Toolchain_versions and
+// https://trac.macports.org/wiki/XcodeVersionInfo.
+#elif __has_warning("-Wc++98-compat-extra-semi") || \
+    (defined(__apple_build_version__) && __apple_build_version__ >= 10010000)
 #define HWY_COMPILER_CLANG 700
 #else  // Anything older than 7.0 is not recommended for Highway.
 #define HWY_COMPILER_CLANG 600
 #endif  // __has_warning chain
-#else   // Non-Apple: normal version
+#else   // use normal version
 #define HWY_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
 #endif
 #else  // Not clang
 #define HWY_COMPILER_CLANG 0
 #endif
 
+#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
+#define HWY_COMPILER_GCC_ACTUAL HWY_COMPILER_GCC
+#else
+#define HWY_COMPILER_GCC_ACTUAL 0
+#endif
+
 // More than one may be nonzero, but we want at least one.
-#if !HWY_COMPILER_MSVC && !HWY_COMPILER_CLANGCL && !HWY_COMPILER_ICC && \
-    !HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
+#if 0 == (HWY_COMPILER_MSVC + HWY_COMPILER_CLANGCL + HWY_COMPILER_ICC + \
+          HWY_COMPILER_GCC + HWY_COMPILER_CLANG)
 #error "Unsupported compiler"
 #endif
 
+// We should only detect one of these (only clang/clangcl overlap)
+#if 1 <                                                                     \
+    (!!HWY_COMPILER_MSVC + !!HWY_COMPILER_ICC + !!HWY_COMPILER_GCC_ACTUAL + \
+     !!(HWY_COMPILER_CLANGCL | HWY_COMPILER_CLANG))
+#error "Detected multiple compilers"
+#endif
+
 #ifdef __has_builtin
 #define HWY_HAS_BUILTIN(name) __has_builtin(name)
 #else
@@ -147,7 +176,7 @@
 #define HWY_ARCH_ARM_A64 0
 #endif
 
-#if defined(__arm__) || defined(_M_ARM)
+#if (defined(__ARM_ARCH) && __ARM_ARCH == 7) || (defined(_M_ARM) && _M_ARM == 7)
 #define HWY_ARCH_ARM_V7 1
 #else
 #define HWY_ARCH_ARM_V7 0
@@ -157,12 +186,20 @@
 #error "Cannot have both A64 and V7"
 #endif
 
+// Any *supported* version of Arm, i.e. 7 or later
 #if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7
 #define HWY_ARCH_ARM 1
 #else
 #define HWY_ARCH_ARM 0
 #endif
 
+// Older than v7 (e.g. armel aka Arm v5), in which case we do not support SIMD.
+#if (defined(__arm__) || defined(_M_ARM)) && !HWY_ARCH_ARM
+#define HWY_ARCH_ARM_OLD 1
+#else
+#define HWY_ARCH_ARM_OLD 0
+#endif
+
 #if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__)
 #define HWY_ARCH_WASM 1
 #else
@@ -177,9 +214,21 @@
 
 // It is an error to detect multiple architectures at the same time, but OK to
 // detect none of the above.
-#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \
-     HWY_ARCH_RVV) > 1
+#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_ARM_OLD + \
+     HWY_ARCH_WASM + HWY_ARCH_RVV) > 1
 #error "Must not detect more than one architecture"
 #endif
 
+#if defined(_WIN32) || defined(_WIN64)
+#define HWY_OS_WIN 1
+#else
+#define HWY_OS_WIN 0
+#endif
+
+#if defined(linux) || defined(__linux__)
+#define HWY_OS_LINUX 1
+#else
+#define HWY_OS_LINUX 0
+#endif
+
 #endif  // HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
diff --git a/media/highway/src/hwy/detect_targets.h b/media/highway/src/hwy/detect_targets.h
index 9c453c6f9b..7f7e179b31 100644
--- a/media/highway/src/hwy/detect_targets.h
+++ b/media/highway/src/hwy/detect_targets.h
@@ -23,7 +23,7 @@
 //------------------------------------------------------------------------------
 // Optional configuration
 
-// See ../quick_reference.md for documentation of these macros.
+// See g3doc/quick_reference.md for documentation of these macros.
 
 // Uncomment to override the default baseline determined from predefined macros:
 // #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
@@ -51,60 +51,71 @@
 // All values are unconditionally defined so we can test HWY_TARGETS without
 // first checking the HWY_ARCH_*.
 //
-// The C99 preprocessor evaluates #if expressions using intmax_t types, so we
-// can use 32-bit literals.
-
-// 1,2: reserved
-
+// The C99 preprocessor evaluates #if expressions using intmax_t types. This
+// holds at least 64 bits in practice (verified 2022-07-18 via Godbolt on
+// 32-bit clang/GCC/MSVC compilers for x86/Arm7/AArch32/RISC-V/WASM). We now
+// avoid overflow when computing HWY_TARGETS (subtracting one instead of
+// left-shifting 2^62), but still do not use bit 63 because it is the sign bit.
+
+// --------------------------- x86: 15 targets (+ one fallback)
+// Bits 0..6 reserved (7 targets)
 // Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VPOPCNTDQ, VBMI, VBMI2,
 // VAES, BITALG). Later to be added: BF16 (Cooper Lake). VP2INTERSECT is only in
 // Tiger Lake? We do not yet have uses for GFNI.
-#define HWY_AVX3_DL 4  // see HWY_WANT_AVX3_DL below
-#define HWY_AVX3 8
-#define HWY_AVX2 16
-// 32: reserved for AVX
-#define HWY_SSE4 64
-#define HWY_SSSE3 128
-// 0x100, 0x200: reserved for SSE3, SSE2
-
+#define HWY_AVX3_DL (1LL << 7)  // see HWY_WANT_AVX3_DL below
+#define HWY_AVX3 (1LL << 8)
+#define HWY_AVX2 (1LL << 9)
+// Bit 10: reserved for AVX
+#define HWY_SSE4 (1LL << 11)
+#define HWY_SSSE3 (1LL << 12)
+// Bits 13..14 reserved for SSE3 or SSE2 (2 targets)
 // The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
 // dynamic dispatch. All x86 target bits must be lower or equal to
 // (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
 // HWY_MAX_DYNAMIC_TARGETS in total.
-#define HWY_HIGHEST_TARGET_BIT_X86 9
-
-#define HWY_SVE2 0x400
-#define HWY_SVE 0x800
-// 0x1000 reserved for Helium
-#define HWY_NEON 0x2000
-
-#define HWY_HIGHEST_TARGET_BIT_ARM 13
-
-// 0x4000, 0x8000 reserved
-#define HWY_PPC8 0x10000  // v2.07 or 3
-// 0x20000, 0x40000 reserved for prior VSX/AltiVec
-
-#define HWY_HIGHEST_TARGET_BIT_PPC 18
-
-#define HWY_WASM2 0x80000  // Experimental
-#define HWY_WASM 0x100000
-
-#define HWY_HIGHEST_TARGET_BIT_WASM 20
-
-// 0x200000, 0x400000, 0x800000 reserved
-
-#define HWY_RVV 0x1000000
-
-#define HWY_HIGHEST_TARGET_BIT_RVV 24
-
-// 0x2000000, 0x4000000, 0x8000000 reserved
-
-#define HWY_EMU128 0x10000000
-#define HWY_SCALAR 0x20000000
-
-#define HWY_HIGHEST_TARGET_BIT_SCALAR 29
-
-// Cannot use higher values, otherwise HWY_TARGETS computation might overflow.
+#define HWY_HIGHEST_TARGET_BIT_X86 14
+
+// --------------------------- Arm: 15 targets (+ one fallback)
+// Bits 15..23 reserved (9 targets)
+#define HWY_SVE2_128 (1LL << 24)  // specialized target (e.g. Arm N2)
+#define HWY_SVE_256 (1LL << 25)   // specialized target (e.g. Arm V1)
+#define HWY_SVE2 (1LL << 26)
+#define HWY_SVE (1LL << 27)
+#define HWY_NEON (1LL << 28)  // On A64, includes/requires AES
+// Bit 29 reserved (Helium?)
+#define HWY_HIGHEST_TARGET_BIT_ARM 29
+
+// --------------------------- RISC-V: 9 targets (+ one fallback)
+// Bits 30..36 reserved (7 targets)
+#define HWY_RVV (1LL << 37)
+// Bit 38 reserved
+#define HWY_HIGHEST_TARGET_BIT_RVV 38
+
+// --------------------------- Future expansion: 4 targets
+// Bits 39..42 reserved
+
+
+// --------------------------- IBM Power: 9 targets (+ one fallback)
+// Bits 43..48 reserved (6 targets)
+#define HWY_PPC8 (1LL << 49)  // v2.07 or 3
+// Bits 50..51 reserved for prior VSX/AltiVec (2 targets)
+#define HWY_HIGHEST_TARGET_BIT_PPC 51
+
+// --------------------------- WebAssembly: 9 targets (+ one fallback)
+// Bits 52..57 reserved (6 targets)
+#define HWY_WASM_EMU256 (1LL << 58)  // Experimental
+#define HWY_WASM (1LL << 59)
+// Bits 60 reserved
+#define HWY_HIGHEST_TARGET_BIT_WASM 60
+
+// --------------------------- Emulation: 2 targets
+
+#define HWY_EMU128 (1LL << 61)
+// We do not add/left-shift, so this will not overflow to a negative number.
+#define HWY_SCALAR (1LL << 62)
+#define HWY_HIGHEST_TARGET_BIT_SCALAR 62
+
+// Do not use bit 63 - would be confusing to have negative numbers.
 
 //------------------------------------------------------------------------------
 // Set default blocklists
@@ -144,9 +155,9 @@
 #define HWY_BROKEN_TARGETS (HWY_NEON)
 
 // SVE[2] require recent clang or gcc versions.
-#elif (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) ||\
-(!HWY_COMPILER_CLANG && HWY_COMPILER_GCC && HWY_COMPILER_GCC < 1000)
-#define HWY_BROKEN_TARGETS (HWY_SVE | HWY_SVE2)
+#elif (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \
+    (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000)
+#define HWY_BROKEN_TARGETS (HWY_SVE | HWY_SVE2 | HWY_SVE_256 | HWY_SVE2_128)
 
 #else
 #define HWY_BROKEN_TARGETS 0
@@ -158,6 +169,19 @@
 #define HWY_ENABLED(targets) \
   ((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
 
+// Opt-out for EMU128 (affected by a GCC bug on multiple arches, fixed in 12.3:
+// see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106322). This is separate
+// from HWY_BROKEN_TARGETS because it affects the fallback target, which must
+// always be enabled. If 1, we instead choose HWY_SCALAR even without
+// HWY_COMPILE_ONLY_SCALAR being set.
+#if !defined(HWY_BROKEN_EMU128)  // allow overriding
+#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1203
+#define HWY_BROKEN_EMU128 1
+#else
+#define HWY_BROKEN_EMU128 0
+#endif
+#endif  // HWY_BROKEN_EMU128
+
 //------------------------------------------------------------------------------
 // Detect baseline targets using predefined macros
 
@@ -165,7 +189,7 @@
 // instructions, implying the target CPU would have to support them. This does
 // not take the blocklist into account.
 
-#if defined(HWY_COMPILE_ONLY_SCALAR)
+#if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128
 #define HWY_BASELINE_SCALAR HWY_SCALAR
 #else
 #define HWY_BASELINE_SCALAR HWY_EMU128
@@ -176,7 +200,7 @@
 
 #if HWY_ARCH_WASM && defined(__wasm_simd128__)
 #if defined(HWY_WANT_WASM2)
-#define HWY_BASELINE_WASM HWY_WASM2
+#define HWY_BASELINE_WASM HWY_WASM_EMU256
 #else
 #define HWY_BASELINE_WASM HWY_WASM
 #endif  // HWY_WANT_WASM2
@@ -191,27 +215,47 @@
 #define HWY_BASELINE_PPC8 0
 #endif
 
-#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2)
-#define HWY_BASELINE_SVE2 HWY_SVE2
-#else
 #define HWY_BASELINE_SVE2 0
-#endif
+#define HWY_BASELINE_SVE 0
+#define HWY_BASELINE_NEON 0
 
-#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE)
-#define HWY_BASELINE_SVE HWY_SVE
+#if HWY_ARCH_ARM
+
+#if defined(__ARM_FEATURE_SVE2)
+#undef HWY_BASELINE_SVE2  // was 0, will be re-defined
+// If user specified -msve-vector-bits=128, they assert the vector length is
+// 128 bits and we should use the HWY_SVE2_128 (more efficient for some ops).
+#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 128
+#define HWY_BASELINE_SVE2 HWY_SVE2_128
+// Otherwise we're not sure what the vector length will be. The baseline must be
+// unconditionally valid, so we can only assume HWY_SVE2. However, when running
+// on a CPU with 128-bit vectors, user code that supports dynamic dispatch will
+// still benefit from HWY_SVE2_128 because we add it to HWY_ATTAINABLE_TARGETS.
 #else
-#define HWY_BASELINE_SVE 0
-#endif
+#define HWY_BASELINE_SVE2 HWY_SVE2
+#endif  // __ARM_FEATURE_SVE_BITS
+#endif  // __ARM_FEATURE_SVE2
+
+#if defined(__ARM_FEATURE_SVE)
+#undef HWY_BASELINE_SVE  // was 0, will be re-defined
+// See above. If user-specified vector length matches our optimization, use it.
+#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
+#define HWY_BASELINE_SVE HWY_SVE_256
+#else
+#define HWY_BASELINE_SVE HWY_SVE
+#endif  // __ARM_FEATURE_SVE_BITS
+#endif  // __ARM_FEATURE_SVE
 
 // GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
-#if HWY_ARCH_ARM && (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#undef HWY_BASELINE_NEON
 #define HWY_BASELINE_NEON HWY_NEON
-#else
-#define HWY_BASELINE_NEON 0
 #endif
 
+#endif  // HWY_ARCH_ARM
+
 // Special handling for MSVC because it has fewer predefined macros:
-#if HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG
+#if HWY_COMPILER_MSVC
 
 // 1) We can only be sure SSSE3/SSE4 are enabled if AVX is:
 //    https://stackoverflow.com/questions/18563978/.
@@ -337,10 +381,22 @@
 //------------------------------------------------------------------------------
 // Choose targets for dynamic dispatch according to one of four policies
 
-#if defined(HWY_COMPILE_ONLY_SCALAR) && defined(HWY_COMPILE_ONLY_STATIC)
-#error "Defined both HWY_COMPILE_ONLY_{SCALAR|STATIC} - bug?"
+#if 1 < (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_EMU128) + \
+         defined(HWY_COMPILE_ONLY_STATIC))
+#error "Can only define one of HWY_COMPILE_ONLY_{SCALAR|EMU128|STATIC} - bug?"
+#endif
+// Defining one of HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE.
+
+// Clang, GCC and MSVC allow runtime dispatch on x86.
+#if HWY_ARCH_X86
+#define HWY_HAVE_RUNTIME_DISPATCH 1
+// On Arm, currently only GCC does, and we require Linux to detect CPU
+// capabilities.
+#elif HWY_ARCH_ARM && HWY_COMPILER_GCC_ACTUAL && HWY_OS_LINUX
+#define HWY_HAVE_RUNTIME_DISPATCH 1
+#else
+#define HWY_HAVE_RUNTIME_DISPATCH 0
 #endif
-// Defining either HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE.
 
 // AVX3_DL is not widely available yet. To reduce code size and compile time,
 // only include it in the set of attainable targets (for dynamic dispatch) if
@@ -351,19 +407,45 @@
 #define HWY_ATTAINABLE_AVX3_DL 0
 #endif
 
+#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \
+                         (HWY_ENABLED_BASELINE & (HWY_SVE | HWY_SVE_256)))
+#define HWY_ATTAINABLE_SVE HWY_ENABLED(HWY_SVE | HWY_SVE_256)
+#else
+#define HWY_ATTAINABLE_SVE 0
+#endif
+
+#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \
+                         (HWY_ENABLED_BASELINE & (HWY_SVE2 | HWY_SVE2_128)))
+#define HWY_ATTAINABLE_SVE2 HWY_ENABLED(HWY_SVE2 | HWY_SVE2_128)
+#else
+#define HWY_ATTAINABLE_SVE2 0
+#endif
+
 // Attainable means enabled and the compiler allows intrinsics (even when not
 // allowed to autovectorize). Used in 3 and 4.
 #if HWY_ARCH_X86
 #define HWY_ATTAINABLE_TARGETS                                        \
   HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | \
               HWY_AVX3 | HWY_ATTAINABLE_AVX3_DL)
+#elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
+#define HWY_ATTAINABLE_TARGETS                                      \
+  HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_NEON | HWY_ATTAINABLE_SVE | \
+              HWY_ATTAINABLE_SVE2)
 #else
-#define HWY_ATTAINABLE_TARGETS HWY_ENABLED_BASELINE
+#define HWY_ATTAINABLE_TARGETS \
+  (HWY_ENABLED_BASELINE | HWY_ATTAINABLE_SVE | HWY_ATTAINABLE_SVE2)
 #endif
 
-// 1) For older compilers: disable all SIMD (could also set HWY_DISABLED_TARGETS
-// to ~HWY_SCALAR, but this is more explicit).
-#if defined(HWY_COMPILE_ONLY_SCALAR)
+// 1) For older compilers: avoid SIMD intrinsics, but still support all ops.
+#if defined(HWY_COMPILE_ONLY_EMU128) && !HWY_BROKEN_EMU128
+#undef HWY_STATIC_TARGET
+#define HWY_STATIC_TARGET HWY_EMU128  // override baseline
+#define HWY_TARGETS HWY_EMU128
+
+// 1b) HWY_SCALAR is less capable than HWY_EMU128 (which supports all ops), but
+// we currently still support it for backwards compatibility.
+#elif defined(HWY_COMPILE_ONLY_SCALAR) || \
+    (defined(HWY_COMPILE_ONLY_EMU128) && HWY_BROKEN_EMU128)
 #undef HWY_STATIC_TARGET
 #define HWY_STATIC_TARGET HWY_SCALAR  // override baseline
 #define HWY_TARGETS HWY_SCALAR
@@ -377,9 +459,12 @@
 #define HWY_TARGETS HWY_ATTAINABLE_TARGETS
 
 // 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
-// excluding superseded targets, in particular scalar.
+// excluding superseded targets, in particular scalar. Note: HWY_STATIC_TARGET
+// may be 2^62 (HWY_SCALAR), so we must not left-shift/add it. Subtracting one
+// sets all lower bits (better targets), then we also include the static target.
 #else
-#define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2 * HWY_STATIC_TARGET - 1))
+#define HWY_TARGETS \
+  (HWY_ATTAINABLE_TARGETS & ((HWY_STATIC_TARGET - 1LL) | HWY_STATIC_TARGET))
 
 #endif  // target policy
 
diff --git a/media/highway/src/hwy/examples/benchmark.cc b/media/highway/src/hwy/examples/benchmark.cc
index 136c829afa..8ab8108949 100644
--- a/media/highway/src/hwy/examples/benchmark.cc
+++ b/media/highway/src/hwy/examples/benchmark.cc
@@ -13,10 +13,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
-#include "hwy/foreach_target.h"
-
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
 #include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>
@@ -25,8 +24,12 @@
 #include <memory>
 #include <numeric>  // iota
 
-#include "hwy/aligned_allocator.h"
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
 // Must come after foreach_target.h to avoid redefinition errors.
+#include "hwy/aligned_allocator.h"
 #include "hwy/highway.h"
 #include "hwy/nanobenchmark.h"
 
@@ -82,7 +85,8 @@ void RunBenchmark(const char* caption) {
   benchmark.Verify(num_items);
 
   for (size_t i = 0; i < num_results; ++i) {
-    const double cycles_per_item = results[i].ticks / double(results[i].input);
+    const double cycles_per_item =
+        results[i].ticks / static_cast<double>(results[i].input);
     const double mad = results[i].variability * cycles_per_item;
     printf("%6" PRIu64 ": %6.3f (+/- %5.3f)\n",
            static_cast<uint64_t>(results[i].input), cycles_per_item, mad);
@@ -234,7 +238,7 @@ namespace hwy {
 HWY_EXPORT(RunBenchmarks);
 
 void Run() {
-  for (uint32_t target : SupportedAndGeneratedTargets()) {
+  for (int64_t target : SupportedAndGeneratedTargets()) {
     SetSupportedTargetsForTest(target);
     HWY_DYNAMIC_DISPATCH(RunBenchmarks)();
   }
diff --git a/media/highway/src/hwy/examples/skeleton-inl.h b/media/highway/src/hwy/examples/skeleton-inl.h
index aa51885bfc..8aec33e666 100644
--- a/media/highway/src/hwy/examples/skeleton-inl.h
+++ b/media/highway/src/hwy/examples/skeleton-inl.h
@@ -19,7 +19,9 @@
 // splitting code into different files while still inlining instead of requiring
 // calling through function pointers.
 
-// Include guard (still compiled once per target)
+// Per-target include guard. This is only required when using dynamic dispatch,
+// i.e. including foreach_target.h. For static dispatch, a normal include
+// guard would be fine because the header is only compiled once.
 #if defined(HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_) == defined(HWY_TARGET_TOGGLE)
 #ifdef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
 #undef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
@@ -36,7 +38,8 @@ HWY_BEFORE_NAMESPACE();
 namespace skeleton {
 namespace HWY_NAMESPACE {
 
-using namespace hwy::HWY_NAMESPACE;
+// Highway ops reside here; ADL does not find templates nor builtins.
+namespace hn = hwy::HWY_NAMESPACE;
 
 // Example of a type-agnostic (caller-specified lane type) and width-agnostic
 // (uses best available instruction set) function in a header.
@@ -46,12 +49,12 @@ template <class D, typename T>
 HWY_MAYBE_UNUSED void MulAddLoop(const D d, const T* HWY_RESTRICT mul_array,
                                  const T* HWY_RESTRICT add_array,
                                  const size_t size, T* HWY_RESTRICT x_array) {
-  for (size_t i = 0; i < size; i += Lanes(d)) {
-    const auto mul = Load(d, mul_array + i);
-    const auto add = Load(d, add_array + i);
-    auto x = Load(d, x_array + i);
-    x = MulAdd(mul, x, add);
-    Store(x, d, x_array + i);
+  for (size_t i = 0; i < size; i += hn::Lanes(d)) {
+    const auto mul = hn::Load(d, mul_array + i);
+    const auto add = hn::Load(d, add_array + i);
+    auto x = hn::Load(d, x_array + i);
+    x = hn::MulAdd(mul, x, add);
+    hn::Store(x, d, x_array + i);
   }
 }
 
diff --git a/media/highway/src/hwy/examples/skeleton.cc b/media/highway/src/hwy/examples/skeleton.cc
index a3e3ea5ed8..2e820b6a92 100644
--- a/media/highway/src/hwy/examples/skeleton.cc
+++ b/media/highway/src/hwy/examples/skeleton.cc
@@ -17,26 +17,32 @@
 
 #include <stdio.h>
 
+// >>>> for dynamic dispatch only, skip if you want static dispatch
+
 // First undef to prevent error when re-included.
 #undef HWY_TARGET_INCLUDE
-// For runtime dispatch, specify the name of the current file (unfortunately
+// For dynamic dispatch, specify the name of the current file (unfortunately
 // __FILE__ is not reliable) so that foreach_target.h can re-include it.
 #define HWY_TARGET_INCLUDE "hwy/examples/skeleton.cc"
 // Generates code for each enabled target by re-including this source file.
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// <<<< end of dynamic dispatch
 
 // Must come after foreach_target.h to avoid redefinition errors.
 #include "hwy/highway.h"
 
 // Optional, can instead add HWY_ATTR to all functions.
 HWY_BEFORE_NAMESPACE();
+
 namespace skeleton {
 // This namespace name is unique per target, which allows code for multiple
-// targets to co-exist in the same translation unit.
+// targets to co-exist in the same translation unit. Required when using dynamic
+// dispatch, otherwise optional.
 namespace HWY_NAMESPACE {
 
 // Highway ops reside here; ADL does not find templates nor builtins.
-using namespace hwy::HWY_NAMESPACE;
+namespace hn = hwy::HWY_NAMESPACE;
 
 // Computes log2 by converting to a vector of floats. Compiled once per target.
 template <class DF>
@@ -44,13 +50,13 @@ HWY_ATTR_NO_MSAN void OneFloorLog2(const DF df,
                                    const uint8_t* HWY_RESTRICT values,
                                    uint8_t* HWY_RESTRICT log2) {
   // Type tags for converting to other element types (Rebind = same count).
-  const RebindToSigned<DF> d32;
-  const Rebind<uint8_t, DF> d8;
+  const hn::RebindToSigned<DF> d32;
+  const hn::Rebind<uint8_t, DF> d8;
 
-  const auto u8 = Load(d8, values);
-  const auto bits = BitCast(d32, ConvertTo(df, PromoteTo(d32, u8)));
-  const auto exponent = Sub(ShiftRight<23>(bits), Set(d32, 127));
-  Store(DemoteTo(d8, exponent), d8, log2);
+  const auto u8 = hn::Load(d8, values);
+  const auto bits = hn::BitCast(d32, hn::ConvertTo(df, hn::PromoteTo(d32, u8)));
+  const auto exponent = hn::Sub(hn::ShiftRight<23>(bits), hn::Set(d32, 127));
+  hn::Store(hn::DemoteTo(d8, exponent), d8, log2);
 }
 
 void CodepathDemo() {
@@ -68,14 +74,14 @@ void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
                uint8_t* HWY_RESTRICT log2) {
   CodepathDemo();
 
-  const ScalableTag<float> df;
-  const size_t N = Lanes(df);
+  const hn::ScalableTag<float> df;
+  const size_t N = hn::Lanes(df);
   size_t i = 0;
   for (; i + N <= count; i += N) {
     OneFloorLog2(df, values + i, log2 + i);
   }
   for (; i < count; ++i) {
-    CappedTag<float, 1> d1;
+    hn::CappedTag<float, 1> d1;
     OneFloorLog2(d1, values + i, log2 + i);
   }
 }
@@ -104,6 +110,7 @@ HWY_DLLEXPORT void CallFloorLog2(const uint8_t* HWY_RESTRICT in,
                                  uint8_t* HWY_RESTRICT out) {
   // This must reside outside of HWY_NAMESPACE because it references (calls the
   // appropriate one from) the per-target implementations there.
+  // For static dispatch, use HWY_STATIC_DISPATCH.
   return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out);
 }
 
diff --git a/media/highway/src/hwy/examples/skeleton_test.cc b/media/highway/src/hwy/examples/skeleton_test.cc
index 65282a2da6..c7c26bf5b4 100644
--- a/media/highway/src/hwy/examples/skeleton_test.cc
+++ b/media/highway/src/hwy/examples/skeleton_test.cc
@@ -21,7 +21,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "examples/skeleton_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 
 // Must come after foreach_target.h to avoid redefinition errors.
 #include "hwy/highway.h"
@@ -35,13 +35,13 @@ HWY_BEFORE_NAMESPACE();
 namespace skeleton {
 namespace HWY_NAMESPACE {
 
-using namespace hwy::HWY_NAMESPACE;
+namespace hn = hwy::HWY_NAMESPACE;
 
 // Calls function defined in skeleton.cc.
 struct TestFloorLog2 {
   template <class T, class DF>
   HWY_NOINLINE void operator()(T /*unused*/, DF df) {
-    const size_t count = 5 * Lanes(df);
+    const size_t count = 5 * hn::Lanes(df);
     auto in = hwy::AllocateAligned<uint8_t>(count);
     auto expected = hwy::AllocateAligned<uint8_t>(count);
 
@@ -62,7 +62,7 @@ struct TestFloorLog2 {
 };
 
 HWY_NOINLINE void TestAllFloorLog2() {
-  ForPartialVectors<TestFloorLog2>()(float());
+  hn::ForPartialVectors<TestFloorLog2>()(float());
 }
 
 // Calls function defined in skeleton-inl.h.
@@ -71,7 +71,7 @@ struct TestSumMulAdd {
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
     hwy::RandomState rng;
     const size_t count = 4096;
-    EXPECT_TRUE(count % Lanes(d) == 0);
+    EXPECT_EQ(0, count % hn::Lanes(d));
     auto mul = hwy::AllocateAligned<T>(count);
     auto x = hwy::AllocateAligned<T>(count);
     auto add = hwy::AllocateAligned<T>(count);
@@ -91,7 +91,7 @@ struct TestSumMulAdd {
 };
 
 HWY_NOINLINE void TestAllSumMulAdd() {
-  ForFloatTypes(ForPartialVectors<TestSumMulAdd>());
+  hn::ForFloatTypes(hn::ForPartialVectors<TestSumMulAdd>());
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/media/highway/src/hwy/foreach_target.h b/media/highway/src/hwy/foreach_target.h
index 136aed3682..3929905ca2 100644
--- a/media/highway/src/hwy/foreach_target.h
+++ b/media/highway/src/hwy/foreach_target.h
@@ -119,6 +119,28 @@
 #endif
 #endif
 
+#if (HWY_TARGETS & HWY_SVE_256) && (HWY_STATIC_TARGET != HWY_SVE_256)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_SVE_256
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_SVE2_128) && (HWY_STATIC_TARGET != HWY_SVE2_128)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_SVE2_128
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
 #if (HWY_TARGETS & HWY_SSSE3) && (HWY_STATIC_TARGET != HWY_SSSE3)
 #undef HWY_TARGET
 #define HWY_TARGET HWY_SSSE3
@@ -174,9 +196,9 @@
 #endif
 #endif
 
-#if (HWY_TARGETS & HWY_WASM2) && (HWY_STATIC_TARGET != HWY_WASM2)
+#if (HWY_TARGETS & HWY_WASM_EMU256) && (HWY_STATIC_TARGET != HWY_WASM_EMU256)
 #undef HWY_TARGET
-#define HWY_TARGET HWY_WASM2
+#define HWY_TARGET HWY_WASM_EMU256
 #include HWY_TARGET_INCLUDE
 #ifdef HWY_TARGET_TOGGLE
 #undef HWY_TARGET_TOGGLE
diff --git a/media/highway/src/hwy/highway.h b/media/highway/src/hwy/highway.h
index 63cd4e2b91..4640f31e8d 100644
--- a/media/highway/src/hwy/highway.h
+++ b/media/highway/src/hwy/highway.h
@@ -27,9 +27,9 @@
 namespace hwy {
 
 // API version (https://semver.org/); keep in sync with CMakeLists.txt.
-#define HWY_MAJOR 0
-#define HWY_MINOR 17
-#define HWY_PATCH 0
+#define HWY_MAJOR 1
+#define HWY_MINOR 0
+#define HWY_PATCH 2
 
 //------------------------------------------------------------------------------
 // Shorthand for tags (defined in shared-inl.h) used to select overloads.
@@ -40,7 +40,7 @@ namespace hwy {
 // registers in the group, and is ignored on targets that do not support groups.
 #define HWY_FULL1(T) hwy::HWY_NAMESPACE::ScalableTag<T>
 #define HWY_FULL2(T, LMUL) \
-  hwy::HWY_NAMESPACE::ScalableTag<T, CeilLog2(HWY_MAX(0, LMUL))>
+  hwy::HWY_NAMESPACE::ScalableTag<T, hwy::CeilLog2(HWY_MAX(0, LMUL))>
 #define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3
 // Workaround for MSVC grouping __VA_ARGS__ into a single argument
 #define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren
@@ -72,8 +72,8 @@ namespace hwy {
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_EMU128::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_RVV
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME
-#elif HWY_STATIC_TARGET == HWY_WASM2
-#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM2::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_WASM_EMU256
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM_EMU256::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_WASM
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_NEON
@@ -82,6 +82,10 @@ namespace hwy {
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_SVE2
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_SVE_256
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE_256::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_SVE2_128
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2_128::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_PPC8
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_SSSE3
@@ -96,36 +100,6 @@ namespace hwy {
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_DL::FUNC_NAME
 #endif
 
-// Dynamic dispatch declarations.
-
-template <typename RetType, typename... Args>
-struct FunctionCache {
- public:
-  typedef RetType(FunctionType)(Args...);
-
-  // A template function that when instantiated has the same signature as the
-  // function being called. This function initializes the global cache of the
-  // current supported targets mask used for dynamic dispatch and calls the
-  // appropriate function. Since this mask used for dynamic dispatch is a
-  // global cache, all the highway exported functions, even those exposed by
-  // different modules, will be initialized after this function runs for any one
-  // of those exported functions.
-  template <FunctionType* const table[]>
-  static RetType ChooseAndCall(Args... args) {
-    // If we are running here it means we need to update the chosen target.
-    ChosenTarget& chosen_target = GetChosenTarget();
-    chosen_target.Update();
-    return (table[chosen_target.GetIndex()])(args...);
-  }
-};
-
-// Factory function only used to infer the template parameters RetType and Args
-// from a function passed to the factory.
-template <typename RetType, typename... Args>
-FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
-  return FunctionCache<RetType, Args...>();
-}
-
 // HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or
 // nullptr is that target was not compiled.
 #if HWY_TARGETS & HWY_EMU128
@@ -138,10 +112,10 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
 #define HWY_CHOOSE_FALLBACK(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
 #endif
 
-#if HWY_TARGETS & HWY_WASM2
-#define HWY_CHOOSE_WASM2(FUNC_NAME) &N_WASM2::FUNC_NAME
+#if HWY_TARGETS & HWY_WASM_EMU256
+#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) &N_WASM_EMU256::FUNC_NAME
 #else
-#define HWY_CHOOSE_WASM2(FUNC_NAME) nullptr
+#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) nullptr
 #endif
 
 #if HWY_TARGETS & HWY_WASM
@@ -174,6 +148,18 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
 #define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr
 #endif
 
+#if HWY_TARGETS & HWY_SVE_256
+#define HWY_CHOOSE_SVE_256(FUNC_NAME) &N_SVE_256::FUNC_NAME
+#else
+#define HWY_CHOOSE_SVE_256(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_SVE2_128
+#define HWY_CHOOSE_SVE2_128(FUNC_NAME) &N_SVE2_128::FUNC_NAME
+#else
+#define HWY_CHOOSE_SVE2_128(FUNC_NAME) nullptr
+#endif
+
 #if HWY_TARGETS & HWY_PPC8
 #define HWY_CHOOSE_PCC8(FUNC_NAME) &N_PPC8::FUNC_NAME
 #else
@@ -210,6 +196,53 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
 #define HWY_CHOOSE_AVX3_DL(FUNC_NAME) nullptr
 #endif
 
+// MSVC 2017 workaround: the non-type template parameter to ChooseAndCall
+// apparently cannot be an array. Use a function pointer instead, which has the
+// disadvantage that we call the static (not best) target on the first call to
+// any HWY_DYNAMIC_DISPATCH.
+#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1915
+#define HWY_DISPATCH_WORKAROUND 1
+#else
+#define HWY_DISPATCH_WORKAROUND 0
+#endif
+
+// Provides a static member function which is what is called during the first
+// HWY_DYNAMIC_DISPATCH, where GetIndex is still zero, and instantiations of
+// this function are the first entry in the tables created by HWY_EXPORT.
+template <typename RetType, typename... Args>
+struct FunctionCache {
+ public:
+  typedef RetType(FunctionType)(Args...);
+
+#if HWY_DISPATCH_WORKAROUND
+  template <FunctionType* const func>
+  static RetType ChooseAndCall(Args... args) {
+    ChosenTarget& chosen_target = GetChosenTarget();
+    chosen_target.Update(SupportedTargets());
+    return (*func)(args...);
+  }
+#else
+  // A template function that when instantiated has the same signature as the
+  // function being called. This function initializes the bit array of targets
+  // supported by the current CPU and then calls the appropriate entry within
+  // the HWY_EXPORT table. Subsequent calls via HWY_DYNAMIC_DISPATCH to any
+  // exported functions, even those defined by different translation units,
+  // will dispatch directly to the best available target.
+  template <FunctionType* const table[]>
+  static RetType ChooseAndCall(Args... args) {
+    ChosenTarget& chosen_target = GetChosenTarget();
+    chosen_target.Update(SupportedTargets());
+    return (table[chosen_target.GetIndex()])(args...);
+  }
+#endif  // HWY_DISPATCH_WORKAROUND
+};
+
+// Used to deduce the template parameters RetType and Args from a function.
+template <typename RetType, typename... Args>
+FunctionCache<RetType, Args...> DeduceFunctionCache(RetType (*)(Args...)) {
+  return FunctionCache<RetType, Args...>();
+}
+
 #define HWY_DISPATCH_TABLE(FUNC_NAME) \
   HWY_CONCAT(FUNC_NAME, HighwayDispatchTable)
 
@@ -218,7 +251,7 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
 // static array must be defined at the same namespace level as the function
 // it is exporting.
 // After being exported, it can be called from other parts of the same source
-// file using HWY_DYNAMIC_DISTPATCH(), in particular from a function wrapper
+// file using HWY_DYNAMIC_DISPATCH(), in particular from a function wrapper
 // like in the following example:
 //
 //   #include "hwy/highway.h"
@@ -248,14 +281,29 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
 // This case still uses a table, although of a single element, to provide the
 // same compile error conditions as with the dynamic dispatch case when multiple
 // targets are being compiled.
-#define HWY_EXPORT(FUNC_NAME)                                       \
-  HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) \
-      const HWY_DISPATCH_TABLE(FUNC_NAME)[1] = {                    \
-          &HWY_STATIC_DISPATCH(FUNC_NAME)}
+#define HWY_EXPORT(FUNC_NAME)                                             \
+  HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const \
+  HWY_DISPATCH_TABLE(FUNC_NAME)[1] = {&HWY_STATIC_DISPATCH(FUNC_NAME)}
 #define HWY_DYNAMIC_DISPATCH(FUNC_NAME) HWY_STATIC_DISPATCH(FUNC_NAME)
 
 #else
 
+// Simplified version for MSVC 2017: function pointer instead of table.
+#if HWY_DISPATCH_WORKAROUND
+
+#define HWY_EXPORT(FUNC_NAME)                                                \
+  static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
+      FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = {                            \
+      /* The first entry in the table initializes the global cache and       \
+       * calls the function from HWY_STATIC_TARGET. */                       \
+      &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH(               \
+          FUNC_NAME)))::ChooseAndCall<&HWY_STATIC_DISPATCH(FUNC_NAME)>,      \
+      HWY_CHOOSE_TARGET_LIST(FUNC_NAME),                                     \
+      HWY_CHOOSE_FALLBACK(FUNC_NAME),                                        \
+  }
+
+#else
+
 // Dynamic dispatch case with one entry per dynamic target plus the fallback
 // target and the initialization wrapper.
 #define HWY_EXPORT(FUNC_NAME)                                                \
@@ -263,11 +311,14 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
       FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = {                            \
       /* The first entry in the table initializes the global cache and       \
        * calls the appropriate function. */                                  \
-      &decltype(hwy::FunctionCacheFactory(&HWY_STATIC_DISPATCH(              \
+      &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH(               \
           FUNC_NAME)))::ChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>,        \
       HWY_CHOOSE_TARGET_LIST(FUNC_NAME),                                     \
       HWY_CHOOSE_FALLBACK(FUNC_NAME),                                        \
   }
+
+#endif  // HWY_DISPATCH_WORKAROUND
+
 #define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \
   (*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()]))
 
@@ -305,9 +356,10 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
 #error "PPC is not yet supported"
 #elif HWY_TARGET == HWY_NEON
 #include "hwy/ops/arm_neon-inl.h"
-#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
+#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 || \
+    HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
 #include "hwy/ops/arm_sve-inl.h"
-#elif HWY_TARGET == HWY_WASM2
+#elif HWY_TARGET == HWY_WASM_EMU256
 #include "hwy/ops/wasm_256-inl.h"
 #elif HWY_TARGET == HWY_WASM
 #include "hwy/ops/wasm_128-inl.h"
diff --git a/media/highway/src/hwy/highway_test.cc b/media/highway/src/hwy/highway_test.cc
index f505726fb5..4838e72f4f 100644
--- a/media/highway/src/hwy/highway_test.cc
+++ b/media/highway/src/hwy/highway_test.cc
@@ -15,7 +15,6 @@
 
 #include <stddef.h>
 #include <stdint.h>
-#include <string.h>
 
 #include <bitset>
 
@@ -23,7 +22,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "highway_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"    // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/nanobenchmark.h"  // Unpredictable1
 #include "hwy/tests/test_util-inl.h"
@@ -224,7 +223,7 @@ HWY_INLINE void AssertNaN(D d, VecArg<V> v, const char* file, int line) {
     // avoid truncating doubles.
     uint8_t bytes[HWY_MAX(sizeof(T), 8)] = {0};
     const T lane = GetLane(v);
-    memcpy(bytes, &lane, sizeof(T));
+    CopyBytes<sizeof(T)>(&lane, bytes);
     Abort(file, line,
           "Expected %s NaN, got %E (bytes %02x %02x %02x %02x %02x %02x %02x "
           "%02x)",
diff --git a/media/highway/src/hwy/nanobenchmark.cc b/media/highway/src/hwy/nanobenchmark.cc
index 0f20653b79..e03ed4cf66 100644
--- a/media/highway/src/hwy/nanobenchmark.cc
+++ b/media/highway/src/hwy/nanobenchmark.cc
@@ -15,17 +15,19 @@
 
 #include "hwy/nanobenchmark.h"
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
 #include <inttypes.h>
 #include <stddef.h>
 #include <stdio.h>
-#include <stdlib.h>  // abort
-#include <string.h>  // memcpy
+#include <stdlib.h>
 #include <time.h>    // clock_gettime
 
 #include <algorithm>  // sort
 #include <array>
 #include <atomic>
-#include <chrono>
+#include <chrono>  //NOLINT
 #include <limits>
 #include <numeric>  // iota
 #include <random>
@@ -148,7 +150,7 @@ inline Ticks Start() {
       // "cc" = flags modified by SHL.
       : "rdx", "memory", "cc");
 #elif HWY_ARCH_RVV
-  asm volatile("rdcycle %0" : "=r"(t));
+  asm volatile("rdtime %0" : "=r"(t));
 #elif defined(_WIN32) || defined(_WIN64)
   LARGE_INTEGER counter;
   (void)QueryPerformanceCounter(&counter);
@@ -311,7 +313,8 @@ T MedianAbsoluteDeviation(const T* values, const size_t num_values,
   std::vector<T> abs_deviations;
   abs_deviations.reserve(num_values);
   for (size_t i = 0; i < num_values; ++i) {
-    const int64_t abs = std::abs(int64_t(values[i]) - int64_t(median));
+    const int64_t abs = std::abs(static_cast<int64_t>(values[i]) -
+                                 static_cast<int64_t>(median));
     abs_deviations.push_back(static_cast<T>(abs));
   }
   return Median(abs_deviations.data(), num_values);
@@ -413,7 +416,7 @@ std::string BrandString() {
 
   for (size_t i = 0; i < 3; ++i) {
     Cpuid(static_cast<uint32_t>(0x80000002U + i), 0, abcd.data());
-    memcpy(brand_string + i * 16, abcd.data(), sizeof(abcd));
+    CopyBytes<sizeof(abcd)>(&abcd[0], brand_string + i * 16);  // not same size
   }
   brand_string[48] = 0;
   return brand_string;
@@ -425,7 +428,7 @@ std::string BrandString() {
 
 HWY_DLLEXPORT double InvariantTicksPerSecond() {
 #if HWY_ARCH_PPC && defined(__GLIBC__)
-  return double(__ppc_get_timebase_freq());
+  return static_cast<double>(__ppc_get_timebase_freq());
 #elif HWY_ARCH_X86 || HWY_ARCH_RVV || (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC)
   // We assume the x86 TSC is invariant; it is on all recent Intel/AMD CPUs.
   static const double freq = MeasureNominalClockRate();
@@ -433,12 +436,12 @@ HWY_DLLEXPORT double InvariantTicksPerSecond() {
 #elif defined(_WIN32) || defined(_WIN64)
   LARGE_INTEGER freq;
   (void)QueryPerformanceFrequency(&freq);
-  return double(freq.QuadPart);
+  return static_cast<double>(freq.QuadPart);
 #elif defined(__APPLE__)
   // https://developer.apple.com/library/mac/qa/qa1398/_index.html
   mach_timebase_info_data_t timebase;
   (void)mach_timebase_info(&timebase);
-  return double(timebase.denom) / timebase.numer * 1E9;
+  return static_cast<double>(timebase.denom) / timebase.numer * 1E9;
 #else
   return 1E9;  // Haiku and clock_gettime return nanoseconds.
 #endif
diff --git a/media/highway/src/hwy/nanobenchmark_test.cc b/media/highway/src/hwy/nanobenchmark_test.cc
index 5fc011aea4..0d153a14c5 100644
--- a/media/highway/src/hwy/nanobenchmark_test.cc
+++ b/media/highway/src/hwy/nanobenchmark_test.cc
@@ -15,6 +15,9 @@
 
 #include "hwy/nanobenchmark.h"
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
 #include <inttypes.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -58,7 +61,7 @@ std::mt19937 rng;
 // A function whose runtime depends on rng.
 FuncOutput Random(const void* /*arg*/, FuncInput in) {
   const size_t r = rng() & 0xF;
-  uint32_t ret = in;
+  FuncOutput ret = static_cast<FuncOutput>(in);
   for (size_t i = 0; i < r; ++i) {
     ret /= ((rng() & 1) + 2);
   }
diff --git a/media/highway/src/hwy/ops/arm_neon-inl.h b/media/highway/src/hwy/ops/arm_neon-inl.h
index 89e23da90a..f85fcf8f5a 100644
--- a/media/highway/src/hwy/ops/arm_neon-inl.h
+++ b/media/highway/src/hwy/ops/arm_neon-inl.h
@@ -19,14 +19,21 @@
 // ARM NEON intrinsics are documented at:
 // https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon]
 
-#include <arm_neon.h>
 #include <stddef.h>
 #include <stdint.h>
 
-#include "hwy/base.h"
 #include "hwy/ops/shared-inl.h"
 
 HWY_BEFORE_NAMESPACE();
+
+// Must come after HWY_BEFORE_NAMESPACE so that the intrinsics are compiled with
+// the same target attribute as our code, see #834.
+HWY_DIAGNOSTICS(push)
+HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
+#include <arm_neon.h>
+HWY_DIAGNOSTICS(pop)
+
+// Must come after arm_neon.h.
 namespace hwy {
 namespace HWY_NAMESPACE {
 
@@ -814,6 +821,9 @@ class Mask128 {
   Raw raw;
 };
 
+template <typename T>
+using Mask64 = Mask128<T, 8 / sizeof(T)>;
+
 namespace detail {
 
 // Deduce Simd<T, N, 0> from Vec128<T, N>
@@ -1017,16 +1027,21 @@ HWY_API Vec128<bfloat16_t, N> Zero(Simd<bfloat16_t, N, 0> /* tag */) {
 template <class D>
 using VFromD = decltype(Zero(D()));
 
+HWY_DIAGNOSTICS(push)
+HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
+#if HWY_COMPILER_GCC_ACTUAL
+  HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
+#endif
+
 // Returns a vector with uninitialized elements.
 template <typename T, size_t N>
 HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> /*d*/) {
-  HWY_DIAGNOSTICS(push)
-  HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
   typename detail::Raw128<T, N>::type a;
   return Vec128<T, N>(a);
-  HWY_DIAGNOSTICS(pop)
 }
 
+HWY_DIAGNOSTICS(pop)
+
 // Returns a vector with lane i=[0, N) set to "first" + i.
 template <typename T, size_t N, typename T2>
 Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
@@ -2266,6 +2281,12 @@ HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
   return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
 }
 
+template <typename T, size_t N>
+HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
+  const Simd<T, N, 0> d;
+  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
+}
+
 // ================================================== COMPARE
 
 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
@@ -2625,61 +2646,49 @@ HWY_API Vec64<double> LoadU(Full64<double> /* tag */,
   return Vec64<double>(vld1_f64(p));
 }
 #endif
-
 // ------------------------------ Load 32
 
-HWY_API Vec32<uint8_t> LoadU(Full32<uint8_t> /*tag*/,
-                             const uint8_t* HWY_RESTRICT p) {
-  uint32x2_t a = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
-  return Vec32<uint8_t>(vreinterpret_u8_u32(a));
-}
-HWY_API Vec32<uint16_t> LoadU(Full32<uint16_t> /*tag*/,
-                              const uint16_t* HWY_RESTRICT p) {
-  uint32x2_t a = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
-  return Vec32<uint16_t>(vreinterpret_u16_u32(a));
-}
+// Actual 32-bit broadcast load - used to implement the other lane types
+// because reinterpret_cast of the pointer leads to incorrect codegen on GCC.
 HWY_API Vec32<uint32_t> LoadU(Full32<uint32_t> /*tag*/,
                               const uint32_t* HWY_RESTRICT p) {
-  return Vec32<uint32_t>(vld1_dup_u32(reinterpret_cast<const uint32_t*>(p)));
-}
-HWY_API Vec32<int8_t> LoadU(Full32<int8_t> /*tag*/,
-                            const int8_t* HWY_RESTRICT p) {
-  int32x2_t a = vld1_dup_s32(reinterpret_cast<const int32_t*>(p));
-  return Vec32<int8_t>(vreinterpret_s8_s32(a));
-}
-HWY_API Vec32<int16_t> LoadU(Full32<int16_t> /*tag*/,
-                             const int16_t* HWY_RESTRICT p) {
-  int32x2_t a = vld1_dup_s32(reinterpret_cast<const int32_t*>(p));
-  return Vec32<int16_t>(vreinterpret_s16_s32(a));
+  return Vec32<uint32_t>(vld1_dup_u32(p));
 }
 HWY_API Vec32<int32_t> LoadU(Full32<int32_t> /*tag*/,
                              const int32_t* HWY_RESTRICT p) {
-  return Vec32<int32_t>(vld1_dup_s32(reinterpret_cast<const int32_t*>(p)));
+  return Vec32<int32_t>(vld1_dup_s32(p));
 }
 HWY_API Vec32<float> LoadU(Full32<float> /*tag*/, const float* HWY_RESTRICT p) {
   return Vec32<float>(vld1_dup_f32(p));
 }
 
+template <typename T, HWY_IF_LANE_SIZE_LT(T, 4)>
+HWY_API Vec32<T> LoadU(Full32<T> d, const T* HWY_RESTRICT p) {
+  const Repartition<uint32_t, decltype(d)> d32;
+  uint32_t buf;
+  CopyBytes<4>(p, &buf);
+  return BitCast(d, LoadU(d32, &buf));
+}
+
 // ------------------------------ Load 16
 
-HWY_API Vec128<uint8_t, 2> LoadU(Simd<uint8_t, 2, 0> /*tag*/,
-                                 const uint8_t* HWY_RESTRICT p) {
-  uint16x4_t a = vld1_dup_u16(reinterpret_cast<const uint16_t*>(p));
-  return Vec128<uint8_t, 2>(vreinterpret_u8_u16(a));
-}
+// Actual 16-bit broadcast load - used to implement the other lane types
+// because reinterpret_cast of the pointer leads to incorrect codegen on GCC.
 HWY_API Vec128<uint16_t, 1> LoadU(Simd<uint16_t, 1, 0> /*tag*/,
                                   const uint16_t* HWY_RESTRICT p) {
-  return Vec128<uint16_t, 1>(
-      vld1_dup_u16(reinterpret_cast<const uint16_t*>(p)));
-}
-HWY_API Vec128<int8_t, 2> LoadU(Simd<int8_t, 2, 0> /*tag*/,
-                                const int8_t* HWY_RESTRICT p) {
-  int16x4_t a = vld1_dup_s16(reinterpret_cast<const int16_t*>(p));
-  return Vec128<int8_t, 2>(vreinterpret_s8_s16(a));
+  return Vec128<uint16_t, 1>(vld1_dup_u16(p));
 }
 HWY_API Vec128<int16_t, 1> LoadU(Simd<int16_t, 1, 0> /*tag*/,
                                  const int16_t* HWY_RESTRICT p) {
-  return Vec128<int16_t, 1>(vld1_dup_s16(reinterpret_cast<const int16_t*>(p)));
+  return Vec128<int16_t, 1>(vld1_dup_s16(p));
+}
+
+template <typename T, HWY_IF_LANE_SIZE_LT(T, 2)>
+HWY_API Vec128<T, 2> LoadU(Simd<T, 2, 0> d, const T* HWY_RESTRICT p) {
+  const Repartition<uint16_t, decltype(d)> d16;
+  uint16_t buf;
+  CopyBytes<2>(p, &buf);
+  return BitCast(d, LoadU(d16, &buf));
 }
 
 // ------------------------------ Load 8
@@ -2821,30 +2830,10 @@ HWY_API void StoreU(const Vec64<double> v, Full64<double> /* tag */,
 
 // ------------------------------ Store 32
 
-HWY_API void StoreU(const Vec32<uint8_t> v, Full32<uint8_t>,
-                    uint8_t* HWY_RESTRICT p) {
-  uint32x2_t a = vreinterpret_u32_u8(v.raw);
-  vst1_lane_u32(reinterpret_cast<uint32_t*>(p), a, 0);
-}
-HWY_API void StoreU(const Vec32<uint16_t> v, Full32<uint16_t>,
-                    uint16_t* HWY_RESTRICT p) {
-  uint32x2_t a = vreinterpret_u32_u16(v.raw);
-  vst1_lane_u32(reinterpret_cast<uint32_t*>(p), a, 0);
-}
 HWY_API void StoreU(const Vec32<uint32_t> v, Full32<uint32_t>,
                     uint32_t* HWY_RESTRICT p) {
   vst1_lane_u32(p, v.raw, 0);
 }
-HWY_API void StoreU(const Vec32<int8_t> v, Full32<int8_t>,
-                    int8_t* HWY_RESTRICT p) {
-  int32x2_t a = vreinterpret_s32_s8(v.raw);
-  vst1_lane_s32(reinterpret_cast<int32_t*>(p), a, 0);
-}
-HWY_API void StoreU(const Vec32<int16_t> v, Full32<int16_t>,
-                    int16_t* HWY_RESTRICT p) {
-  int32x2_t a = vreinterpret_s32_s16(v.raw);
-  vst1_lane_s32(reinterpret_cast<int32_t*>(p), a, 0);
-}
 HWY_API void StoreU(const Vec32<int32_t> v, Full32<int32_t>,
                     int32_t* HWY_RESTRICT p) {
   vst1_lane_s32(p, v.raw, 0);
@@ -2854,27 +2843,31 @@ HWY_API void StoreU(const Vec32<float> v, Full32<float>,
   vst1_lane_f32(p, v.raw, 0);
 }
 
+template <typename T, HWY_IF_LANE_SIZE_LT(T, 4)>
+HWY_API void StoreU(const Vec32<T> v, Full32<T> d, T* HWY_RESTRICT p) {
+  const Repartition<uint32_t, decltype(d)> d32;
+  const uint32_t buf = GetLane(BitCast(d32, v));
+  CopyBytes<4>(&buf, p);
+}
+
 // ------------------------------ Store 16
 
-HWY_API void StoreU(const Vec128<uint8_t, 2> v, Simd<uint8_t, 2, 0>,
-                    uint8_t* HWY_RESTRICT p) {
-  uint16x4_t a = vreinterpret_u16_u8(v.raw);
-  vst1_lane_u16(reinterpret_cast<uint16_t*>(p), a, 0);
-}
 HWY_API void StoreU(const Vec128<uint16_t, 1> v, Simd<uint16_t, 1, 0>,
                     uint16_t* HWY_RESTRICT p) {
   vst1_lane_u16(p, v.raw, 0);
 }
-HWY_API void StoreU(const Vec128<int8_t, 2> v, Simd<int8_t, 2, 0>,
-                    int8_t* HWY_RESTRICT p) {
-  int16x4_t a = vreinterpret_s16_s8(v.raw);
-  vst1_lane_s16(reinterpret_cast<int16_t*>(p), a, 0);
-}
 HWY_API void StoreU(const Vec128<int16_t, 1> v, Simd<int16_t, 1, 0>,
                     int16_t* HWY_RESTRICT p) {
   vst1_lane_s16(p, v.raw, 0);
 }
 
+template <typename T, HWY_IF_LANE_SIZE_LT(T, 2)>
+HWY_API void StoreU(const Vec128<T, 2> v, Simd<T, 2, 0> d, T* HWY_RESTRICT p) {
+  const Repartition<uint16_t, decltype(d)> d16;
+  const uint16_t buf = GetLane(BitCast(d16, v));
+  CopyBytes<2>(&buf, p);
+}
+
 // ------------------------------ Store 8
 
 HWY_API void StoreU(const Vec128<uint8_t, 1> v, Simd<uint8_t, 1, 0>,
@@ -2902,12 +2895,19 @@ HWY_API void StoreU(Vec128<bfloat16_t, N> v, Simd<bfloat16_t, N, 0> d,
   return StoreU(Vec128<uint16_t, N>(v.raw), du16, pu16);
 }
 
+HWY_DIAGNOSTICS(push)
+#if HWY_COMPILER_GCC_ACTUAL
+  HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
+#endif
+
 // On ARM, Store is the same as StoreU.
 template <typename T, size_t N>
 HWY_API void Store(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT aligned) {
   StoreU(v, d, aligned);
 }
 
+HWY_DIAGNOSTICS(pop)
+
 template <typename T, size_t N>
 HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
                           T* HWY_RESTRICT p) {
@@ -3305,6 +3305,16 @@ HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
   return Vec128<float, N>(vcvt_f32_s32(v.raw));
 }
 
+HWY_API Vec128<float> ConvertTo(Full128<float> /* tag */,
+                                const Vec128<uint32_t> v) {
+  return Vec128<float>(vcvtq_f32_u32(v.raw));
+}
+template <size_t N, HWY_IF_LE64(uint32_t, N)>
+HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
+                                   const Vec128<uint32_t, N> v) {
+  return Vec128<float, N>(vcvt_f32_u32(v.raw));
+}
+
 // Truncates (rounds toward zero).
 HWY_API Vec128<int32_t> ConvertTo(Full128<int32_t> /* tag */,
                                   const Vec128<float> v) {
@@ -3327,6 +3337,15 @@ HWY_API Vec64<double> ConvertTo(Full64<double> /* tag */,
   return Vec64<double>(vcvt_f64_s64(v.raw));
 }
 
+HWY_API Vec128<double> ConvertTo(Full128<double> /* tag */,
+                                 const Vec128<uint64_t> v) {
+  return Vec128<double>(vcvtq_f64_u64(v.raw));
+}
+HWY_API Vec64<double> ConvertTo(Full64<double> /* tag */,
+                                const Vec64<uint64_t> v) {
+  return Vec64<double>(vcvt_f64_u64(v.raw));
+}
+
 // Truncates (rounds toward zero).
 HWY_API Vec128<int64_t> ConvertTo(Full128<int64_t> /* tag */,
                                   const Vec128<double> v) {
@@ -3525,6 +3544,11 @@ HWY_API Vec64<double> LowerHalf(const Vec128<double> v) {
   return Vec64<double>(vget_low_f64(v.raw));
 }
 #endif
+HWY_API Vec64<bfloat16_t> LowerHalf(const Vec128<bfloat16_t> v) {
+  const Full128<uint16_t> du;
+  const Full64<bfloat16_t> dbh;
+  return BitCast(dbh, LowerHalf(BitCast(du, v)));
+}
 
 template <typename T, size_t N>
 HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2, 0> /* tag */,
@@ -3725,6 +3749,13 @@ HWY_API Vec64<double> UpperHalf(Full64<double> /* tag */,
 }
 #endif
 
+HWY_API Vec64<bfloat16_t> UpperHalf(Full64<bfloat16_t> dbh,
+                                    const Vec128<bfloat16_t> v) {
+  const RebindToUnsigned<decltype(dbh)> duh;
+  const Twice<decltype(duh)> du;
+  return BitCast(dbh, UpperHalf(duh, BitCast(du, v)));
+}
+
 // Partial
 template <typename T, size_t N, HWY_IF_LE64(T, N)>
 HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N, 0>> /* tag */,
@@ -4241,6 +4272,48 @@ HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
   return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
 }
 
+HWY_API Vec128<int32_t> ReorderWidenMulAccumulate(Full128<int32_t> /*d32*/,
+                                                  Vec128<int16_t> a,
+                                                  Vec128<int16_t> b,
+                                                  const Vec128<int32_t> sum0,
+                                                  Vec128<int32_t>& sum1) {
+#if HWY_ARCH_ARM_A64
+  sum1 = Vec128<int32_t>(vmlal_high_s16(sum1.raw, a.raw, b.raw));
+#else
+  const Full64<int16_t> dh;
+  sum1 = Vec128<int32_t>(
+      vmlal_s16(sum1.raw, UpperHalf(dh, a).raw, UpperHalf(dh, b).raw));
+#endif
+  return Vec128<int32_t>(
+      vmlal_s16(sum0.raw, LowerHalf(a).raw, LowerHalf(b).raw));
+}
+
+HWY_API Vec64<int32_t> ReorderWidenMulAccumulate(Full64<int32_t> d32,
+                                                 Vec64<int16_t> a,
+                                                 Vec64<int16_t> b,
+                                                 const Vec64<int32_t> sum0,
+                                                 Vec64<int32_t>& sum1) {
+  // vmlal writes into the upper half, which the caller cannot use, so
+  // split into two halves.
+  const Vec128<int32_t> mul_3210(vmull_s16(a.raw, b.raw));
+  const Vec64<int32_t> mul_32 = UpperHalf(d32, mul_3210);
+  sum1 += mul_32;
+  return sum0 + LowerHalf(mul_3210);
+}
+
+HWY_API Vec32<int32_t> ReorderWidenMulAccumulate(Full32<int32_t> d32,
+                                                 Vec32<int16_t> a,
+                                                 Vec32<int16_t> b,
+                                                 const Vec32<int32_t> sum0,
+                                                 Vec32<int32_t>& sum1) {
+  const Vec128<int32_t> mul_xx10(vmull_s16(a.raw, b.raw));
+  const Vec64<int32_t> mul_10(LowerHalf(mul_xx10));
+  const Vec32<int32_t> mul0 = LowerHalf(d32, mul_10);
+  const Vec32<int32_t> mul1 = UpperHalf(d32, mul_10);
+  sum1 += mul1;
+  return sum0 + mul0;
+}
+
 // ================================================== COMBINE
 
 // ------------------------------ Combine (InterleaveLower)
@@ -4585,9 +4658,36 @@ HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
   return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
 }
 
+HWY_API Vec128<int16_t> ReorderDemote2To(Full128<int16_t> d16,
+                                         Vec128<int32_t> a, Vec128<int32_t> b) {
+  const Vec64<int16_t> a16(vqmovn_s32(a.raw));
+#if HWY_ARCH_ARM_A64
+  (void)d16;
+  return Vec128<int16_t>(vqmovn_high_s32(a16.raw, b.raw));
+#else
+  const Vec64<int16_t> b16(vqmovn_s32(b.raw));
+  return Combine(d16, a16, b16);
+#endif
+}
+
+HWY_API Vec64<int16_t> ReorderDemote2To(Full64<int16_t> /*d16*/,
+                                        Vec64<int32_t> a, Vec64<int32_t> b) {
+  const Full128<int32_t> d32;
+  const Vec128<int32_t> ab = Combine(d32, a, b);
+  return Vec64<int16_t>(vqmovn_s32(ab.raw));
+}
+
+HWY_API Vec32<int16_t> ReorderDemote2To(Full32<int16_t> /*d16*/,
+                                        Vec32<int32_t> a, Vec32<int32_t> b) {
+  const Full128<int32_t> d32;
+  const Vec64<int32_t> ab(vzip1_s32(a.raw, b.raw));
+  return Vec32<int16_t>(vqmovn_s32(Combine(d32, ab, ab).raw));
+}
+
 // ================================================== CRYPTO
 
-#if defined(__ARM_FEATURE_AES)
+#if defined(__ARM_FEATURE_AES) || \
+    (HWY_HAVE_RUNTIME_DISPATCH && HWY_ARCH_ARM_A64)
 
 // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
 #ifdef HWY_NATIVE_AES
@@ -4632,6 +4732,73 @@ HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
   return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
 }
 
+// ------------------------------ Truncations
+
+template <typename From, typename To, HWY_IF_UNSIGNED(From),
+          HWY_IF_UNSIGNED(To),
+          hwy::EnableIf<(sizeof(To) < sizeof(From))>* = nullptr>
+HWY_API Vec128<To, 1> TruncateTo(Simd<To, 1, 0> /* tag */,
+                                 const Vec128<From, 1> v) {
+  const Repartition<To, DFromV<decltype(v)>> d;
+  const auto v1 = BitCast(d, v);
+  return Vec128<To, 1>{v1.raw};
+}
+
+HWY_API Vec128<uint8_t, 2> TruncateTo(Simd<uint8_t, 2, 0> /* tag */,
+                                      const Vec128<uint64_t, 2> v) {
+  const Repartition<uint8_t, DFromV<decltype(v)>> d;
+  const auto v1 = BitCast(d, v);
+  const auto v2 = detail::ConcatEven(v1, v1);
+  const auto v3 = detail::ConcatEven(v2, v2);
+  const auto v4 = detail::ConcatEven(v3, v3);
+  return LowerHalf(LowerHalf(LowerHalf(v4)));
+}
+
+HWY_API Vec32<uint16_t> TruncateTo(Simd<uint16_t, 2, 0> /* tag */,
+                                   const Vec128<uint64_t, 2> v) {
+  const Repartition<uint16_t, DFromV<decltype(v)>> d;
+  const auto v1 = BitCast(d, v);
+  const auto v2 = detail::ConcatEven(v1, v1);
+  const auto v3 = detail::ConcatEven(v2, v2);
+  return LowerHalf(LowerHalf(v3));
+}
+
+HWY_API Vec64<uint32_t> TruncateTo(Simd<uint32_t, 2, 0> /* tag */,
+                                   const Vec128<uint64_t, 2> v) {
+  const Repartition<uint32_t, DFromV<decltype(v)>> d;
+  const auto v1 = BitCast(d, v);
+  const auto v2 = detail::ConcatEven(v1, v1);
+  return LowerHalf(v2);
+}
+
+template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
+HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
+                                      const Vec128<uint32_t, N> v) {
+  const Repartition<uint8_t, DFromV<decltype(v)>> d;
+  const auto v1 = BitCast(d, v);
+  const auto v2 = detail::ConcatEven(v1, v1);
+  const auto v3 = detail::ConcatEven(v2, v2);
+  return LowerHalf(LowerHalf(v3));
+}
+
+template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
+HWY_API Vec128<uint16_t, N> TruncateTo(Simd<uint16_t, N, 0> /* tag */,
+                                       const Vec128<uint32_t, N> v) {
+  const Repartition<uint16_t, DFromV<decltype(v)>> d;
+  const auto v1 = BitCast(d, v);
+  const auto v2 = detail::ConcatEven(v1, v1);
+  return LowerHalf(v2);
+}
+
+template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
+HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
+                                      const Vec128<uint16_t, N> v) {
+  const Repartition<uint8_t, DFromV<decltype(v)>> d;
+  const auto v1 = BitCast(d, v);
+  const auto v2 = detail::ConcatEven(v1, v1);
+  return LowerHalf(v2);
+}
+
 // ------------------------------ MulEven (ConcatEven)
 
 // Multiplies even lanes (0, 2 ..) and places the double-wide result into
@@ -4822,7 +4989,8 @@ namespace detail {
 
 // N=1 for any T: no-op
 template <typename T>
-HWY_INLINE Vec128<T, 1> SumOfLanes(const Vec128<T, 1> v) {
+HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
+                                   const Vec128<T, 1> v) {
   return v;
 }
 template <typename T>
@@ -4838,7 +5006,8 @@ HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
 
 // u32/i32/f32: N=2
 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE Vec128<T, 2> SumOfLanes(const Vec128<T, 2> v10) {
+HWY_INLINE Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
+                                   const Vec128<T, 2> v10) {
   return v10 + Shuffle2301(v10);
 }
 template <typename T>
@@ -4854,48 +5023,59 @@ HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
 
 // full vectors
 #if HWY_ARCH_ARM_A64
-HWY_INLINE Vec128<uint32_t> SumOfLanes(const Vec128<uint32_t> v) {
+HWY_INLINE Vec128<uint32_t> SumOfLanes(hwy::SizeTag<4> /* tag */,
+                                       const Vec128<uint32_t> v) {
   return Vec128<uint32_t>(vdupq_n_u32(vaddvq_u32(v.raw)));
 }
-HWY_INLINE Vec128<int32_t> SumOfLanes(const Vec128<int32_t> v) {
+HWY_INLINE Vec128<int32_t> SumOfLanes(hwy::SizeTag<4> /* tag */,
+                                      const Vec128<int32_t> v) {
   return Vec128<int32_t>(vdupq_n_s32(vaddvq_s32(v.raw)));
 }
-HWY_INLINE Vec128<float> SumOfLanes(const Vec128<float> v) {
+HWY_INLINE Vec128<float> SumOfLanes(hwy::SizeTag<4> /* tag */,
+                                    const Vec128<float> v) {
   return Vec128<float>(vdupq_n_f32(vaddvq_f32(v.raw)));
 }
-HWY_INLINE Vec128<uint64_t> SumOfLanes(const Vec128<uint64_t> v) {
+HWY_INLINE Vec128<uint64_t> SumOfLanes(hwy::SizeTag<8> /* tag */,
+                                       const Vec128<uint64_t> v) {
   return Vec128<uint64_t>(vdupq_n_u64(vaddvq_u64(v.raw)));
 }
-HWY_INLINE Vec128<int64_t> SumOfLanes(const Vec128<int64_t> v) {
+HWY_INLINE Vec128<int64_t> SumOfLanes(hwy::SizeTag<8> /* tag */,
+                                      const Vec128<int64_t> v) {
   return Vec128<int64_t>(vdupq_n_s64(vaddvq_s64(v.raw)));
 }
-HWY_INLINE Vec128<double> SumOfLanes(const Vec128<double> v) {
+HWY_INLINE Vec128<double> SumOfLanes(hwy::SizeTag<8> /* tag */,
+                                     const Vec128<double> v) {
   return Vec128<double>(vdupq_n_f64(vaddvq_f64(v.raw)));
 }
 #else
 // ARMv7 version for everything except doubles.
-HWY_INLINE Vec128<uint32_t> SumOfLanes(const Vec128<uint32_t> v) {
+HWY_INLINE Vec128<uint32_t> SumOfLanes(hwy::SizeTag<4> /* tag */,
+                                       const Vec128<uint32_t> v) {
   uint32x4x2_t v0 = vuzpq_u32(v.raw, v.raw);
   uint32x4_t c0 = vaddq_u32(v0.val[0], v0.val[1]);
   uint32x4x2_t v1 = vuzpq_u32(c0, c0);
   return Vec128<uint32_t>(vaddq_u32(v1.val[0], v1.val[1]));
 }
-HWY_INLINE Vec128<int32_t> SumOfLanes(const Vec128<int32_t> v) {
+HWY_INLINE Vec128<int32_t> SumOfLanes(hwy::SizeTag<4> /* tag */,
+                                      const Vec128<int32_t> v) {
   int32x4x2_t v0 = vuzpq_s32(v.raw, v.raw);
   int32x4_t c0 = vaddq_s32(v0.val[0], v0.val[1]);
   int32x4x2_t v1 = vuzpq_s32(c0, c0);
   return Vec128<int32_t>(vaddq_s32(v1.val[0], v1.val[1]));
 }
-HWY_INLINE Vec128<float> SumOfLanes(const Vec128<float> v) {
+HWY_INLINE Vec128<float> SumOfLanes(hwy::SizeTag<4> /* tag */,
+                                    const Vec128<float> v) {
   float32x4x2_t v0 = vuzpq_f32(v.raw, v.raw);
   float32x4_t c0 = vaddq_f32(v0.val[0], v0.val[1]);
   float32x4x2_t v1 = vuzpq_f32(c0, c0);
   return Vec128<float>(vaddq_f32(v1.val[0], v1.val[1]));
 }
-HWY_INLINE Vec128<uint64_t> SumOfLanes(const Vec128<uint64_t> v) {
+HWY_INLINE Vec128<uint64_t> SumOfLanes(hwy::SizeTag<8> /* tag */,
+                                       const Vec128<uint64_t> v) {
   return v + Shuffle01(v);
 }
-HWY_INLINE Vec128<int64_t> SumOfLanes(const Vec128<int64_t> v) {
+HWY_INLINE Vec128<int64_t> SumOfLanes(hwy::SizeTag<8> /* tag */,
+                                      const Vec128<int64_t> v) {
   return v + Shuffle01(v);
 }
 #endif
@@ -4931,31 +5111,83 @@ HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
   return Max(v10, v01);
 }
 
-// u16/i16
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
-  const Repartition<int32_t, Simd<T, N, 0>> d32;
+template <size_t N, HWY_IF_GE32(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
+                                       Vec128<uint16_t, N> v) {
+  const Simd<uint16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
   const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
   const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MinOfLanes(d32, Min(even, odd));
+  const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
   // Also broadcast into odd lanes.
-  return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
+  return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
+}
+template <size_t N, HWY_IF_GE32(int16_t, N)>
+HWY_API Vec128<int16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
+                                      Vec128<int16_t, N> v) {
+  const Simd<int16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
 }
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
-  const Repartition<int32_t, Simd<T, N, 0>> d32;
+
+template <size_t N, HWY_IF_GE32(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
+                                       Vec128<uint16_t, N> v) {
+  const Simd<uint16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
   const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
   const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MaxOfLanes(d32, Max(even, odd));
+  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+template <size_t N, HWY_IF_GE32(int16_t, N)>
+HWY_API Vec128<int16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
+                                      Vec128<int16_t, N> v) {
+  const Simd<int16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
   // Also broadcast into odd lanes.
-  return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+
+template <size_t N, HWY_IF_GE32(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
+                                       Vec128<uint16_t, N> v) {
+  const Simd<uint16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+template <size_t N, HWY_IF_GE32(int16_t, N)>
+HWY_API Vec128<int16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
+                                      Vec128<int16_t, N> v) {
+  const Simd<int16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
 }
 
 }  // namespace detail
 
 template <typename T, size_t N>
 HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
-  return detail::SumOfLanes(v);
+  return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
 }
 template <typename T, size_t N>
 HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
@@ -5038,6 +5270,34 @@ HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d,
 
 namespace detail {
 
+// Returns mask[i]? 0xF : 0 in each nibble. This is more efficient than
+// BitsFromMask for use in (partial) CountTrue, FindFirstTrue and AllFalse.
+template <typename T>
+HWY_INLINE uint64_t NibblesFromMask(const Full128<T> d, Mask128<T> mask) {
+  const Full128<uint16_t> du16;
+  const Vec128<uint16_t> vu16 = BitCast(du16, VecFromMask(d, mask));
+  const Vec64<uint8_t> nib(vshrn_n_u16(vu16.raw, 4));
+  return GetLane(BitCast(Full64<uint64_t>(), nib));
+}
+
+template <typename T>
+HWY_INLINE uint64_t NibblesFromMask(const Full64<T> d, Mask64<T> mask) {
+  // There is no vshrn_n_u16 for uint16x4, so zero-extend.
+  const Twice<decltype(d)> d2;
+  const Vec128<T> v128 = ZeroExtendVector(d2, VecFromMask(d, mask));
+  // No need to mask, upper half is zero thanks to ZeroExtendVector.
+  return NibblesFromMask(d2, MaskFromVec(v128));
+}
+
+template <typename T, size_t N, HWY_IF_LE32(T, N)>
+HWY_INLINE uint64_t NibblesFromMask(Simd<T, N, 0> /*d*/, Mask128<T, N> mask) {
+  const Mask64<T> mask64(mask.raw);
+  const uint64_t nib = NibblesFromMask(Full64<T>(), mask64);
+  // Clear nibbles from upper half of 64-bits
+  constexpr size_t kBytes = sizeof(T) * N;
+  return nib & ((1ull << (kBytes * 4)) - 1);
+}
+
 template <typename T>
 HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
                                  const Mask128<T> mask) {
@@ -5195,6 +5455,10 @@ HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
 // Masks are either FF..FF or 0. Unfortunately there is no reduce-sub op
 // ("vsubv"). ANDing with 1 would work but requires a constant. Negating also
 // changes each lane to 1 (if mask set) or 0.
+// NOTE: PopCount also operates on vectors, so we still have to do horizontal
+// sums separately. We specialize CountTrue for full vectors (negating instead
+// of PopCount because it avoids an extra shift), and use PopCount of
+// NibblesFromMask for partial vectors.
 
 template <typename T>
 HWY_INLINE size_t CountTrue(hwy::SizeTag<1> /*tag*/, const Mask128<T> mask) {
@@ -5265,15 +5529,26 @@ HWY_API size_t CountTrue(Full128<T> /* tag */, const Mask128<T> mask) {
 
 // Partial
 template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API size_t CountTrue(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
-  return PopCount(detail::BitsFromMask(mask));
+HWY_API size_t CountTrue(Simd<T, N, 0> d, const Mask128<T, N> mask) {
+  constexpr int kDiv = 4 * sizeof(T);
+  return PopCount(detail::NibblesFromMask(d, mask)) / kDiv;
+}
+
+template <typename T, size_t N>
+HWY_API size_t FindKnownFirstTrue(const Simd<T, N, 0> d,
+                                  const Mask128<T, N> mask) {
+  const uint64_t nib = detail::NibblesFromMask(d, mask);
+  constexpr size_t kDiv = 4 * sizeof(T);
+  return Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv;
 }
 
 template <typename T, size_t N>
-HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
+HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> d,
                                const Mask128<T, N> mask) {
-  const uint64_t bits = detail::BitsFromMask(mask);
-  return bits ? static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(bits)) : -1;
+  const uint64_t nib = detail::NibblesFromMask(d, mask);
+  if (nib == 0) return -1;
+  constexpr int kDiv = 4 * sizeof(T);
+  return static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv);
 }
 
 // `p` points to at least 8 writable bytes.
@@ -5286,29 +5561,21 @@ HWY_API size_t StoreMaskBits(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask,
   return kNumBytes;
 }
 
+template <typename T, size_t N>
+HWY_API bool AllFalse(const Simd<T, N, 0> d, const Mask128<T, N> m) {
+  return detail::NibblesFromMask(d, m) == 0;
+}
+
 // Full
 template <typename T>
-HWY_API bool AllFalse(const Full128<T> d, const Mask128<T> m) {
-#if HWY_ARCH_ARM_A64
-  const Full128<uint32_t> d32;
-  const auto m32 = MaskFromVec(BitCast(d32, VecFromMask(d, m)));
-  return (vmaxvq_u32(m32.raw) == 0);
-#else
-  const auto v64 = BitCast(Full128<uint64_t>(), VecFromMask(d, m));
-  uint32x2_t a = vqmovn_u64(v64.raw);
-  return vget_lane_u64(vreinterpret_u64_u32(a), 0) == 0;
-#endif
+HWY_API bool AllTrue(const Full128<T> d, const Mask128<T> m) {
+  return detail::NibblesFromMask(d, m) == ~0ull;
 }
-
 // Partial
 template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API bool AllFalse(const Simd<T, N, 0> /* tag */, const Mask128<T, N> m) {
-  return detail::BitsFromMask(m) == 0;
-}
-
-template <typename T, size_t N>
 HWY_API bool AllTrue(const Simd<T, N, 0> d, const Mask128<T, N> m) {
-  return AllFalse(d, VecFromMask(d, m) == Zero(d));
+  constexpr size_t kBytes = sizeof(T) * N;
+  return detail::NibblesFromMask(d, m) == (1ull << (kBytes * 4)) - 1;
 }
 
 // ------------------------------ Compress
@@ -5351,6 +5618,7 @@ HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<2> /*tag*/,
   // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
   // is likely more costly than the higher cache footprint from storing bytes.
   alignas(16) constexpr uint8_t table[256 * 8] = {
+      // PrintCompress16x8Tables
       0,  2,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
       2,  0,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
       4,  0,  2,  6,  8,  10, 12, 14, /**/ 0, 4,  2,  6,  8,  10, 12, 14,  //
@@ -5486,12 +5754,165 @@ HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<2> /*tag*/,
 }
 
 template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<2> /*tag*/,
+                                       const uint64_t mask_bits) {
+  HWY_DASSERT(mask_bits < 256);
+  const Simd<T, N, 0> d;
+  const Repartition<uint8_t, decltype(d)> d8;
+  const Simd<uint16_t, N, 0> du;
+
+  // ARM does not provide an equivalent of AVX2 permutevar, so we need byte
+  // indices for VTBL (one vector's worth for each of 256 combinations of
+  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
+  // store lane indices and convert to byte indices (2*lane + 0..1), with the
+  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
+  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
+  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
+  // is likely more costly than the higher cache footprint from storing bytes.
+  alignas(16) constexpr uint8_t table[256 * 8] = {
+      // PrintCompressNot16x8Tables
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 14, 0,   //
+      0, 4,  6,  8,  10, 12, 14, 2,  /**/ 4,  6,  8,  10, 12, 14, 0,  2,   //
+      0, 2,  6,  8,  10, 12, 14, 4,  /**/ 2,  6,  8,  10, 12, 14, 0,  4,   //
+      0, 6,  8,  10, 12, 14, 2,  4,  /**/ 6,  8,  10, 12, 14, 0,  2,  4,   //
+      0, 2,  4,  8,  10, 12, 14, 6,  /**/ 2,  4,  8,  10, 12, 14, 0,  6,   //
+      0, 4,  8,  10, 12, 14, 2,  6,  /**/ 4,  8,  10, 12, 14, 0,  2,  6,   //
+      0, 2,  8,  10, 12, 14, 4,  6,  /**/ 2,  8,  10, 12, 14, 0,  4,  6,   //
+      0, 8,  10, 12, 14, 2,  4,  6,  /**/ 8,  10, 12, 14, 0,  2,  4,  6,   //
+      0, 2,  4,  6,  10, 12, 14, 8,  /**/ 2,  4,  6,  10, 12, 14, 0,  8,   //
+      0, 4,  6,  10, 12, 14, 2,  8,  /**/ 4,  6,  10, 12, 14, 0,  2,  8,   //
+      0, 2,  6,  10, 12, 14, 4,  8,  /**/ 2,  6,  10, 12, 14, 0,  4,  8,   //
+      0, 6,  10, 12, 14, 2,  4,  8,  /**/ 6,  10, 12, 14, 0,  2,  4,  8,   //
+      0, 2,  4,  10, 12, 14, 6,  8,  /**/ 2,  4,  10, 12, 14, 0,  6,  8,   //
+      0, 4,  10, 12, 14, 2,  6,  8,  /**/ 4,  10, 12, 14, 0,  2,  6,  8,   //
+      0, 2,  10, 12, 14, 4,  6,  8,  /**/ 2,  10, 12, 14, 0,  4,  6,  8,   //
+      0, 10, 12, 14, 2,  4,  6,  8,  /**/ 10, 12, 14, 0,  2,  4,  6,  8,   //
+      0, 2,  4,  6,  8,  12, 14, 10, /**/ 2,  4,  6,  8,  12, 14, 0,  10,  //
+      0, 4,  6,  8,  12, 14, 2,  10, /**/ 4,  6,  8,  12, 14, 0,  2,  10,  //
+      0, 2,  6,  8,  12, 14, 4,  10, /**/ 2,  6,  8,  12, 14, 0,  4,  10,  //
+      0, 6,  8,  12, 14, 2,  4,  10, /**/ 6,  8,  12, 14, 0,  2,  4,  10,  //
+      0, 2,  4,  8,  12, 14, 6,  10, /**/ 2,  4,  8,  12, 14, 0,  6,  10,  //
+      0, 4,  8,  12, 14, 2,  6,  10, /**/ 4,  8,  12, 14, 0,  2,  6,  10,  //
+      0, 2,  8,  12, 14, 4,  6,  10, /**/ 2,  8,  12, 14, 0,  4,  6,  10,  //
+      0, 8,  12, 14, 2,  4,  6,  10, /**/ 8,  12, 14, 0,  2,  4,  6,  10,  //
+      0, 2,  4,  6,  12, 14, 8,  10, /**/ 2,  4,  6,  12, 14, 0,  8,  10,  //
+      0, 4,  6,  12, 14, 2,  8,  10, /**/ 4,  6,  12, 14, 0,  2,  8,  10,  //
+      0, 2,  6,  12, 14, 4,  8,  10, /**/ 2,  6,  12, 14, 0,  4,  8,  10,  //
+      0, 6,  12, 14, 2,  4,  8,  10, /**/ 6,  12, 14, 0,  2,  4,  8,  10,  //
+      0, 2,  4,  12, 14, 6,  8,  10, /**/ 2,  4,  12, 14, 0,  6,  8,  10,  //
+      0, 4,  12, 14, 2,  6,  8,  10, /**/ 4,  12, 14, 0,  2,  6,  8,  10,  //
+      0, 2,  12, 14, 4,  6,  8,  10, /**/ 2,  12, 14, 0,  4,  6,  8,  10,  //
+      0, 12, 14, 2,  4,  6,  8,  10, /**/ 12, 14, 0,  2,  4,  6,  8,  10,  //
+      0, 2,  4,  6,  8,  10, 14, 12, /**/ 2,  4,  6,  8,  10, 14, 0,  12,  //
+      0, 4,  6,  8,  10, 14, 2,  12, /**/ 4,  6,  8,  10, 14, 0,  2,  12,  //
+      0, 2,  6,  8,  10, 14, 4,  12, /**/ 2,  6,  8,  10, 14, 0,  4,  12,  //
+      0, 6,  8,  10, 14, 2,  4,  12, /**/ 6,  8,  10, 14, 0,  2,  4,  12,  //
+      0, 2,  4,  8,  10, 14, 6,  12, /**/ 2,  4,  8,  10, 14, 0,  6,  12,  //
+      0, 4,  8,  10, 14, 2,  6,  12, /**/ 4,  8,  10, 14, 0,  2,  6,  12,  //
+      0, 2,  8,  10, 14, 4,  6,  12, /**/ 2,  8,  10, 14, 0,  4,  6,  12,  //
+      0, 8,  10, 14, 2,  4,  6,  12, /**/ 8,  10, 14, 0,  2,  4,  6,  12,  //
+      0, 2,  4,  6,  10, 14, 8,  12, /**/ 2,  4,  6,  10, 14, 0,  8,  12,  //
+      0, 4,  6,  10, 14, 2,  8,  12, /**/ 4,  6,  10, 14, 0,  2,  8,  12,  //
+      0, 2,  6,  10, 14, 4,  8,  12, /**/ 2,  6,  10, 14, 0,  4,  8,  12,  //
+      0, 6,  10, 14, 2,  4,  8,  12, /**/ 6,  10, 14, 0,  2,  4,  8,  12,  //
+      0, 2,  4,  10, 14, 6,  8,  12, /**/ 2,  4,  10, 14, 0,  6,  8,  12,  //
+      0, 4,  10, 14, 2,  6,  8,  12, /**/ 4,  10, 14, 0,  2,  6,  8,  12,  //
+      0, 2,  10, 14, 4,  6,  8,  12, /**/ 2,  10, 14, 0,  4,  6,  8,  12,  //
+      0, 10, 14, 2,  4,  6,  8,  12, /**/ 10, 14, 0,  2,  4,  6,  8,  12,  //
+      0, 2,  4,  6,  8,  14, 10, 12, /**/ 2,  4,  6,  8,  14, 0,  10, 12,  //
+      0, 4,  6,  8,  14, 2,  10, 12, /**/ 4,  6,  8,  14, 0,  2,  10, 12,  //
+      0, 2,  6,  8,  14, 4,  10, 12, /**/ 2,  6,  8,  14, 0,  4,  10, 12,  //
+      0, 6,  8,  14, 2,  4,  10, 12, /**/ 6,  8,  14, 0,  2,  4,  10, 12,  //
+      0, 2,  4,  8,  14, 6,  10, 12, /**/ 2,  4,  8,  14, 0,  6,  10, 12,  //
+      0, 4,  8,  14, 2,  6,  10, 12, /**/ 4,  8,  14, 0,  2,  6,  10, 12,  //
+      0, 2,  8,  14, 4,  6,  10, 12, /**/ 2,  8,  14, 0,  4,  6,  10, 12,  //
+      0, 8,  14, 2,  4,  6,  10, 12, /**/ 8,  14, 0,  2,  4,  6,  10, 12,  //
+      0, 2,  4,  6,  14, 8,  10, 12, /**/ 2,  4,  6,  14, 0,  8,  10, 12,  //
+      0, 4,  6,  14, 2,  8,  10, 12, /**/ 4,  6,  14, 0,  2,  8,  10, 12,  //
+      0, 2,  6,  14, 4,  8,  10, 12, /**/ 2,  6,  14, 0,  4,  8,  10, 12,  //
+      0, 6,  14, 2,  4,  8,  10, 12, /**/ 6,  14, 0,  2,  4,  8,  10, 12,  //
+      0, 2,  4,  14, 6,  8,  10, 12, /**/ 2,  4,  14, 0,  6,  8,  10, 12,  //
+      0, 4,  14, 2,  6,  8,  10, 12, /**/ 4,  14, 0,  2,  6,  8,  10, 12,  //
+      0, 2,  14, 4,  6,  8,  10, 12, /**/ 2,  14, 0,  4,  6,  8,  10, 12,  //
+      0, 14, 2,  4,  6,  8,  10, 12, /**/ 14, 0,  2,  4,  6,  8,  10, 12,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 0,  14,  //
+      0, 4,  6,  8,  10, 12, 2,  14, /**/ 4,  6,  8,  10, 12, 0,  2,  14,  //
+      0, 2,  6,  8,  10, 12, 4,  14, /**/ 2,  6,  8,  10, 12, 0,  4,  14,  //
+      0, 6,  8,  10, 12, 2,  4,  14, /**/ 6,  8,  10, 12, 0,  2,  4,  14,  //
+      0, 2,  4,  8,  10, 12, 6,  14, /**/ 2,  4,  8,  10, 12, 0,  6,  14,  //
+      0, 4,  8,  10, 12, 2,  6,  14, /**/ 4,  8,  10, 12, 0,  2,  6,  14,  //
+      0, 2,  8,  10, 12, 4,  6,  14, /**/ 2,  8,  10, 12, 0,  4,  6,  14,  //
+      0, 8,  10, 12, 2,  4,  6,  14, /**/ 8,  10, 12, 0,  2,  4,  6,  14,  //
+      0, 2,  4,  6,  10, 12, 8,  14, /**/ 2,  4,  6,  10, 12, 0,  8,  14,  //
+      0, 4,  6,  10, 12, 2,  8,  14, /**/ 4,  6,  10, 12, 0,  2,  8,  14,  //
+      0, 2,  6,  10, 12, 4,  8,  14, /**/ 2,  6,  10, 12, 0,  4,  8,  14,  //
+      0, 6,  10, 12, 2,  4,  8,  14, /**/ 6,  10, 12, 0,  2,  4,  8,  14,  //
+      0, 2,  4,  10, 12, 6,  8,  14, /**/ 2,  4,  10, 12, 0,  6,  8,  14,  //
+      0, 4,  10, 12, 2,  6,  8,  14, /**/ 4,  10, 12, 0,  2,  6,  8,  14,  //
+      0, 2,  10, 12, 4,  6,  8,  14, /**/ 2,  10, 12, 0,  4,  6,  8,  14,  //
+      0, 10, 12, 2,  4,  6,  8,  14, /**/ 10, 12, 0,  2,  4,  6,  8,  14,  //
+      0, 2,  4,  6,  8,  12, 10, 14, /**/ 2,  4,  6,  8,  12, 0,  10, 14,  //
+      0, 4,  6,  8,  12, 2,  10, 14, /**/ 4,  6,  8,  12, 0,  2,  10, 14,  //
+      0, 2,  6,  8,  12, 4,  10, 14, /**/ 2,  6,  8,  12, 0,  4,  10, 14,  //
+      0, 6,  8,  12, 2,  4,  10, 14, /**/ 6,  8,  12, 0,  2,  4,  10, 14,  //
+      0, 2,  4,  8,  12, 6,  10, 14, /**/ 2,  4,  8,  12, 0,  6,  10, 14,  //
+      0, 4,  8,  12, 2,  6,  10, 14, /**/ 4,  8,  12, 0,  2,  6,  10, 14,  //
+      0, 2,  8,  12, 4,  6,  10, 14, /**/ 2,  8,  12, 0,  4,  6,  10, 14,  //
+      0, 8,  12, 2,  4,  6,  10, 14, /**/ 8,  12, 0,  2,  4,  6,  10, 14,  //
+      0, 2,  4,  6,  12, 8,  10, 14, /**/ 2,  4,  6,  12, 0,  8,  10, 14,  //
+      0, 4,  6,  12, 2,  8,  10, 14, /**/ 4,  6,  12, 0,  2,  8,  10, 14,  //
+      0, 2,  6,  12, 4,  8,  10, 14, /**/ 2,  6,  12, 0,  4,  8,  10, 14,  //
+      0, 6,  12, 2,  4,  8,  10, 14, /**/ 6,  12, 0,  2,  4,  8,  10, 14,  //
+      0, 2,  4,  12, 6,  8,  10, 14, /**/ 2,  4,  12, 0,  6,  8,  10, 14,  //
+      0, 4,  12, 2,  6,  8,  10, 14, /**/ 4,  12, 0,  2,  6,  8,  10, 14,  //
+      0, 2,  12, 4,  6,  8,  10, 14, /**/ 2,  12, 0,  4,  6,  8,  10, 14,  //
+      0, 12, 2,  4,  6,  8,  10, 14, /**/ 12, 0,  2,  4,  6,  8,  10, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 0,  12, 14,  //
+      0, 4,  6,  8,  10, 2,  12, 14, /**/ 4,  6,  8,  10, 0,  2,  12, 14,  //
+      0, 2,  6,  8,  10, 4,  12, 14, /**/ 2,  6,  8,  10, 0,  4,  12, 14,  //
+      0, 6,  8,  10, 2,  4,  12, 14, /**/ 6,  8,  10, 0,  2,  4,  12, 14,  //
+      0, 2,  4,  8,  10, 6,  12, 14, /**/ 2,  4,  8,  10, 0,  6,  12, 14,  //
+      0, 4,  8,  10, 2,  6,  12, 14, /**/ 4,  8,  10, 0,  2,  6,  12, 14,  //
+      0, 2,  8,  10, 4,  6,  12, 14, /**/ 2,  8,  10, 0,  4,  6,  12, 14,  //
+      0, 8,  10, 2,  4,  6,  12, 14, /**/ 8,  10, 0,  2,  4,  6,  12, 14,  //
+      0, 2,  4,  6,  10, 8,  12, 14, /**/ 2,  4,  6,  10, 0,  8,  12, 14,  //
+      0, 4,  6,  10, 2,  8,  12, 14, /**/ 4,  6,  10, 0,  2,  8,  12, 14,  //
+      0, 2,  6,  10, 4,  8,  12, 14, /**/ 2,  6,  10, 0,  4,  8,  12, 14,  //
+      0, 6,  10, 2,  4,  8,  12, 14, /**/ 6,  10, 0,  2,  4,  8,  12, 14,  //
+      0, 2,  4,  10, 6,  8,  12, 14, /**/ 2,  4,  10, 0,  6,  8,  12, 14,  //
+      0, 4,  10, 2,  6,  8,  12, 14, /**/ 4,  10, 0,  2,  6,  8,  12, 14,  //
+      0, 2,  10, 4,  6,  8,  12, 14, /**/ 2,  10, 0,  4,  6,  8,  12, 14,  //
+      0, 10, 2,  4,  6,  8,  12, 14, /**/ 10, 0,  2,  4,  6,  8,  12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  0,  10, 12, 14,  //
+      0, 4,  6,  8,  2,  10, 12, 14, /**/ 4,  6,  8,  0,  2,  10, 12, 14,  //
+      0, 2,  6,  8,  4,  10, 12, 14, /**/ 2,  6,  8,  0,  4,  10, 12, 14,  //
+      0, 6,  8,  2,  4,  10, 12, 14, /**/ 6,  8,  0,  2,  4,  10, 12, 14,  //
+      0, 2,  4,  8,  6,  10, 12, 14, /**/ 2,  4,  8,  0,  6,  10, 12, 14,  //
+      0, 4,  8,  2,  6,  10, 12, 14, /**/ 4,  8,  0,  2,  6,  10, 12, 14,  //
+      0, 2,  8,  4,  6,  10, 12, 14, /**/ 2,  8,  0,  4,  6,  10, 12, 14,  //
+      0, 8,  2,  4,  6,  10, 12, 14, /**/ 8,  0,  2,  4,  6,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  0,  8,  10, 12, 14,  //
+      0, 4,  6,  2,  8,  10, 12, 14, /**/ 4,  6,  0,  2,  8,  10, 12, 14,  //
+      0, 2,  6,  4,  8,  10, 12, 14, /**/ 2,  6,  0,  4,  8,  10, 12, 14,  //
+      0, 6,  2,  4,  8,  10, 12, 14, /**/ 6,  0,  2,  4,  8,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  0,  6,  8,  10, 12, 14,  //
+      0, 4,  2,  6,  8,  10, 12, 14, /**/ 4,  0,  2,  6,  8,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  0,  4,  6,  8,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 0,  2,  4,  6,  8,  10, 12, 14};
+
+  const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
+  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
+  return BitCast(d, pairs + Set(du, 0x0100));
+}
+
+template <typename T, size_t N>
 HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<4> /*tag*/,
                                     const uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 16);
 
   // There are only 4 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t packed_array[16 * 16] = {
+  alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
+      // PrintCompress32x4Tables
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
       4,  5,  6,  7,  0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15,  //
@@ -5510,7 +5931,35 @@ HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<4> /*tag*/,
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15};
   const Simd<T, N, 0> d;
   const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
+}
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<4> /*tag*/,
+                                       const uint64_t mask_bits) {
+  HWY_DASSERT(mask_bits < 16);
+
+  // There are only 4 lanes, so we can afford to load the index vector directly.
+  alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
+      // PrintCompressNot32x4Tables
+      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,
+      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,
+      8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
+      14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
+      12, 13, 14, 15, 8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 0,  1,
+      2,  3,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,
+      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+      10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 0,  1,
+      2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,
+      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  4,  5,
+      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  0,  1,  2,  3,
+      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+      10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+      12, 13, 14, 15};
+  const Simd<T, N, 0> d;
+  const Repartition<uint8_t, decltype(d)> d8;
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
 }
 
 #if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
@@ -5521,7 +5970,8 @@ HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<8> /*tag*/,
   HWY_DASSERT(mask_bits < 4);
 
   // There are only 2 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t packed_array[4 * 16] = {
+  alignas(16) constexpr uint8_t u8_indices[64] = {
+      // PrintCompress64x2Tables
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
       8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
@@ -5529,7 +5979,25 @@ HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<8> /*tag*/,
 
   const Simd<T, N, 0> d;
   const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
+}
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<8> /*tag*/,
+                                       const uint64_t mask_bits) {
+  HWY_DASSERT(mask_bits < 4);
+
+  // There are only 2 lanes, so we can afford to load the index vector directly.
+  alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
+      // PrintCompressNot64x2Tables
+      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
+      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
+      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
+      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
+
+  const Simd<T, N, 0> d;
+  const Repartition<uint8_t, decltype(d)> d8;
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
 }
 
 #endif
@@ -5545,13 +6013,76 @@ HWY_INLINE Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) {
   return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
 }
 
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, const uint64_t mask_bits) {
+  const auto idx =
+      detail::IdxFromNotBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
+  using D = Simd<T, N, 0>;
+  const RebindToSigned<D> di;
+  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
+}
+
 }  // namespace detail
 
-template <typename T, size_t N>
+// Single lane: no-op
+template <typename T>
+HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
+  return v;
+}
+
+// Two lanes: conditional swap
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
+  // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
+  const Simd<T, N, 0> d;
+  const Vec128<T, N> m = VecFromMask(d, mask);
+  const Vec128<T, N> maskL = DupEven(m);
+  const Vec128<T, N> maskH = DupOdd(m);
+  const Vec128<T, N> swap = AndNot(maskL, maskH);
+  return IfVecThenElse(swap, Shuffle01(v), v);
+}
+
+// General case
+template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
   return detail::Compress(v, detail::BitsFromMask(mask));
 }
 
+// Single lane: no-op
+template <typename T>
+HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
+  return v;
+}
+
+// Two lanes: conditional swap
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
+  // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
+  const Full128<T> d;
+  const Vec128<T> m = VecFromMask(d, mask);
+  const Vec128<T> maskL = DupEven(m);
+  const Vec128<T> maskH = DupOdd(m);
+  const Vec128<T> swap = AndNot(maskH, maskL);
+  return IfVecThenElse(swap, Shuffle01(v), v);
+}
+
+// General case
+template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
+HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
+  // For partial vectors, we cannot pull the Not() into the table because
+  // BitsFromMask clears the upper bits.
+  if (N < 16 / sizeof(T)) {
+    return detail::Compress(v, detail::BitsFromMask(Not(mask)));
+  }
+  return detail::CompressNot(v, detail::BitsFromMask(mask));
+}
+
+// ------------------------------ CompressBlocksNot
+HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
+                                           Mask128<uint64_t> /* m */) {
+  return v;
+}
+
 // ------------------------------ CompressBits
 
 template <typename T, size_t N>
@@ -5943,19 +6474,10 @@ HWY_API void StoreInterleaved4(const Vec128<T> v0, const Vec128<T> v1,
 
 // ------------------------------ Lt128
 
-namespace detail {
-
-template <size_t kLanes, typename T, size_t N>
-Mask128<T, N> ShiftMaskLeft(Mask128<T, N> m) {
-  return MaskFromVec(ShiftLeftLanes<kLanes>(VecFromMask(Simd<T, N, 0>(), m)));
-}
-
-}  // namespace detail
-
 template <typename T, size_t N, HWY_IF_LE128(T, N)>
 HWY_INLINE Mask128<T, N> Lt128(Simd<T, N, 0> d, Vec128<T, N> a,
                                Vec128<T, N> b) {
-  static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
+  static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
   // Truth table of Eq and Lt for Hi and Lo u64.
   // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
   // =H =L cH cL  | out = cH | (=H & cL)
@@ -5970,85 +6492,78 @@ HWY_INLINE Mask128<T, N> Lt128(Simd<T, N, 0> d, Vec128<T, N> a,
   //  1  0  0  1  |  1
   //  1  1  0  0  |  0
   const Mask128<T, N> eqHL = Eq(a, b);
-  const Mask128<T, N> ltHL = Lt(a, b);
+  const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
   // We need to bring cL to the upper lane/bit corresponding to cH. Comparing
   // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the
-  // comparison result leftwards requires only 4.
-  const Mask128<T, N> ltLx = detail::ShiftMaskLeft<1>(ltHL);
-  const Mask128<T, N> outHx = Or(ltHL, And(eqHL, ltLx));
-  const Vec128<T, N> vecHx = VecFromMask(d, outHx);
-  return MaskFromVec(InterleaveUpper(d, vecHx, vecHx));
+  // comparison result leftwards requires only 4. IfThenElse compiles to the
+  // same code as OrAnd().
+  const Vec128<T, N> ltLx = DupEven(ltHL);
+  const Vec128<T, N> outHx = IfThenElse(eqHL, ltLx, ltHL);
+  return MaskFromVec(DupOdd(outHx));
 }
 
-// ------------------------------ Min128, Max128 (Lt128)
-
-// Without a native OddEven, it seems infeasible to go faster than Lt128.
-template <class D>
-HWY_INLINE VFromD<D> Min128(D d, const VFromD<D> a, const VFromD<D> b) {
-  return IfThenElse(Lt128(d, a, b), a, b);
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Mask128<T, N> Lt128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
+                                    Vec128<T, N> b) {
+  const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
+  return MaskFromVec(InterleaveUpper(d, ltHL, ltHL));
 }
 
-template <class D>
-HWY_INLINE VFromD<D> Max128(D d, const VFromD<D> a, const VFromD<D> b) {
-  return IfThenElse(Lt128(d, a, b), b, a);
+// ------------------------------ Eq128
+
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Mask128<T, N> Eq128(Simd<T, N, 0> d, Vec128<T, N> a,
+                               Vec128<T, N> b) {
+  static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
+  const Vec128<T, N> eqHL = VecFromMask(d, Eq(a, b));
+  return MaskFromVec(And(Reverse2(d, eqHL), eqHL));
 }
 
-// ================================================== Operator wrapper
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Mask128<T, N> Eq128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
+                                    Vec128<T, N> b) {
+  const Vec128<T, N> eqHL = VecFromMask(d, Eq(a, b));
+  return MaskFromVec(InterleaveUpper(d, eqHL, eqHL));
+}
 
-// These apply to all x86_*-inl.h because there are no restrictions on V.
+// ------------------------------ Ne128
 
-template <class V>
-HWY_API V Add(V a, V b) {
-  return a + b;
-}
-template <class V>
-HWY_API V Sub(V a, V b) {
-  return a - b;
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Mask128<T, N> Ne128(Simd<T, N, 0> d, Vec128<T, N> a,
+                               Vec128<T, N> b) {
+  static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
+  const Vec128<T, N> neHL = VecFromMask(d, Ne(a, b));
+  return MaskFromVec(Or(Reverse2(d, neHL), neHL));
 }
 
-template <class V>
-HWY_API V Mul(V a, V b) {
-  return a * b;
-}
-template <class V>
-HWY_API V Div(V a, V b) {
-  return a / b;
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Mask128<T, N> Ne128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
+                                    Vec128<T, N> b) {
+  const Vec128<T, N> neHL = VecFromMask(d, Ne(a, b));
+  return MaskFromVec(InterleaveUpper(d, neHL, neHL));
 }
 
-template <class V>
-V Shl(V a, V b) {
-  return a << b;
-}
-template <class V>
-V Shr(V a, V b) {
-  return a >> b;
-}
+// ------------------------------ Min128, Max128 (Lt128)
 
-template <class V>
-HWY_API auto Eq(V a, V b) -> decltype(a == b) {
-  return a == b;
-}
-template <class V>
-HWY_API auto Ne(V a, V b) -> decltype(a == b) {
-  return a != b;
-}
-template <class V>
-HWY_API auto Lt(V a, V b) -> decltype(a == b) {
-  return a < b;
+// Without a native OddEven, it seems infeasible to go faster than Lt128.
+template <class D>
+HWY_INLINE VFromD<D> Min128(D d, const VFromD<D> a, const VFromD<D> b) {
+  return IfThenElse(Lt128(d, a, b), a, b);
 }
 
-template <class V>
-HWY_API auto Gt(V a, V b) -> decltype(a == b) {
-  return a > b;
+template <class D>
+HWY_INLINE VFromD<D> Max128(D d, const VFromD<D> a, const VFromD<D> b) {
+  return IfThenElse(Lt128(d, b, a), a, b);
 }
-template <class V>
-HWY_API auto Ge(V a, V b) -> decltype(a == b) {
-  return a >= b;
+
+template <class D>
+HWY_INLINE VFromD<D> Min128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
+  return IfThenElse(Lt128Upper(d, a, b), a, b);
 }
 
-template <class V>
-HWY_API auto Le(V a, V b) -> decltype(a == b) {
-  return a <= b;
+template <class D>
+HWY_INLINE VFromD<D> Max128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
+  return IfThenElse(Lt128Upper(d, b, a), a, b);
 }
 
 namespace detail {  // for code folding
diff --git a/media/highway/src/hwy/ops/arm_sve-inl.h b/media/highway/src/hwy/ops/arm_sve-inl.h
index 9c86c0f7c5..1ccac9e6eb 100644
--- a/media/highway/src/hwy/ops/arm_sve-inl.h
+++ b/media/highway/src/hwy/ops/arm_sve-inl.h
@@ -24,11 +24,13 @@
 #include "hwy/ops/shared-inl.h"
 
 // If running on hardware whose vector length is known to be a power of two, we
-// can skip fixups for non-power of two sizes. This may be 1 on future
-// fixed-size SVE targets.
-#ifndef HWY_SVE_IS_POW2
+// can skip fixups for non-power of two sizes.
+#undef HWY_SVE_IS_POW2
+#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
+#define HWY_SVE_IS_POW2 1
+#else
 #define HWY_SVE_IS_POW2 0
-#endif  // HWY_SVE_IS_POW2
+#endif
 
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
@@ -42,11 +44,6 @@ using DFromV = typename DFromV_t<RemoveConst<V>>::type;
 template <class V>
 using TFromV = TFromD<DFromV<V>>;
 
-#define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
-#define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
-#define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>)
-#define HWY_IF_LANE_SIZE_V(V, bytes) HWY_IF_LANE_SIZE(TFromV<V>, bytes)
-
 // ================================================== MACROS
 
 // Generate specializations and function definitions using X macros. Although
@@ -202,31 +199,57 @@ HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag<8> /* tag */) {
   return svcntd_pat(SV_ALL);
 }
 
+// All-true mask from a macro
+#define HWY_SVE_ALL_PTRUE(BITS) svptrue_pat_b##BITS(SV_ALL)
+
+#if HWY_SVE_IS_POW2
+#define HWY_SVE_PTRUE(BITS) HWY_SVE_ALL_PTRUE(BITS)
+#else
+#define HWY_SVE_PTRUE(BITS) svptrue_pat_b##BITS(SV_POW2)
+
 // Returns actual lanes of a hardware vector, rounded down to a power of two.
-HWY_INLINE size_t HardwareLanes(hwy::SizeTag<1> /* tag */) {
+template <typename T, HWY_IF_LANE_SIZE(T, 1)>
+HWY_INLINE size_t HardwareLanes() {
   return svcntb_pat(SV_POW2);
 }
-HWY_INLINE size_t HardwareLanes(hwy::SizeTag<2> /* tag */) {
+template <typename T, HWY_IF_LANE_SIZE(T, 2)>
+HWY_INLINE size_t HardwareLanes() {
   return svcnth_pat(SV_POW2);
 }
-HWY_INLINE size_t HardwareLanes(hwy::SizeTag<4> /* tag */) {
+template <typename T, HWY_IF_LANE_SIZE(T, 4)>
+HWY_INLINE size_t HardwareLanes() {
   return svcntw_pat(SV_POW2);
 }
-HWY_INLINE size_t HardwareLanes(hwy::SizeTag<8> /* tag */) {
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_INLINE size_t HardwareLanes() {
   return svcntd_pat(SV_POW2);
 }
 
+#endif  // HWY_SVE_IS_POW2
+
 }  // namespace detail
 
 // Returns actual number of lanes after capping by N and shifting. May return 0
 // (e.g. for "1/8th" of a u32x4 - would be 1 for 1/8th of u32x8).
+#if HWY_TARGET == HWY_SVE_256
+template <typename T, size_t N, int kPow2>
+HWY_API constexpr size_t Lanes(Simd<T, N, kPow2> /* d */) {
+  return HWY_MIN(detail::ScaleByPower(32 / sizeof(T), kPow2), N);
+}
+#elif HWY_TARGET == HWY_SVE2_128
+template <typename T, size_t N, int kPow2>
+HWY_API constexpr size_t Lanes(Simd<T, N, kPow2> /* d */) {
+  return HWY_MIN(detail::ScaleByPower(16 / sizeof(T), kPow2), N);
+}
+#else
 template <typename T, size_t N, int kPow2>
 HWY_API size_t Lanes(Simd<T, N, kPow2> d) {
-  const size_t actual = detail::HardwareLanes(hwy::SizeTag<sizeof(T)>());
+  const size_t actual = detail::HardwareLanes<T>();
   // Common case of full vectors: avoid any extra instructions.
   if (detail::IsFull(d)) return actual;
   return HWY_MIN(detail::ScaleByPower(actual, kPow2), N);
 }
+#endif  // HWY_TARGET
 
 // ================================================== MASK INIT
 
@@ -242,15 +265,19 @@ HWY_API size_t Lanes(Simd<T, N, kPow2> d) {
 HWY_SVE_FOREACH(HWY_SVE_FIRSTN, FirstN, whilelt)
 #undef HWY_SVE_FIRSTN
 
-namespace detail {
+template <class D>
+using MFromD = decltype(FirstN(D(), 0));
 
-// All-true mask from a macro
-#define HWY_SVE_PTRUE(BITS) svptrue_pat_b##BITS(SV_POW2)
+namespace detail {
 
-#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP)       \
-  template <size_t N, int kPow2>                                   \
-  HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
-    return HWY_SVE_PTRUE(BITS);                                    \
+#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP)            \
+  template <size_t N, int kPow2>                                        \
+  HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) {      \
+    return HWY_SVE_PTRUE(BITS);                                         \
+  }                                                                     \
+  template <size_t N, int kPow2>                                        \
+  HWY_API svbool_t All##NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
+    return HWY_SVE_ALL_PTRUE(BITS);                                     \
   }
 
 HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue)  // return all-true
@@ -296,7 +323,9 @@ using VFromD = decltype(Set(D(), TFromD<D>()));
 
 template <class D>
 VFromD<D> Zero(D d) {
-  return Set(d, 0);
+  // Cast to support bfloat16_t.
+  const RebindToUnsigned<decltype(d)> du;
+  return BitCast(d, Set(du, 0));
 }
 
 // ------------------------------ Undefined
@@ -605,12 +634,18 @@ HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, MaxN, max_n)
 HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGPVV, Mul, mul)
 HWY_SVE_FOREACH_UIF3264(HWY_SVE_RETV_ARGPVV, Mul, mul)
 
+// Per-target flag to prevent generic_ops-inl.h from defining i64 operator*.
+#ifdef HWY_NATIVE_I64MULLO
+#undef HWY_NATIVE_I64MULLO
+#else
+#define HWY_NATIVE_I64MULLO
+#endif
+
 // ------------------------------ MulHigh
 HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
-namespace detail {
+// Not part of API, used internally:
 HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
 HWY_SVE_FOREACH_U64(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
-}  // namespace detail
 
 // ------------------------------ MulFixedPoint15
 HWY_API svint16_t MulFixedPoint15(svint16_t a, svint16_t b) {
@@ -701,6 +736,10 @@ HWY_API svbool_t Xor(svbool_t a, svbool_t b) {
   return svsel_b(a, svnand_b_z(a, a, b), b);  // a ? !(a & b) : b.
 }
 
+HWY_API svbool_t ExclusiveNeither(svbool_t a, svbool_t b) {
+  return svnor_b_z(HWY_SVE_PTRUE(8), a, b);  // !a && !b, undefined if a && b.
+}
+
 // ------------------------------ CountTrue
 
 #define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP)           \
@@ -746,6 +785,12 @@ HWY_API intptr_t FindFirstTrue(D d, svbool_t m) {
                               CountTrue(d, svbrkb_b_z(detail::MakeMask(d), m)));
 }
 
+// ------------------------------ FindKnownFirstTrue
+template <class D>
+HWY_API size_t FindKnownFirstTrue(D d, svbool_t m) {
+  return CountTrue(d, svbrkb_b_z(detail::MakeMask(d), m));
+}
+
 // ------------------------------ IfThenElse
 #define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP)                \
   HWY_API HWY_SVE_V(BASE, BITS)                                               \
@@ -757,14 +802,14 @@ HWY_SVE_FOREACH(HWY_SVE_IF_THEN_ELSE, IfThenElse, sel)
 #undef HWY_SVE_IF_THEN_ELSE
 
 // ------------------------------ IfThenElseZero
-template <class M, class V>
-HWY_API V IfThenElseZero(const M mask, const V yes) {
+template <class V>
+HWY_API V IfThenElseZero(const svbool_t mask, const V yes) {
   return IfThenElse(mask, yes, Zero(DFromV<V>()));
 }
 
 // ------------------------------ IfThenZeroElse
-template <class M, class V>
-HWY_API V IfThenZeroElse(const M mask, const V no) {
+template <class V>
+HWY_API V IfThenZeroElse(const svbool_t mask, const V no) {
   return IfThenElse(mask, Zero(DFromV<V>()), no);
 }
 
@@ -827,26 +872,45 @@ HWY_API svbool_t MaskFromVec(const V v) {
 }
 
 // ------------------------------ VecFromMask
-
-template <class D, HWY_IF_NOT_FLOAT_D(D)>
+template <class D>
 HWY_API VFromD<D> VecFromMask(const D d, svbool_t mask) {
-  const auto v0 = Zero(RebindToSigned<decltype(d)>());
-  return BitCast(d, detail::SubN(mask, v0, 1));
+  const RebindToSigned<D> di;
+  // This generates MOV imm, whereas svdup_n_s8_z generates MOV scalar, which
+  // requires an extra instruction plus M0 pipeline.
+  return BitCast(d, IfThenElseZero(mask, Set(di, -1)));
 }
 
-template <class D, HWY_IF_FLOAT_D(D)>
-HWY_API VFromD<D> VecFromMask(const D d, svbool_t mask) {
-  return BitCast(d, VecFromMask(RebindToUnsigned<D>(), mask));
+// ------------------------------ IfVecThenElse (MaskFromVec, IfThenElse)
+
+#if HWY_TARGET == HWY_SVE2
+
+#define HWY_SVE_IF_VEC(BASE, CHAR, BITS, HALF, NAME, OP)          \
+  HWY_API HWY_SVE_V(BASE, BITS)                                   \
+      NAME(HWY_SVE_V(BASE, BITS) mask, HWY_SVE_V(BASE, BITS) yes, \
+           HWY_SVE_V(BASE, BITS) no) {                            \
+    return sv##OP##_##CHAR##BITS(yes, no, mask);                  \
+  }
+
+HWY_SVE_FOREACH_UI(HWY_SVE_IF_VEC, IfVecThenElse, bsl)
+#undef HWY_SVE_IF_VEC
+
+template <class V, HWY_IF_FLOAT_V(V)>
+HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
+  const DFromV<V> d;
+  const RebindToUnsigned<decltype(d)> du;
+  return BitCast(
+      d, IfVecThenElse(BitCast(du, mask), BitCast(du, yes), BitCast(du, no)));
 }
 
-// ------------------------------ IfVecThenElse (MaskFromVec, IfThenElse)
+#else
 
 template <class V>
 HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
-  // TODO(janwas): use svbsl for SVE2
-  return IfThenElse(MaskFromVec(mask), yes, no);
+  return Or(And(mask, yes), AndNot(mask, no));
 }
 
+#endif  // HWY_TARGET == HWY_SVE2
+
 // ------------------------------ Floating-point classification (Ne)
 
 template <class V>
@@ -1171,32 +1235,31 @@ HWY_API svint32_t PromoteTo(Simd<int32_t, N, kPow2> dto, svuint8_t vfrom) {
 
 // ------------------------------ PromoteTo F
 
-// svcvt* expects inputs in even lanes, whereas Highway wants lower lanes, so
-// first replicate each lane once.
+// Unlike Highway's ZipLower, this returns the same type.
 namespace detail {
-HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipLower, zip1)
-// Do not use zip2 to implement PromoteUpperTo or similar because vectors may be
-// non-powers of two, so getting the actual "upper half" requires MaskUpperHalf.
+HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipLowerSame, zip1)
 }  // namespace detail
 
 template <size_t N, int kPow2>
 HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> /* d */,
                               const svfloat16_t v) {
-  const svfloat16_t vv = detail::ZipLower(v, v);
+  // svcvt* expects inputs in even lanes, whereas Highway wants lower lanes, so
+  // first replicate each lane once.
+  const svfloat16_t vv = detail::ZipLowerSame(v, v);
   return svcvt_f32_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()), vv);
 }
 
 template <size_t N, int kPow2>
 HWY_API svfloat64_t PromoteTo(Simd<float64_t, N, kPow2> /* d */,
                               const svfloat32_t v) {
-  const svfloat32_t vv = detail::ZipLower(v, v);
+  const svfloat32_t vv = detail::ZipLowerSame(v, v);
   return svcvt_f64_f32_x(detail::PTrue(Simd<float32_t, N, kPow2>()), vv);
 }
 
 template <size_t N, int kPow2>
 HWY_API svfloat64_t PromoteTo(Simd<float64_t, N, kPow2> /* d */,
                               const svint32_t v) {
-  const svint32_t vv = detail::ZipLower(v, v);
+  const svint32_t vv = detail::ZipLowerSame(v, v);
   return svcvt_f64_s32_x(detail::PTrue(Simd<int32_t, N, kPow2>()), vv);
 }
 
@@ -1281,6 +1344,60 @@ HWY_API svuint8_t U8FromU32(const svuint32_t v) {
   return svuzp1_u8(cast8, cast8);
 }
 
+// ------------------------------ Truncations
+
+template <size_t N, int kPow2>
+HWY_API svuint8_t TruncateTo(Simd<uint8_t, N, kPow2> /* tag */,
+                             const svuint64_t v) {
+  const DFromV<svuint8_t> d;
+  const svuint8_t v1 = BitCast(d, v);
+  const svuint8_t v2 = svuzp1_u8(v1, v1);
+  const svuint8_t v3 = svuzp1_u8(v2, v2);
+  return svuzp1_u8(v3, v3);
+}
+
+template <size_t N, int kPow2>
+HWY_API svuint16_t TruncateTo(Simd<uint16_t, N, kPow2> /* tag */,
+                              const svuint64_t v) {
+  const DFromV<svuint16_t> d;
+  const svuint16_t v1 = BitCast(d, v);
+  const svuint16_t v2 = svuzp1_u16(v1, v1);
+  return svuzp1_u16(v2, v2);
+}
+
+template <size_t N, int kPow2>
+HWY_API svuint32_t TruncateTo(Simd<uint32_t, N, kPow2> /* tag */,
+                              const svuint64_t v) {
+  const DFromV<svuint32_t> d;
+  const svuint32_t v1 = BitCast(d, v);
+  return svuzp1_u32(v1, v1);
+}
+
+template <size_t N, int kPow2>
+HWY_API svuint8_t TruncateTo(Simd<uint8_t, N, kPow2> /* tag */,
+                             const svuint32_t v) {
+  const DFromV<svuint8_t> d;
+  const svuint8_t v1 = BitCast(d, v);
+  const svuint8_t v2 = svuzp1_u8(v1, v1);
+  return svuzp1_u8(v2, v2);
+}
+
+template <size_t N, int kPow2>
+HWY_API svuint16_t TruncateTo(Simd<uint16_t, N, kPow2> /* tag */,
+                              const svuint32_t v) {
+  const DFromV<svuint16_t> d;
+  const svuint16_t v1 = BitCast(d, v);
+  return svuzp1_u16(v1, v1);
+}
+
+template <size_t N, int kPow2>
+HWY_API svuint8_t TruncateTo(Simd<uint8_t, N, kPow2> /* tag */,
+                             const svuint16_t v) {
+  const DFromV<svuint8_t> d;
+  const svuint8_t v1 = BitCast(d, v);
+  return svuzp1_u8(v1, v1);
+}
+
 // ------------------------------ DemoteTo I
 
 template <size_t N, int kPow2>
@@ -1329,8 +1446,12 @@ namespace detail {
       NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) {      \
     return sv##OP##_##CHAR##BITS(lo, hi);                             \
   }
-HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEven, uzp1)
-HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOdd, uzp2)
+HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1)
+HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2)
+#if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
+HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
+HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
+#endif
 #undef HWY_SVE_CONCAT_EVERY_SECOND
 
 // Used to slide up / shift whole register left; mask indicates which range
@@ -1349,10 +1470,10 @@ template <class D>
 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
 #if HWY_SVE_IS_POW2
   (void)d;
-  return detail::ConcatOdd(hi, lo);
+  return detail::ConcatOddFull(hi, lo);
 #else
-  const VFromD<D> hi_odd = detail::ConcatOdd(hi, hi);
-  const VFromD<D> lo_odd = detail::ConcatOdd(lo, lo);
+  const VFromD<D> hi_odd = detail::ConcatOddFull(hi, hi);
+  const VFromD<D> lo_odd = detail::ConcatOddFull(lo, lo);
   return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
 #endif
 }
@@ -1361,10 +1482,10 @@ template <class D>
 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
 #if HWY_SVE_IS_POW2
   (void)d;
-  return detail::ConcatEven(hi, lo);
+  return detail::ConcatEvenFull(hi, lo);
 #else
-  const VFromD<D> hi_odd = detail::ConcatEven(hi, hi);
-  const VFromD<D> lo_odd = detail::ConcatEven(lo, lo);
+  const VFromD<D> hi_odd = detail::ConcatEvenFull(hi, hi);
+  const VFromD<D> lo_odd = detail::ConcatEvenFull(lo, lo);
   return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
 #endif
 }
@@ -1374,35 +1495,45 @@ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
 template <size_t N, int kPow2>
 HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat32_t v) {
   const svfloat16_t in_even = svcvt_f16_f32_x(detail::PTrue(d), v);
-  return detail::ConcatEven(in_even, in_even);  // only low 1/2 of result valid
+  return detail::ConcatEvenFull(in_even,
+                                in_even);  // lower half
 }
 
 template <size_t N, int kPow2>
 HWY_API svuint16_t DemoteTo(Simd<bfloat16_t, N, kPow2> /* d */, svfloat32_t v) {
   const svuint16_t in_even = BitCast(ScalableTag<uint16_t>(), v);
-  return detail::ConcatOdd(in_even, in_even);  // can ignore upper half of vec
+  return detail::ConcatOddFull(in_even, in_even);  // lower half
 }
 
 template <size_t N, int kPow2>
 HWY_API svfloat32_t DemoteTo(Simd<float32_t, N, kPow2> d, const svfloat64_t v) {
   const svfloat32_t in_even = svcvt_f32_f64_x(detail::PTrue(d), v);
-  return detail::ConcatEven(in_even, in_even);  // only low 1/2 of result valid
+  return detail::ConcatEvenFull(in_even,
+                                in_even);  // lower half
 }
 
 template <size_t N, int kPow2>
 HWY_API svint32_t DemoteTo(Simd<int32_t, N, kPow2> d, const svfloat64_t v) {
   const svint32_t in_even = svcvt_s32_f64_x(detail::PTrue(d), v);
-  return detail::ConcatEven(in_even, in_even);  // only low 1/2 of result valid
+  return detail::ConcatEvenFull(in_even,
+                                in_even);  // lower half
 }
 
 // ------------------------------ ConvertTo F
 
 #define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP)                     \
+  /* signed integers */                                                       \
   template <size_t N, int kPow2>                                              \
   HWY_API HWY_SVE_V(BASE, BITS)                                               \
       NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(int, BITS) v) { \
     return sv##OP##_##CHAR##BITS##_s##BITS##_x(HWY_SVE_PTRUE(BITS), v);       \
   }                                                                           \
+  /* unsigned integers */                                                     \
+  template <size_t N, int kPow2>                                              \
+  HWY_API HWY_SVE_V(BASE, BITS)                                               \
+      NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(uint, BITS) v) { \
+    return sv##OP##_##CHAR##BITS##_u##BITS##_x(HWY_SVE_PTRUE(BITS), v);       \
+  }                                                                           \
   /* Truncates (rounds toward zero). */                                       \
   template <size_t N, int kPow2>                                              \
   HWY_API HWY_SVE_V(int, BITS)                                                \
@@ -1439,16 +1570,177 @@ HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
   return detail::AddN(ConvertTo(d, Iota(di, 0)), first);
 }
 
+// ------------------------------ InterleaveLower
+
+template <class D, class V>
+HWY_API V InterleaveLower(D d, const V a, const V b) {
+  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
+#if HWY_TARGET == HWY_SVE2_128
+  (void)d;
+  return detail::ZipLowerSame(a, b);
+#else
+  // Move lower halves of blocks to lower half of vector.
+  const Repartition<uint64_t, decltype(d)> d64;
+  const auto a64 = BitCast(d64, a);
+  const auto b64 = BitCast(d64, b);
+  const auto a_blocks = detail::ConcatEvenFull(a64, a64);  // lower half
+  const auto b_blocks = detail::ConcatEvenFull(b64, b64);
+  return detail::ZipLowerSame(BitCast(d, a_blocks), BitCast(d, b_blocks));
+#endif
+}
+
+template <class V>
+HWY_API V InterleaveLower(const V a, const V b) {
+  return InterleaveLower(DFromV<V>(), a, b);
+}
+
+// ------------------------------ InterleaveUpper
+
+// Only use zip2 if vector are a powers of two, otherwise getting the actual
+// "upper half" requires MaskUpperHalf.
+#if HWY_TARGET == HWY_SVE2_128
+namespace detail {
+// Unlike Highway's ZipUpper, this returns the same type.
+HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipUpperSame, zip2)
+}  // namespace detail
+#endif
+
+// Full vector: guaranteed to have at least one block
+template <class D, class V = VFromD<D>,
+          hwy::EnableIf<detail::IsFull(D())>* = nullptr>
+HWY_API V InterleaveUpper(D d, const V a, const V b) {
+#if HWY_TARGET == HWY_SVE2_128
+  (void)d;
+  return detail::ZipUpperSame(a, b);
+#else
+  // Move upper halves of blocks to lower half of vector.
+  const Repartition<uint64_t, decltype(d)> d64;
+  const auto a64 = BitCast(d64, a);
+  const auto b64 = BitCast(d64, b);
+  const auto a_blocks = detail::ConcatOddFull(a64, a64);  // lower half
+  const auto b_blocks = detail::ConcatOddFull(b64, b64);
+  return detail::ZipLowerSame(BitCast(d, a_blocks), BitCast(d, b_blocks));
+#endif
+}
+
+// Capped/fraction: need runtime check
+template <class D, class V = VFromD<D>,
+          hwy::EnableIf<!detail::IsFull(D())>* = nullptr>
+HWY_API V InterleaveUpper(D d, const V a, const V b) {
+  // Less than one block: treat as capped
+  if (Lanes(d) * sizeof(TFromD<D>) < 16) {
+    const Half<decltype(d)> d2;
+    return InterleaveLower(d, UpperHalf(d2, a), UpperHalf(d2, b));
+  }
+  return InterleaveUpper(DFromV<V>(), a, b);
+}
+
 // ================================================== COMBINE
 
 namespace detail {
 
+#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
+template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
+svbool_t MaskLowerHalf(D d) {
+  switch (Lanes(d)) {
+    case 32:
+      return svptrue_pat_b8(SV_VL16);
+    case 16:
+      return svptrue_pat_b8(SV_VL8);
+    case 8:
+      return svptrue_pat_b8(SV_VL4);
+    case 4:
+      return svptrue_pat_b8(SV_VL2);
+    default:
+      return svptrue_pat_b8(SV_VL1);
+  }
+}
+template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
+svbool_t MaskLowerHalf(D d) {
+  switch (Lanes(d)) {
+    case 16:
+      return svptrue_pat_b16(SV_VL8);
+    case 8:
+      return svptrue_pat_b16(SV_VL4);
+    case 4:
+      return svptrue_pat_b16(SV_VL2);
+    default:
+      return svptrue_pat_b16(SV_VL1);
+  }
+}
+template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
+svbool_t MaskLowerHalf(D d) {
+  switch (Lanes(d)) {
+    case 8:
+      return svptrue_pat_b32(SV_VL4);
+    case 4:
+      return svptrue_pat_b32(SV_VL2);
+    default:
+      return svptrue_pat_b32(SV_VL1);
+  }
+}
+template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
+svbool_t MaskLowerHalf(D d) {
+  switch (Lanes(d)) {
+    case 4:
+      return svptrue_pat_b64(SV_VL2);
+    default:
+      return svptrue_pat_b64(SV_VL1);
+  }
+}
+#endif
+#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
+template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
+svbool_t MaskLowerHalf(D d) {
+  switch (Lanes(d)) {
+    case 16:
+      return svptrue_pat_b8(SV_VL8);
+    case 8:
+      return svptrue_pat_b8(SV_VL4);
+    case 4:
+      return svptrue_pat_b8(SV_VL2);
+    case 2:
+    case 1:
+    default:
+      return svptrue_pat_b8(SV_VL1);
+  }
+}
+template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
+svbool_t MaskLowerHalf(D d) {
+  switch (Lanes(d)) {
+    case 8:
+      return svptrue_pat_b16(SV_VL4);
+    case 4:
+      return svptrue_pat_b16(SV_VL2);
+    case 2:
+    case 1:
+    default:
+      return svptrue_pat_b16(SV_VL1);
+  }
+}
+template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
+svbool_t MaskLowerHalf(D d) {
+  return svptrue_pat_b32(Lanes(d) == 4 ? SV_VL2 : SV_VL1);
+}
+template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
+svbool_t MaskLowerHalf(D /*d*/) {
+  return svptrue_pat_b64(SV_VL1);
+}
+#endif  // HWY_TARGET == HWY_SVE2_128
+#if HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128
 template <class D>
 svbool_t MaskLowerHalf(D d) {
   return FirstN(d, Lanes(d) / 2);
 }
+#endif
+
 template <class D>
 svbool_t MaskUpperHalf(D d) {
+  // TODO(janwas): WHILEGE on pow2 SVE2
+  if (HWY_SVE_IS_POW2 && IsFull(d)) {
+    return Not(MaskLowerHalf(d));
+  }
+
   // For Splice to work as intended, make sure bits above Lanes(d) are zero.
   return AndNot(MaskLowerHalf(d), detail::MakeMask(d));
 }
@@ -1475,18 +1767,43 @@ HWY_API V ConcatUpperLower(const D d, const V hi, const V lo) {
 // ------------------------------ ConcatLowerLower
 template <class D, class V>
 HWY_API V ConcatLowerLower(const D d, const V hi, const V lo) {
+  if (detail::IsFull(d)) {
+#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256
+    return detail::ConcatEvenBlocks(hi, lo);
+#endif
+#if HWY_TARGET == HWY_SVE2_128
+    const Repartition<uint64_t, D> du64;
+    const auto lo64 = BitCast(du64, lo);
+    return BitCast(d, InterleaveLower(du64, lo64, BitCast(du64, hi)));
+#endif
+  }
   return detail::Splice(hi, lo, detail::MaskLowerHalf(d));
 }
 
 // ------------------------------ ConcatLowerUpper
 template <class D, class V>
 HWY_API V ConcatLowerUpper(const D d, const V hi, const V lo) {
+#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128  // constexpr Lanes
+  if (detail::IsFull(d)) {
+    return detail::Ext<Lanes(d) / 2>(hi, lo);
+  }
+#endif
   return detail::Splice(hi, lo, detail::MaskUpperHalf(d));
 }
 
 // ------------------------------ ConcatUpperUpper
 template <class D, class V>
 HWY_API V ConcatUpperUpper(const D d, const V hi, const V lo) {
+  if (detail::IsFull(d)) {
+#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256
+    return detail::ConcatOddBlocks(hi, lo);
+#endif
+#if HWY_TARGET == HWY_SVE2_128
+    const Repartition<uint64_t, D> du64;
+    const auto lo64 = BitCast(du64, lo);
+    return BitCast(d, InterleaveUpper(du64, lo64, BitCast(du64, hi)));
+#endif
+  }
   const svbool_t mask_upper = detail::MaskUpperHalf(d);
   const V lo_upper = detail::Splice(lo, lo, mask_upper);
   return IfThenElse(mask_upper, hi, lo_upper);
@@ -1516,11 +1833,68 @@ HWY_API V LowerHalf(const V v) {
   return v;
 }
 
-template <class D2, class V>
-HWY_API V UpperHalf(const D2 /* d2 */, const V v) {
-  return detail::Splice(v, v, detail::MaskUpperHalf(Twice<D2>()));
+template <class DH, class V>
+HWY_API V UpperHalf(const DH dh, const V v) {
+  const Twice<decltype(dh)> d;
+  // Cast so that we support bfloat16_t.
+  const RebindToUnsigned<decltype(d)> du;
+  const VFromD<decltype(du)> vu = BitCast(du, v);
+#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128  // constexpr Lanes
+  return BitCast(d, detail::Ext<Lanes(dh)>(vu, vu));
+#else
+  const MFromD<decltype(du)> mask = detail::MaskUpperHalf(du);
+  return BitCast(d, detail::Splice(vu, vu, mask));
+#endif
+}
+
+// ================================================== REDUCE
+
+// These return T, whereas the Highway op returns a broadcasted vector.
+namespace detail {
+#define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP)                   \
+  HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) {   \
+    /* The intrinsic returns [u]int64_t; truncate to T so we can broadcast. */ \
+    using T = HWY_SVE_T(BASE, BITS);                                           \
+    using TU = MakeUnsigned<T>;                                                \
+    constexpr uint64_t kMask = LimitsMax<TU>();                                \
+    return static_cast<T>(static_cast<TU>(                                     \
+        static_cast<uint64_t>(sv##OP##_##CHAR##BITS(pg, v)) & kMask));         \
+  }
+
+#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP)                     \
+  HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
+    return sv##OP##_##CHAR##BITS(pg, v);                                     \
+  }
+
+HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE_ADD, SumOfLanesM, addv)
+HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, SumOfLanesM, addv)
+
+HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MinOfLanesM, minv)
+HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MaxOfLanesM, maxv)
+// NaN if all are
+HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MinOfLanesM, minnmv)
+HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanesM, maxnmv)
+
+#undef HWY_SVE_REDUCE
+#undef HWY_SVE_REDUCE_ADD
+}  // namespace detail
+
+template <class D, class V>
+V SumOfLanes(D d, V v) {
+  return Set(d, detail::SumOfLanesM(detail::MakeMask(d), v));
 }
 
+template <class D, class V>
+V MinOfLanes(D d, V v) {
+  return Set(d, detail::MinOfLanesM(detail::MakeMask(d), v));
+}
+
+template <class D, class V>
+V MaxOfLanes(D d, V v) {
+  return Set(d, detail::MaxOfLanesM(detail::MakeMask(d), v));
+}
+
+
 // ================================================== SWIZZLE
 
 // ------------------------------ GetLane
@@ -1532,19 +1906,19 @@ namespace detail {
     return sv##OP##_##CHAR##BITS(mask, v);                 \
   }
 
-HWY_SVE_FOREACH(HWY_SVE_GET_LANE, GetLane, lasta)
+HWY_SVE_FOREACH(HWY_SVE_GET_LANE, GetLaneM, lasta)
 #undef HWY_SVE_GET_LANE
 }  // namespace detail
 
 template <class V>
 HWY_API TFromV<V> GetLane(V v) {
-  return detail::GetLane(v, detail::PFalse());
+  return detail::GetLaneM(v, detail::PFalse());
 }
 
 // ------------------------------ ExtractLane
 template <class V>
 HWY_API TFromV<V> ExtractLane(V v, size_t i) {
-  return detail::GetLane(v, FirstN(DFromV<V>(), i));
+  return detail::GetLaneM(v, FirstN(DFromV<V>(), i));
 }
 
 // ------------------------------ InsertLane (IfThenElse)
@@ -1579,26 +1953,53 @@ HWY_API V DupOdd(const V v) {
 
 // ------------------------------ OddEven
 
-namespace detail {
-HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVN, Insert, insr_n)
-}  // namespace detail
+#if HWY_TARGET == HWY_SVE2_128 || HWY_TARGET == HWY_SVE2
+
+#define HWY_SVE_ODD_EVEN(BASE, CHAR, BITS, HALF, NAME, OP)          \
+  HWY_API HWY_SVE_V(BASE, BITS)                                     \
+      NAME(HWY_SVE_V(BASE, BITS) odd, HWY_SVE_V(BASE, BITS) even) { \
+    return sv##OP##_##CHAR##BITS(even, odd, /*xor=*/0);             \
+  }
+
+HWY_SVE_FOREACH_UI(HWY_SVE_ODD_EVEN, OddEven, eortb_n)
+#undef HWY_SVE_ODD_EVEN
+
+template <class V, HWY_IF_FLOAT_V(V)>
+HWY_API V OddEven(const V odd, const V even) {
+  const DFromV<V> d;
+  const RebindToUnsigned<decltype(d)> du;
+  return BitCast(d, OddEven(BitCast(du, odd), BitCast(du, even)));
+}
+
+#else
 
 template <class V>
 HWY_API V OddEven(const V odd, const V even) {
-  const auto even_in_odd = detail::Insert(even, 0);
-  return detail::InterleaveOdd(even_in_odd, odd);
+  const auto odd_in_even = detail::Ext<1>(odd, odd);
+  return detail::InterleaveEven(even, odd_in_even);
 }
 
+#endif  // HWY_TARGET
+
 // ------------------------------ OddEvenBlocks
 template <class V>
 HWY_API V OddEvenBlocks(const V odd, const V even) {
-  const RebindToUnsigned<DFromV<V>> du;
+  const DFromV<V> d;
+#if HWY_TARGET == HWY_SVE_256
+  return ConcatUpperLower(d, odd, even);
+#elif HWY_TARGET == HWY_SVE2_128
+  (void)odd;
+  (void)d;
+  return even;
+#else
+  const RebindToUnsigned<decltype(d)> du;
   using TU = TFromD<decltype(du)>;
   constexpr size_t kShift = CeilLog2(16 / sizeof(TU));
   const auto idx_block = ShiftRight<kShift>(Iota(du, 0));
   const auto lsb = detail::AndN(idx_block, static_cast<TU>(1));
   const svbool_t is_even = detail::EqN(lsb, static_cast<TU>(0));
   return IfThenElse(is_even, even, odd);
+#endif
 }
 
 // ------------------------------ TableLookupLanes
@@ -1648,11 +2049,18 @@ constexpr size_t LanesPerBlock(Simd<T, N, kPow2> /* tag */) {
 template <class V>
 HWY_API V SwapAdjacentBlocks(const V v) {
   const DFromV<V> d;
+#if HWY_TARGET == HWY_SVE_256
+  return ConcatLowerUpper(d, v, v);
+#elif HWY_TARGET == HWY_SVE2_128
+  (void)d;
+  return v;
+#else
   const RebindToUnsigned<decltype(d)> du;
   constexpr auto kLanesPerBlock =
-      static_cast<TFromV<V>>(detail::LanesPerBlock(d));
+      static_cast<TFromD<decltype(du)>>(detail::LanesPerBlock(d));
   const VFromD<decltype(du)> idx = detail::XorN(Iota(du, 0), kLanesPerBlock);
   return TableLookupLanes(v, idx);
+#endif
 }
 
 // ------------------------------ Reverse
@@ -1675,11 +2083,14 @@ HWY_API V Reverse(D d, V v) {
   const auto reversed = detail::ReverseFull(v);
   if (HWY_SVE_IS_POW2 && detail::IsFull(d)) return reversed;
   // Shift right to remove extra (non-pow2 and remainder) lanes.
-  // TODO(janwas): on SVE2, use whilege.
-  const size_t all_lanes = detail::AllHardwareLanes(hwy::SizeTag<sizeof(T)>());
-  // Avoids FirstN truncating to the return vector size.
+  // TODO(janwas): on SVE2, use WHILEGE.
+  // Avoids FirstN truncating to the return vector size. Must also avoid Not
+  // because that is limited to SV_POW2.
   const ScalableTag<T> dfull;
-  const svbool_t mask = Not(FirstN(dfull, all_lanes - Lanes(d)));
+  const svbool_t all_true = detail::AllPTrue(dfull);
+  const size_t all_lanes = detail::AllHardwareLanes(hwy::SizeTag<sizeof(T)>());
+  const svbool_t mask =
+      svnot_b_z(all_true, FirstN(dfull, all_lanes - Lanes(d)));
   return detail::Splice(reversed, reversed, mask);
 }
 
@@ -1700,14 +2111,23 @@ HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
 }
 
 template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
-HWY_API VFromD<D> Reverse2(D /* tag */, const VFromD<D> v) {  // 3210
-  const auto even_in_odd = detail::Insert(v, 0);              // 210z
-  return detail::InterleaveOdd(v, even_in_odd);               // 2301
+HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {  // 3210
+#if HWY_TARGET == HWY_SVE2_128
+  if (detail::IsFull(d)) {
+    return detail::Ext<1>(v, v);
+  }
+#endif
+  (void)d;
+  const auto odd_in_even = detail::Ext<1>(v, v);  // x321
+  return detail::InterleaveEven(odd_in_even, v);  // 2301
 }
-
 // ------------------------------ Reverse4 (TableLookupLanes)
 template <class D>
 HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
+  if (HWY_TARGET == HWY_SVE_256 && sizeof(TFromD<D>) == 8 &&
+      detail::IsFull(d)) {
+    return detail::ReverseFull(v);
+  }
   // TODO(janwas): is this approach faster than Shuffle0123?
   const RebindToUnsigned<decltype(d)> du;
   const auto idx = detail::XorN(Iota(du, 0), 3);
@@ -1726,7 +2146,13 @@ HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
 
 template <typename T>
 struct CompressIsPartition {
+#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
+  // Optimization for 64-bit lanes (could also be applied to 32-bit, but that
+  // requires a larger table).
+  enum { value = (sizeof(T) == 8) };
+#else
   enum { value = 0 };
+#endif  // HWY_TARGET == HWY_SVE_256
 };
 
 #define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP)                     \
@@ -1734,9 +2160,48 @@ struct CompressIsPartition {
     return sv##OP##_##CHAR##BITS(mask, v);                                     \
   }
 
+#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
+HWY_SVE_FOREACH_UI32(HWY_SVE_COMPRESS, Compress, compact)
+HWY_SVE_FOREACH_F32(HWY_SVE_COMPRESS, Compress, compact)
+#else
 HWY_SVE_FOREACH_UIF3264(HWY_SVE_COMPRESS, Compress, compact)
+#endif
 #undef HWY_SVE_COMPRESS
 
+#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
+template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
+HWY_API V Compress(V v, svbool_t mask) {
+  const DFromV<V> d;
+  const RebindToUnsigned<decltype(d)> du64;
+
+  // Convert mask into bitfield via horizontal sum (faster than ORV) of masked
+  // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for
+  // SetTableIndices.
+  const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2));
+  const size_t offset = detail::SumOfLanesM(mask, bits);
+
+  // See CompressIsPartition.
+  alignas(16) static constexpr uint64_t table[4 * 16] = {
+      // PrintCompress64x4Tables
+      0, 1, 2, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 2, 0, 1, 3, 0, 2,
+      1, 3, 1, 2, 0, 3, 0, 1, 2, 3, 3, 0, 1, 2, 0, 3, 1, 2, 1, 3, 0, 2,
+      0, 1, 3, 2, 2, 3, 0, 1, 0, 2, 3, 1, 1, 2, 3, 0, 0, 1, 2, 3};
+  return TableLookupLanes(v, SetTableIndices(d, table + offset));
+}
+#endif  // HWY_TARGET == HWY_SVE_256
+#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
+template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
+HWY_API V Compress(V v, svbool_t mask) {
+  // If mask == 10: swap via splice. A mask of 00 or 11 leaves v unchanged, 10
+  // swaps upper/lower (the lower half is set to the upper half, and the
+  // remaining upper half is filled from the lower half of the second v), and
+  // 01 is invalid because it would ConcatLowerLower. zip1 and AndNot keep 10
+  // unchanged and map everything else to 00.
+  const svbool_t maskLL = svzip1_b64(mask, mask);  // broadcast lower lane
+  return detail::Splice(v, v, AndNot(maskLL, mask));
+}
+#endif  // HWY_TARGET == HWY_SVE_256
+
 template <class V, HWY_IF_LANE_SIZE_V(V, 2)>
 HWY_API V Compress(V v, svbool_t mask16) {
   static_assert(!IsSame<V, svfloat16_t>(), "Must use overload");
@@ -1755,8 +2220,8 @@ HWY_API V Compress(V v, svbool_t mask16) {
   // Demote to 16-bit (already in range) - separately so we can splice
   const V evenL = BitCast(d16, compressedL);
   const V evenH = BitCast(d16, compressedH);
-  const V v16L = detail::ConcatEven(evenL, evenL);  // only lower half needed
-  const V v16H = detail::ConcatEven(evenH, evenH);
+  const V v16L = detail::ConcatEvenFull(evenL, evenL);  // lower half
+  const V v16H = detail::ConcatEvenFull(evenH, evenH);
 
   // We need to combine two vectors of non-constexpr length, so the only option
   // is Splice, which requires us to synthesize a mask. NOTE: this function uses
@@ -1773,17 +2238,78 @@ HWY_API svfloat16_t Compress(svfloat16_t v, svbool_t mask16) {
   return BitCast(df, Compress(BitCast(di, v), mask16));
 }
 
+// ------------------------------ CompressNot
+
+template <class V, HWY_IF_NOT_LANE_SIZE_V(V, 8)>
+HWY_API V CompressNot(V v, const svbool_t mask) {
+  return Compress(v, Not(mask));
+}
+
+template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
+HWY_API V CompressNot(V v, svbool_t mask) {
+#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
+  // If mask == 01: swap via splice. A mask of 00 or 11 leaves v unchanged, 10
+  // swaps upper/lower (the lower half is set to the upper half, and the
+  // remaining upper half is filled from the lower half of the second v), and
+  // 01 is invalid because it would ConcatLowerLower. zip1 and AndNot map
+  // 01 to 10, and everything else to 00.
+  const svbool_t maskLL = svzip1_b64(mask, mask);  // broadcast lower lane
+  return detail::Splice(v, v, AndNot(mask, maskLL));
+#endif
+#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
+  const DFromV<V> d;
+  const RebindToUnsigned<decltype(d)> du64;
+
+  // Convert mask into bitfield via horizontal sum (faster than ORV) of masked
+  // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for
+  // SetTableIndices.
+  const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2));
+  const size_t offset = detail::SumOfLanesM(mask, bits);
+
+  // See CompressIsPartition.
+  alignas(16) static constexpr uint64_t table[4 * 16] = {
+      // PrintCompressNot64x4Tables
+      0, 1, 2, 3, 1, 2, 3, 0, 0, 2, 3, 1, 2, 3, 0, 1, 0, 1, 3, 2, 1, 3,
+      0, 2, 0, 3, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 1, 2, 0, 3, 0, 2, 1, 3,
+      2, 0, 1, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
+  return TableLookupLanes(v, SetTableIndices(d, table + offset));
+#endif  // HWY_TARGET == HWY_SVE_256
+
+  return Compress(v, Not(mask));
+}
+
+// ------------------------------ CompressBlocksNot
+HWY_API svuint64_t CompressBlocksNot(svuint64_t v, svbool_t mask) {
+#if HWY_TARGET == HWY_SVE2_128
+  (void)mask;
+  return v;
+#endif
+#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
+  uint64_t bits = 0;  // predicate reg is 32-bit
+  CopyBytes<4>(&mask, &bits);  // not same size - 64-bit more efficient
+  // Concatenate LSB for upper and lower blocks, pre-scale by 4 for table idx.
+  const size_t offset = ((bits & 1) ? 4u : 0u) + ((bits & 0x10000) ? 8u : 0u);
+  // See CompressIsPartition. Manually generated; flip halves if mask = [0, 1].
+  alignas(16) static constexpr uint64_t table[4 * 4] = {0, 1, 2, 3, 2, 3, 0, 1,
+                                                        0, 1, 2, 3, 0, 1, 2, 3};
+  const ScalableTag<uint64_t> d;
+  return TableLookupLanes(v, SetTableIndices(d, table + offset));
+#endif
+
+  return CompressNot(v, mask);
+}
+
 // ------------------------------ CompressStore
-template <class V, class M, class D>
-HWY_API size_t CompressStore(const V v, const M mask, const D d,
+template <class V, class D>
+HWY_API size_t CompressStore(const V v, const svbool_t mask, const D d,
                              TFromD<D>* HWY_RESTRICT unaligned) {
   StoreU(Compress(v, mask), d, unaligned);
   return CountTrue(d, mask);
 }
 
 // ------------------------------ CompressBlendedStore
-template <class V, class M, class D>
-HWY_API size_t CompressBlendedStore(const V v, const M mask, const D d,
+template <class V, class D>
+HWY_API size_t CompressBlendedStore(const V v, const svbool_t mask, const D d,
                                     TFromD<D>* HWY_RESTRICT unaligned) {
   const size_t count = CountTrue(d, mask);
   const svbool_t store_mask = FirstN(d, count);
@@ -1795,6 +2321,9 @@ HWY_API size_t CompressBlendedStore(const V v, const M mask, const D d,
 
 // ------------------------------ CombineShiftRightBytes
 
+// Prevent accidentally using these for 128-bit vectors - should not be
+// necessary.
+#if HWY_TARGET != HWY_SVE2_128
 namespace detail {
 
 // For x86-compatible behaviour mandated by Highway API: TableLookupBytes
@@ -1847,16 +2376,21 @@ svbool_t FirstNPerBlock(D d) {
 }
 
 }  // namespace detail
+#endif  // HWY_TARGET != HWY_SVE2_128
 
 template <size_t kBytes, class D, class V = VFromD<D>>
 HWY_API V CombineShiftRightBytes(const D d, const V hi, const V lo) {
   const Repartition<uint8_t, decltype(d)> d8;
   const auto hi8 = BitCast(d8, hi);
   const auto lo8 = BitCast(d8, lo);
+#if HWY_TARGET == HWY_SVE2_128
+  return BitCast(d, detail::Ext<kBytes>(hi8, lo8));
+#else
   const auto hi_up = detail::Splice(hi8, hi8, FirstN(d8, 16 - kBytes));
   const auto lo_down = detail::Ext<kBytes>(lo8, lo8);
   const svbool_t is_lo = detail::FirstNPerBlock<16 - kBytes>(d8);
   return BitCast(d, IfThenElse(is_lo, lo_down, hi_up));
+#endif
 }
 
 // ------------------------------ Shuffle2301
@@ -1916,6 +2450,16 @@ HWY_API V Shuffle0123(const V v) {
 // ------------------------------ ReverseBlocks (Reverse, Shuffle01)
 template <class D, class V = VFromD<D>>
 HWY_API V ReverseBlocks(D d, V v) {
+#if HWY_TARGET == HWY_SVE_256
+  if (detail::IsFull(d)) {
+    return SwapAdjacentBlocks(v);
+  } else if (detail::IsFull(Twice<D>())) {
+    return v;
+  }
+#elif HWY_TARGET == HWY_SVE2_128
+  (void)d;
+  return v;
+#endif
   const Repartition<uint64_t, D> du64;
   return BitCast(d, Shuffle01(Reverse(du64, BitCast(du64, v))));
 }
@@ -1926,9 +2470,13 @@ template <class V, class VI>
 HWY_API VI TableLookupBytes(const V v, const VI idx) {
   const DFromV<VI> d;
   const Repartition<uint8_t, decltype(d)> du8;
+#if HWY_TARGET == HWY_SVE2_128
+  return BitCast(d, TableLookupLanes(BitCast(du8, v), BitCast(du8, idx)));
+#else
   const auto offsets128 = detail::OffsetsOf128BitBlocks(du8, Iota(du8, 0));
   const auto idx8 = Add(BitCast(du8, idx), offsets128);
   return BitCast(d, TableLookupLanes(BitCast(du8, v), idx8));
+#endif
 }
 
 template <class V, class VI>
@@ -1945,17 +2493,35 @@ HWY_API VI TableLookupBytesOr0(const V v, const VI idx) {
 }
 
 // ------------------------------ Broadcast
+
+#if HWY_TARGET == HWY_SVE2_128
+namespace detail {
+#define HWY_SVE_BROADCAST(BASE, CHAR, BITS, HALF, NAME, OP)        \
+  template <int kLane>                                             \
+  HWY_INLINE HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
+    return sv##OP##_##CHAR##BITS(v, kLane);                        \
+  }
+
+HWY_SVE_FOREACH(HWY_SVE_BROADCAST, BroadcastLane, dup_lane)
+#undef HWY_SVE_BROADCAST
+}  // namespace detail
+#endif
+
 template <int kLane, class V>
 HWY_API V Broadcast(const V v) {
   const DFromV<V> d;
   const RebindToUnsigned<decltype(d)> du;
   constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
   static_assert(0 <= kLane && kLane < kLanesPerBlock, "Invalid lane");
+#if HWY_TARGET == HWY_SVE2_128
+  return detail::BroadcastLane<kLane>(v);
+#else
   auto idx = detail::OffsetsOf128BitBlocks(du, Iota(du, 0));
   if (kLane != 0) {
     idx = detail::AddN(idx, kLane);
   }
   return TableLookupLanes(v, idx);
+#endif
 }
 
 // ------------------------------ ShiftLeftLanes
@@ -1964,8 +2530,12 @@ template <size_t kLanes, class D, class V = VFromD<D>>
 HWY_API V ShiftLeftLanes(D d, const V v) {
   const auto zero = Zero(d);
   const auto shifted = detail::Splice(v, zero, FirstN(d, kLanes));
+#if HWY_TARGET == HWY_SVE2_128
+  return shifted;
+#else
   // Match x86 semantics by zeroing lower lanes in 128-bit blocks
   return IfThenElse(detail::FirstNPerBlock<kLanes>(d), zero, shifted);
+#endif
 }
 
 template <size_t kLanes, class V>
@@ -1981,11 +2551,15 @@ HWY_API V ShiftRightLanes(D d, V v) {
     v = IfThenElseZero(detail::MakeMask(d), v);
   }
 
+#if HWY_TARGET == HWY_SVE2_128
+  return detail::Ext<kLanes>(Zero(d), v);
+#else
   const auto shifted = detail::Ext<kLanes>(v, v);
   // Match x86 semantics by zeroing upper lanes in 128-bit blocks
   constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d);
   const svbool_t mask = detail::FirstNPerBlock<kLanesPerBlock - kLanes>(d);
   return IfThenElseZero(mask, shifted);
+#endif
 }
 
 // ------------------------------ ShiftLeftBytes
@@ -2008,53 +2582,6 @@ HWY_API V ShiftRightBytes(const D d, const V v) {
   return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v)));
 }
 
-// ------------------------------ InterleaveLower
-
-template <class D, class V>
-HWY_API V InterleaveLower(D d, const V a, const V b) {
-  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
-  // Move lower halves of blocks to lower half of vector.
-  const Repartition<uint64_t, decltype(d)> d64;
-  const auto a64 = BitCast(d64, a);
-  const auto b64 = BitCast(d64, b);
-  const auto a_blocks = detail::ConcatEven(a64, a64);  // only lower half needed
-  const auto b_blocks = detail::ConcatEven(b64, b64);
-
-  return detail::ZipLower(BitCast(d, a_blocks), BitCast(d, b_blocks));
-}
-
-template <class V>
-HWY_API V InterleaveLower(const V a, const V b) {
-  return InterleaveLower(DFromV<V>(), a, b);
-}
-
-// ------------------------------ InterleaveUpper
-
-// Full vector: guaranteed to have at least one block
-template <class D, class V = VFromD<D>,
-          hwy::EnableIf<detail::IsFull(D())>* = nullptr>
-HWY_API V InterleaveUpper(D d, const V a, const V b) {
-  // Move upper halves of blocks to lower half of vector.
-  const Repartition<uint64_t, decltype(d)> d64;
-  const auto a64 = BitCast(d64, a);
-  const auto b64 = BitCast(d64, b);
-  const auto a_blocks = detail::ConcatOdd(a64, a64);  // only lower half needed
-  const auto b_blocks = detail::ConcatOdd(b64, b64);
-  return detail::ZipLower(BitCast(d, a_blocks), BitCast(d, b_blocks));
-}
-
-// Capped/fraction: need runtime check
-template <class D, class V = VFromD<D>,
-          hwy::EnableIf<!detail::IsFull(D())>* = nullptr>
-HWY_API V InterleaveUpper(D d, const V a, const V b) {
-  // Less than one block: treat as capped
-  if (Lanes(d) * sizeof(TFromD<D>) < 16) {
-    const Half<decltype(d)> d2;
-    return InterleaveLower(d, UpperHalf(d2, a), UpperHalf(d2, b));
-  }
-  return InterleaveUpper(DFromV<V>(), a, b);
-}
-
 // ------------------------------ ZipLower
 
 template <class V, class DW = RepartitionToWide<DFromV<V>>>
@@ -2076,35 +2603,17 @@ HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
   return BitCast(dw, InterleaveUpper(dn, a, b));
 }
 
-// ================================================== REDUCE
-
-#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP)                 \
-  template <size_t N, int kPow2>                                         \
-  HWY_API HWY_SVE_V(BASE, BITS)                                          \
-      NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, HWY_SVE_V(BASE, BITS) v) { \
-    return Set(d, static_cast<HWY_SVE_T(BASE, BITS)>(                    \
-                      sv##OP##_##CHAR##BITS(detail::MakeMask(d), v)));   \
-  }
-
-HWY_SVE_FOREACH(HWY_SVE_REDUCE, SumOfLanes, addv)
-HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MinOfLanes, minv)
-HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MaxOfLanes, maxv)
-// NaN if all are
-HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MinOfLanes, minnmv)
-HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanes, maxnmv)
-
-#undef HWY_SVE_REDUCE
-
 // ================================================== Ops with dependencies
 
 // ------------------------------ PromoteTo bfloat16 (ZipLower)
 template <size_t N, int kPow2>
 HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> df32,
                               const svuint16_t v) {
-  return BitCast(df32, detail::ZipLower(svdup_n_u16(0), v));
+  return BitCast(df32, detail::ZipLowerSame(svdup_n_u16(0), v));
 }
 
 // ------------------------------ ReorderDemote2To (OddEven)
+
 template <size_t N, int kPow2>
 HWY_API svuint16_t ReorderDemote2To(Simd<bfloat16_t, N, kPow2> dbf16,
                                     svfloat32_t a, svfloat32_t b) {
@@ -2114,6 +2623,21 @@ HWY_API svuint16_t ReorderDemote2To(Simd<bfloat16_t, N, kPow2> dbf16,
   return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
 }
 
+template <size_t N, int kPow2>
+HWY_API svint16_t ReorderDemote2To(Simd<int16_t, N, kPow2> d16, svint32_t a,
+                                   svint32_t b) {
+#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
+  (void)d16;
+  const svint16_t a_in_even = svqxtnb_s32(a);
+  return svqxtnt_s32(a_in_even, b);
+#else
+  const Half<decltype(d16)> dh;
+  const svint16_t a16 = BitCast(dh, detail::SaturateI<int16_t>(a));
+  const svint16_t b16 = BitCast(dh, detail::SaturateI<int16_t>(b));
+  return detail::InterleaveEven(a16, b16);
+#endif
+}
+
 // ------------------------------ ZeroIfNegative (Lt, IfThenElse)
 template <class V>
 HWY_API V ZeroIfNegative(const V v) {
@@ -2210,7 +2734,7 @@ HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
   // Max 2048 bits = 32 lanes = 32 input bits; replicate those into each lane.
   // The "at least 8 byte" guarantee in quick_reference ensures this is safe.
   uint32_t mask_bits;
-  CopyBytes<4>(bits, &mask_bits);
+  CopyBytes<4>(bits, &mask_bits);  // copy from bytes
   const auto vbits = Set(du, mask_bits);
 
   // 2 ^ {0,1, .., 31}, will not have more lanes than that.
@@ -2232,7 +2756,7 @@ template <class T, HWY_IF_LANE_SIZE(T, 2)>
 HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
   const ScalableTag<uint8_t> d8;
   const svuint8_t b16 = BitCast(d8, svdup_n_u16_z(m, 1));
-  return detail::ConcatEven(b16, b16);  // only lower half needed
+  return detail::ConcatEvenFull(b16, b16);  // lower half
 }
 template <class T, HWY_IF_LANE_SIZE(T, 4)>
 HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
@@ -2242,7 +2766,7 @@ template <class T, HWY_IF_LANE_SIZE(T, 8)>
 HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
   const ScalableTag<uint32_t> d32;
   const svuint32_t b64 = BitCast(d32, svdup_n_u64_z(m, 1));
-  return U8FromU32(detail::ConcatEven(b64, b64));  // only lower half needed
+  return U8FromU32(detail::ConcatEvenFull(b64, b64));  // lower half
 }
 
 // Compacts groups of 8 u8 into 8 contiguous bits in a 64-bit lane.
@@ -2261,6 +2785,7 @@ HWY_INLINE svuint64_t BitsFromBool(svuint8_t x) {
 }  // namespace detail
 
 // `p` points to at least 8 writable bytes.
+// TODO(janwas): specialize for HWY_SVE_256
 template <class D>
 HWY_API size_t StoreMaskBits(D d, svbool_t m, uint8_t* bits) {
   svuint64_t bits_in_u64 =
@@ -2275,7 +2800,7 @@ HWY_API size_t StoreMaskBits(D d, svbool_t m, uint8_t* bits) {
   // Non-full byte, need to clear the undefined upper bits. Can happen for
   // capped/fractional vectors or large T and small hardware vectors.
   if (num_bits < 8) {
-    const int mask = (1 << num_bits) - 1;
+    const int mask = static_cast<int>((1ull << num_bits) - 1);
     bits[0] = static_cast<uint8_t>(bits[0] & mask);
   }
   // Else: we wrote full bytes because num_bits is a power of two >= 8.
@@ -2306,7 +2831,7 @@ namespace detail {
     return sv##OP##_##CHAR##BITS(a, b);                        \
   }
 
-HWY_SVE_FOREACH_UI64(HWY_SVE_MUL_EVEN, MulEven, mullb)
+HWY_SVE_FOREACH_UI64(HWY_SVE_MUL_EVEN, MulEvenNative, mullb)
 #undef HWY_SVE_MUL_EVEN
 }  // namespace detail
 #endif
@@ -2314,27 +2839,28 @@ HWY_SVE_FOREACH_UI64(HWY_SVE_MUL_EVEN, MulEven, mullb)
 template <class V, class DW = RepartitionToWide<DFromV<V>>>
 HWY_API VFromD<DW> MulEven(const V a, const V b) {
 #if HWY_TARGET == HWY_SVE2
-  return BitCast(DW(), detail::MulEven(a, b));
+  return BitCast(DW(), detail::MulEvenNative(a, b));
 #else
   const auto lo = Mul(a, b);
-  const auto hi = detail::MulHigh(a, b);
+  const auto hi = MulHigh(a, b);
   return BitCast(DW(), detail::InterleaveEven(lo, hi));
 #endif
 }
 
 HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b) {
   const auto lo = Mul(a, b);
-  const auto hi = detail::MulHigh(a, b);
+  const auto hi = MulHigh(a, b);
   return detail::InterleaveEven(lo, hi);
 }
 
 HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
   const auto lo = Mul(a, b);
-  const auto hi = detail::MulHigh(a, b);
+  const auto hi = MulHigh(a, b);
   return detail::InterleaveOdd(lo, hi);
 }
 
 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
+
 template <size_t N, int kPow2>
 HWY_API svfloat32_t ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
                                               svuint16_t a, svuint16_t b,
@@ -2352,9 +2878,38 @@ HWY_API svfloat32_t ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
   return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
 }
 
+template <size_t N, int kPow2>
+HWY_API svint32_t ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32,
+                                            svint16_t a, svint16_t b,
+                                            const svint32_t sum0,
+                                            svint32_t& sum1) {
+#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
+  (void)d32;
+  sum1 = svmlalt_s32(sum1, a, b);
+  return svmlalb_s32(sum0, a, b);
+#else
+  const svbool_t pg = detail::PTrue(d32);
+  const svint32_t a0 = svunpklo_s32(a);
+  const svint32_t b0 = svunpklo_s32(b);
+  svint32_t a1, b1;
+  if (detail::IsFull(d32)) {
+    a1 = svunpkhi_s32(a);
+    b1 = svunpkhi_s32(b);
+  } else {
+    const Rebind<int16_t, decltype(d32)> d16h;
+    a1 = svunpklo_s32(UpperHalf(d16h, a));
+    b1 = svunpklo_s32(UpperHalf(d16h, b));
+  }
+  sum1 = svmla_s32_x(pg, sum1, a1, b1);
+  return svmla_s32_x(pg, sum0, a0, b0);
+#endif
+}
+
 // ------------------------------ AESRound / CLMul
 
-#if defined(__ARM_FEATURE_SVE2_AES)
+#if defined(__ARM_FEATURE_SVE2_AES) ||                         \
+    ((HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128) && \
+     HWY_HAVE_RUNTIME_DISPATCH)
 
 // Per-target flag to prevent generic_ops-inl.h from defining AESRound.
 #ifdef HWY_NATIVE_AES
@@ -2384,48 +2939,177 @@ HWY_API svuint64_t CLMulUpper(const svuint64_t a, const svuint64_t b) {
 #endif  // __ARM_FEATURE_SVE2_AES
 
 // ------------------------------ Lt128
+
+namespace detail {
+#define HWY_SVE_DUP(BASE, CHAR, BITS, HALF, NAME, OP)                        \
+  template <size_t N, int kPow2>                                             \
+  HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /*d*/, svbool_t m) { \
+    return sv##OP##_b##BITS(m, m);                                           \
+  }
+
+HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupEvenB, trn1)  // actually for bool
+HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupOddB, trn2)   // actually for bool
+#undef HWY_SVE_DUP
+
+#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
 template <class D>
-HWY_INLINE svbool_t Lt128(D /* d */, const svuint64_t a, const svuint64_t b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
-  // Truth table of Eq and Compare for Hi and Lo u64.
-  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
-  // =H =L cH cL  | out = cH | (=H & cL) = IfThenElse(=H, cL, cH)
-  //  0  0  0  0  |  0
-  //  0  0  0  1  |  0
-  //  0  0  1  0  |  1
-  //  0  0  1  1  |  1
-  //  0  1  0  0  |  0
-  //  0  1  0  1  |  0
-  //  0  1  1  0  |  1
-  //  1  0  0  0  |  0
-  //  1  0  0  1  |  1
-  //  1  1  0  0  |  0
-  const svbool_t eqHL = Eq(a, b);
+HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b) {
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
+  const svbool_t eqHx = Eq(a, b);  // only odd lanes used
+  // Convert to vector: more pipelines can execute vector TRN* instructions
+  // than the predicate version.
+  const svuint64_t ltHL = VecFromMask(d, Lt(a, b));
+  // Move into upper lane: ltL if the upper half is equal, otherwise ltH.
+  // Requires an extra IfThenElse because INSR, EXT, TRN2 are unpredicated.
+  const svuint64_t ltHx = IfThenElse(eqHx, DupEven(ltHL), ltHL);
+  // Duplicate upper lane into lower.
+  return DupOdd(ltHx);
+}
+#endif
+}  // namespace detail
+
+template <class D>
+HWY_INLINE svbool_t Lt128(D d, const svuint64_t a, const svuint64_t b) {
+#if HWY_TARGET == HWY_SVE_256
+  return MaskFromVec(detail::Lt128Vec(d, a, b));
+#else
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
+  const svbool_t eqHx = Eq(a, b);  // only odd lanes used
+  const svbool_t ltHL = Lt(a, b);
+  // Move into upper lane: ltL if the upper half is equal, otherwise ltH.
+  const svbool_t ltHx = svsel_b(eqHx, detail::DupEvenB(d, ltHL), ltHL);
+  // Duplicate upper lane into lower.
+  return detail::DupOddB(d, ltHx);
+#endif  // HWY_TARGET != HWY_SVE_256
+}
+
+// ------------------------------ Lt128Upper
+
+template <class D>
+HWY_INLINE svbool_t Lt128Upper(D d, svuint64_t a, svuint64_t b) {
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
   const svbool_t ltHL = Lt(a, b);
-  // trn (interleave even/odd) allow us to move and copy masks across lanes.
-  const svbool_t cmpLL = svtrn1_b64(ltHL, ltHL);
-  const svbool_t outHx = svsel_b(eqHL, cmpLL, ltHL);  // See truth table above.
-  return svtrn2_b64(outHx, outHx);                    // replicate to HH
+  return detail::DupOddB(d, ltHL);
+}
+
+// ------------------------------ Eq128, Ne128
+
+#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
+namespace detail {
+
+template <class D>
+HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b) {
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
+  // Convert to vector: more pipelines can execute vector TRN* instructions
+  // than the predicate version.
+  const svuint64_t eqHL = VecFromMask(d, Eq(a, b));
+  // Duplicate upper and lower.
+  const svuint64_t eqHH = DupOdd(eqHL);
+  const svuint64_t eqLL = DupEven(eqHL);
+  return And(eqLL, eqHH);
+}
+
+template <class D>
+HWY_INLINE svuint64_t Ne128Vec(D d, const svuint64_t a, const svuint64_t b) {
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
+  // Convert to vector: more pipelines can execute vector TRN* instructions
+  // than the predicate version.
+  const svuint64_t neHL = VecFromMask(d, Ne(a, b));
+  // Duplicate upper and lower.
+  const svuint64_t neHH = DupOdd(neHL);
+  const svuint64_t neLL = DupEven(neHL);
+  return Or(neLL, neHH);
+}
+
+}  // namespace detail
+#endif
+
+template <class D>
+HWY_INLINE svbool_t Eq128(D d, const svuint64_t a, const svuint64_t b) {
+#if HWY_TARGET == HWY_SVE_256
+  return MaskFromVec(detail::Eq128Vec(d, a, b));
+#else
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
+  const svbool_t eqHL = Eq(a, b);
+  const svbool_t eqHH = detail::DupOddB(d, eqHL);
+  const svbool_t eqLL = detail::DupEvenB(d, eqHL);
+  return And(eqLL, eqHH);
+#endif  // HWY_TARGET != HWY_SVE_256
+}
+
+template <class D>
+HWY_INLINE svbool_t Ne128(D d, const svuint64_t a, const svuint64_t b) {
+#if HWY_TARGET == HWY_SVE_256
+  return MaskFromVec(detail::Ne128Vec(d, a, b));
+#else
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
+  const svbool_t neHL = Ne(a, b);
+  const svbool_t neHH = detail::DupOddB(d, neHL);
+  const svbool_t neLL = detail::DupEvenB(d, neHL);
+  return Or(neLL, neHH);
+#endif  // HWY_TARGET != HWY_SVE_256
+}
+
+// ------------------------------ Eq128Upper, Ne128Upper
+
+template <class D>
+HWY_INLINE svbool_t Eq128Upper(D d, svuint64_t a, svuint64_t b) {
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
+  const svbool_t eqHL = Eq(a, b);
+  return detail::DupOddB(d, eqHL);
+}
+
+template <class D>
+HWY_INLINE svbool_t Ne128Upper(D d, svuint64_t a, svuint64_t b) {
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
+  const svbool_t neHL = Ne(a, b);
+  return detail::DupOddB(d, neHL);
 }
 
 // ------------------------------ Min128, Max128 (Lt128)
 
 template <class D>
 HWY_INLINE svuint64_t Min128(D d, const svuint64_t a, const svuint64_t b) {
+#if HWY_TARGET == HWY_SVE_256
+  return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
+#else
   return IfThenElse(Lt128(d, a, b), a, b);
+#endif
 }
 
 template <class D>
 HWY_INLINE svuint64_t Max128(D d, const svuint64_t a, const svuint64_t b) {
-  return IfThenElse(Lt128(d, a, b), b, a);
+#if HWY_TARGET == HWY_SVE_256
+  return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b);
+#else
+  return IfThenElse(Lt128(d, b, a), a, b);
+#endif
+}
+
+template <class D>
+HWY_INLINE svuint64_t Min128Upper(D d, const svuint64_t a, const svuint64_t b) {
+  return IfThenElse(Lt128Upper(d, a, b), a, b);
+}
+
+template <class D>
+HWY_INLINE svuint64_t Max128Upper(D d, const svuint64_t a, const svuint64_t b) {
+  return IfThenElse(Lt128Upper(d, b, a), a, b);
 }
 
 // ================================================== END MACROS
 namespace detail {  // for code folding
 #undef HWY_IF_FLOAT_V
 #undef HWY_IF_LANE_SIZE_V
-#undef HWY_IF_SIGNED_V
-#undef HWY_IF_UNSIGNED_V
+#undef HWY_SVE_ALL_PTRUE
 #undef HWY_SVE_D
 #undef HWY_SVE_FOREACH
 #undef HWY_SVE_FOREACH_F
diff --git a/media/highway/src/hwy/ops/emu128-inl.h b/media/highway/src/hwy/ops/emu128-inl.h
index 0e590066c7..5063a6d959 100644
--- a/media/highway/src/hwy/ops/emu128-inl.h
+++ b/media/highway/src/hwy/ops/emu128-inl.h
@@ -101,9 +101,7 @@ using TFromV = TFromD<DFromV<V>>;
 template <typename T, size_t N, typename FromT, size_t FromN>
 HWY_API Vec128<T, N> BitCast(Simd<T, N, 0> /* tag */, Vec128<FromT, FromN> v) {
   Vec128<T, N> to;
-  static_assert(sizeof(T) * N == sizeof(FromT) * FromN,
-                "Casting does not change size");
-  CopyBytes<sizeof(T) * N>(v.raw, to.raw);
+  CopySameSize(&v, &to);
   return to;
 }
 
@@ -135,13 +133,13 @@ HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> d) {
 
 namespace detail {
 
-template <typename T, HWY_IF_FLOAT(T)>
-HWY_INLINE constexpr T IncrementWithWraparound(T t) {
+template <typename T>
+HWY_INLINE constexpr T IncrementWithWraparound(hwy::FloatTag /*tag*/, T t) {
   return t + T{1};
 }
 
-template <typename T, HWY_IF_NOT_FLOAT(T)>
-HWY_INLINE constexpr T IncrementWithWraparound(T t) {
+template <typename T>
+HWY_INLINE constexpr T IncrementWithWraparound(hwy::NonFloatTag /*tag*/, T t) {
   using TU = MakeUnsigned<T>;
   return static_cast<T>(static_cast<TU>(static_cast<TU>(t) + TU{1}) &
                         hwy::LimitsMax<TU>());
@@ -155,7 +153,7 @@ HWY_API Vec128<T, N> Iota(const Simd<T, N, 0> /* tag */, T2 first) {
   T counter = static_cast<T>(first);
   for (size_t i = 0; i < N; ++i) {
     v.raw[i] = counter;
-    counter = detail::IncrementWithWraparound(counter);
+    counter = detail::IncrementWithWraparound(hwy::IsFloatTag<T>(), counter);
   }
   return v;
 }
@@ -285,8 +283,7 @@ template <typename TFrom, typename TTo, size_t N>
 HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
                                    Mask128<TFrom, N> mask) {
   Mask128<TTo, N> to;
-  static_assert(sizeof(TTo) * N == sizeof(TFrom) * N, "Must have same size");
-  CopyBytes<sizeof(TTo) * N>(mask.bits, to.bits);
+  CopySameSize(&mask, &to);
   return to;
 }
 
@@ -294,15 +291,14 @@ HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
 template <typename T, size_t N>
 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
   Mask128<T, N> mask;
-  static_assert(sizeof(v) == sizeof(mask), "Must have same size");
-  CopyBytes<sizeof(T) * N>(v.raw, mask.bits);
+  CopySameSize(&v, &mask);
   return mask;
 }
 
 template <typename T, size_t N>
 Vec128<T, N> VecFromMask(const Mask128<T, N> mask) {
   Vec128<T, N> v;
-  CopyBytes<sizeof(T) * N>(mask.bits, v.raw);
+  CopySameSize(&mask, &v);
   return v;
 }
 
@@ -384,6 +380,12 @@ HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
   return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
 }
 
+template <typename T, size_t N>
+HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
+  const Simd<T, N, 0> d;
+  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
+}
+
 // ================================================== SHIFTS
 
 // ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
@@ -544,8 +546,12 @@ HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
 
 // ================================================== ARITHMETIC
 
-template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
-HWY_API Vec128<T, N> operator+(Vec128<T, N> a, Vec128<T, N> b) {
+// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
+namespace detail {
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Add(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
+                            Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
     const uint64_t a64 = static_cast<uint64_t>(a.raw[i]);
     const uint64_t b64 = static_cast<uint64_t>(b.raw[i]);
@@ -553,31 +559,46 @@ HWY_API Vec128<T, N> operator+(Vec128<T, N> a, Vec128<T, N> b) {
   }
   return a;
 }
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T, N> operator+(Vec128<T, N> a, const Vec128<T, N> b) {
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Sub(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
+                            Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
-    a.raw[i] += b.raw[i];
+    const uint64_t a64 = static_cast<uint64_t>(a.raw[i]);
+    const uint64_t b64 = static_cast<uint64_t>(b.raw[i]);
+    a.raw[i] = static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0)));
   }
   return a;
 }
 
-template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
-HWY_API Vec128<T, N> operator-(Vec128<T, N> a, Vec128<T, N> b) {
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Add(hwy::FloatTag /*tag*/, Vec128<T, N> a,
+                            const Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
-    const uint64_t a64 = static_cast<uint64_t>(a.raw[i]);
-    const uint64_t b64 = static_cast<uint64_t>(b.raw[i]);
-    a.raw[i] = static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0)));
+    a.raw[i] += b.raw[i];
   }
   return a;
 }
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T, N> operator-(Vec128<T, N> a, const Vec128<T, N> b) {
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Sub(hwy::FloatTag /*tag*/, Vec128<T, N> a,
+                            const Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
     a.raw[i] -= b.raw[i];
   }
   return a;
 }
 
+}  // namespace detail
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> operator-(Vec128<T, N> a, const Vec128<T, N> b) {
+  return detail::Sub(hwy::IsFloatTag<T>(), a, b);
+}
+template <typename T, size_t N>
+HWY_API Vec128<T, N> operator+(Vec128<T, N> a, const Vec128<T, N> b) {
+  return detail::Add(hwy::IsFloatTag<T>(), a, b);
+}
+
 // ------------------------------ SumsOf8
 
 template <size_t N>
@@ -612,8 +633,9 @@ HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, const Vec128<T, N> b) {
 }
 
 // ------------------------------ AverageRound
-template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
+template <typename T, size_t N>
 HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, const Vec128<T, N> b) {
+  static_assert(!IsSigned<T>(), "Only for unsigned");
   for (size_t i = 0; i < N; ++i) {
     a.raw[i] = static_cast<T>((a.raw[i] + b.raw[i] + 1) / 2);
   }
@@ -622,8 +644,11 @@ HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, const Vec128<T, N> b) {
 
 // ------------------------------ Abs
 
-template <typename T, size_t N, HWY_IF_SIGNED(T)>
-HWY_API Vec128<T, N> Abs(Vec128<T, N> a) {
+// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
+namespace detail {
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Abs(SignedTag /*tag*/, Vec128<T, N> a) {
   for (size_t i = 0; i < N; ++i) {
     const T s = a.raw[i];
     const T min = hwy::LimitsMin<T>();
@@ -631,26 +656,47 @@ HWY_API Vec128<T, N> Abs(Vec128<T, N> a) {
   }
   return a;
 }
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T, N> Abs(Vec128<T, N> v) {
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Abs(hwy::FloatTag /*tag*/, Vec128<T, N> v) {
   for (size_t i = 0; i < N; ++i) {
     v.raw[i] = std::abs(v.raw[i]);
   }
   return v;
 }
 
+}  // namespace detail
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Abs(Vec128<T, N> a) {
+  return detail::Abs(hwy::TypeTag<T>(), a);
+}
+
 // ------------------------------ Min/Max
 
-template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
-HWY_API Vec128<T, N> Min(Vec128<T, N> a, const Vec128<T, N> b) {
+// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
+namespace detail {
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Min(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
+                            const Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
     a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
   }
   return a;
 }
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Max(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
+                            const Vec128<T, N> b) {
+  for (size_t i = 0; i < N; ++i) {
+    a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
+  }
+  return a;
+}
 
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T, N> Min(Vec128<T, N> a, const Vec128<T, N> b) {
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Min(hwy::FloatTag /*tag*/, Vec128<T, N> a,
+                            const Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
     if (std::isnan(a.raw[i])) {
       a.raw[i] = b.raw[i];
@@ -662,17 +708,9 @@ HWY_API Vec128<T, N> Min(Vec128<T, N> a, const Vec128<T, N> b) {
   }
   return a;
 }
-
-template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
-HWY_API Vec128<T, N> Max(Vec128<T, N> a, const Vec128<T, N> b) {
-  for (size_t i = 0; i < N; ++i) {
-    a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
-  }
-  return a;
-}
-
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T, N> Max(Vec128<T, N> a, const Vec128<T, N> b) {
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Max(hwy::FloatTag /*tag*/, Vec128<T, N> a,
+                            const Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
     if (std::isnan(a.raw[i])) {
       a.raw[i] = b.raw[i];
@@ -685,44 +723,79 @@ HWY_API Vec128<T, N> Max(Vec128<T, N> a, const Vec128<T, N> b) {
   return a;
 }
 
+}  // namespace detail
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Min(Vec128<T, N> a, const Vec128<T, N> b) {
+  return detail::Min(hwy::IsFloatTag<T>(), a, b);
+}
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Max(Vec128<T, N> a, const Vec128<T, N> b) {
+  return detail::Max(hwy::IsFloatTag<T>(), a, b);
+}
+
 // ------------------------------ Neg
 
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
+// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
+namespace detail {
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Neg(hwy::NonFloatTag /*tag*/, Vec128<T, N> v) {
+  return Zero(Simd<T, N, 0>()) - v;
+}
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Neg(hwy::FloatTag /*tag*/, Vec128<T, N> v) {
   return Xor(v, SignBit(Simd<T, N, 0>()));
 }
 
-template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
+}  // namespace detail
+
+template <typename T, size_t N>
 HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
-  return Zero(Simd<T, N, 0>()) - v;
+  return detail::Neg(hwy::IsFloatTag<T>(), v);
 }
 
 // ------------------------------ Mul/Div
 
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T, N> operator*(Vec128<T, N> a, const Vec128<T, N> b) {
+// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
+namespace detail {
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Mul(hwy::FloatTag /*tag*/, Vec128<T, N> a,
+                            const Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
     a.raw[i] *= b.raw[i];
   }
   return a;
 }
 
-template <typename T, size_t N, HWY_IF_SIGNED(T)>
-HWY_API Vec128<T, N> operator*(Vec128<T, N> a, const Vec128<T, N> b) {
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Mul(SignedTag /*tag*/, Vec128<T, N> a,
+                            const Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
-    a.raw[i] = static_cast<T>(int64_t(a.raw[i]) * b.raw[i]);
+    a.raw[i] = static_cast<T>(static_cast<int64_t>(a.raw[i]) * b.raw[i]);
   }
   return a;
 }
 
-template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
-HWY_API Vec128<T, N> operator*(Vec128<T, N> a, const Vec128<T, N> b) {
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Mul(UnsignedTag /*tag*/, Vec128<T, N> a,
+                            const Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
-    a.raw[i] = static_cast<T>(uint64_t(a.raw[i]) * b.raw[i]);
+    a.raw[i] = static_cast<T>(static_cast<uint64_t>(a.raw[i]) * b.raw[i]);
   }
   return a;
 }
 
+}  // namespace detail
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> operator*(Vec128<T, N> a, const Vec128<T, N> b) {
+  return detail::Mul(hwy::TypeTag<T>(), a, b);
+}
+
 template <typename T, size_t N>
 HWY_API Vec128<T, N> operator/(Vec128<T, N> a, const Vec128<T, N> b) {
   for (size_t i = 0; i < N; ++i) {
@@ -736,7 +809,7 @@ template <size_t N>
 HWY_API Vec128<int16_t, N> MulHigh(Vec128<int16_t, N> a,
                                    const Vec128<int16_t, N> b) {
   for (size_t i = 0; i < N; ++i) {
-    a.raw[i] = static_cast<int16_t>((a.raw[i] * b.raw[i]) >> 16);
+    a.raw[i] = static_cast<int16_t>((int32_t{a.raw[i]} * b.raw[i]) >> 16);
   }
   return a;
 }
@@ -855,10 +928,10 @@ HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
   for (size_t i = 0; i < N; ++i) {
     const float half = v.raw[i] * 0.5f;
     uint32_t bits;
-    CopyBytes<4>(&v.raw[i], &bits);
+    CopySameSize(&v.raw[i], &bits);
     // Initial guess based on log2(f)
     bits = 0x5F3759DF - (bits >> 1);
-    CopyBytes<4>(&bits, &v.raw[i]);
+    CopySameSize(&bits, &v.raw[i]);
     // One Newton-Raphson iteration
     v.raw[i] = v.raw[i] * (1.5f - (half * v.raw[i] * v.raw[i]));
   }
@@ -968,7 +1041,7 @@ Vec128<Float, N> Ceil(Vec128<Float, N> v) {
     const bool positive = v.raw[i] > Float(0.0);
 
     Bits bits;
-    CopyBytes<sizeof(Bits)>(&v.raw[i], &bits);
+    CopySameSize(&v.raw[i], &bits);
 
     const int exponent =
         static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
@@ -988,7 +1061,7 @@ Vec128<Float, N> Ceil(Vec128<Float, N> v) {
     if (positive) bits += (kMantissaMask + 1) >> exponent;
     bits &= ~mantissa_mask;
 
-    CopyBytes<sizeof(Bits)>(&bits, &v.raw[i]);
+    CopySameSize(&bits, &v.raw[i]);
   }
   return v;
 }
@@ -1006,7 +1079,7 @@ Vec128<Float, N> Floor(Vec128<Float, N> v) {
     const bool negative = v.raw[i] < Float(0.0);
 
     Bits bits;
-    CopyBytes<sizeof(Bits)>(&v.raw[i], &bits);
+    CopySameSize(&v.raw[i], &bits);
 
     const int exponent =
         static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
@@ -1026,7 +1099,7 @@ Vec128<Float, N> Floor(Vec128<Float, N> v) {
     if (negative) bits += (kMantissaMask + 1) >> exponent;
     bits &= ~mantissa_mask;
 
-    CopyBytes<sizeof(Bits)>(&bits, &v.raw[i]);
+    CopySameSize(&bits, &v.raw[i]);
   }
   return v;
 }
@@ -1039,7 +1112,7 @@ HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
   for (size_t i = 0; i < N; ++i) {
     // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
     MakeUnsigned<T> bits;
-    memcpy(&bits, &v.raw[i], sizeof(T));
+    CopySameSize(&v.raw[i], &bits);
     bits += bits;
     bits >>= 1;  // clear sign bit
     // NaN if all exponent bits are set and the mantissa is not zero.
@@ -1048,8 +1121,9 @@ HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
   return ret;
 }
 
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
+template <typename T, size_t N>
 HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
+  static_assert(IsFloat<T>(), "Only for float");
   const Simd<T, N, 0> d;
   const RebindToSigned<decltype(d)> di;
   const VFromD<decltype(di)> vi = BitCast(di, v);
@@ -1058,8 +1132,9 @@ HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
 }
 
 // Returns whether normal/subnormal/zero.
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
+template <typename T, size_t N>
 HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
+  static_assert(IsFloat<T>(), "Only for float");
   const Simd<T, N, 0> d;
   const RebindToUnsigned<decltype(d)> du;
   const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
@@ -1146,6 +1221,52 @@ HWY_API Mask128<uint64_t> Lt128(Simd<uint64_t, 2, 0> /* tag */,
   return ret;
 }
 
+HWY_API Mask128<uint64_t> Lt128Upper(Simd<uint64_t, 2, 0> /* tag */,
+                                     Vec128<uint64_t> a,
+                                     const Vec128<uint64_t> b) {
+  const bool lt = a.raw[1] < b.raw[1];
+  Mask128<uint64_t> ret;
+  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(lt);
+  return ret;
+}
+
+// ------------------------------ Eq128
+
+// Only makes sense for full vectors of u64.
+HWY_API Mask128<uint64_t> Eq128(Simd<uint64_t, 2, 0> /* tag */,
+                                Vec128<uint64_t> a, const Vec128<uint64_t> b) {
+  const bool eq = a.raw[1] == b.raw[1] && a.raw[0] == b.raw[0];
+  Mask128<uint64_t> ret;
+  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(eq);
+  return ret;
+}
+
+HWY_API Mask128<uint64_t> Ne128(Simd<uint64_t, 2, 0> /* tag */,
+                                Vec128<uint64_t> a, const Vec128<uint64_t> b) {
+  const bool ne = a.raw[1] != b.raw[1] || a.raw[0] != b.raw[0];
+  Mask128<uint64_t> ret;
+  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(ne);
+  return ret;
+}
+
+HWY_API Mask128<uint64_t> Eq128Upper(Simd<uint64_t, 2, 0> /* tag */,
+                                     Vec128<uint64_t> a,
+                                     const Vec128<uint64_t> b) {
+  const bool eq = a.raw[1] == b.raw[1];
+  Mask128<uint64_t> ret;
+  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(eq);
+  return ret;
+}
+
+HWY_API Mask128<uint64_t> Ne128Upper(Simd<uint64_t, 2, 0> /* tag */,
+                                     Vec128<uint64_t> a,
+                                     const Vec128<uint64_t> b) {
+  const bool ne = a.raw[1] != b.raw[1];
+  Mask128<uint64_t> ret;
+  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(ne);
+  return ret;
+}
+
 // ------------------------------ Min128, Max128 (Lt128)
 
 template <class D, class V = VFromD<D>>
@@ -1155,7 +1276,17 @@ HWY_API V Min128(D d, const V a, const V b) {
 
 template <class D, class V = VFromD<D>>
 HWY_API V Max128(D d, const V a, const V b) {
-  return IfThenElse(Lt128(d, a, b), b, a);
+  return IfThenElse(Lt128(d, b, a), a, b);
+}
+
+template <class D, class V = VFromD<D>>
+HWY_API V Min128Upper(D d, const V a, const V b) {
+  return IfThenElse(Lt128Upper(d, a, b), a, b);
+}
+
+template <class D, class V = VFromD<D>>
+HWY_API V Max128Upper(D d, const V a, const V b) {
+  return IfThenElse(Lt128Upper(d, b, a), a, b);
 }
 
 // ================================================== MEMORY
@@ -1166,7 +1297,7 @@ template <typename T, size_t N>
 HWY_API Vec128<T, N> Load(Simd<T, N, 0> /* tag */,
                           const T* HWY_RESTRICT aligned) {
   Vec128<T, N> v;
-  CopyBytes<sizeof(T) * N>(aligned, v.raw);
+  CopyBytes<sizeof(T) * N>(aligned, v.raw);  // copy from array
   return v;
 }
 
@@ -1193,7 +1324,7 @@ HWY_API Vec128<T, N> LoadDup128(Simd<T, N, 0> d,
 template <typename T, size_t N>
 HWY_API void Store(const Vec128<T, N> v, Simd<T, N, 0> /* tag */,
                    T* HWY_RESTRICT aligned) {
-  CopyBytes<sizeof(T) * N>(v.raw, aligned);
+  CopyBytes<sizeof(T) * N>(v.raw, aligned);  // copy to array
 }
 
 template <typename T, size_t N>
@@ -1322,7 +1453,7 @@ HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N, 0> /* tag */, T* base,
   static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
   for (size_t i = 0; i < N; ++i) {
     uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw[i];
-    CopyBytes<sizeof(T)>(&v.raw[i], base8);
+    CopyBytes<sizeof(T)>(&v.raw[i], base8);  // copy to bytes
   }
 }
 
@@ -1345,7 +1476,7 @@ HWY_API Vec128<T, N> GatherOffset(Simd<T, N, 0> /* tag */, const T* base,
   for (size_t i = 0; i < N; ++i) {
     const uint8_t* base8 =
         reinterpret_cast<const uint8_t*>(base) + offset.raw[i];
-    CopyBytes<sizeof(T)>(base8, &v.raw[i]);
+    CopyBytes<sizeof(T)>(base8, &v.raw[i]);  // copy from bytes
   }
   return v;
 }
@@ -1433,31 +1564,40 @@ HWY_API Vec128<ToT, N> DemoteTo(Simd<ToT, N, 0> /* tag */,
 template <size_t N>
 HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
     Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
-  const RebindToUnsigned<decltype(dbf16)> du16;
   const Repartition<uint32_t, decltype(dbf16)> du32;
-  const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
-  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
+  const Vec128<uint32_t, N> b_in_lower = ShiftRight<16>(BitCast(du32, b));
+  // Avoid OddEven - we want the upper half of `a` even on big-endian systems.
+  const Vec128<uint32_t, N> a_mask = Set(du32, 0xFFFF0000);
+  return BitCast(dbf16, IfVecThenElse(a_mask, BitCast(du32, a), b_in_lower));
+}
+
+template <size_t N>
+HWY_API Vec128<int16_t, 2 * N> ReorderDemote2To(Simd<int16_t, 2 * N, 0> /*d16*/,
+                                                Vec128<int32_t, N> a,
+                                                Vec128<int32_t, N> b) {
+  const int16_t min = LimitsMin<int16_t>();
+  const int16_t max = LimitsMax<int16_t>();
+  Vec128<int16_t, 2 * N> ret;
+  for (size_t i = 0; i < N; ++i) {
+    ret.raw[i] = static_cast<int16_t>(HWY_MIN(HWY_MAX(min, a.raw[i]), max));
+  }
+  for (size_t i = 0; i < N; ++i) {
+    ret.raw[N + i] = static_cast<int16_t>(HWY_MIN(HWY_MAX(min, b.raw[i]), max));
+  }
+  return ret;
 }
 
 namespace detail {
 
 HWY_INLINE void StoreU16ToF16(const uint16_t val,
                               hwy::float16_t* HWY_RESTRICT to) {
-#if HWY_NATIVE_FLOAT16
-  CopyBytes<2>(&val, to);
-#else
-  to->bits = val;
-#endif
+  CopySameSize(&val, to);
 }
 
 HWY_INLINE uint16_t U16FromF16(const hwy::float16_t* HWY_RESTRICT from) {
-#if HWY_NATIVE_FLOAT16
   uint16_t bits16;
-  CopyBytes<2>(from, &bits16);
+  CopySameSize(from, &bits16);
   return bits16;
-#else
-  return from->bits;
-#endif
 }
 
 }  // namespace detail
@@ -1485,7 +1625,7 @@ HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> /* tag */,
     const uint32_t biased_exp32 = biased_exp + (127 - 15);
     const uint32_t mantissa32 = mantissa << (23 - 10);
     const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
-    CopyBytes<4>(&bits32, &ret.raw[i]);
+    CopySameSize(&bits32, &ret.raw[i]);
   }
   return ret;
 }
@@ -1506,7 +1646,7 @@ HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> /* tag */,
   Vec128<float16_t, N> ret;
   for (size_t i = 0; i < N; ++i) {
     uint32_t bits32;
-    CopyBytes<4>(&v.raw[i], &bits32);
+    CopySameSize(&v.raw[i], &bits32);
     const uint32_t sign = bits32 >> 31;
     const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
     const uint32_t mantissa32 = bits32 & 0x7FFFFF;
@@ -1554,8 +1694,12 @@ HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N, 0> /* tag */,
   return ret;
 }
 
-template <typename FromT, typename ToT, size_t N, HWY_IF_FLOAT(FromT)>
-HWY_API Vec128<ToT, N> ConvertTo(Simd<ToT, N, 0> /* tag */,
+// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
+namespace detail {
+
+template <typename FromT, typename ToT, size_t N>
+HWY_API Vec128<ToT, N> ConvertTo(hwy::FloatTag /*tag*/,
+                                 Simd<ToT, N, 0> /* tag */,
                                  Vec128<FromT, N> from) {
   static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
   Vec128<ToT, N> ret;
@@ -1574,8 +1718,9 @@ HWY_API Vec128<ToT, N> ConvertTo(Simd<ToT, N, 0> /* tag */,
   return ret;
 }
 
-template <typename FromT, typename ToT, size_t N, HWY_IF_NOT_FLOAT(FromT)>
-HWY_API Vec128<ToT, N> ConvertTo(Simd<ToT, N, 0> /* tag */,
+template <typename FromT, typename ToT, size_t N>
+HWY_API Vec128<ToT, N> ConvertTo(hwy::NonFloatTag /*tag*/,
+                                 Simd<ToT, N, 0> /* tag */,
                                  Vec128<FromT, N> from) {
   static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
   Vec128<ToT, N> ret;
@@ -1586,11 +1731,80 @@ HWY_API Vec128<ToT, N> ConvertTo(Simd<ToT, N, 0> /* tag */,
   return ret;
 }
 
+}  // namespace detail
+
+template <typename FromT, typename ToT, size_t N>
+HWY_API Vec128<ToT, N> ConvertTo(Simd<ToT, N, 0> d, Vec128<FromT, N> from) {
+  return detail::ConvertTo(hwy::IsFloatTag<FromT>(), d, from);
+}
+
 template <size_t N>
 HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
   return DemoteTo(Simd<uint8_t, N, 0>(), v);
 }
 
+// ------------------------------ Truncations
+
+template <size_t N>
+HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
+                                      const Vec128<uint64_t, N> v) {
+  Vec128<uint8_t, N> ret;
+  for (size_t i = 0; i < N; ++i) {
+    ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
+  }
+  return ret;
+}
+
+template <size_t N>
+HWY_API Vec128<uint16_t, N> TruncateTo(Simd<uint16_t, N, 0> /* tag */,
+                                       const Vec128<uint64_t, N> v) {
+  Vec128<uint16_t, N> ret;
+  for (size_t i = 0; i < N; ++i) {
+    ret.raw[i] = static_cast<uint16_t>(v.raw[i] & 0xFFFF);
+  }
+  return ret;
+}
+
+template <size_t N>
+HWY_API Vec128<uint32_t, N> TruncateTo(Simd<uint32_t, N, 0> /* tag */,
+                                       const Vec128<uint64_t, N> v) {
+  Vec128<uint32_t, N> ret;
+  for (size_t i = 0; i < N; ++i) {
+    ret.raw[i] = static_cast<uint32_t>(v.raw[i] & 0xFFFFFFFFu);
+  }
+  return ret;
+}
+
+template <size_t N>
+HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
+                                      const Vec128<uint32_t, N> v) {
+  Vec128<uint8_t, N> ret;
+  for (size_t i = 0; i < N; ++i) {
+    ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
+  }
+  return ret;
+}
+
+template <size_t N>
+HWY_API Vec128<uint16_t, N> TruncateTo(Simd<uint16_t, N, 0> /* tag */,
+                                       const Vec128<uint32_t, N> v) {
+  Vec128<uint16_t, N> ret;
+  for (size_t i = 0; i < N; ++i) {
+    ret.raw[i] = static_cast<uint16_t>(v.raw[i] & 0xFFFF);
+  }
+  return ret;
+}
+
+template <size_t N>
+HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
+                                      const Vec128<uint16_t, N> v) {
+  Vec128<uint8_t, N> ret;
+  for (size_t i = 0; i < N; ++i) {
+    ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
+  }
+  return ret;
+}
+
 // ================================================== COMBINE
 
 template <typename T, size_t N>
@@ -1908,15 +2122,17 @@ HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
 // ------------------------------ Shuffle*
 
 // Swap 32-bit halves in 64-bit halves.
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+template <typename T, size_t N>
 HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) {
+  static_assert(sizeof(T) == 4, "Only for 32-bit");
   static_assert(N == 2 || N == 4, "Does not make sense for N=1");
   return Reverse2(DFromV<decltype(v)>(), v);
 }
 
 // Swap 64-bit halves
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
+template <typename T>
 HWY_API Vec128<T> Shuffle1032(const Vec128<T> v) {
+  static_assert(sizeof(T) == 4, "Only for 32-bit");
   Vec128<T> ret;
   ret.raw[3] = v.raw[1];
   ret.raw[2] = v.raw[0];
@@ -1924,8 +2140,9 @@ HWY_API Vec128<T> Shuffle1032(const Vec128<T> v) {
   ret.raw[0] = v.raw[2];
   return ret;
 }
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+template <typename T>
 HWY_API Vec128<T> Shuffle01(const Vec128<T> v) {
+  static_assert(sizeof(T) == 8, "Only for 64-bit");
   return Reverse2(DFromV<decltype(v)>(), v);
 }
 
@@ -2055,9 +2272,8 @@ HWY_API bool AllFalse(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
 
 template <typename T, size_t N>
 HWY_API bool AllTrue(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
-  using Bits = typename Mask128<T, N>::Raw;
-  constexpr Bits kAll = static_cast<Bits>(~Bits{0});
-  Bits and_sum = kAll;
+  constexpr uint64_t kAll = LimitsMax<typename Mask128<T, N>::Raw>();
+  uint64_t and_sum = kAll;
   for (size_t i = 0; i < N; ++i) {
     and_sum &= mask.bits[i];
   }
@@ -2103,6 +2319,16 @@ HWY_API size_t CountTrue(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
 }
 
 template <typename T, size_t N>
+HWY_API size_t FindKnownFirstTrue(Simd<T, N, 0> /* tag */,
+                               const Mask128<T, N> mask) {
+  for (size_t i = 0; i < N; ++i) {
+    if (mask.bits[i] != 0) return i;
+  }
+  HWY_DASSERT(false);
+  return 0;
+}
+
+template <typename T, size_t N>
 HWY_API intptr_t FindFirstTrue(Simd<T, N, 0> /* tag */,
                                const Mask128<T, N> mask) {
   for (size_t i = 0; i < N; ++i) {
@@ -2136,6 +2362,31 @@ HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
   return ret;
 }
 
+// ------------------------------ CompressNot
+template <typename T, size_t N>
+HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, const Mask128<T, N> mask) {
+  size_t count = 0;
+  Vec128<T, N> ret;
+  for (size_t i = 0; i < N; ++i) {
+    if (!mask.bits[i]) {
+      ret.raw[count++] = v.raw[i];
+    }
+  }
+  for (size_t i = 0; i < N; ++i) {
+    if (mask.bits[i]) {
+      ret.raw[count++] = v.raw[i];
+    }
+  }
+  HWY_DASSERT(count == N);
+  return ret;
+}
+
+// ------------------------------ CompressBlocksNot
+HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
+                                           Mask128<uint64_t> /* m */) {
+  return v;
+}
+
 // ------------------------------ CompressBits
 template <typename T, size_t N>
 HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
@@ -2176,23 +2427,37 @@ HWY_API size_t CompressBitsStore(Vec128<T, N> v,
 }
 
 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
+
 template <size_t N>
 HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
                                                    Vec128<bfloat16_t, 2 * N> a,
                                                    Vec128<bfloat16_t, 2 * N> b,
                                                    const Vec128<float, N> sum0,
                                                    Vec128<float, N>& sum1) {
-  const Repartition<uint16_t, decltype(df32)> du16;
-  const RebindToUnsigned<decltype(df32)> du32;
-  const Vec128<uint16_t, 2 * N> zero = Zero(du16);
-  const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
-  const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
-  const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
-  const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
+  const Rebind<bfloat16_t, decltype(df32)> dbf16;
+  // Avoid ZipLower/Upper so this also works on big-endian systems.
+  const Vec128<float, N> a0 = PromoteTo(df32, LowerHalf(dbf16, a));
+  const Vec128<float, N> a1 = PromoteTo(df32, UpperHalf(dbf16, a));
+  const Vec128<float, N> b0 = PromoteTo(df32, LowerHalf(dbf16, b));
+  const Vec128<float, N> b1 = PromoteTo(df32, UpperHalf(dbf16, b));
   sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
   return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
 }
 
+template <size_t N>
+HWY_API Vec128<int32_t, N> ReorderWidenMulAccumulate(
+    Simd<int32_t, N, 0> d32, Vec128<int16_t, 2 * N> a, Vec128<int16_t, 2 * N> b,
+    const Vec128<int32_t, N> sum0, Vec128<int32_t, N>& sum1) {
+  const Rebind<int16_t, decltype(d32)> d16;
+  // Avoid ZipLower/Upper so this also works on big-endian systems.
+  const Vec128<int32_t, N> a0 = PromoteTo(d32, LowerHalf(d16, a));
+  const Vec128<int32_t, N> a1 = PromoteTo(d32, UpperHalf(d16, a));
+  const Vec128<int32_t, N> b0 = PromoteTo(d32, LowerHalf(d16, b));
+  const Vec128<int32_t, N> b1 = PromoteTo(d32, UpperHalf(d16, b));
+  sum1 = MulAdd(BitCast(d32, a1), BitCast(d32, b1), sum1);
+  return MulAdd(BitCast(d32, a0), BitCast(d32, b0), sum0);
+}
+
 // ================================================== REDUCTIONS
 
 template <typename T, size_t N>
@@ -2240,62 +2505,6 @@ HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
   return Load(Full128<uint64_t>(), mul);
 }
 
-// ================================================== Operator wrapper
-
-template <class V>
-HWY_API V Add(V a, V b) {
-  return a + b;
-}
-template <class V>
-HWY_API V Sub(V a, V b) {
-  return a - b;
-}
-
-template <class V>
-HWY_API V Mul(V a, V b) {
-  return a * b;
-}
-template <class V>
-HWY_API V Div(V a, V b) {
-  return a / b;
-}
-
-template <class V>
-V Shl(V a, V b) {
-  return a << b;
-}
-template <class V>
-V Shr(V a, V b) {
-  return a >> b;
-}
-
-template <class V>
-HWY_API auto Eq(V a, V b) -> decltype(a == b) {
-  return a == b;
-}
-template <class V>
-HWY_API auto Ne(V a, V b) -> decltype(a == b) {
-  return a != b;
-}
-template <class V>
-HWY_API auto Lt(V a, V b) -> decltype(a == b) {
-  return a < b;
-}
-
-template <class V>
-HWY_API auto Gt(V a, V b) -> decltype(a == b) {
-  return a > b;
-}
-template <class V>
-HWY_API auto Ge(V a, V b) -> decltype(a == b) {
-  return a >= b;
-}
-
-template <class V>
-HWY_API auto Le(V a, V b) -> decltype(a == b) {
-  return a <= b;
-}
-
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
diff --git a/media/highway/src/hwy/ops/generic_ops-inl.h b/media/highway/src/hwy/ops/generic_ops-inl.h
index 6beeb42ed7..b01c5de0fb 100644
--- a/media/highway/src/hwy/ops/generic_ops-inl.h
+++ b/media/highway/src/hwy/ops/generic_ops-inl.h
@@ -1192,10 +1192,11 @@ HWY_API V CLMulUpper(V a, V b) {
 
 // This algorithm requires vectors to be at least 16 bytes, which is the case
 // for LMUL >= 2. If not, use the fallback below.
-template <typename V, HWY_IF_LANES_ARE(uint8_t, V), HWY_IF_GE128_D(DFromV<V>),
-          HWY_IF_POW2_GE(DFromV<V>, HWY_MIN_POW2_FOR_128)>
+template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1),
+          HWY_IF_GE128_D(D), HWY_IF_POW2_GE(D, HWY_MIN_POW2_FOR_128)>
 HWY_API V PopulationCount(V v) {
-  const DFromV<V> d;
+  static_assert(IsSame<TFromD<D>, uint8_t>(), "V must be u8");
+  const D d;
   HWY_ALIGN constexpr uint8_t kLookup[16] = {
       0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
   };
@@ -1208,9 +1209,11 @@ HWY_API V PopulationCount(V v) {
 // RVV has a specialization that avoids the Set().
 #if HWY_TARGET != HWY_RVV
 // Slower fallback for capped vectors.
-template <typename V, HWY_IF_LANES_ARE(uint8_t, V), HWY_IF_LT128_D(DFromV<V>)>
+template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1),
+          HWY_IF_LT128_D(D)>
 HWY_API V PopulationCount(V v) {
-  const DFromV<V> d;
+  static_assert(IsSame<TFromD<D>, uint8_t>(), "V must be u8");
+  const D d;
   // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
   v = Sub(v, And(ShiftRight<1>(v), Set(d, 0x55)));
   v = Add(And(ShiftRight<2>(v), Set(d, 0x33)), And(v, Set(d, 0x33)));
@@ -1218,26 +1221,29 @@ HWY_API V PopulationCount(V v) {
 }
 #endif  // HWY_TARGET != HWY_RVV
 
-template <typename V, HWY_IF_LANES_ARE(uint16_t, V)>
+template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 2)>
 HWY_API V PopulationCount(V v) {
-  const DFromV<V> d;
+  static_assert(IsSame<TFromD<D>, uint16_t>(), "V must be u16");
+  const D d;
   const Repartition<uint8_t, decltype(d)> d8;
   const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
   return Add(ShiftRight<8>(vals), And(vals, Set(d, 0xFF)));
 }
 
-template <typename V, HWY_IF_LANES_ARE(uint32_t, V)>
+template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 4)>
 HWY_API V PopulationCount(V v) {
-  const DFromV<V> d;
+  static_assert(IsSame<TFromD<D>, uint32_t>(), "V must be u32");
+  const D d;
   Repartition<uint16_t, decltype(d)> d16;
   auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
   return Add(ShiftRight<16>(vals), And(vals, Set(d, 0xFF)));
 }
 
 #if HWY_HAVE_INTEGER64
-template <typename V, HWY_IF_LANES_ARE(uint64_t, V)>
+template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 8)>
 HWY_API V PopulationCount(V v) {
-  const DFromV<V> d;
+  static_assert(IsSame<TFromD<D>, uint64_t>(), "V must be u64");
+  const D d;
   Repartition<uint32_t, decltype(d)> d32;
   auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
   return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFF)));
@@ -1246,6 +1252,105 @@ HWY_API V PopulationCount(V v) {
 
 #endif  // HWY_NATIVE_POPCNT
 
+template <class V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 8),
+          HWY_IF_LT128_D(D)>
+HWY_API V operator*(V x, V y) {
+  return Set(D(), GetLane(x) * GetLane(y));
+}
+
+// "Include guard": skip if native 64-bit mul instructions are available.
+#if (defined(HWY_NATIVE_I64MULLO) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_I64MULLO
+#undef HWY_NATIVE_I64MULLO
+#else
+#define HWY_NATIVE_I64MULLO
+#endif
+
+template <class V, class D64 = DFromV<V>, typename T = LaneType<V>,
+          HWY_IF_LANE_SIZE(T, 8), HWY_IF_UNSIGNED(T), HWY_IF_GE128_D(D64)>
+HWY_API V operator*(V x, V y) {
+  RepartitionToNarrow<D64> d32;
+  auto x32 = BitCast(d32, x);
+  auto y32 = BitCast(d32, y);
+  auto lolo = BitCast(d32, MulEven(x32, y32));
+  auto lohi = BitCast(d32, MulEven(x32, BitCast(d32, ShiftRight<32>(y))));
+  auto hilo = BitCast(d32, MulEven(BitCast(d32, ShiftRight<32>(x)), y32));
+  auto hi = BitCast(d32, ShiftLeft<32>(BitCast(D64{}, lohi + hilo)));
+  return BitCast(D64{}, lolo + hi);
+}
+template <class V, class DI64 = DFromV<V>, typename T = LaneType<V>,
+          HWY_IF_LANE_SIZE(T, 8), HWY_IF_SIGNED(T), HWY_IF_GE128_D(DI64)>
+HWY_API V operator*(V x, V y) {
+  RebindToUnsigned<DI64> du64;
+  return BitCast(DI64{}, BitCast(du64, x) * BitCast(du64, y));
+}
+
+#endif  // HWY_NATIVE_I64MULLO
+
+// ================================================== Operator wrapper
+
+// These targets currently cannot define operators and have already defined
+// (only) the corresponding functions such as Add.
+#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE &&      \
+    HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && \
+    HWY_TARGET != HWY_SVE2_128
+
+template <class V>
+HWY_API V Add(V a, V b) {
+  return a + b;
+}
+template <class V>
+HWY_API V Sub(V a, V b) {
+  return a - b;
+}
+
+template <class V>
+HWY_API V Mul(V a, V b) {
+  return a * b;
+}
+template <class V>
+HWY_API V Div(V a, V b) {
+  return a / b;
+}
+
+template <class V>
+V Shl(V a, V b) {
+  return a << b;
+}
+template <class V>
+V Shr(V a, V b) {
+  return a >> b;
+}
+
+template <class V>
+HWY_API auto Eq(V a, V b) -> decltype(a == b) {
+  return a == b;
+}
+template <class V>
+HWY_API auto Ne(V a, V b) -> decltype(a == b) {
+  return a != b;
+}
+template <class V>
+HWY_API auto Lt(V a, V b) -> decltype(a == b) {
+  return a < b;
+}
+
+template <class V>
+HWY_API auto Gt(V a, V b) -> decltype(a == b) {
+  return a > b;
+}
+template <class V>
+HWY_API auto Ge(V a, V b) -> decltype(a == b) {
+  return a >= b;
+}
+
+template <class V>
+HWY_API auto Le(V a, V b) -> decltype(a == b) {
+  return a <= b;
+}
+
+#endif  // HWY_TARGET for operators
+
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
diff --git a/media/highway/src/hwy/ops/rvv-inl.h b/media/highway/src/hwy/ops/rvv-inl.h
index 9ddb76b2c4..2a8fb52436 100644
--- a/media/highway/src/hwy/ops/rvv-inl.h
+++ b/media/highway/src/hwy/ops/rvv-inl.h
@@ -496,7 +496,9 @@ using VFromD = decltype(Set(D(), TFromD<D>()));
 
 template <typename T, size_t N, int kPow2>
 HWY_API VFromD<Simd<T, N, kPow2>> Zero(Simd<T, N, kPow2> d) {
-  return Set(d, T(0));
+  // Cast to support bfloat16_t.
+  const RebindToUnsigned<decltype(d)> du;
+  return BitCast(d, Set(du, 0));
 }
 
 // ------------------------------ Undefined
@@ -949,16 +951,16 @@ HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Max, fmax, _ALL)
 
 // ------------------------------ Mul
 
-// Only for internal use (Highway only promises Mul for 16/32-bit inputs).
-// Used by MulLower.
-namespace detail {
-HWY_RVV_FOREACH_U64(HWY_RVV_RETV_ARGVV, Mul, mul, _ALL)
-}  // namespace detail
-
-HWY_RVV_FOREACH_UI16(HWY_RVV_RETV_ARGVV, Mul, mul, _ALL)
-HWY_RVV_FOREACH_UI32(HWY_RVV_RETV_ARGVV, Mul, mul, _ALL)
+HWY_RVV_FOREACH_UI163264(HWY_RVV_RETV_ARGVV, Mul, mul, _ALL)
 HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Mul, fmul, _ALL)
 
+// Per-target flag to prevent generic_ops-inl.h from defining i64 operator*.
+#ifdef HWY_NATIVE_I64MULLO
+#undef HWY_NATIVE_I64MULLO
+#else
+#define HWY_NATIVE_I64MULLO
+#endif
+
 // ------------------------------ MulHigh
 
 // Only for internal use (Highway only promises MulHigh for 16-bit inputs).
@@ -1087,9 +1089,9 @@ HWY_API auto TestBit(const V a, const V bit) -> decltype(Eq(a, bit)) {
 }
 
 // ------------------------------ Not
+// NOLINTNEXTLINE
 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, Not, not )
 
-
 // ------------------------------ And
 
 // mask = f(mask_a, mask_b) (note arg2,arg1 order!)
@@ -1109,6 +1111,9 @@ HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Or, or)
 // ------------------------------ Xor
 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Xor, xor)
 
+// ------------------------------ ExclusiveNeither
+HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, ExclusiveNeither, xnor)
+
 #undef HWY_RVV_RETM_ARGMM
 
 // ------------------------------ IfThenElse
@@ -1219,14 +1224,19 @@ HWY_API V IfNegativeThenElse(V v, V yes, V no) {
 
 // ------------------------------ FindFirstTrue
 
-#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
-  template <class D>                                        \
-  HWY_API intptr_t FindFirstTrue(D d, HWY_RVV_M(MLEN) m) {  \
-    static_assert(MLenFromD(d) == MLEN, "Type mismatch");   \
-    return vfirst_m_b##MLEN(m, Lanes(d));                   \
+#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP)    \
+  template <class D>                                           \
+  HWY_API intptr_t FindFirstTrue(D d, HWY_RVV_M(MLEN) m) {     \
+    static_assert(MLenFromD(d) == MLEN, "Type mismatch");      \
+    return vfirst_m_b##MLEN(m, Lanes(d));                      \
+  }                                                            \
+  template <class D>                                           \
+  HWY_API size_t FindKnownFirstTrue(D d, HWY_RVV_M(MLEN) m) {  \
+    static_assert(MLenFromD(d) == MLEN, "Type mismatch");      \
+    return static_cast<size_t>(vfirst_m_b##MLEN(m, Lanes(d))); \
   }
 
-HWY_RVV_FOREACH_B(HWY_RVV_FIND_FIRST_TRUE, _, _)
+HWY_RVV_FOREACH_B(HWY_RVV_FIND_FIRST_TRUE, , _)
 #undef HWY_RVV_FIND_FIRST_TRUE
 
 // ------------------------------ AllFalse
@@ -1690,6 +1700,249 @@ HWY_API vuint8m2_t U8FromU32(const vuint32m8_t v) {
   return vnclipu_wx_u8m2(vnclipu_wx_u16m4(v, 0, avl), 0, avl);
 }
 
+// ------------------------------ Truncations
+
+template <size_t N>
+HWY_API vuint8mf8_t TruncateTo(Simd<uint8_t, N, -3> d,
+                               const VFromD<Simd<uint64_t, N, 0>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m1_t v1 = vand(v, 0xFF, avl);
+  const vuint32mf2_t v2 = vnclipu_wx_u32mf2(v1, 0, avl);
+  const vuint16mf4_t v3 = vnclipu_wx_u16mf4(v2, 0, avl);
+  return vnclipu_wx_u8mf8(v3, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8mf4_t TruncateTo(Simd<uint8_t, N, -2> d,
+                               const VFromD<Simd<uint64_t, N, 1>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m2_t v1 = vand(v, 0xFF, avl);
+  const vuint32m1_t v2 = vnclipu_wx_u32m1(v1, 0, avl);
+  const vuint16mf2_t v3 = vnclipu_wx_u16mf2(v2, 0, avl);
+  return vnclipu_wx_u8mf4(v3, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8mf2_t TruncateTo(Simd<uint8_t, N, -1> d,
+                               const VFromD<Simd<uint64_t, N, 2>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m4_t v1 = vand(v, 0xFF, avl);
+  const vuint32m2_t v2 = vnclipu_wx_u32m2(v1, 0, avl);
+  const vuint16m1_t v3 = vnclipu_wx_u16m1(v2, 0, avl);
+  return vnclipu_wx_u8mf2(v3, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8m1_t TruncateTo(Simd<uint8_t, N, 0> d,
+                              const VFromD<Simd<uint64_t, N, 3>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m8_t v1 = vand(v, 0xFF, avl);
+  const vuint32m4_t v2 = vnclipu_wx_u32m4(v1, 0, avl);
+  const vuint16m2_t v3 = vnclipu_wx_u16m2(v2, 0, avl);
+  return vnclipu_wx_u8m1(v3, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint16mf4_t TruncateTo(Simd<uint16_t, N, -2> d,
+                                const VFromD<Simd<uint64_t, N, 0>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m1_t v1 = vand(v, 0xFFFF, avl);
+  const vuint32mf2_t v2 = vnclipu_wx_u32mf2(v1, 0, avl);
+  return vnclipu_wx_u16mf4(v2, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint16mf2_t TruncateTo(Simd<uint16_t, N, -1> d,
+                                const VFromD<Simd<uint64_t, N, 1>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m2_t v1 = vand(v, 0xFFFF, avl);
+  const vuint32m1_t v2 = vnclipu_wx_u32m1(v1, 0, avl);
+  return vnclipu_wx_u16mf2(v2, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint16m1_t TruncateTo(Simd<uint16_t, N, 0> d,
+                               const VFromD<Simd<uint64_t, N, 2>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m4_t v1 = vand(v, 0xFFFF, avl);
+  const vuint32m2_t v2 = vnclipu_wx_u32m2(v1, 0, avl);
+  return vnclipu_wx_u16m1(v2, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint16m2_t TruncateTo(Simd<uint16_t, N, 1> d,
+                               const VFromD<Simd<uint64_t, N, 3>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m8_t v1 = vand(v, 0xFFFF, avl);
+  const vuint32m4_t v2 = vnclipu_wx_u32m4(v1, 0, avl);
+  return vnclipu_wx_u16m2(v2, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint32mf2_t TruncateTo(Simd<uint32_t, N, -1> d,
+                                const VFromD<Simd<uint64_t, N, 0>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m1_t v1 = vand(v, 0xFFFFFFFFu, avl);
+  return vnclipu_wx_u32mf2(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint32m1_t TruncateTo(Simd<uint32_t, N, 0> d,
+                               const VFromD<Simd<uint64_t, N, 1>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m2_t v1 = vand(v, 0xFFFFFFFFu, avl);
+  return vnclipu_wx_u32m1(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint32m2_t TruncateTo(Simd<uint32_t, N, 1> d,
+                               const VFromD<Simd<uint64_t, N, 2>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m4_t v1 = vand(v, 0xFFFFFFFFu, avl);
+  return vnclipu_wx_u32m2(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint32m4_t TruncateTo(Simd<uint32_t, N, 2> d,
+                               const VFromD<Simd<uint64_t, N, 3>> v) {
+  const size_t avl = Lanes(d);
+  const vuint64m8_t v1 = vand(v, 0xFFFFFFFFu, avl);
+  return vnclipu_wx_u32m4(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8mf8_t TruncateTo(Simd<uint8_t, N, -3> d,
+                               const VFromD<Simd<uint32_t, N, -1>> v) {
+  const size_t avl = Lanes(d);
+  const vuint32mf2_t v1 = vand(v, 0xFF, avl);
+  const vuint16mf4_t v2 = vnclipu_wx_u16mf4(v1, 0, avl);
+  return vnclipu_wx_u8mf8(v2, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8mf4_t TruncateTo(Simd<uint8_t, N, -2> d,
+                               const VFromD<Simd<uint32_t, N, 0>> v) {
+  const size_t avl = Lanes(d);
+  const vuint32m1_t v1 = vand(v, 0xFF, avl);
+  const vuint16mf2_t v2 = vnclipu_wx_u16mf2(v1, 0, avl);
+  return vnclipu_wx_u8mf4(v2, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8mf2_t TruncateTo(Simd<uint8_t, N, -1> d,
+                               const VFromD<Simd<uint32_t, N, 1>> v) {
+  const size_t avl = Lanes(d);
+  const vuint32m2_t v1 = vand(v, 0xFF, avl);
+  const vuint16m1_t v2 = vnclipu_wx_u16m1(v1, 0, avl);
+  return vnclipu_wx_u8mf2(v2, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8m1_t TruncateTo(Simd<uint8_t, N, 0> d,
+                              const VFromD<Simd<uint32_t, N, 2>> v) {
+  const size_t avl = Lanes(d);
+  const vuint32m4_t v1 = vand(v, 0xFF, avl);
+  const vuint16m2_t v2 = vnclipu_wx_u16m2(v1, 0, avl);
+  return vnclipu_wx_u8m1(v2, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8m2_t TruncateTo(Simd<uint8_t, N, 1> d,
+                              const VFromD<Simd<uint32_t, N, 3>> v) {
+  const size_t avl = Lanes(d);
+  const vuint32m8_t v1 = vand(v, 0xFF, avl);
+  const vuint16m4_t v2 = vnclipu_wx_u16m4(v1, 0, avl);
+  return vnclipu_wx_u8m2(v2, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint16mf4_t TruncateTo(Simd<uint16_t, N, -2> d,
+                                const VFromD<Simd<uint32_t, N, -1>> v) {
+  const size_t avl = Lanes(d);
+  const vuint32mf2_t v1 = vand(v, 0xFFFF, avl);
+  return vnclipu_wx_u16mf4(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint16mf2_t TruncateTo(Simd<uint16_t, N, -1> d,
+                                const VFromD<Simd<uint32_t, N, 0>> v) {
+  const size_t avl = Lanes(d);
+  const vuint32m1_t v1 = vand(v, 0xFFFF, avl);
+  return vnclipu_wx_u16mf2(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint16m1_t TruncateTo(Simd<uint16_t, N, 0> d,
+                               const VFromD<Simd<uint32_t, N, 1>> v) {
+  const size_t avl = Lanes(d);
+  const vuint32m2_t v1 = vand(v, 0xFFFF, avl);
+  return vnclipu_wx_u16m1(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint16m2_t TruncateTo(Simd<uint16_t, N, 1> d,
+                               const VFromD<Simd<uint32_t, N, 2>> v) {
+  const size_t avl = Lanes(d);
+  const vuint32m4_t v1 = vand(v, 0xFFFF, avl);
+  return vnclipu_wx_u16m2(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint16m4_t TruncateTo(Simd<uint16_t, N, 2> d,
+                               const VFromD<Simd<uint32_t, N, 3>> v) {
+  const size_t avl = Lanes(d);
+  const vuint32m8_t v1 = vand(v, 0xFFFF, avl);
+  return vnclipu_wx_u16m4(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8mf8_t TruncateTo(Simd<uint8_t, N, -3> d,
+                               const VFromD<Simd<uint16_t, N, -2>> v) {
+  const size_t avl = Lanes(d);
+  const vuint16mf4_t v1 = vand(v, 0xFF, avl);
+  return vnclipu_wx_u8mf8(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8mf4_t TruncateTo(Simd<uint8_t, N, -2> d,
+                               const VFromD<Simd<uint16_t, N, -1>> v) {
+  const size_t avl = Lanes(d);
+  const vuint16mf2_t v1 = vand(v, 0xFF, avl);
+  return vnclipu_wx_u8mf4(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8mf2_t TruncateTo(Simd<uint8_t, N, -1> d,
+                               const VFromD<Simd<uint16_t, N, 0>> v) {
+  const size_t avl = Lanes(d);
+  const vuint16m1_t v1 = vand(v, 0xFF, avl);
+  return vnclipu_wx_u8mf2(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8m1_t TruncateTo(Simd<uint8_t, N, 0> d,
+                              const VFromD<Simd<uint16_t, N, 1>> v) {
+  const size_t avl = Lanes(d);
+  const vuint16m2_t v1 = vand(v, 0xFF, avl);
+  return vnclipu_wx_u8m1(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8m2_t TruncateTo(Simd<uint8_t, N, 1> d,
+                              const VFromD<Simd<uint16_t, N, 2>> v) {
+  const size_t avl = Lanes(d);
+  const vuint16m4_t v1 = vand(v, 0xFF, avl);
+  return vnclipu_wx_u8m2(v1, 0, avl);
+}
+
+template <size_t N>
+HWY_API vuint8m4_t TruncateTo(Simd<uint8_t, N, 2> d,
+                              const VFromD<Simd<uint16_t, N, 3>> v) {
+  const size_t avl = Lanes(d);
+  const vuint16m8_t v1 = vand(v, 0xFF, avl);
+  return vnclipu_wx_u8m4(v1, 0, avl);
+}
+
 // ------------------------------ DemoteTo I
 
 HWY_RVV_FOREACH_I16(HWY_RVV_DEMOTE, DemoteTo, vnclip_wx_, _DEMOTE_VIRT)
@@ -1776,6 +2029,11 @@ HWY_API VFromD<Simd<uint16_t, N, kPow2>> DemoteTo(
       HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(int, SEW, LMUL) v) {         \
     return vfcvt_f_x_v_f##SEW##LMUL(v, Lanes(d));                              \
   }                                                                            \
+  template <size_t N>                                                          \
+  HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo(                                \
+      HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(uint, SEW, LMUL) v) {\
+    return vfcvt_f_xu_v_f##SEW##LMUL(v, Lanes(d));                             \
+  }                                                                            \
   /* Truncates (rounds toward zero). */                                        \
   template <size_t N>                                                          \
   HWY_API HWY_RVV_V(int, SEW, LMUL) ConvertTo(HWY_RVV_D(int, SEW, N, SHIFT) d, \
@@ -1823,7 +2081,8 @@ template <size_t kLanes, class D>
 HWY_INLINE MFromD<D> FirstNPerBlock(D /* tag */) {
   const RebindToUnsigned<D> du;
   const RebindToSigned<D> di;
-  const auto idx_mod = AndS(Iota0(du), LanesPerBlock(du) - 1);
+  using TU = TFromD<decltype(du)>;
+  const auto idx_mod = AndS(Iota0(du), static_cast<TU>(LanesPerBlock(du) - 1));
   return LtS(BitCast(di, idx_mod), static_cast<TFromD<decltype(di)>>(kLanes));
 }
 
@@ -2190,6 +2449,18 @@ HWY_RVV_FOREACH_UI163264(HWY_RVV_COMPRESS, Compress, compress, _ALL)
 HWY_RVV_FOREACH_F(HWY_RVV_COMPRESS, Compress, compress, _ALL)
 #undef HWY_RVV_COMPRESS
 
+// ------------------------------ CompressNot
+template <class V, class M>
+HWY_API V CompressNot(V v, const M mask) {
+  return Compress(v, Not(mask));
+}
+
+// ------------------------------ CompressBlocksNot
+template <class V, class M>
+HWY_API V CompressBlocksNot(V v, const M mask) {
+  return CompressNot(v, mask);
+}
+
 // ------------------------------ CompressStore
 template <class V, class M, class D>
 HWY_API size_t CompressStore(const V v, const M mask, const D d,
@@ -2346,7 +2617,7 @@ HWY_API VI TableLookupBytes(const VT vt, const VI vi) {
   // If the table is shorter, wrap around offsets so they do not reference
   // undefined lanes in the newly extended vmt.
   if (kPow2T < kPow2I) {
-    offsets = detail::AndS(offsets, Lanes(dt8) - 1);
+    offsets = detail::AndS(offsets, static_cast<uint8_t>(Lanes(dt8) - 1));
   }
   const auto out = TableLookupLanes(vmt, Add(vmi, offsets));
   return BitCast(di, detail::ChangeLMUL(di8, out));
@@ -2382,8 +2653,9 @@ HWY_API V ShiftLeftLanes(const D d, const V v) {
   const auto shifted = detail::SlideUp(v, v, kLanes);
   // Match x86 semantics by zeroing lower lanes in 128-bit blocks
   const auto idx_mod =
-      detail::AndS(detail::Iota0(di), detail::LanesPerBlock(di) - 1);
-  const auto clear = detail::LtS(BitCast(di, idx_mod), static_cast<TI>(kLanes));
+      detail::AndS(BitCast(di, detail::Iota0(di)),
+                   static_cast<TI>(detail::LanesPerBlock(di) - 1));
+  const auto clear = detail::LtS(idx_mod, static_cast<TI>(kLanes));
   return IfThenZeroElse(clear, shifted);
 }
 
@@ -2419,9 +2691,9 @@ HWY_API V ShiftRightLanes(const Simd<T, N, kPow2> d, V v) {
   const auto shifted = detail::SlideDown(v, v, kLanes);
   // Match x86 semantics by zeroing upper lanes in 128-bit blocks
   const size_t lpb = detail::LanesPerBlock(di);
-  const auto idx_mod = detail::AndS(detail::Iota0(di), lpb - 1);
-  const auto keep =
-      detail::LtS(BitCast(di, idx_mod), static_cast<TI>(lpb - kLanes));
+  const auto idx_mod =
+      detail::AndS(BitCast(di, detail::Iota0(di)), static_cast<TI>(lpb - 1));
+  const auto keep = detail::LtS(idx_mod, static_cast<TI>(lpb - kLanes));
   return IfThenElseZero(keep, shifted);
 }
 
@@ -2438,9 +2710,10 @@ template <class D, class V>
 HWY_API V InterleaveLower(D d, const V a, const V b) {
   static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
   const RebindToUnsigned<decltype(d)> du;
+  using TU = TFromD<decltype(du)>;
   const auto i = detail::Iota0(du);
-  const auto idx_mod =
-      ShiftRight<1>(detail::AndS(i, detail::LanesPerBlock(du) - 1));
+  const auto idx_mod = ShiftRight<1>(
+      detail::AndS(i, static_cast<TU>(detail::LanesPerBlock(du) - 1)));
   const auto idx = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i));
   const auto is_even = detail::EqS(detail::AndS(i, 1), 0u);
   return IfThenElse(is_even, TableLookupLanes(a, idx),
@@ -2458,11 +2731,12 @@ template <class D, class V>
 HWY_API V InterleaveUpper(const D d, const V a, const V b) {
   static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
   const RebindToUnsigned<decltype(d)> du;
+  using TU = TFromD<decltype(du)>;
   const size_t lpb = detail::LanesPerBlock(du);
   const auto i = detail::Iota0(du);
-  const auto idx_mod = ShiftRight<1>(detail::AndS(i, lpb - 1));
+  const auto idx_mod = ShiftRight<1>(detail::AndS(i, static_cast<TU>(lpb - 1)));
   const auto idx_lower = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i));
-  const auto idx = detail::AddS(idx_lower, lpb / 2);
+  const auto idx = detail::AddS(idx_lower, static_cast<TU>(lpb / 2));
   const auto is_even = detail::EqS(detail::AndS(i, 1), 0u);
   return IfThenElse(is_even, TableLookupLanes(a, idx),
                     TableLookupLanes(b, idx));
@@ -2552,7 +2826,7 @@ HWY_API VFromD<D> MaxOfLanes(D d, const VFromD<D> v) {
 // ------------------------------ PopulationCount (ShiftRight)
 
 // Handles LMUL >= 2 or capped vectors, which generic_ops-inl cannot.
-template <typename V, class D = DFromV<V>, HWY_IF_LANES_ARE(uint8_t, V),
+template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1),
           hwy::EnableIf<Pow2(D()) < 1 || MaxLanes(D()) < 16>* = nullptr>
 HWY_API V PopulationCount(V v) {
   // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
@@ -2565,9 +2839,12 @@ HWY_API V PopulationCount(V v) {
 
 template <class D>
 HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* const HWY_RESTRICT p) {
-  const auto loaded = Load(d, p);
-  // Broadcast the first block
-  const auto idx = detail::AndS(detail::Iota0(d), detail::LanesPerBlock(d) - 1);
+  const VFromD<D> loaded = Load(d, p);
+  // idx must be unsigned for TableLookupLanes.
+  using TU = MakeUnsigned<TFromD<D>>;
+  const TU mask = static_cast<TU>(detail::LanesPerBlock(d) - 1);
+  // Broadcast the first block.
+  const VFromD<RebindToUnsigned<D>> idx = detail::AndS(detail::Iota0(d), mask);
   return TableLookupLanes(loaded, idx);
 }
 
@@ -2809,19 +3086,19 @@ HWY_API VFromD<DW> MulEven(const V a, const V b) {
 // There is no 64x64 vwmul.
 template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
 HWY_INLINE V MulEven(const V a, const V b) {
-  const auto lo = detail::Mul(a, b);
+  const auto lo = Mul(a, b);
   const auto hi = detail::MulHigh(a, b);
   return OddEven(detail::Slide1Up(hi), lo);
 }
 
 template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
 HWY_INLINE V MulOdd(const V a, const V b) {
-  const auto lo = detail::Mul(a, b);
+  const auto lo = Mul(a, b);
   const auto hi = detail::MulHigh(a, b);
   return OddEven(hi, detail::Slide1Down(lo));
 }
 
-// ------------------------------ ReorderDemote2To (OddEven)
+// ------------------------------ ReorderDemote2To (OddEven, Combine)
 
 template <size_t N, int kPow2>
 HWY_API VFromD<Simd<uint16_t, N, kPow2>> ReorderDemote2To(
@@ -2834,22 +3111,42 @@ HWY_API VFromD<Simd<uint16_t, N, kPow2>> ReorderDemote2To(
   return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
 }
 
+// If LMUL is not the max, Combine first to avoid another DemoteTo.
+template <size_t N, int kPow2, hwy::EnableIf<(kPow2 < 3)>* = nullptr,
+          class D32 = RepartitionToWide<Simd<int16_t, N, kPow2>>>
+HWY_API VFromD<Simd<int16_t, N, kPow2>> ReorderDemote2To(
+    Simd<int16_t, N, kPow2> d16, VFromD<D32> a, VFromD<D32> b) {
+  const Twice<D32> d32t;
+  const VFromD<decltype(d32t)> ab = Combine(d32t, a, b);
+  return DemoteTo(d16, ab);
+}
+
+// Max LMUL: must DemoteTo first, then Combine.
+template <size_t N, class V32 = VFromD<RepartitionToWide<Simd<int16_t, N, 3>>>>
+HWY_API VFromD<Simd<int16_t, N, 3>> ReorderDemote2To(Simd<int16_t, N, 3> d16,
+                                                     V32 a, V32 b) {
+  const Half<decltype(d16)> d16h;
+  const VFromD<decltype(d16h)> a16 = DemoteTo(d16h, a);
+  const VFromD<decltype(d16h)> b16 = DemoteTo(d16h, b);
+  return Combine(d16, a16, b16);
+}
+
 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
 
-template <class DF>
-using DU16FromDF = RepartitionToNarrow<RebindToUnsigned<DF>>;
+namespace detail {
 
-template <size_t N, int kPow2>
-HWY_API auto ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
-                                       VFromD<DU16FromDF<decltype(df32)>> a,
-                                       VFromD<DU16FromDF<decltype(df32)>> b,
-                                       const VFromD<decltype(df32)> sum0,
-                                       VFromD<decltype(df32)>& sum1)
-    -> VFromD<decltype(df32)> {
-  const DU16FromDF<decltype(df32)> du16;
-  const RebindToUnsigned<decltype(df32)> du32;
+// Non-overloaded wrapper function so we can define DF32 in template args.
+template <
+    size_t N, int kPow2, class DF32 = Simd<float, N, kPow2>,
+    class VF32 = VFromD<DF32>,
+    class DU16 = RepartitionToNarrow<RebindToUnsigned<Simd<float, N, kPow2>>>>
+HWY_API VF32 ReorderWidenMulAccumulateBF16(Simd<float, N, kPow2> df32,
+                                           VFromD<DU16> a, VFromD<DU16> b,
+                                           const VF32 sum0, VF32& sum1) {
+  const DU16 du16;
+  const RebindToUnsigned<DF32> du32;
   using VU32 = VFromD<decltype(du32)>;
-  const VFromD<decltype(du16)> zero = Zero(du16);
+  const VFromD<DU16> zero = Zero(du16);
   const VU32 a0 = ZipLower(du32, zero, BitCast(du16, a));
   const VU32 a1 = ZipUpper(du32, zero, BitCast(du16, a));
   const VU32 b0 = ZipLower(du32, zero, BitCast(du16, b));
@@ -2858,11 +3155,68 @@ HWY_API auto ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
   return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
 }
 
-// ------------------------------ Lt128
+#define HWY_RVV_WIDEN_MACC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,    \
+                           SHIFT, MLEN, NAME, OP)                              \
+  template <size_t N>                                                          \
+  HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME(                                   \
+      HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEWD, LMULD) sum, \
+      HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) {            \
+    return OP##CHAR##SEWD##LMULD(sum, a, b, Lanes(d));                         \
+  }
+
+HWY_RVV_FOREACH_I16(HWY_RVV_WIDEN_MACC, WidenMulAcc, vwmacc_vv_, _EXT_VIRT)
+#undef HWY_RVV_WIDEN_MACC
+
+// If LMUL is not the max, we can WidenMul first (3 instructions).
+template <size_t N, int kPow2, hwy::EnableIf<(kPow2 < 3)>* = nullptr,
+          class D32 = Simd<int32_t, N, kPow2>, class V32 = VFromD<D32>,
+          class D16 = RepartitionToNarrow<D32>>
+HWY_API VFromD<D32> ReorderWidenMulAccumulateI16(Simd<int32_t, N, kPow2> d32,
+                                                 VFromD<D16> a, VFromD<D16> b,
+                                                 const V32 sum0, V32& sum1) {
+  const Twice<decltype(d32)> d32t;
+  using V32T = VFromD<decltype(d32t)>;
+  V32T sum = Combine(d32t, sum0, sum1);
+  sum = detail::WidenMulAcc(d32t, sum, a, b);
+  sum1 = UpperHalf(d32, sum);
+  return LowerHalf(d32, sum);
+}
+
+// Max LMUL: must LowerHalf first (4 instructions).
+template <size_t N, class D32 = Simd<int32_t, N, 3>, class V32 = VFromD<D32>,
+          class D16 = RepartitionToNarrow<D32>>
+HWY_API VFromD<D32> ReorderWidenMulAccumulateI16(Simd<int32_t, N, 3> d32,
+                                                 VFromD<D16> a, VFromD<D16> b,
+                                                 const V32 sum0, V32& sum1) {
+  const Half<D16> d16h;
+  using V16H = VFromD<decltype(d16h)>;
+  const V16H a0 = LowerHalf(d16h, a);
+  const V16H a1 = UpperHalf(d16h, a);
+  const V16H b0 = LowerHalf(d16h, b);
+  const V16H b1 = UpperHalf(d16h, b);
+  sum1 = detail::WidenMulAcc(d32, sum1, a1, b1);
+  return detail::WidenMulAcc(d32, sum0, a0, b0);
+}
+
+}  // namespace detail
+
+template <size_t N, int kPow2, class VN, class VW>
+HWY_API VW ReorderWidenMulAccumulate(Simd<float, N, kPow2> d32, VN a, VN b,
+                                     const VW sum0, VW& sum1) {
+  return detail::ReorderWidenMulAccumulateBF16(d32, a, b, sum0, sum1);
+}
+
+template <size_t N, int kPow2, class VN, class VW>
+HWY_API VW ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32, VN a, VN b,
+                                     const VW sum0, VW& sum1) {
+  return detail::ReorderWidenMulAccumulateI16(d32, a, b, sum0, sum1);
+}
 
+// ------------------------------ Lt128
 template <class D>
 HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
   // Truth table of Eq and Compare for Hi and Lo u64.
   // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
   // =H =L cH cL  | out = cH | (=H & cL)
@@ -2885,6 +3239,56 @@ HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
   return MaskFromVec(OddEven(vecHx, detail::Slide1Down(vecHx)));
 }
 
+// ------------------------------ Lt128Upper
+template <class D>
+HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
+  const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
+  // Replicate H to its neighbor.
+  return MaskFromVec(OddEven(ltHL, detail::Slide1Down(ltHL)));
+}
+
+// ------------------------------ Eq128
+template <class D>
+HWY_INLINE MFromD<D> Eq128(D d, const VFromD<D> a, const VFromD<D> b) {
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
+  const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
+  const VFromD<D> eqLH = Reverse2(d, eqHL);
+  return MaskFromVec(And(eqHL, eqLH));
+}
+
+// ------------------------------ Eq128Upper
+template <class D>
+HWY_INLINE MFromD<D> Eq128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
+  const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
+  // Replicate H to its neighbor.
+  return MaskFromVec(OddEven(eqHL, detail::Slide1Down(eqHL)));
+}
+
+// ------------------------------ Ne128
+template <class D>
+HWY_INLINE MFromD<D> Ne128(D d, const VFromD<D> a, const VFromD<D> b) {
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
+  const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
+  const VFromD<D> neLH = Reverse2(d, neHL);
+  return MaskFromVec(Or(neHL, neLH));
+}
+
+// ------------------------------ Ne128Upper
+template <class D>
+HWY_INLINE MFromD<D> Ne128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
+  const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
+  // Replicate H to its neighbor.
+  return MaskFromVec(OddEven(neHL, detail::Slide1Down(neHL)));
+}
+
 // ------------------------------ Min128, Max128 (Lt128)
 
 template <class D>
@@ -2915,6 +3319,16 @@ HWY_INLINE VFromD<D> Max128(D /* tag */, const VFromD<D> a, const VFromD<D> b) {
   return OddEven(maxHL, IfThenElse(eqXH, maxHL, lo));
 }
 
+template <class D>
+HWY_INLINE VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) {
+  return IfThenElse(Lt128Upper(d, a, b), a, b);
+}
+
+template <class D>
+HWY_INLINE VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) {
+  return IfThenElse(Lt128Upper(d, b, a), a, b);
+}
+
 // ================================================== END MACROS
 namespace detail {  // for code folding
 #undef HWY_RVV_AVL
diff --git a/media/highway/src/hwy/ops/scalar-inl.h b/media/highway/src/hwy/ops/scalar-inl.h
index 2ceed1a0bf..8b11828e62 100644
--- a/media/highway/src/hwy/ops/scalar-inl.h
+++ b/media/highway/src/hwy/ops/scalar-inl.h
@@ -102,7 +102,7 @@ template <typename T, typename FromT>
 HWY_API Vec1<T> BitCast(Sisd<T> /* tag */, Vec1<FromT> v) {
   static_assert(sizeof(T) <= sizeof(FromT), "Promoting is undefined");
   T to;
-  CopyBytes<sizeof(FromT)>(&v.raw, &to);
+  CopyBytes<sizeof(FromT)>(&v.raw, &to);  // not same size - ok to shrink
   return Vec1<T>(to);
 }
 
@@ -128,6 +128,9 @@ HWY_API Vec1<T> Iota(const Sisd<T> /* tag */, const T2 first) {
   return Vec1<T>(static_cast<T>(first));
 }
 
+template <class D>
+using VFromD = decltype(Zero(D()));
+
 // ================================================== LOGICAL
 
 // ------------------------------ Not
@@ -257,21 +260,21 @@ HWY_API Mask1<TTo> RebindMask(Sisd<TTo> /*tag*/, Mask1<TFrom> m) {
 template <typename T>
 HWY_API Mask1<T> MaskFromVec(const Vec1<T> v) {
   Mask1<T> mask;
-  CopyBytes<sizeof(mask.bits)>(&v.raw, &mask.bits);
+  CopySameSize(&v, &mask);
   return mask;
 }
 
 template <typename T>
 Vec1<T> VecFromMask(const Mask1<T> mask) {
   Vec1<T> v;
-  CopyBytes<sizeof(v.raw)>(&mask.bits, &v.raw);
+  CopySameSize(&mask, &v);
   return v;
 }
 
 template <typename T>
 Vec1<T> VecFromMask(Sisd<T> /* tag */, const Mask1<T> mask) {
   Vec1<T> v;
-  CopyBytes<sizeof(v.raw)>(&mask.bits, &v.raw);
+  CopySameSize(&mask, &v);
   return v;
 }
 
@@ -338,6 +341,12 @@ HWY_API Mask1<T> Xor(const Mask1<T> a, Mask1<T> b) {
   return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
 }
 
+template <typename T>
+HWY_API Mask1<T> ExclusiveNeither(const Mask1<T> a, Mask1<T> b) {
+  const Sisd<T> d;
+  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
+}
+
 // ================================================== SHIFTS
 
 // ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
@@ -694,10 +703,10 @@ HWY_API Vec1<float> ApproximateReciprocalSqrt(const Vec1<float> v) {
   float f = v.raw;
   const float half = f * 0.5f;
   uint32_t bits;
-  CopyBytes<4>(&f, &bits);
+  CopySameSize(&f, &bits);
   // Initial guess based on log2(f)
   bits = 0x5F3759DF - (bits >> 1);
-  CopyBytes<4>(&bits, &f);
+  CopySameSize(&bits, &f);
   // One Newton-Raphson iteration
   return Vec1<float>(f * (1.5f - (half * f * f)));
 }
@@ -722,7 +731,7 @@ HWY_API Vec1<T> Round(const Vec1<T> v) {
   const TI rounded = static_cast<TI>(v.raw + bias);
   if (rounded == 0) return CopySignToAbs(Vec1<T>(0), v);
   // Round to even
-  if ((rounded & 1) && std::abs(rounded - v.raw) == T(0.5)) {
+  if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
     return Vec1<T>(static_cast<T>(rounded - (v.raw < T(0) ? -1 : 1)));
   }
   return Vec1<T>(static_cast<T>(rounded));
@@ -775,7 +784,7 @@ V Ceiling(const V v) {
   const bool positive = f > Float(0.0);
 
   Bits bits;
-  CopyBytes<sizeof(Bits)>(&v, &bits);
+  CopySameSize(&v, &bits);
 
   const int exponent =
       static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
@@ -792,7 +801,7 @@ V Ceiling(const V v) {
   if (positive) bits += (kMantissaMask + 1) >> exponent;
   bits &= ~mantissa_mask;
 
-  CopyBytes<sizeof(Bits)>(&bits, &f);
+  CopySameSize(&bits, &f);
   return V(f);
 }
 
@@ -807,7 +816,7 @@ V Floor(const V v) {
   const bool negative = f < Float(0.0);
 
   Bits bits;
-  CopyBytes<sizeof(Bits)>(&v, &bits);
+  CopySameSize(&v, &bits);
 
   const int exponent =
       static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
@@ -824,7 +833,7 @@ V Floor(const V v) {
   if (negative) bits += (kMantissaMask + 1) >> exponent;
   bits &= ~mantissa_mask;
 
-  CopyBytes<sizeof(Bits)>(&bits, &f);
+  CopySameSize(&bits, &f);
   return V(f);
 }
 
@@ -886,7 +895,7 @@ template <typename T>
 HWY_API Mask1<T> IsNaN(const Vec1<T> v) {
   // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
   MakeUnsigned<T> bits;
-  memcpy(&bits, &v, sizeof(v));
+  CopySameSize(&v, &bits);
   bits += bits;
   bits >>= 1;  // clear sign bit
   // NaN if all exponent bits are set and the mantissa is not zero.
@@ -926,7 +935,7 @@ HWY_API Mask1<double> IsFinite(const Vec1<double> v) {
 template <typename T>
 HWY_API Vec1<T> Load(Sisd<T> /* tag */, const T* HWY_RESTRICT aligned) {
   T t;
-  CopyBytes<sizeof(T)>(aligned, &t);
+  CopySameSize(aligned, &t);
   return Vec1<T>(t);
 }
 
@@ -952,7 +961,7 @@ HWY_API Vec1<T> LoadDup128(Sisd<T> d, const T* HWY_RESTRICT aligned) {
 template <typename T>
 HWY_API void Store(const Vec1<T> v, Sisd<T> /* tag */,
                    T* HWY_RESTRICT aligned) {
-  CopyBytes<sizeof(T)>(&v.raw, aligned);
+  CopySameSize(&v.raw, aligned);
 }
 
 template <typename T>
@@ -1115,12 +1124,8 @@ HWY_API Vec1<ToT> DemoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
 }
 
 HWY_API Vec1<float> PromoteTo(Sisd<float> /* tag */, const Vec1<float16_t> v) {
-#if HWY_NATIVE_FLOAT16
   uint16_t bits16;
-  CopyBytes<2>(&v.raw, &bits16);
-#else
-  const uint16_t bits16 = v.raw.bits;
-#endif
+  CopySameSize(&v.raw, &bits16);
   const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
   const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
   const uint32_t mantissa = bits16 & 0x3FF;
@@ -1137,7 +1142,7 @@ HWY_API Vec1<float> PromoteTo(Sisd<float> /* tag */, const Vec1<float16_t> v) {
   const uint32_t mantissa32 = mantissa << (23 - 10);
   const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
   float out;
-  CopyBytes<4>(&bits32, &out);
+  CopySameSize(&bits32, &out);
   return Vec1<float>(out);
 }
 
@@ -1148,7 +1153,7 @@ HWY_API Vec1<float> PromoteTo(Sisd<float> d, const Vec1<bfloat16_t> v) {
 HWY_API Vec1<float16_t> DemoteTo(Sisd<float16_t> /* tag */,
                                  const Vec1<float> v) {
   uint32_t bits32;
-  CopyBytes<4>(&v.raw, &bits32);
+  CopySameSize(&v.raw, &bits32);
   const uint32_t sign = bits32 >> 31;
   const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
   const uint32_t mantissa32 = bits32 & 0x7FFFFF;
@@ -1158,12 +1163,8 @@ HWY_API Vec1<float16_t> DemoteTo(Sisd<float16_t> /* tag */,
   // Tiny or zero => zero.
   Vec1<float16_t> out;
   if (exp < -24) {
-#if HWY_NATIVE_FLOAT16
     const uint16_t zero = 0;
-    CopyBytes<2>(&zero, &out.raw);
-#else
-    out.raw.bits = 0;
-#endif
+    CopySameSize(&zero, &out.raw);
     return out;
   }
 
@@ -1186,12 +1187,8 @@ HWY_API Vec1<float16_t> DemoteTo(Sisd<float16_t> /* tag */,
   HWY_DASSERT(mantissa16 < 1024);
   const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
   HWY_DASSERT(bits16 < 0x10000);
-#if HWY_NATIVE_FLOAT16
   const uint16_t narrowed = static_cast<uint16_t>(bits16);  // big-endian safe
-  CopyBytes<2>(&narrowed, &out.raw);
-#else
-  out.raw.bits = static_cast<uint16_t>(bits16);
-#endif
+  CopySameSize(&narrowed, &out.raw);
   return out;
 }
 
@@ -1224,6 +1221,38 @@ HWY_API Vec1<uint8_t> U8FromU32(const Vec1<uint32_t> v) {
   return DemoteTo(Sisd<uint8_t>(), v);
 }
 
+// ------------------------------ Truncations
+
+HWY_API Vec1<uint8_t> TruncateTo(Sisd<uint8_t> /* tag */,
+                                 const Vec1<uint64_t> v) {
+  return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
+}
+
+HWY_API Vec1<uint16_t> TruncateTo(Sisd<uint16_t> /* tag */,
+                                  const Vec1<uint64_t> v) {
+  return Vec1<uint16_t>{static_cast<uint16_t>(v.raw & 0xFFFF)};
+}
+
+HWY_API Vec1<uint32_t> TruncateTo(Sisd<uint32_t> /* tag */,
+                                  const Vec1<uint64_t> v) {
+  return Vec1<uint32_t>{static_cast<uint32_t>(v.raw & 0xFFFFFFFFu)};
+}
+
+HWY_API Vec1<uint8_t> TruncateTo(Sisd<uint8_t> /* tag */,
+                                 const Vec1<uint32_t> v) {
+  return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
+}
+
+HWY_API Vec1<uint16_t> TruncateTo(Sisd<uint16_t> /* tag */,
+                                  const Vec1<uint32_t> v) {
+  return Vec1<uint16_t>{static_cast<uint16_t>(v.raw & 0xFFFF)};
+}
+
+HWY_API Vec1<uint8_t> TruncateTo(Sisd<uint8_t> /* tag */,
+                                 const Vec1<uint16_t> v) {
+  return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
+}
+
 // ================================================== COMBINE
 // UpperHalf, ZeroExtendVector, Combine, Concat* are unsupported.
 
@@ -1356,7 +1385,7 @@ HWY_API Vec1<TI> TableLookupBytes(const Vec1<T> in, const Vec1<TI> indices) {
   uint8_t in_bytes[sizeof(T)];
   uint8_t idx_bytes[sizeof(T)];
   uint8_t out_bytes[sizeof(T)];
-  CopyBytes<sizeof(T)>(&in, &in_bytes);
+  CopyBytes<sizeof(T)>(&in, &in_bytes);  // copy to bytes
   CopyBytes<sizeof(T)>(&indices, &idx_bytes);
   for (size_t i = 0; i < sizeof(T); ++i) {
     out_bytes[i] = in_bytes[idx_bytes[i]];
@@ -1371,7 +1400,7 @@ HWY_API Vec1<TI> TableLookupBytesOr0(const Vec1<T> in, const Vec1<TI> indices) {
   uint8_t in_bytes[sizeof(T)];
   uint8_t idx_bytes[sizeof(T)];
   uint8_t out_bytes[sizeof(T)];
-  CopyBytes<sizeof(T)>(&in, &in_bytes);
+  CopyBytes<sizeof(T)>(&in, &in_bytes);  // copy to bytes
   CopyBytes<sizeof(T)>(&indices, &idx_bytes);
   for (size_t i = 0; i < sizeof(T); ++i) {
     out_bytes[i] = idx_bytes[i] & 0x80 ? 0 : in_bytes[idx_bytes[i]];
@@ -1445,6 +1474,11 @@ HWY_API intptr_t FindFirstTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
   return mask.bits == 0 ? -1 : 0;
 }
 
+template <typename T>
+HWY_API size_t FindKnownFirstTrue(Sisd<T> /* tag */, const Mask1<T> /* m */) {
+  return 0;  // There is only one lane and we know it is true.
+}
+
 // ------------------------------ Compress, CompressBits
 
 template <typename T>
@@ -1454,7 +1488,13 @@ struct CompressIsPartition {
 
 template <typename T>
 HWY_API Vec1<T> Compress(Vec1<T> v, const Mask1<T> /* mask */) {
-  // Upper lanes are undefined, so result is the same independent of mask.
+  // A single lane is already partitioned by definition.
+  return v;
+}
+
+template <typename T>
+HWY_API Vec1<T> CompressNot(Vec1<T> v, const Mask1<T> /* mask */) {
+  // A single lane is already partitioned by definition.
   return v;
 }
 
@@ -1501,6 +1541,14 @@ HWY_API Vec1<float> ReorderWidenMulAccumulate(Sisd<float> /* tag */,
                 Vec1<float>(F32FromBF16(b.raw)), sum0);
 }
 
+HWY_API Vec1<int32_t> ReorderWidenMulAccumulate(Sisd<int32_t> /* tag */,
+                                                Vec1<int16_t> a,
+                                                Vec1<int16_t> b,
+                                                const Vec1<int32_t> sum0,
+                                                Vec1<int32_t>& /* sum1 */) {
+  return Vec1<int32_t>(a.raw * b.raw + sum0.raw);
+}
+
 // ================================================== REDUCTIONS
 
 // Sum of all lanes, i.e. the only one.
@@ -1517,62 +1565,6 @@ HWY_API Vec1<T> MaxOfLanes(Sisd<T> /* tag */, const Vec1<T> v) {
   return v;
 }
 
-// ================================================== Operator wrapper
-
-template <class V>
-HWY_API V Add(V a, V b) {
-  return a + b;
-}
-template <class V>
-HWY_API V Sub(V a, V b) {
-  return a - b;
-}
-
-template <class V>
-HWY_API V Mul(V a, V b) {
-  return a * b;
-}
-template <class V>
-HWY_API V Div(V a, V b) {
-  return a / b;
-}
-
-template <class V>
-V Shl(V a, V b) {
-  return a << b;
-}
-template <class V>
-V Shr(V a, V b) {
-  return a >> b;
-}
-
-template <class V>
-HWY_API auto Eq(V a, V b) -> decltype(a == b) {
-  return a == b;
-}
-template <class V>
-HWY_API auto Ne(V a, V b) -> decltype(a == b) {
-  return a != b;
-}
-template <class V>
-HWY_API auto Lt(V a, V b) -> decltype(a == b) {
-  return a < b;
-}
-
-template <class V>
-HWY_API auto Gt(V a, V b) -> decltype(a == b) {
-  return a > b;
-}
-template <class V>
-HWY_API auto Ge(V a, V b) -> decltype(a == b) {
-  return a >= b;
-}
-
-template <class V>
-HWY_API auto Le(V a, V b) -> decltype(a == b) {
-  return a <= b;
-}
-
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
diff --git a/media/highway/src/hwy/ops/set_macros-inl.h b/media/highway/src/hwy/ops/set_macros-inl.h
index 0b774421ac..c1189604bc 100644
--- a/media/highway/src/hwy/ops/set_macros-inl.h
+++ b/media/highway/src/hwy/ops/set_macros-inl.h
@@ -227,17 +227,25 @@
 
 #define HWY_NAMESPACE N_NEON
 
-// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
+// Can use pragmas instead of -march compiler flag
+#if HWY_HAVE_RUNTIME_DISPATCH
+#if HWY_ARCH_ARM_V7
+#define HWY_TARGET_STR "+neon-vfpv4"
+#else
+#define HWY_TARGET_STR "+crypto"
+#endif  // HWY_ARCH_ARM_V7
+#else
+// HWY_TARGET_STR remains undefined
+#endif
 
 //-----------------------------------------------------------------------------
 // SVE[2]
-#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE
+#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE || \
+    HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
 
 // SVE only requires lane alignment, not natural alignment of the entire vector.
 #define HWY_ALIGN alignas(8)
 
-#define HWY_MAX_BYTES 256
-
 // Value ensures MaxLanes() is the tightest possible upper bound to reduce
 // overallocation.
 #define HWY_LANES(T) ((HWY_MAX_BYTES) / sizeof(T))
@@ -253,11 +261,28 @@
 
 #if HWY_TARGET == HWY_SVE2
 #define HWY_NAMESPACE N_SVE2
+#define HWY_MAX_BYTES 256
+#elif HWY_TARGET == HWY_SVE_256
+#define HWY_NAMESPACE N_SVE_256
+#define HWY_MAX_BYTES 32
+#elif HWY_TARGET == HWY_SVE2_128
+#define HWY_NAMESPACE N_SVE2_128
+#define HWY_MAX_BYTES 16
 #else
 #define HWY_NAMESPACE N_SVE
+#define HWY_MAX_BYTES 256
 #endif
 
+// Can use pragmas instead of -march compiler flag
+#if HWY_HAVE_RUNTIME_DISPATCH
+#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
+#define HWY_TARGET_STR "+sve2-aes"
+#else
+#define HWY_TARGET_STR "+sve"
+#endif
+#else
 // HWY_TARGET_STR remains undefined
+#endif
 
 //-----------------------------------------------------------------------------
 // WASM
@@ -281,8 +306,8 @@
 #define HWY_TARGET_STR "simd128"
 
 //-----------------------------------------------------------------------------
-// WASM2
-#elif HWY_TARGET == HWY_WASM2
+// WASM_EMU256
+#elif HWY_TARGET == HWY_WASM_EMU256
 
 #define HWY_ALIGN alignas(32)
 #define HWY_MAX_BYTES 32
@@ -297,7 +322,7 @@
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0
 
-#define HWY_NAMESPACE N_WASM2
+#define HWY_NAMESPACE N_WASM_EMU256
 
 #define HWY_TARGET_STR "simd128"
 
@@ -324,7 +349,7 @@
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0
 
-#if defined(__riscv_zfh)
+#if defined(__riscv_zvfh)
 #define HWY_HAVE_FLOAT16 1
 #else
 #define HWY_HAVE_FLOAT16 0
diff --git a/media/highway/src/hwy/ops/shared-inl.h b/media/highway/src/hwy/ops/shared-inl.h
index 1486d1f987..29c4303880 100644
--- a/media/highway/src/hwy/ops/shared-inl.h
+++ b/media/highway/src/hwy/ops/shared-inl.h
@@ -99,21 +99,21 @@ struct Simd {
 
 namespace detail {
 
-#if HWY_HAVE_SCALABLE
-
 template <typename T, size_t N, int kPow2>
 constexpr bool IsFull(Simd<T, N, kPow2> /* d */) {
   return N == HWY_LANES(T) && kPow2 == 0;
 }
 
-#endif
-
 // Returns the number of lanes (possibly zero) after applying a shift:
 // - 0: no change;
 // - [1,3]: a group of 2,4,8 [fractional] vectors;
 // - [-3,-1]: a fraction of a vector from 1/8 to 1/2.
 constexpr size_t ScaleByPower(size_t N, int pow2) {
+#if HWY_TARGET == HWY_RVV
   return pow2 >= 0 ? (N << pow2) : (N >> (-pow2));
+#else
+  return pow2 >= 0 ? N : (N >> (-pow2));
+#endif
 }
 
 // Struct wrappers enable validation of arguments via static_assert.
@@ -241,17 +241,13 @@ using Full128 = Simd<T, 16 / sizeof(T), 0>;
 #define HWY_IF_GE128_D(D) \
   hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) >= 16>* = nullptr
 
-// Same, but with a vector argument.
+// Same, but with a vector argument. ops/*-inl.h define their own TFromV.
 #define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
 #define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
 #define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>)
 #define HWY_IF_LANE_SIZE_V(V, bytes) HWY_IF_LANE_SIZE(TFromV<V>, bytes)
 #define HWY_IF_NOT_LANE_SIZE_V(V, bytes) HWY_IF_NOT_LANE_SIZE(TFromV<V>, bytes)
 
-// For implementing functions for a specific type.
-// IsSame<...>() in template arguments is broken on MSVC2015.
-#define HWY_IF_LANES_ARE(T, V) EnableIf<IsSameT<T, TFromV<V>>::value>* = nullptr
-
 template <class D>
 HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D /* d */) {
   return D::kPrivatePow2;
@@ -301,8 +297,7 @@ HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes(Simd<T, N, kPow2>) {
 // We therefore pass by const& only on GCC and (Windows or ARM64). This alias
 // must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
 // and possibly also other functions that are not inlined.
-#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG && \
-    ((defined(_WIN32) || defined(_WIN64)) || HWY_ARCH_ARM_A64)
+#if HWY_COMPILER_GCC_ACTUAL && (HWY_OS_WIN || HWY_ARCH_ARM_A64)
 template <class V>
 using VecArg = const V&;
 #else
diff --git a/media/highway/src/hwy/ops/wasm_128-inl.h b/media/highway/src/hwy/ops/wasm_128-inl.h
index c266c0cc5e..3831258fc5 100644
--- a/media/highway/src/hwy/ops/wasm_128-inl.h
+++ b/media/highway/src/hwy/ops/wasm_128-inl.h
@@ -60,7 +60,7 @@ struct Raw128<float> {
   using type = __f32x4;
 };
 
-} // namespace detail
+}  // namespace detail
 
 template <typename T, size_t N = 16 / sizeof(T)>
 class Vec128 {
@@ -1412,6 +1412,12 @@ HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
   return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
 }
 
+template <typename T, size_t N>
+HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
+  const Simd<T, N, 0> d;
+  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
+}
+
 // ------------------------------ Shl (BroadcastSignBit, IfThenElse)
 
 // The x86 multiply-by-Pow2() trick will not work because WASM saturates
@@ -3288,6 +3294,31 @@ HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
   return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
 }
 
+// Specializations for partial vectors because i16x8_narrow_i32x4 sets lanes
+// above 2*N.
+HWY_API Vec128<int16_t, 2> ReorderDemote2To(Simd<int16_t, 2, 0> dn,
+                                            Vec128<int32_t, 1> a,
+                                            Vec128<int32_t, 1> b) {
+  const Half<decltype(dn)> dnh;
+  // Pretend the result has twice as many lanes so we can InterleaveLower.
+  const Vec128<int16_t, 2> an{DemoteTo(dnh, a).raw};
+  const Vec128<int16_t, 2> bn{DemoteTo(dnh, b).raw};
+  return InterleaveLower(an, bn);
+}
+HWY_API Vec128<int16_t, 4> ReorderDemote2To(Simd<int16_t, 4, 0> dn,
+                                            Vec128<int32_t, 2> a,
+                                            Vec128<int32_t, 2> b) {
+  const Half<decltype(dn)> dnh;
+  // Pretend the result has twice as many lanes so we can InterleaveLower.
+  const Vec128<int16_t, 4> an{DemoteTo(dnh, a).raw};
+  const Vec128<int16_t, 4> bn{DemoteTo(dnh, b).raw};
+  return InterleaveLower(an, bn);
+}
+HWY_API Vec128<int16_t> ReorderDemote2To(Full128<int16_t> /*d16*/,
+                                         Vec128<int32_t> a, Vec128<int32_t> b) {
+  return Vec128<int16_t>{wasm_i16x8_narrow_i32x4(a.raw, b.raw)};
+}
+
 // For already range-limited input [0, 255].
 template <size_t N>
 HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
@@ -3296,6 +3327,70 @@ HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
       wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
 }
 
+// ------------------------------ Truncations
+
+template <typename From, typename To, HWY_IF_UNSIGNED(From),
+          HWY_IF_UNSIGNED(To),
+          hwy::EnableIf<(sizeof(To) < sizeof(From))>* = nullptr>
+HWY_API Vec128<To, 1> TruncateTo(Simd<To, 1, 0> /* tag */,
+                                 const Vec128<From, 1> v) {
+  const Repartition<To, DFromV<decltype(v)>> d;
+  const auto v1 = BitCast(d, v);
+  return Vec128<To, 1>{v1.raw};
+}
+
+HWY_API Vec128<uint8_t, 2> TruncateTo(Simd<uint8_t, 2, 0> /* tag */,
+                                      const Vec128<uint64_t> v) {
+  const Full128<uint8_t> d;
+  const auto v1 = BitCast(d, v);
+  const auto v2 = ConcatEven(d, v1, v1);
+  const auto v4 = ConcatEven(d, v2, v2);
+  return LowerHalf(LowerHalf(LowerHalf(ConcatEven(d, v4, v4))));
+}
+
+HWY_API Vec128<uint16_t, 2> TruncateTo(Simd<uint16_t, 2, 0> /* tag */,
+                                       const Vec128<uint64_t> v) {
+  const Full128<uint16_t> d;
+  const auto v1 = BitCast(d, v);
+  const auto v2 = ConcatEven(d, v1, v1);
+  return LowerHalf(LowerHalf(ConcatEven(d, v2, v2)));
+}
+
+HWY_API Vec128<uint32_t, 2> TruncateTo(Simd<uint32_t, 2, 0> /* tag */,
+                                       const Vec128<uint64_t> v) {
+  const Full128<uint32_t> d;
+  const auto v1 = BitCast(d, v);
+  return LowerHalf(ConcatEven(d, v1, v1));
+}
+
+template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
+HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
+                                      const Vec128<uint32_t, N> v) {
+  const Full128<uint8_t> d;
+  const auto v1 = Vec128<uint8_t>{v.raw};
+  const auto v2 = ConcatEven(d, v1, v1);
+  const auto v3 = ConcatEven(d, v2, v2);
+  return Vec128<uint8_t, N>{v3.raw};
+}
+
+template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
+HWY_API Vec128<uint16_t, N> TruncateTo(Simd<uint16_t, N, 0> /* tag */,
+                                       const Vec128<uint32_t, N> v) {
+  const Full128<uint16_t> d;
+  const auto v1 = Vec128<uint16_t>{v.raw};
+  const auto v2 = ConcatEven(d, v1, v1);
+  return Vec128<uint16_t, N>{v2.raw};
+}
+
+template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
+HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
+                                      const Vec128<uint16_t, N> v) {
+  const Full128<uint8_t> d;
+  const auto v1 = Vec128<uint8_t>{v.raw};
+  const auto v2 = ConcatEven(d, v1, v1);
+  return Vec128<uint8_t, N>{v2.raw};
+}
+
 // ------------------------------ Convert i32 <=> f32 (Round)
 
 template <size_t N>
@@ -3303,6 +3398,11 @@ HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
                                    const Vec128<int32_t, N> v) {
   return Vec128<float, N>{wasm_f32x4_convert_i32x4(v.raw)};
 }
+template <size_t N>
+HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
+                                   const Vec128<uint32_t, N> v) {
+  return Vec128<float, N>{wasm_f32x4_convert_u32x4(v.raw)};
+}
 // Truncates (rounds toward zero).
 template <size_t N>
 HWY_API Vec128<int32_t, N> ConvertTo(Simd<int32_t, N, 0> /* tag */,
@@ -3615,6 +3715,13 @@ HWY_API bool AllTrue(const Simd<T, N, 0> /* d */, const Mask128<T, N> m) {
 }
 
 template <typename T, size_t N>
+HWY_API size_t FindKnownFirstTrue(const Simd<T, N, 0> /* tag */,
+                                  const Mask128<T, N> mask) {
+  const uint64_t bits = detail::BitsFromMask(mask);
+  return Num0BitsBelowLS1Bit_Nonzero64(bits);
+}
+
+template <typename T, size_t N>
 HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
                                const Mask128<T, N> mask) {
   const uint64_t bits = detail::BitsFromMask(mask);
@@ -3625,8 +3732,8 @@ HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
 
 namespace detail {
 
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Idx16x8FromBits(const uint64_t mask_bits) {
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
+HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 256);
   const Simd<T, N, 0> d;
   const Rebind<uint8_t, decltype(d)> d8;
@@ -3638,6 +3745,7 @@ HWY_INLINE Vec128<T, N> Idx16x8FromBits(const uint64_t mask_bits) {
   // with the doubling baked into the table. Unpacking nibbles is likely more
   // costly than the higher cache footprint from storing bytes.
   alignas(16) constexpr uint8_t table[256 * 8] = {
+      // PrintCompress16x8Tables
       0,  2,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
       2,  0,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
       4,  0,  2,  6,  8,  10, 12, 14, /**/ 0, 4,  2,  6,  8,  10, 12, 14,  //
@@ -3772,12 +3880,161 @@ HWY_INLINE Vec128<T, N> Idx16x8FromBits(const uint64_t mask_bits) {
   return BitCast(d, pairs + Set(du, 0x0100));
 }
 
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Idx32x4FromBits(const uint64_t mask_bits) {
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
+HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
+  HWY_DASSERT(mask_bits < 256);
+  const Simd<T, N, 0> d;
+  const Rebind<uint8_t, decltype(d)> d8;
+  const Simd<uint16_t, N, 0> du;
+
+  // We need byte indices for TableLookupBytes (one vector's worth for each of
+  // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
+  // can instead store lane indices and convert to byte indices (2*lane + 0..1),
+  // with the doubling baked into the table. Unpacking nibbles is likely more
+  // costly than the higher cache footprint from storing bytes.
+  alignas(16) constexpr uint8_t table[256 * 8] = {
+      // PrintCompressNot16x8Tables
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 14, 0,   //
+      0, 4,  6,  8,  10, 12, 14, 2,  /**/ 4,  6,  8,  10, 12, 14, 0,  2,   //
+      0, 2,  6,  8,  10, 12, 14, 4,  /**/ 2,  6,  8,  10, 12, 14, 0,  4,   //
+      0, 6,  8,  10, 12, 14, 2,  4,  /**/ 6,  8,  10, 12, 14, 0,  2,  4,   //
+      0, 2,  4,  8,  10, 12, 14, 6,  /**/ 2,  4,  8,  10, 12, 14, 0,  6,   //
+      0, 4,  8,  10, 12, 14, 2,  6,  /**/ 4,  8,  10, 12, 14, 0,  2,  6,   //
+      0, 2,  8,  10, 12, 14, 4,  6,  /**/ 2,  8,  10, 12, 14, 0,  4,  6,   //
+      0, 8,  10, 12, 14, 2,  4,  6,  /**/ 8,  10, 12, 14, 0,  2,  4,  6,   //
+      0, 2,  4,  6,  10, 12, 14, 8,  /**/ 2,  4,  6,  10, 12, 14, 0,  8,   //
+      0, 4,  6,  10, 12, 14, 2,  8,  /**/ 4,  6,  10, 12, 14, 0,  2,  8,   //
+      0, 2,  6,  10, 12, 14, 4,  8,  /**/ 2,  6,  10, 12, 14, 0,  4,  8,   //
+      0, 6,  10, 12, 14, 2,  4,  8,  /**/ 6,  10, 12, 14, 0,  2,  4,  8,   //
+      0, 2,  4,  10, 12, 14, 6,  8,  /**/ 2,  4,  10, 12, 14, 0,  6,  8,   //
+      0, 4,  10, 12, 14, 2,  6,  8,  /**/ 4,  10, 12, 14, 0,  2,  6,  8,   //
+      0, 2,  10, 12, 14, 4,  6,  8,  /**/ 2,  10, 12, 14, 0,  4,  6,  8,   //
+      0, 10, 12, 14, 2,  4,  6,  8,  /**/ 10, 12, 14, 0,  2,  4,  6,  8,   //
+      0, 2,  4,  6,  8,  12, 14, 10, /**/ 2,  4,  6,  8,  12, 14, 0,  10,  //
+      0, 4,  6,  8,  12, 14, 2,  10, /**/ 4,  6,  8,  12, 14, 0,  2,  10,  //
+      0, 2,  6,  8,  12, 14, 4,  10, /**/ 2,  6,  8,  12, 14, 0,  4,  10,  //
+      0, 6,  8,  12, 14, 2,  4,  10, /**/ 6,  8,  12, 14, 0,  2,  4,  10,  //
+      0, 2,  4,  8,  12, 14, 6,  10, /**/ 2,  4,  8,  12, 14, 0,  6,  10,  //
+      0, 4,  8,  12, 14, 2,  6,  10, /**/ 4,  8,  12, 14, 0,  2,  6,  10,  //
+      0, 2,  8,  12, 14, 4,  6,  10, /**/ 2,  8,  12, 14, 0,  4,  6,  10,  //
+      0, 8,  12, 14, 2,  4,  6,  10, /**/ 8,  12, 14, 0,  2,  4,  6,  10,  //
+      0, 2,  4,  6,  12, 14, 8,  10, /**/ 2,  4,  6,  12, 14, 0,  8,  10,  //
+      0, 4,  6,  12, 14, 2,  8,  10, /**/ 4,  6,  12, 14, 0,  2,  8,  10,  //
+      0, 2,  6,  12, 14, 4,  8,  10, /**/ 2,  6,  12, 14, 0,  4,  8,  10,  //
+      0, 6,  12, 14, 2,  4,  8,  10, /**/ 6,  12, 14, 0,  2,  4,  8,  10,  //
+      0, 2,  4,  12, 14, 6,  8,  10, /**/ 2,  4,  12, 14, 0,  6,  8,  10,  //
+      0, 4,  12, 14, 2,  6,  8,  10, /**/ 4,  12, 14, 0,  2,  6,  8,  10,  //
+      0, 2,  12, 14, 4,  6,  8,  10, /**/ 2,  12, 14, 0,  4,  6,  8,  10,  //
+      0, 12, 14, 2,  4,  6,  8,  10, /**/ 12, 14, 0,  2,  4,  6,  8,  10,  //
+      0, 2,  4,  6,  8,  10, 14, 12, /**/ 2,  4,  6,  8,  10, 14, 0,  12,  //
+      0, 4,  6,  8,  10, 14, 2,  12, /**/ 4,  6,  8,  10, 14, 0,  2,  12,  //
+      0, 2,  6,  8,  10, 14, 4,  12, /**/ 2,  6,  8,  10, 14, 0,  4,  12,  //
+      0, 6,  8,  10, 14, 2,  4,  12, /**/ 6,  8,  10, 14, 0,  2,  4,  12,  //
+      0, 2,  4,  8,  10, 14, 6,  12, /**/ 2,  4,  8,  10, 14, 0,  6,  12,  //
+      0, 4,  8,  10, 14, 2,  6,  12, /**/ 4,  8,  10, 14, 0,  2,  6,  12,  //
+      0, 2,  8,  10, 14, 4,  6,  12, /**/ 2,  8,  10, 14, 0,  4,  6,  12,  //
+      0, 8,  10, 14, 2,  4,  6,  12, /**/ 8,  10, 14, 0,  2,  4,  6,  12,  //
+      0, 2,  4,  6,  10, 14, 8,  12, /**/ 2,  4,  6,  10, 14, 0,  8,  12,  //
+      0, 4,  6,  10, 14, 2,  8,  12, /**/ 4,  6,  10, 14, 0,  2,  8,  12,  //
+      0, 2,  6,  10, 14, 4,  8,  12, /**/ 2,  6,  10, 14, 0,  4,  8,  12,  //
+      0, 6,  10, 14, 2,  4,  8,  12, /**/ 6,  10, 14, 0,  2,  4,  8,  12,  //
+      0, 2,  4,  10, 14, 6,  8,  12, /**/ 2,  4,  10, 14, 0,  6,  8,  12,  //
+      0, 4,  10, 14, 2,  6,  8,  12, /**/ 4,  10, 14, 0,  2,  6,  8,  12,  //
+      0, 2,  10, 14, 4,  6,  8,  12, /**/ 2,  10, 14, 0,  4,  6,  8,  12,  //
+      0, 10, 14, 2,  4,  6,  8,  12, /**/ 10, 14, 0,  2,  4,  6,  8,  12,  //
+      0, 2,  4,  6,  8,  14, 10, 12, /**/ 2,  4,  6,  8,  14, 0,  10, 12,  //
+      0, 4,  6,  8,  14, 2,  10, 12, /**/ 4,  6,  8,  14, 0,  2,  10, 12,  //
+      0, 2,  6,  8,  14, 4,  10, 12, /**/ 2,  6,  8,  14, 0,  4,  10, 12,  //
+      0, 6,  8,  14, 2,  4,  10, 12, /**/ 6,  8,  14, 0,  2,  4,  10, 12,  //
+      0, 2,  4,  8,  14, 6,  10, 12, /**/ 2,  4,  8,  14, 0,  6,  10, 12,  //
+      0, 4,  8,  14, 2,  6,  10, 12, /**/ 4,  8,  14, 0,  2,  6,  10, 12,  //
+      0, 2,  8,  14, 4,  6,  10, 12, /**/ 2,  8,  14, 0,  4,  6,  10, 12,  //
+      0, 8,  14, 2,  4,  6,  10, 12, /**/ 8,  14, 0,  2,  4,  6,  10, 12,  //
+      0, 2,  4,  6,  14, 8,  10, 12, /**/ 2,  4,  6,  14, 0,  8,  10, 12,  //
+      0, 4,  6,  14, 2,  8,  10, 12, /**/ 4,  6,  14, 0,  2,  8,  10, 12,  //
+      0, 2,  6,  14, 4,  8,  10, 12, /**/ 2,  6,  14, 0,  4,  8,  10, 12,  //
+      0, 6,  14, 2,  4,  8,  10, 12, /**/ 6,  14, 0,  2,  4,  8,  10, 12,  //
+      0, 2,  4,  14, 6,  8,  10, 12, /**/ 2,  4,  14, 0,  6,  8,  10, 12,  //
+      0, 4,  14, 2,  6,  8,  10, 12, /**/ 4,  14, 0,  2,  6,  8,  10, 12,  //
+      0, 2,  14, 4,  6,  8,  10, 12, /**/ 2,  14, 0,  4,  6,  8,  10, 12,  //
+      0, 14, 2,  4,  6,  8,  10, 12, /**/ 14, 0,  2,  4,  6,  8,  10, 12,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 0,  14,  //
+      0, 4,  6,  8,  10, 12, 2,  14, /**/ 4,  6,  8,  10, 12, 0,  2,  14,  //
+      0, 2,  6,  8,  10, 12, 4,  14, /**/ 2,  6,  8,  10, 12, 0,  4,  14,  //
+      0, 6,  8,  10, 12, 2,  4,  14, /**/ 6,  8,  10, 12, 0,  2,  4,  14,  //
+      0, 2,  4,  8,  10, 12, 6,  14, /**/ 2,  4,  8,  10, 12, 0,  6,  14,  //
+      0, 4,  8,  10, 12, 2,  6,  14, /**/ 4,  8,  10, 12, 0,  2,  6,  14,  //
+      0, 2,  8,  10, 12, 4,  6,  14, /**/ 2,  8,  10, 12, 0,  4,  6,  14,  //
+      0, 8,  10, 12, 2,  4,  6,  14, /**/ 8,  10, 12, 0,  2,  4,  6,  14,  //
+      0, 2,  4,  6,  10, 12, 8,  14, /**/ 2,  4,  6,  10, 12, 0,  8,  14,  //
+      0, 4,  6,  10, 12, 2,  8,  14, /**/ 4,  6,  10, 12, 0,  2,  8,  14,  //
+      0, 2,  6,  10, 12, 4,  8,  14, /**/ 2,  6,  10, 12, 0,  4,  8,  14,  //
+      0, 6,  10, 12, 2,  4,  8,  14, /**/ 6,  10, 12, 0,  2,  4,  8,  14,  //
+      0, 2,  4,  10, 12, 6,  8,  14, /**/ 2,  4,  10, 12, 0,  6,  8,  14,  //
+      0, 4,  10, 12, 2,  6,  8,  14, /**/ 4,  10, 12, 0,  2,  6,  8,  14,  //
+      0, 2,  10, 12, 4,  6,  8,  14, /**/ 2,  10, 12, 0,  4,  6,  8,  14,  //
+      0, 10, 12, 2,  4,  6,  8,  14, /**/ 10, 12, 0,  2,  4,  6,  8,  14,  //
+      0, 2,  4,  6,  8,  12, 10, 14, /**/ 2,  4,  6,  8,  12, 0,  10, 14,  //
+      0, 4,  6,  8,  12, 2,  10, 14, /**/ 4,  6,  8,  12, 0,  2,  10, 14,  //
+      0, 2,  6,  8,  12, 4,  10, 14, /**/ 2,  6,  8,  12, 0,  4,  10, 14,  //
+      0, 6,  8,  12, 2,  4,  10, 14, /**/ 6,  8,  12, 0,  2,  4,  10, 14,  //
+      0, 2,  4,  8,  12, 6,  10, 14, /**/ 2,  4,  8,  12, 0,  6,  10, 14,  //
+      0, 4,  8,  12, 2,  6,  10, 14, /**/ 4,  8,  12, 0,  2,  6,  10, 14,  //
+      0, 2,  8,  12, 4,  6,  10, 14, /**/ 2,  8,  12, 0,  4,  6,  10, 14,  //
+      0, 8,  12, 2,  4,  6,  10, 14, /**/ 8,  12, 0,  2,  4,  6,  10, 14,  //
+      0, 2,  4,  6,  12, 8,  10, 14, /**/ 2,  4,  6,  12, 0,  8,  10, 14,  //
+      0, 4,  6,  12, 2,  8,  10, 14, /**/ 4,  6,  12, 0,  2,  8,  10, 14,  //
+      0, 2,  6,  12, 4,  8,  10, 14, /**/ 2,  6,  12, 0,  4,  8,  10, 14,  //
+      0, 6,  12, 2,  4,  8,  10, 14, /**/ 6,  12, 0,  2,  4,  8,  10, 14,  //
+      0, 2,  4,  12, 6,  8,  10, 14, /**/ 2,  4,  12, 0,  6,  8,  10, 14,  //
+      0, 4,  12, 2,  6,  8,  10, 14, /**/ 4,  12, 0,  2,  6,  8,  10, 14,  //
+      0, 2,  12, 4,  6,  8,  10, 14, /**/ 2,  12, 0,  4,  6,  8,  10, 14,  //
+      0, 12, 2,  4,  6,  8,  10, 14, /**/ 12, 0,  2,  4,  6,  8,  10, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 0,  12, 14,  //
+      0, 4,  6,  8,  10, 2,  12, 14, /**/ 4,  6,  8,  10, 0,  2,  12, 14,  //
+      0, 2,  6,  8,  10, 4,  12, 14, /**/ 2,  6,  8,  10, 0,  4,  12, 14,  //
+      0, 6,  8,  10, 2,  4,  12, 14, /**/ 6,  8,  10, 0,  2,  4,  12, 14,  //
+      0, 2,  4,  8,  10, 6,  12, 14, /**/ 2,  4,  8,  10, 0,  6,  12, 14,  //
+      0, 4,  8,  10, 2,  6,  12, 14, /**/ 4,  8,  10, 0,  2,  6,  12, 14,  //
+      0, 2,  8,  10, 4,  6,  12, 14, /**/ 2,  8,  10, 0,  4,  6,  12, 14,  //
+      0, 8,  10, 2,  4,  6,  12, 14, /**/ 8,  10, 0,  2,  4,  6,  12, 14,  //
+      0, 2,  4,  6,  10, 8,  12, 14, /**/ 2,  4,  6,  10, 0,  8,  12, 14,  //
+      0, 4,  6,  10, 2,  8,  12, 14, /**/ 4,  6,  10, 0,  2,  8,  12, 14,  //
+      0, 2,  6,  10, 4,  8,  12, 14, /**/ 2,  6,  10, 0,  4,  8,  12, 14,  //
+      0, 6,  10, 2,  4,  8,  12, 14, /**/ 6,  10, 0,  2,  4,  8,  12, 14,  //
+      0, 2,  4,  10, 6,  8,  12, 14, /**/ 2,  4,  10, 0,  6,  8,  12, 14,  //
+      0, 4,  10, 2,  6,  8,  12, 14, /**/ 4,  10, 0,  2,  6,  8,  12, 14,  //
+      0, 2,  10, 4,  6,  8,  12, 14, /**/ 2,  10, 0,  4,  6,  8,  12, 14,  //
+      0, 10, 2,  4,  6,  8,  12, 14, /**/ 10, 0,  2,  4,  6,  8,  12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  0,  10, 12, 14,  //
+      0, 4,  6,  8,  2,  10, 12, 14, /**/ 4,  6,  8,  0,  2,  10, 12, 14,  //
+      0, 2,  6,  8,  4,  10, 12, 14, /**/ 2,  6,  8,  0,  4,  10, 12, 14,  //
+      0, 6,  8,  2,  4,  10, 12, 14, /**/ 6,  8,  0,  2,  4,  10, 12, 14,  //
+      0, 2,  4,  8,  6,  10, 12, 14, /**/ 2,  4,  8,  0,  6,  10, 12, 14,  //
+      0, 4,  8,  2,  6,  10, 12, 14, /**/ 4,  8,  0,  2,  6,  10, 12, 14,  //
+      0, 2,  8,  4,  6,  10, 12, 14, /**/ 2,  8,  0,  4,  6,  10, 12, 14,  //
+      0, 8,  2,  4,  6,  10, 12, 14, /**/ 8,  0,  2,  4,  6,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  0,  8,  10, 12, 14,  //
+      0, 4,  6,  2,  8,  10, 12, 14, /**/ 4,  6,  0,  2,  8,  10, 12, 14,  //
+      0, 2,  6,  4,  8,  10, 12, 14, /**/ 2,  6,  0,  4,  8,  10, 12, 14,  //
+      0, 6,  2,  4,  8,  10, 12, 14, /**/ 6,  0,  2,  4,  8,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  0,  6,  8,  10, 12, 14,  //
+      0, 4,  2,  6,  8,  10, 12, 14, /**/ 4,  0,  2,  6,  8,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  0,  4,  6,  8,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 0,  2,  4,  6,  8,  10, 12, 14};
+
+  const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
+  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
+  return BitCast(d, pairs + Set(du, 0x0100));
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 16);
 
   // There are only 4 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t packed_array[16 * 16] = {
+  alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
+      // PrintCompress32x4Tables
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
       4,  5,  6,  7,  0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15,  //
@@ -3796,15 +4053,43 @@ HWY_INLINE Vec128<T, N> Idx32x4FromBits(const uint64_t mask_bits) {
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15};
   const Simd<T, N, 0> d;
   const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
 }
 
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Idx64x2FromBits(const uint64_t mask_bits) {
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
+  HWY_DASSERT(mask_bits < 16);
+
+  // There are only 4 lanes, so we can afford to load the index vector directly.
+  alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
+      // PrintCompressNot32x4Tables
+      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,
+      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,
+      8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
+      14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
+      12, 13, 14, 15, 8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 0,  1,
+      2,  3,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,
+      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+      10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 0,  1,
+      2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,
+      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  4,  5,
+      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  0,  1,  2,  3,
+      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+      10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+      12, 13, 14, 15};
+  const Simd<T, N, 0> d;
+  const Repartition<uint8_t, decltype(d)> d8;
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
+HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 4);
 
   // There are only 2 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t packed_array[4 * 16] = {
+  alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
+      // PrintCompress64x2Tables
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
       8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
@@ -3812,34 +4097,40 @@ HWY_INLINE Vec128<T, N> Idx64x2FromBits(const uint64_t mask_bits) {
 
   const Simd<T, N, 0> d;
   const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
 }
 
-// Helper functions called by both Compress and CompressStore - avoids a
-// redundant BitsFromMask in the latter.
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
+HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
+  HWY_DASSERT(mask_bits < 4);
 
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Compress(hwy::SizeTag<2> /*tag*/, Vec128<T, N> v,
-                                 const uint64_t mask_bits) {
-  const auto idx = detail::Idx16x8FromBits<T, N>(mask_bits);
-  const DFromV<decltype(v)> d;
-  const RebindToSigned<decltype(d)> di;
-  return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
+  // There are only 2 lanes, so we can afford to load the index vector directly.
+  alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
+      // PrintCompressNot64x2Tables
+      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
+      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
+      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
+      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
+
+  const Simd<T, N, 0> d;
+  const Repartition<uint8_t, decltype(d)> d8;
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
 }
 
+// Helper functions called by both Compress and CompressStore - avoids a
+// redundant BitsFromMask in the latter.
+
 template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Compress(hwy::SizeTag<4> /*tag*/, Vec128<T, N> v,
-                                 const uint64_t mask_bits) {
-  const auto idx = detail::Idx32x4FromBits<T, N>(mask_bits);
+HWY_INLINE Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) {
+  const auto idx = detail::IdxFromBits<T, N>(mask_bits);
   const DFromV<decltype(v)> d;
   const RebindToSigned<decltype(d)> di;
   return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
 }
 
 template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Compress(hwy::SizeTag<8> /*tag*/, Vec128<T, N> v,
-                                 const uint64_t mask_bits) {
-  const auto idx = detail::Idx64x2FromBits<T, N>(mask_bits);
+HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, const uint64_t mask_bits) {
+  const auto idx = detail::IdxFromNotBits<T, N>(mask_bits);
   const DFromV<decltype(v)> d;
   const RebindToSigned<decltype(d)> di;
   return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
@@ -3852,10 +4143,62 @@ struct CompressIsPartition {
   enum { value = 1 };
 };
 
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
-  const uint64_t mask_bits = detail::BitsFromMask(mask);
-  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
+// Single lane: no-op
+template <typename T>
+HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
+  return v;
+}
+
+// Two lanes: conditional swap
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
+  // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
+  const Full128<T> d;
+  const Vec128<T> m = VecFromMask(d, mask);
+  const Vec128<T> maskL = DupEven(m);
+  const Vec128<T> maskH = DupOdd(m);
+  const Vec128<T> swap = AndNot(maskL, maskH);
+  return IfVecThenElse(swap, Shuffle01(v), v);
+}
+
+// General case
+template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
+HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
+  return detail::Compress(v, detail::BitsFromMask(mask));
+}
+
+// Single lane: no-op
+template <typename T>
+HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
+  return v;
+}
+
+// Two lanes: conditional swap
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
+  // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
+  const Full128<T> d;
+  const Vec128<T> m = VecFromMask(d, mask);
+  const Vec128<T> maskL = DupEven(m);
+  const Vec128<T> maskH = DupOdd(m);
+  const Vec128<T> swap = AndNot(maskH, maskL);
+  return IfVecThenElse(swap, Shuffle01(v), v);
+}
+
+// General case
+template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
+HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
+  // For partial vectors, we cannot pull the Not() into the table because
+  // BitsFromMask clears the upper bits.
+  if (N < 16 / sizeof(T)) {
+    return detail::Compress(v, detail::BitsFromMask(Not(mask)));
+  }
+  return detail::CompressNot(v, detail::BitsFromMask(mask));
+}
+// ------------------------------ CompressBlocksNot
+HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
+                                           Mask128<uint64_t> /* m */) {
+  return v;
 }
 
 // ------------------------------ CompressBits
@@ -3870,7 +4213,7 @@ HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
     mask_bits &= (1ull << N) - 1;
   }
 
-  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
+  return detail::Compress(v, mask_bits);
 }
 
 // ------------------------------ CompressStore
@@ -3878,7 +4221,7 @@ template <typename T, size_t N>
 HWY_API size_t CompressStore(Vec128<T, N> v, const Mask128<T, N> mask,
                              Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
   const uint64_t mask_bits = detail::BitsFromMask(mask);
-  const auto c = detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
+  const auto c = detail::Compress(v, mask_bits);
   StoreU(c, d, unaligned);
   return PopCount(mask_bits);
 }
@@ -3892,8 +4235,7 @@ HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m,
   using TU = TFromD<decltype(du)>;
   const uint64_t mask_bits = detail::BitsFromMask(m);
   const size_t count = PopCount(mask_bits);
-  const Vec128<TU, N> compressed =
-      detail::Compress(hwy::SizeTag<sizeof(T)>(), BitCast(du, v), mask_bits);
+  const Vec128<TU, N> compressed = detail::Compress(BitCast(du, v), mask_bits);
   const Mask128<T, N> store_mask = RebindMask(d, FirstN(du, count));
   BlendedStore(BitCast(d, compressed), store_mask, d, unaligned);
   return count;
@@ -3912,7 +4254,7 @@ HWY_API size_t CompressBitsStore(Vec128<T, N> v,
     mask_bits &= (1ull << N) - 1;
   }
 
-  const auto c = detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
+  const auto c = detail::Compress(v, mask_bits);
   StoreU(c, d, unaligned);
   return PopCount(mask_bits);
 }
@@ -3961,6 +4303,16 @@ HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
   return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
 }
 
+// Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is
+// safe.
+template <size_t N>
+HWY_API Vec128<int32_t, N> ReorderWidenMulAccumulate(
+    Simd<int32_t, N, 0> /*d32*/, Vec128<int16_t, 2 * N> a,
+    Vec128<int16_t, 2 * N> b, const Vec128<int32_t, N> sum0,
+    Vec128<int32_t, N>& /*sum1*/) {
+  return sum0 + Vec128<int32_t, N>{wasm_i32x4_dot_i16x8(a.raw, b.raw)};
+}
+
 // ------------------------------ Reductions
 
 namespace detail {
@@ -4049,26 +4401,76 @@ HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
   return Max(v10, v01);
 }
 
-// u16/i16
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
-  const DFromV<decltype(v)> d;
-  const Repartition<int32_t, decltype(d)> d32;
+template <size_t N, HWY_IF_GE32(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
+                                       Vec128<uint16_t, N> v) {
+  const Simd<uint16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
   const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
   const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MinOfLanes(d32, Min(even, odd));
+  const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
   // Also broadcast into odd lanes.
-  return BitCast(d, Or(min, ShiftLeft<16>(min)));
+  return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
+}
+template <size_t N, HWY_IF_GE32(int16_t, N)>
+HWY_API Vec128<int16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
+                                      Vec128<int16_t, N> v) {
+  const Simd<int16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
 }
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
-  const DFromV<decltype(v)> d;
-  const Repartition<int32_t, decltype(d)> d32;
+
+template <size_t N, HWY_IF_GE32(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
+                                       Vec128<uint16_t, N> v) {
+  const Simd<uint16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
   const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
   const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MaxOfLanes(d32, Max(even, odd));
+  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+template <size_t N, HWY_IF_GE32(int16_t, N)>
+HWY_API Vec128<int16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
+                                      Vec128<int16_t, N> v) {
+  const Simd<int16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
   // Also broadcast into odd lanes.
-  return BitCast(d, Or(min, ShiftLeft<16>(min)));
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+
+template <size_t N, HWY_IF_GE32(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
+                                       Vec128<uint16_t, N> v) {
+  const Simd<uint16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+template <size_t N, HWY_IF_GE32(int16_t, N)>
+HWY_API Vec128<int16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
+                                      Vec128<int16_t, N> v) {
+  const Simd<int16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
 }
 
 }  // namespace detail
@@ -4089,19 +4491,10 @@ HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
 
 // ------------------------------ Lt128
 
-namespace detail {
-
-template <size_t kLanes, typename T, size_t N>
-Mask128<T, N> ShiftMaskLeft(Mask128<T, N> m) {
-  return MaskFromVec(ShiftLeftLanes<kLanes>(VecFromMask(Simd<T, N, 0>(), m)));
-}
-
-}  // namespace detail
-
 template <typename T, size_t N, HWY_IF_LE128(T, N)>
 HWY_INLINE Mask128<T, N> Lt128(Simd<T, N, 0> d, Vec128<T, N> a,
                                Vec128<T, N> b) {
-  static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
+  static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
   // Truth table of Eq and Lt for Hi and Lo u64.
   // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
   // =H =L cH cL  | out = cH | (=H & cL)
@@ -4116,83 +4509,78 @@ HWY_INLINE Mask128<T, N> Lt128(Simd<T, N, 0> d, Vec128<T, N> a,
   //  1  0  0  1  |  1
   //  1  1  0  0  |  0
   const Mask128<T, N> eqHL = Eq(a, b);
-  const Mask128<T, N> ltHL = Lt(a, b);
+  const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
   // We need to bring cL to the upper lane/bit corresponding to cH. Comparing
   // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the
-  // comparison result leftwards requires only 4.
-  const Mask128<T, N> ltLx = detail::ShiftMaskLeft<1>(ltHL);
-  const Mask128<T, N> outHx = Or(ltHL, And(eqHL, ltLx));
-  const Vec128<T, N> vecHx = VecFromMask(d, outHx);
-  return MaskFromVec(InterleaveUpper(d, vecHx, vecHx));
+  // comparison result leftwards requires only 4. IfThenElse compiles to the
+  // same code as OrAnd().
+  const Vec128<T, N> ltLx = DupEven(ltHL);
+  const Vec128<T, N> outHx = IfThenElse(eqHL, ltLx, ltHL);
+  return MaskFromVec(DupOdd(outHx));
 }
 
-// ------------------------------ Min128, Max128 (Lt128)
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Mask128<T, N> Lt128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
+                                    Vec128<T, N> b) {
+  const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
+  return MaskFromVec(InterleaveUpper(d, ltHL, ltHL));
+}
 
-// Without a native OddEven, it seems infeasible to go faster than Lt128.
-template <class D>
-HWY_INLINE VFromD<D> Min128(D d, const VFromD<D> a, const VFromD<D> b) {
-  return IfThenElse(Lt128(d, a, b), a, b);
+// ------------------------------ Eq128
+
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Mask128<T, N> Eq128(Simd<T, N, 0> d, Vec128<T, N> a,
+                               Vec128<T, N> b) {
+  static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
+  const Vec128<T, N> eqHL = VecFromMask(d, Eq(a, b));
+  return MaskFromVec(And(Reverse2(d, eqHL), eqHL));
 }
 
-template <class D>
-HWY_INLINE VFromD<D> Max128(D d, const VFromD<D> a, const VFromD<D> b) {
-  return IfThenElse(Lt128(d, a, b), b, a);
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Mask128<T, N> Eq128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
+                                    Vec128<T, N> b) {
+  const Vec128<T, N> eqHL = VecFromMask(d, Eq(a, b));
+  return MaskFromVec(InterleaveUpper(d, eqHL, eqHL));
 }
 
-// ================================================== Operator wrapper
+// ------------------------------ Ne128
 
-template <class V>
-HWY_API V Add(V a, V b) {
-  return a + b;
-}
-template <class V>
-HWY_API V Sub(V a, V b) {
-  return a - b;
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Mask128<T, N> Ne128(Simd<T, N, 0> d, Vec128<T, N> a,
+                               Vec128<T, N> b) {
+  static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
+  const Vec128<T, N> neHL = VecFromMask(d, Ne(a, b));
+  return MaskFromVec(Or(Reverse2(d, neHL), neHL));
 }
 
-template <class V>
-HWY_API V Mul(V a, V b) {
-  return a * b;
-}
-template <class V>
-HWY_API V Div(V a, V b) {
-  return a / b;
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Mask128<T, N> Ne128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
+                                    Vec128<T, N> b) {
+  const Vec128<T, N> neHL = VecFromMask(d, Ne(a, b));
+  return MaskFromVec(InterleaveUpper(d, neHL, neHL));
 }
 
-template <class V>
-V Shl(V a, V b) {
-  return a << b;
-}
-template <class V>
-V Shr(V a, V b) {
-  return a >> b;
-}
+// ------------------------------ Min128, Max128 (Lt128)
 
-template <class V>
-HWY_API auto Eq(V a, V b) -> decltype(a == b) {
-  return a == b;
-}
-template <class V>
-HWY_API auto Ne(V a, V b) -> decltype(a == b) {
-  return a != b;
-}
-template <class V>
-HWY_API auto Lt(V a, V b) -> decltype(a == b) {
-  return a < b;
+// Without a native OddEven, it seems infeasible to go faster than Lt128.
+template <class D>
+HWY_INLINE VFromD<D> Min128(D d, const VFromD<D> a, const VFromD<D> b) {
+  return IfThenElse(Lt128(d, a, b), a, b);
 }
 
-template <class V>
-HWY_API auto Gt(V a, V b) -> decltype(a == b) {
-  return a > b;
+template <class D>
+HWY_INLINE VFromD<D> Max128(D d, const VFromD<D> a, const VFromD<D> b) {
+  return IfThenElse(Lt128(d, b, a), a, b);
 }
-template <class V>
-HWY_API auto Ge(V a, V b) -> decltype(a == b) {
-  return a >= b;
+
+template <class D>
+HWY_INLINE VFromD<D> Min128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
+  return IfThenElse(Lt128Upper(d, a, b), a, b);
 }
 
-template <class V>
-HWY_API auto Le(V a, V b) -> decltype(a == b) {
-  return a <= b;
+template <class D>
+HWY_INLINE VFromD<D> Max128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
+  return IfThenElse(Lt128Upper(d, b, a), a, b);
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/media/highway/src/hwy/ops/wasm_256-inl.h b/media/highway/src/hwy/ops/wasm_256-inl.h
index cae6c5735f..42f4fb2f44 100644
--- a/media/highway/src/hwy/ops/wasm_256-inl.h
+++ b/media/highway/src/hwy/ops/wasm_256-inl.h
@@ -592,7 +592,7 @@ HWY_API Vec256<int16_t> MulHigh(const Vec256<int16_t> a,
 }
 
 HWY_API Vec256<int16_t> MulFixedPoint15(Vec256<int16_t>, Vec256<int16_t>) {
-  HWY_ASSERT(0);
+  HWY_ASSERT(0);  // Not implemented
 }
 
 // Multiplies even lanes (0, 2 ..) and returns the double-width result.
@@ -1043,7 +1043,7 @@ HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
 template <typename T>
     HWY_API Vec256 <
     T IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
-  HWY_ASSERT(0);
+  HWY_ASSERT(0);  // Not implemented
 }
 
 template <typename T, HWY_IF_FLOAT(T)>
@@ -1084,6 +1084,12 @@ HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
   return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
 }
 
+template <typename T>
+HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) {
+  const Full256<T> d;
+  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
+}
+
 // ------------------------------ Shl (BroadcastSignBit, IfThenElse)
 
 // The x86 multiply-by-Pow2() trick will not work because WASM saturates
@@ -1333,13 +1339,13 @@ HWY_API Vec256<T> GatherIndex(const Full256<T> d, const T* HWY_RESTRICT base,
 // ------------------------------ ExtractLane
 template <typename T, size_t N>
 HWY_API T ExtractLane(const Vec128<T, N> v, size_t i) {
-  HWY_ASSERT(0);
+  HWY_ASSERT(0);  // Not implemented
 }
 
 // ------------------------------ InsertLane
 template <typename T, size_t N>
 HWY_API Vec128<T, N> InsertLane(const Vec128<T, N> v, size_t i, T t) {
-  HWY_ASSERT(0);
+  HWY_ASSERT(0);  // Not implemented
 }
 
 // ------------------------------ GetLane
@@ -1846,21 +1852,21 @@ HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
 
 template <typename T>
 HWY_API Vec256<T> Reverse2(Full256<T> d, const Vec256<T> v) {
-  HWY_ASSERT(0);
+  HWY_ASSERT(0);  // Not implemented
 }
 
 // ------------------------------ Reverse4
 
 template <typename T>
 HWY_API Vec256<T> Reverse4(Full256<T> d, const Vec256<T> v) {
-  HWY_ASSERT(0);
+  HWY_ASSERT(0);  // Not implemented
 }
 
 // ------------------------------ Reverse8
 
 template <typename T>
 HWY_API Vec256<T> Reverse8(Full256<T> d, const Vec256<T> v) {
-  HWY_ASSERT(0);
+  HWY_ASSERT(0);  // Not implemented
 }
 
 // ------------------------------ InterleaveLower
@@ -2065,13 +2071,13 @@ HWY_API Vec256<T> ConcatEven(Full256<T> /* tag */, Vec256<T> hi, Vec256<T> lo) {
 // ------------------------------ DupEven
 template <typename T>
 HWY_API Vec256<T> DupEven(Vec256<T> v) {
-  HWY_ASSERT(0);
+  HWY_ASSERT(0);  // Not implemented
 }
 
 // ------------------------------ DupOdd
 template <typename T>
 HWY_API Vec256<T> DupOdd(Vec256<T> v) {
-  HWY_ASSERT(0);
+  HWY_ASSERT(0);  // Not implemented
 }
 
 // ------------------------------ OddEven
@@ -2298,18 +2304,71 @@ HWY_API Vec128<bfloat16_t> ReorderDemote2To(Full128<bfloat16_t> dbf16,
   return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
 }
 
+HWY_API Vec512<int16_t> ReorderDemote2To(Full512<int16_t> /*d16*/,
+                                         Vec512<int32_t> a, Vec512<int32_t> b) {
+  return Vec512<int16_t>{wasm_i16x8_narrow_i32x4(a.raw, b.raw)};
+}
+
 // For already range-limited input [0, 255].
 HWY_API Vec256<uint8_t> U8FromU32(const Vec256<uint32_t> v) {
   const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
   return Vec256<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
 }
 
+// ------------------------------ Truncations
+
+HWY_API Vec256<uint8_t, 4> TruncateTo(Simd<uint8_t, 4, 0> /* tag */,
+                                      const Vec256<uint64_t> v) {
+  return Vec256<uint8_t, 4>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 8, 16, 24,
+                                               0, 8, 16, 24, 0, 8, 16, 24, 0, 8,
+                                               16, 24)};
+}
+
+HWY_API Vec256<uint16_t, 4> TruncateTo(Simd<uint16_t, 4, 0> /* tag */,
+                                       const Vec256<uint64_t> v) {
+  return Vec256<uint16_t, 4>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 8, 9,
+                                                16, 17, 24, 25, 0, 1, 8, 9, 16,
+                                                17, 24, 25)};
+}
+
+HWY_API Vec256<uint32_t, 4> TruncateTo(Simd<uint32_t, 4, 0> /* tag */,
+                                       const Vec256<uint64_t> v) {
+  return Vec256<uint32_t, 4>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 2, 3,
+                                                8, 9, 10, 11, 16, 17, 18, 19,
+                                                24, 25, 26, 27)};
+}
+
+HWY_API Vec256<uint8_t, 8> TruncateTo(Simd<uint8_t, 8, 0> /* tag */,
+                                      const Vec256<uint32_t> v) {
+  return Vec256<uint8_t, 8>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 4, 8, 12,
+                                               16, 20, 24, 28, 0, 4, 8, 12, 16,
+                                               20, 24, 28)};
+}
+
+HWY_API Vec256<uint16_t, 8> TruncateTo(Simd<uint16_t, 8, 0> /* tag */,
+                                       const Vec256<uint32_t> v) {
+  return Vec256<uint16_t, 8>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 4, 5,
+                                                8, 9, 12, 13, 16, 17, 20, 21,
+                                                24, 25, 28, 29)};
+}
+
+HWY_API Vec256<uint8_t, 16> TruncateTo(Simd<uint8_t, 16, 0> /* tag */,
+                                       const Vec256<uint16_t> v) {
+  return Vec256<uint8_t, 16>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 2, 4, 6,
+                                                8, 10, 12, 14, 16, 18, 20, 22,
+                                                24, 26, 28, 30)};
+}
+
 // ------------------------------ Convert i32 <=> f32 (Round)
 
 HWY_API Vec256<float> ConvertTo(Full256<float> /* tag */,
                                 const Vec256<int32_t> v) {
   return Vec256<float>{wasm_f32x4_convert_i32x4(v.raw)};
 }
+HWY_API Vec256<float> ConvertTo(Full256<float> /* tag */,
+                                const Vec256<uint32_t> v) {
+  return Vec256<float>{wasm_f32x4_convert_u32x4(v.raw)};
+}
 // Truncates (rounds toward zero).
 HWY_API Vec256<int32_t> ConvertTo(Full256<int32_t> /* tag */,
                                   const Vec256<float> v) {
@@ -2519,6 +2578,13 @@ HWY_API bool AllTrue(const Full256<T> /* tag */, const Mask128<T> m) {
 }
 
 template <typename T>
+HWY_API size_t FindKnownFirstTrue(const Full256<T> /* tag */,
+                                  const Mask256<T> mask) {
+  const uint64_t bits = detail::BitsFromMask(mask);
+  return Num0BitsBelowLS1Bit_Nonzero64(bits);
+}
+
+template <typename T>
 HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
                                const Mask256<T> mask) {
   const uint64_t bits = detail::BitsFromMask(mask);
@@ -2758,6 +2824,18 @@ HWY_API Vec256<T> Compress(Vec256<T> v, const Mask256<T> mask) {
   return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
 }
 
+// ------------------------------ CompressNot
+template <typename T>
+HWY_API Vec256<T> Compress(Vec256<T> v, const Mask256<T> mask) {
+  return Compress(v, Not(mask));
+}
+
+// ------------------------------ CompressBlocksNot
+HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
+                                           Mask256<uint64_t> mask) {
+  HWY_ASSERT(0);  // Not implemented
+}
+
 // ------------------------------ CompressBits
 
 template <typename T>
@@ -2858,6 +2936,14 @@ HWY_API Vec256<float> ReorderWidenMulAccumulate(Full256<float> df32,
   return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
 }
 
+HWY_API Vec256<int32_t> ReorderWidenMulAccumulate(Full256<int32_t> /*d32*/,
+                                                  Vec256<int16_t> a,
+                                                  Vec256<int16_t> b,
+                                                  const Vec256<int32_t> sum0,
+                                                  Vec256<int32_t>& /*sum1*/) {
+  return sum0 + Vec256<int32_t>{wasm_i32x4_dot_i16x8(a.raw, b.raw)};
+}
+
 // ------------------------------ Reductions
 
 namespace detail {
@@ -2912,22 +2998,12 @@ HWY_INLINE Vec256<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
 
 // u16/i16
 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> v) {
-  const Repartition<int32_t, Full256<T>> d32;
-  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MinOfLanes(d32, Min(even, odd));
-  // Also broadcast into odd lanes.
-  return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
+HWY_API Vec256<T> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> /*v*/) {
+  HWY_ASSERT(0);  // Not implemented
 }
 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> v) {
-  const Repartition<int32_t, Full256<T>> d32;
-  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MaxOfLanes(d32, Max(even, odd));
-  // Also broadcast into odd lanes.
-  return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
+HWY_API Vec256<T> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> /*v*/) {
+  HWY_ASSERT(0);  // Not implemented
 }
 
 }  // namespace detail
@@ -2952,11 +3028,32 @@ template <typename T>
 HWY_INLINE Mask256<T> Lt128(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
 
 template <typename T>
+HWY_INLINE Mask256<T> Lt128Upper(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
+
+template <typename T>
+HWY_INLINE Mask256<T> Eq128(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
+
+template <typename T>
+HWY_INLINE Mask256<T> Eq128Upper(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
+
+template <typename T>
+HWY_INLINE Mask256<T> Ne128(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
+
+template <typename T>
+HWY_INLINE Mask256<T> Ne128Upper(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
+
+template <typename T>
 HWY_INLINE Vec256<T> Min128(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
 
 template <typename T>
 HWY_INLINE Vec256<T> Max128(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
 
+template <typename T>
+HWY_INLINE Vec256<T> Min128Upper(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
+
+template <typename T>
+HWY_INLINE Vec256<T> Max128Upper(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
+
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
diff --git a/media/highway/src/hwy/ops/x86_128-inl.h b/media/highway/src/hwy/ops/x86_128-inl.h
index fb2a188dcd..68b156e5a2 100644
--- a/media/highway/src/hwy/ops/x86_128-inl.h
+++ b/media/highway/src/hwy/ops/x86_128-inl.h
@@ -17,6 +17,17 @@
 // operations when compiling for those targets.
 // External include guard in highway.h - see comment there.
 
+// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_GCC_ACTUAL
+#include "hwy/base.h"
+
+// Avoid uninitialized warnings in GCC's emmintrin.h - see
+// https://github.com/google/highway/issues/710 and pull/902)
+HWY_DIAGNOSTICS(push)
+#if HWY_COMPILER_GCC_ACTUAL
+HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
+HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized")
+#endif
+
 #include <emmintrin.h>
 #include <stdio.h>
 #if HWY_TARGET == HWY_SSSE3
@@ -27,23 +38,14 @@
 #endif
 #include <stddef.h>
 #include <stdint.h>
+#include <string.h>  // memcpy
 
-#include "hwy/base.h"
 #include "hwy/ops/shared-inl.h"
 
 #if HWY_IS_MSAN
 #include <sanitizer/msan_interface.h>
 #endif
 
-// Clang 3.9 generates VINSERTF128 instead of the desired VBROADCASTF128,
-// which would free up port5. However, inline assembly isn't supported on
-// MSVC, results in incorrect output on GCC 8.3, and raises "invalid output size
-// for constraint" errors on Clang (https://gcc.godbolt.org/z/-Jt_-F), hence we
-// disable it.
-#ifndef HWY_LOADDUP_ASM
-#define HWY_LOADDUP_ASM 0
-#endif
-
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
@@ -143,7 +145,7 @@ struct RawMask128<8> {
 
 }  // namespace detail
 
-template <typename T, size_t N>
+template <typename T, size_t N = 16 / sizeof(T)>
 struct Mask128 {
   using Raw = typename detail::RawMask128<sizeof(T)>::type;
 
@@ -583,16 +585,26 @@ HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) {
 
 // ------------------------------ Neg
 
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
+// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
+namespace detail {
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Neg(hwy::FloatTag /*tag*/, const Vec128<T, N> v) {
   return Xor(v, SignBit(DFromV<decltype(v)>()));
 }
 
-template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
-HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Neg(hwy::NonFloatTag /*tag*/, const Vec128<T, N> v) {
   return Zero(DFromV<decltype(v)>()) - v;
 }
 
+}  // namespace detail
+
+template <typename T, size_t N>
+HWY_INLINE Vec128<T, N> Neg(const Vec128<T, N> v) {
+  return detail::Neg(hwy::IsFloatTag<T>(), v);
+}
+
 // ------------------------------ Abs
 
 // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
@@ -813,7 +825,7 @@ HWY_API Vec128<double, N> IfThenZeroElse(Mask128<double, N> mask,
 
 // For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently.
 #if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS)
-#if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC >= 700 || \
+#if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC_ACTUAL >= 700 || \
     HWY_COMPILER_CLANG >= 800
 #define HWY_COMPILER_HAS_MASK_INTRINSICS 1
 #else
@@ -971,6 +983,47 @@ HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a,
 #endif
 }
 
+template <typename T, size_t N>
+HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<1> /*tag*/,
+                                          const Mask128<T, N> a,
+                                          const Mask128<T, N> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask128<T, N>{_kxnor_mask16(a.raw, b.raw)};
+#else
+  return Mask128<T, N>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)};
+#endif
+}
+template <typename T, size_t N>
+HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<2> /*tag*/,
+                                          const Mask128<T, N> a,
+                                          const Mask128<T, N> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask128<T, N>{_kxnor_mask8(a.raw, b.raw)};
+#else
+  return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)};
+#endif
+}
+template <typename T, size_t N>
+HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<4> /*tag*/,
+                                          const Mask128<T, N> a,
+                                          const Mask128<T, N> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask128<T, N>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)};
+#else
+  return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)};
+#endif
+}
+template <typename T, size_t N>
+HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<8> /*tag*/,
+                                          const Mask128<T, N> a,
+                                          const Mask128<T, N> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask128<T, N>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0x3)};
+#else
+  return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0x3)};
+#endif
+}
+
 }  // namespace detail
 
 template <typename T, size_t N>
@@ -996,9 +1049,15 @@ HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
 template <typename T, size_t N>
 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
   // Flip only the valid bits.
+  // TODO(janwas): use _knot intrinsics if N >= 8.
   return Xor(m, Mask128<T, N>::FromBits((1ull << N) - 1));
 }
 
+template <typename T, size_t N>
+HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
+  return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
+}
+
 #else  // AVX2 or below
 
 // ------------------------------ Mask
@@ -1096,6 +1155,12 @@ HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
   return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
 }
 
+template <typename T, size_t N>
+HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
+  const Simd<T, N, 0> d;
+  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
+}
+
 #endif  // HWY_TARGET <= HWY_AVX3
 
 // ------------------------------ ShiftLeft
@@ -1206,8 +1271,9 @@ HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
 
 // Swap 32-bit halves in 64-bit halves.
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
+template <typename T, size_t N>
 HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) {
+  static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
   static_assert(N == 2 || N == 4, "Does not make sense for N=1");
   return Vec128<T, N>{_mm_shuffle_epi32(v.raw, 0xB1)};
 }
@@ -1688,7 +1754,7 @@ HWY_API Mask128<double, N> operator==(const Vec128<double, N> a,
 
 // ------------------------------ Inequality
 
-// This cannot have T as a template argument, otherwise it is not more 
+// This cannot have T as a template argument, otherwise it is not more
 // specialized than rewritten operator== in C++20, leading to compile
 // errors: https://gcc.godbolt.org/z/xsrPhPvPT.
 template <size_t N>
@@ -1745,57 +1811,71 @@ HWY_API Mask128<double, N> operator!=(const Vec128<double, N> a,
 
 // ------------------------------ Strict inequality
 
-// Signed/float <
+namespace detail {
+
 template <size_t N>
-HWY_API Mask128<int8_t, N> operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
+HWY_INLINE Mask128<int8_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int8_t, N> a,
+                                 Vec128<int8_t, N> b) {
   return Mask128<int8_t, N>{_mm_cmpgt_epi8(a.raw, b.raw)};
 }
 template <size_t N>
-HWY_API Mask128<int16_t, N> operator>(Vec128<int16_t, N> a,
-                                      Vec128<int16_t, N> b) {
+HWY_INLINE Mask128<int16_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int16_t, N> a,
+                                  Vec128<int16_t, N> b) {
   return Mask128<int16_t, N>{_mm_cmpgt_epi16(a.raw, b.raw)};
 }
 template <size_t N>
-HWY_API Mask128<int32_t, N> operator>(Vec128<int32_t, N> a,
-                                      Vec128<int32_t, N> b) {
+HWY_INLINE Mask128<int32_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int32_t, N> a,
+                                  Vec128<int32_t, N> b) {
   return Mask128<int32_t, N>{_mm_cmpgt_epi32(a.raw, b.raw)};
 }
 
-template <typename T, size_t N, HWY_IF_UNSIGNED(T)>
-HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
+template <size_t N>
+HWY_INLINE Mask128<int64_t, N> Gt(hwy::SignedTag /*tag*/,
+                                  const Vec128<int64_t, N> a,
+                                  const Vec128<int64_t, N> b) {
+#if HWY_TARGET == HWY_SSSE3
+  // See https://stackoverflow.com/questions/65166174/:
+  const Simd<int64_t, N, 0> d;
+  const RepartitionToNarrow<decltype(d)> d32;
+  const Vec128<int64_t, N> m_eq32{Eq(BitCast(d32, a), BitCast(d32, b)).raw};
+  const Vec128<int64_t, N> m_gt32{Gt(BitCast(d32, a), BitCast(d32, b)).raw};
+  // If a.upper is greater, upper := true. Otherwise, if a.upper == b.upper:
+  // upper := b-a (unsigned comparison result of lower). Otherwise: upper := 0.
+  const __m128i upper = OrAnd(m_gt32, m_eq32, Sub(b, a)).raw;
+  // Duplicate upper to lower half.
+  return Mask128<int64_t, N>{_mm_shuffle_epi32(upper, _MM_SHUFFLE(3, 3, 1, 1))};
+#else
+  return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)};  // SSE4.2
+#endif
+}
+
+template <typename T, size_t N>
+HWY_INLINE Mask128<T, N> Gt(hwy::UnsignedTag /*tag*/, Vec128<T, N> a,
+                            Vec128<T, N> b) {
   const DFromV<decltype(a)> du;
   const RebindToSigned<decltype(du)> di;
   const Vec128<T, N> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
-  return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb)));
+  const auto sa = BitCast(di, Xor(a, msb));
+  const auto sb = BitCast(di, Xor(b, msb));
+  return RebindMask(du, Gt(hwy::SignedTag(), sa, sb));
 }
 
 template <size_t N>
-HWY_API Mask128<float, N> operator>(Vec128<float, N> a, Vec128<float, N> b) {
+HWY_INLINE Mask128<float, N> Gt(hwy::FloatTag /*tag*/, Vec128<float, N> a,
+                                Vec128<float, N> b) {
   return Mask128<float, N>{_mm_cmpgt_ps(a.raw, b.raw)};
 }
 template <size_t N>
-HWY_API Mask128<double, N> operator>(Vec128<double, N> a, Vec128<double, N> b) {
+HWY_INLINE Mask128<double, N> Gt(hwy::FloatTag /*tag*/, Vec128<double, N> a,
+                                 Vec128<double, N> b) {
   return Mask128<double, N>{_mm_cmpgt_pd(a.raw, b.raw)};
 }
 
-template <size_t N>
-HWY_API Mask128<int64_t, N> operator>(const Vec128<int64_t, N> a,
-                                      const Vec128<int64_t, N> b) {
-#if HWY_TARGET == HWY_SSSE3
-  // If the upper half is less than or greater, this is the answer.
-  const __m128i m_gt = _mm_cmpgt_epi32(a.raw, b.raw);
-
-  // Otherwise, the lower half decides.
-  const __m128i m_eq = _mm_cmpeq_epi32(a.raw, b.raw);
-  const __m128i lo_in_hi = _mm_shuffle_epi32(m_gt, _MM_SHUFFLE(2, 2, 0, 0));
-  const __m128i lo_gt = _mm_and_si128(m_eq, lo_in_hi);
+}  // namespace detail
 
-  const __m128i gt = _mm_or_si128(lo_gt, m_gt);
-  // Copy result in upper 32 bits to lower 32 bits.
-  return Mask128<int64_t, N>{_mm_shuffle_epi32(gt, _MM_SHUFFLE(3, 3, 1, 1))};
-#else
-  return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)};  // SSE4.2
-#endif
+template <typename T, size_t N>
+HWY_INLINE Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
+  return detail::Gt(hwy::TypeTag<T>(), a, b);
 }
 
 // ------------------------------ Weak inequality
@@ -1893,7 +1973,7 @@ template <typename T>
 HWY_API Vec64<T> Load(Full64<T> /* tag */, const T* HWY_RESTRICT p) {
 #if HWY_SAFE_PARTIAL_LOAD_STORE
   __m128i v = _mm_setzero_si128();
-  CopyBytes<8>(p, &v);
+  CopyBytes<8>(p, &v);  // not same size
   return Vec64<T>{v};
 #else
   return Vec64<T>{_mm_loadl_epi64(reinterpret_cast<const __m128i*>(p))};
@@ -1904,7 +1984,7 @@ HWY_API Vec128<float, 2> Load(Full64<float> /* tag */,
                               const float* HWY_RESTRICT p) {
 #if HWY_SAFE_PARTIAL_LOAD_STORE
   __m128 v = _mm_setzero_ps();
-  CopyBytes<8>(p, &v);
+  CopyBytes<8>(p, &v);  // not same size
   return Vec128<float, 2>{v};
 #else
   const __m128 hi = _mm_setzero_ps();
@@ -1916,7 +1996,7 @@ HWY_API Vec64<double> Load(Full64<double> /* tag */,
                            const double* HWY_RESTRICT p) {
 #if HWY_SAFE_PARTIAL_LOAD_STORE
   __m128d v = _mm_setzero_pd();
-  CopyBytes<8>(p, &v);
+  CopyBytes<8>(p, &v);  // not same size
   return Vec64<double>{v};
 #else
   return Vec64<double>{_mm_load_sd(p)};
@@ -1927,7 +2007,7 @@ HWY_API Vec128<float, 1> Load(Full32<float> /* tag */,
                               const float* HWY_RESTRICT p) {
 #if HWY_SAFE_PARTIAL_LOAD_STORE
   __m128 v = _mm_setzero_ps();
-  CopyBytes<4>(p, &v);
+  CopyBytes<4>(p, &v);  // not same size
   return Vec128<float, 1>{v};
 #else
   return Vec128<float, 1>{_mm_load_ss(p)};
@@ -1940,11 +2020,11 @@ HWY_API Vec128<T, N> Load(Simd<T, N, 0> /* tag */, const T* HWY_RESTRICT p) {
   constexpr size_t kSize = sizeof(T) * N;
 #if HWY_SAFE_PARTIAL_LOAD_STORE
   __m128 v = _mm_setzero_ps();
-  CopyBytes<kSize>(p, &v);
+  CopyBytes<kSize>(p, &v);  // not same size
   return Vec128<T, N>{v};
 #else
   int32_t bits = 0;
-  CopyBytes<kSize>(p, &bits);
+  CopyBytes<kSize>(p, &bits);  // not same size
   return Vec128<T, N>{_mm_cvtsi32_si128(bits)};
 #endif
 }
@@ -2094,7 +2174,7 @@ HWY_API void StoreU(const Vec128<double> v, Full128<double> /* tag */,
 template <typename T>
 HWY_API void Store(Vec64<T> v, Full64<T> /* tag */, T* HWY_RESTRICT p) {
 #if HWY_SAFE_PARTIAL_LOAD_STORE
-  CopyBytes<8>(&v, p);
+  CopyBytes<8>(&v, p);  // not same size
 #else
   _mm_storel_epi64(reinterpret_cast<__m128i*>(p), v.raw);
 #endif
@@ -2102,7 +2182,7 @@ HWY_API void Store(Vec64<T> v, Full64<T> /* tag */, T* HWY_RESTRICT p) {
 HWY_API void Store(const Vec128<float, 2> v, Full64<float> /* tag */,
                    float* HWY_RESTRICT p) {
 #if HWY_SAFE_PARTIAL_LOAD_STORE
-  CopyBytes<8>(&v, p);
+  CopyBytes<8>(&v, p);  // not same size
 #else
   _mm_storel_pi(reinterpret_cast<__m64*>(p), v.raw);
 #endif
@@ -2110,7 +2190,7 @@ HWY_API void Store(const Vec128<float, 2> v, Full64<float> /* tag */,
 HWY_API void Store(const Vec64<double> v, Full64<double> /* tag */,
                    double* HWY_RESTRICT p) {
 #if HWY_SAFE_PARTIAL_LOAD_STORE
-  CopyBytes<8>(&v, p);
+  CopyBytes<8>(&v, p);  // not same size
 #else
   _mm_storel_pd(p, v.raw);
 #endif
@@ -2119,12 +2199,12 @@ HWY_API void Store(const Vec64<double> v, Full64<double> /* tag */,
 // Any <= 32 bit except <float, 1>
 template <typename T, size_t N, HWY_IF_LE32(T, N)>
 HWY_API void Store(Vec128<T, N> v, Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
-  CopyBytes<sizeof(T) * N>(&v, p);
+  CopyBytes<sizeof(T) * N>(&v, p);  // not same size
 }
 HWY_API void Store(const Vec128<float, 1> v, Full32<float> /* tag */,
                    float* HWY_RESTRICT p) {
 #if HWY_SAFE_PARTIAL_LOAD_STORE
-  CopyBytes<4>(&v, p);
+  CopyBytes<4>(&v, p);  // not same size
 #else
   _mm_store_ss(p, v.raw);
 #endif
@@ -2155,7 +2235,7 @@ HWY_API void ScalarMaskedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
   Store(BitCast(di, VecFromMask(d, m)), di, mask);
   for (size_t i = 0; i < N; ++i) {
     if (mask[i]) {
-      CopyBytes<sizeof(T)>(buf + i, p + i);
+      CopySameSize(buf + i, p + i);
     }
   }
 }
@@ -2662,8 +2742,9 @@ HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) {
 }
 
 // ------------------------------ ZeroIfNegative (BroadcastSignBit)
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
+template <typename T, size_t N>
 HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
+  static_assert(IsFloat<T>(), "Only works for float");
   const DFromV<decltype(v)> d;
 #if HWY_TARGET == HWY_SSSE3
   const RebindToSigned<decltype(d)> di;
@@ -3617,9 +3698,9 @@ HWY_INLINE float ExtractLane(const Vec128<float, N> v) {
   return lanes[kLane];
 #else
   // Bug in the intrinsic, returns int but should be float.
-  const int bits = _mm_extract_ps(v.raw, kLane);
+  const int32_t bits = _mm_extract_ps(v.raw, kLane);
   float ret;
-  CopyBytes<4>(&bits, &ret);
+  CopySameSize(&bits, &ret);
   return ret;
 #endif
 }
@@ -3796,7 +3877,7 @@ HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
   return Load(d, lanes);
 #else
   MakeSigned<T> ti;
-  CopyBytes<sizeof(T)>(&t, &ti);  // don't just cast because T might be float.
+  CopySameSize(&t, &ti);  // don't just cast because T might be float.
   return Vec128<T, N>{_mm_insert_epi32(v.raw, ti, kLane)};
 #endif
 }
@@ -3812,7 +3893,7 @@ HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
   return Load(d, lanes);
 #else
   MakeSigned<T> ti;
-  CopyBytes<sizeof(T)>(&t, &ti);  // don't just cast because T might be float.
+  CopySameSize(&t, &ti);  // don't just cast because T might be float.
   return Vec128<T, N>{_mm_insert_epi64(v.raw, ti, kLane)};
 #endif
 }
@@ -4489,17 +4570,29 @@ HWY_API Vec128<T, N> Combine(Simd<T, N, 0> d, Vec128<T, N / 2> hi_half,
 
 // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
 
-template <typename T, HWY_IF_NOT_FLOAT(T)>
-HWY_API Vec128<T> ZeroExtendVector(Full128<T> /* tag */, Vec64<T> lo) {
+// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
+namespace detail {
+
+template <typename T>
+HWY_INLINE Vec128<T> ZeroExtendVector(hwy::NonFloatTag /*tag*/,
+                                      Full128<T> /* d */, Vec64<T> lo) {
   return Vec128<T>{_mm_move_epi64(lo.raw)};
 }
 
-template <typename T, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T> ZeroExtendVector(Full128<T> d, Vec64<T> lo) {
+template <typename T>
+HWY_INLINE Vec128<T> ZeroExtendVector(hwy::FloatTag /*tag*/, Full128<T> d,
+                                      Vec64<T> lo) {
   const RebindToUnsigned<decltype(d)> du;
   return BitCast(d, ZeroExtendVector(du, BitCast(Half<decltype(du)>(), lo)));
 }
 
+}  // namespace detail
+
+template <typename T>
+HWY_API Vec128<T> ZeroExtendVector(Full128<T> d, Vec64<T> lo) {
+  return detail::ZeroExtendVector(hwy::IsFloatTag<T>(), d, lo);
+}
+
 template <typename T, size_t N, HWY_IF_LE64(T, N)>
 HWY_API Vec128<T, N> ZeroExtendVector(Simd<T, N, 0> d, Vec128<T, N / 2> lo) {
   return IfThenElseZero(FirstN(d, N / 2), Vec128<T, N>{lo.raw});
@@ -4633,10 +4726,11 @@ HWY_API Vec32<T> ConcatOdd(Simd<T, 4, 0> d, Vec32<T> hi, Vec32<T> lo) {
 // 16-bit full
 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
 HWY_API Vec128<T> ConcatOdd(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
-  const Repartition<uint32_t, decltype(d)> dw;
-  // Right-shift 16 bits per u32 so we can pack.
-  const Vec128<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
-  const Vec128<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
+  // Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns
+  // 0xFFFF8000, which correctly saturates to 0x8000.
+  const Repartition<int32_t, decltype(d)> dw;
+  const Vec128<int32_t> uH = ShiftRight<16>(BitCast(dw, hi));
+  const Vec128<int32_t> uL = ShiftRight<16>(BitCast(dw, lo));
   return Vec128<T>{_mm_packs_epi32(uL.raw, uH.raw)};
 }
 
@@ -4713,12 +4807,22 @@ HWY_API Vec32<T> ConcatEven(Simd<T, 4, 0> d, Vec32<T> hi, Vec32<T> lo) {
 // 16-bit full
 template <typename T, HWY_IF_LANE_SIZE(T, 2)>
 HWY_API Vec128<T> ConcatEven(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
-  const Repartition<uint32_t, decltype(d)> dw;
+#if HWY_TARGET <= HWY_SSE4
   // Isolate lower 16 bits per u32 so we can pack.
+  const Repartition<uint32_t, decltype(d)> dw;
   const Vec128<uint32_t> mask = Set(dw, 0x0000FFFF);
   const Vec128<uint32_t> uH = And(BitCast(dw, hi), mask);
   const Vec128<uint32_t> uL = And(BitCast(dw, lo), mask);
-  return Vec128<T>{_mm_packs_epi32(uL.raw, uH.raw)};
+  return Vec128<T>{_mm_packus_epi32(uL.raw, uH.raw)};
+#else
+  // packs_epi32 saturates 0x8000 to 0x7FFF. Instead ConcatEven within the two
+  // inputs, then concatenate them.
+  alignas(16) const T kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C};
+  const Vec128<T> shuf = BitCast(d, Load(d, kCompactEvenU16));
+  const Vec128<T> L = TableLookupBytes(lo, shuf);
+  const Vec128<T> H = TableLookupBytes(hi, shuf);
+  return ConcatLowerLower(d, H, L);
+#endif
 }
 
 // 16-bit x4
@@ -4875,8 +4979,8 @@ HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
 // two from loading float exponents, which is considerably faster (according
 // to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v.
 
-#if HWY_TARGET > HWY_AVX3  // AVX2 or older
 namespace detail {
+#if HWY_TARGET > HWY_AVX3  // AVX2 or older
 
 // Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
@@ -4909,39 +5013,38 @@ HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
   return Vec128<MakeUnsigned<T>, N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))};
 }
 
-}  // namespace detail
 #endif  // HWY_TARGET > HWY_AVX3
 
 template <size_t N>
-HWY_API Vec128<uint16_t, N> operator<<(const Vec128<uint16_t, N> v,
-                                       const Vec128<uint16_t, N> bits) {
+HWY_API Vec128<uint16_t, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint16_t, N> v,
+                                Vec128<uint16_t, N> bits) {
 #if HWY_TARGET <= HWY_AVX3
   return Vec128<uint16_t, N>{_mm_sllv_epi16(v.raw, bits.raw)};
 #else
-  return v * detail::Pow2(bits);
+  return v * Pow2(bits);
 #endif
 }
-HWY_API Vec128<uint16_t, 1> operator<<(const Vec128<uint16_t, 1> v,
-                                       const Vec128<uint16_t, 1> bits) {
+HWY_API Vec128<uint16_t, 1> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint16_t, 1> v,
+                                Vec128<uint16_t, 1> bits) {
   return Vec128<uint16_t, 1>{_mm_sll_epi16(v.raw, bits.raw)};
 }
 
 template <size_t N>
-HWY_API Vec128<uint32_t, N> operator<<(const Vec128<uint32_t, N> v,
-                                       const Vec128<uint32_t, N> bits) {
+HWY_API Vec128<uint32_t, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint32_t, N> v,
+                                Vec128<uint32_t, N> bits) {
 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
-  return v * detail::Pow2(bits);
+  return v * Pow2(bits);
 #else
   return Vec128<uint32_t, N>{_mm_sllv_epi32(v.raw, bits.raw)};
 #endif
 }
-HWY_API Vec128<uint32_t, 1> operator<<(const Vec128<uint32_t, 1> v,
-                                       const Vec128<uint32_t, 1> bits) {
+HWY_API Vec128<uint32_t, 1> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint32_t, 1> v,
+                                const Vec128<uint32_t, 1> bits) {
   return Vec128<uint32_t, 1>{_mm_sll_epi32(v.raw, bits.raw)};
 }
 
-HWY_API Vec128<uint64_t> operator<<(const Vec128<uint64_t> v,
-                                    const Vec128<uint64_t> bits) {
+HWY_API Vec128<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint64_t> v,
+                             Vec128<uint64_t> bits) {
 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
   // Individual shifts and combine
   const Vec128<uint64_t> out0{_mm_sll_epi64(v.raw, bits.raw)};
@@ -4952,17 +5055,26 @@ HWY_API Vec128<uint64_t> operator<<(const Vec128<uint64_t> v,
   return Vec128<uint64_t>{_mm_sllv_epi64(v.raw, bits.raw)};
 #endif
 }
-HWY_API Vec64<uint64_t> operator<<(const Vec64<uint64_t> v,
-                                   const Vec64<uint64_t> bits) {
+HWY_API Vec64<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec64<uint64_t> v,
+                            Vec64<uint64_t> bits) {
   return Vec64<uint64_t>{_mm_sll_epi64(v.raw, bits.raw)};
 }
 
 // Signed left shift is the same as unsigned.
-template <typename T, size_t N, HWY_IF_SIGNED(T)>
-HWY_API Vec128<T, N> operator<<(const Vec128<T, N> v, const Vec128<T, N> bits) {
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Shl(hwy::SignedTag /*tag*/, Vec128<T, N> v,
+                         Vec128<T, N> bits) {
   const DFromV<decltype(v)> di;
   const RebindToUnsigned<decltype(di)> du;
-  return BitCast(di, BitCast(du, v) << BitCast(du, bits));
+  return BitCast(di,
+                 Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
+}
+
+}  // namespace detail
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
+  return detail::Shl(hwy::TypeTag<T>(), v, bits);
 }
 
 // ------------------------------ Shr (mul, mask, BroadcastSignBit)
@@ -5130,6 +5242,15 @@ HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
   return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
 }
 
+// Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe.
+template <size_t N>
+HWY_API Vec128<int32_t, N> ReorderWidenMulAccumulate(
+    Simd<int32_t, N, 0> /*d32*/, Vec128<int16_t, 2 * N> a,
+    Vec128<int16_t, 2 * N> b, const Vec128<int32_t, N> sum0,
+    Vec128<int32_t, N>& /*sum1*/) {
+  return sum0 + Vec128<int32_t, N>{_mm_madd_epi16(a.raw, b.raw)};
+}
+
 // ================================================== CONVERT
 
 // ------------------------------ Promotions (part w/ narrow lanes -> full)
@@ -5401,6 +5522,30 @@ HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
   return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
 }
 
+// Specializations for partial vectors because packs_epi32 sets lanes above 2*N.
+HWY_API Vec128<int16_t, 2> ReorderDemote2To(Simd<int16_t, 2, 0> dn,
+                                            Vec128<int32_t, 1> a,
+                                            Vec128<int32_t, 1> b) {
+  const Half<decltype(dn)> dnh;
+  // Pretend the result has twice as many lanes so we can InterleaveLower.
+  const Vec128<int16_t, 2> an{DemoteTo(dnh, a).raw};
+  const Vec128<int16_t, 2> bn{DemoteTo(dnh, b).raw};
+  return InterleaveLower(an, bn);
+}
+HWY_API Vec128<int16_t, 4> ReorderDemote2To(Simd<int16_t, 4, 0> dn,
+                                            Vec128<int32_t, 2> a,
+                                            Vec128<int32_t, 2> b) {
+  const Half<decltype(dn)> dnh;
+  // Pretend the result has twice as many lanes so we can InterleaveLower.
+  const Vec128<int16_t, 4> an{DemoteTo(dnh, a).raw};
+  const Vec128<int16_t, 4> bn{DemoteTo(dnh, b).raw};
+  return InterleaveLower(an, bn);
+}
+HWY_API Vec128<int16_t> ReorderDemote2To(Full128<int16_t> /*d16*/,
+                                         Vec128<int32_t> a, Vec128<int32_t> b) {
+  return Vec128<int16_t>{_mm_packs_epi32(a.raw, b.raw)};
+}
+
 template <size_t N>
 HWY_API Vec128<float, N> DemoteTo(Simd<float, N, 0> /* tag */,
                                   const Vec128<double, N> v) {
@@ -5433,7 +5578,7 @@ HWY_INLINE auto FixConversionOverflow(DI di, VFromD<DF> original,
   //   ++: normal >0                       : OK
   const auto converted = VFromD<DI>{converted_raw};
   const auto sign_wrong = AndNot(BitCast(di, original), converted);
-#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
+#if HWY_COMPILER_GCC_ACTUAL
   // Critical GCC 11 compiler bug (possibly also GCC 10): omits the Xor; also
   // Add() if using that instead. Work around with one more instruction.
   const RebindToUnsigned<DI> du;
@@ -5466,6 +5611,65 @@ HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
   return LowerHalf(LowerHalf(BitCast(d8, quad)));
 }
 
+// ------------------------------ Truncations
+
+template <typename From, typename To,
+          hwy::EnableIf<(sizeof(To) < sizeof(From))>* = nullptr>
+HWY_API Vec128<To, 1> TruncateTo(Simd<To, 1, 0> /* tag */,
+                                 const Vec128<From, 1> v) {
+  static_assert(!IsSigned<To>() && !IsSigned<From>(), "Unsigned only");
+  const Repartition<To, DFromV<decltype(v)>> d;
+  const auto v1 = BitCast(d, v);
+  return Vec128<To, 1>{v1.raw};
+}
+
+HWY_API Vec128<uint8_t, 2> TruncateTo(Simd<uint8_t, 2, 0> /* tag */,
+                                      const Vec128<uint64_t, 2> v) {
+  const Full128<uint8_t> d8;
+  alignas(16) static constexpr uint8_t kMap[16] = {0, 8, 0, 8, 0, 8, 0, 8,
+                                                   0, 8, 0, 8, 0, 8, 0, 8};
+  return LowerHalf(LowerHalf(LowerHalf(TableLookupBytes(v, Load(d8, kMap)))));
+}
+
+HWY_API Vec128<uint16_t, 2> TruncateTo(Simd<uint16_t, 2, 0> /* tag */,
+                                       const Vec128<uint64_t, 2> v) {
+  const Full128<uint16_t> d16;
+  alignas(16) static constexpr uint16_t kMap[8] = {
+      0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u};
+  return LowerHalf(LowerHalf(TableLookupBytes(v, Load(d16, kMap))));
+}
+
+HWY_API Vec128<uint32_t, 2> TruncateTo(Simd<uint32_t, 2, 0> /* tag */,
+                                       const Vec128<uint64_t, 2> v) {
+  return Vec128<uint32_t, 2>{_mm_shuffle_epi32(v.raw, 0x88)};
+}
+
+template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
+HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
+                                      const Vec128<uint32_t, N> v) {
+  const Repartition<uint8_t, DFromV<decltype(v)>> d;
+  alignas(16) static constexpr uint8_t kMap[16] = {
+      0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu,
+      0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu};
+  return LowerHalf(LowerHalf(TableLookupBytes(v, Load(d, kMap))));
+}
+
+template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
+HWY_API Vec128<uint16_t, N> TruncateTo(Simd<uint16_t, N, 0> /* tag */,
+                                       const Vec128<uint32_t, N> v) {
+  const Repartition<uint16_t, DFromV<decltype(v)>> d;
+  const auto v1 = BitCast(d, v);
+  return LowerHalf(ConcatEven(d, v1, v1));
+}
+
+template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
+HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
+                                      const Vec128<uint16_t, N> v) {
+  const Repartition<uint8_t, DFromV<decltype(v)>> d;
+  const auto v1 = BitCast(d, v);
+  return LowerHalf(ConcatEven(d, v1, v1));
+}
+
 // ------------------------------ Integer <=> fp (ShiftRight, OddEven)
 
 template <size_t N>
@@ -5475,6 +5679,26 @@ HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
 }
 
 template <size_t N>
+HWY_API Vec128<float, N> ConvertTo(HWY_MAYBE_UNUSED Simd<float, N, 0> df,
+                                   const Vec128<uint32_t, N> v) {
+#if HWY_TARGET <= HWY_AVX3
+  return Vec128<float, N>{_mm_cvtepu32_ps(v.raw)};
+#else
+  // Based on wim's approach (https://stackoverflow.com/questions/34066228/)
+  const RebindToUnsigned<decltype(df)> du32;
+  const RebindToSigned<decltype(df)> d32;
+
+  const auto msk_lo = Set(du32, 0xFFFF);
+  const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16
+
+  // Extract the 16 lowest/highest significant bits of v and cast to signed int
+  const auto v_lo = BitCast(d32, And(v, msk_lo));
+  const auto v_hi = BitCast(d32, ShiftRight<16>(v));
+  return MulAdd(cnst2_16_flt, ConvertTo(df, v_hi), ConvertTo(df, v_lo));
+#endif
+}
+
+template <size_t N>
 HWY_API Vec128<double, N> ConvertTo(Simd<double, N, 0> dd,
                                     const Vec128<int64_t, N> v) {
 #if HWY_TARGET <= HWY_AVX3
@@ -5498,6 +5722,33 @@ HWY_API Vec128<double, N> ConvertTo(Simd<double, N, 0> dd,
 #endif
 }
 
+template <size_t N>
+HWY_API Vec128<double, N> ConvertTo(HWY_MAYBE_UNUSED Simd<double, N, 0> dd,
+                                    const Vec128<uint64_t, N> v) {
+#if HWY_TARGET <= HWY_AVX3
+  return Vec128<double, N>{_mm_cvtepu64_pd(v.raw)};
+#else
+  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
+  const RebindToUnsigned<decltype(dd)> d64;
+  using VU = VFromD<decltype(d64)>;
+
+  const VU msk_lo = Set(d64, 0xFFFFFFFF);
+  const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32
+
+  // Extract the 32 lowest/highest significant bits of v
+  const VU v_lo = And(v, msk_lo);
+  const VU v_hi = ShiftRight<32>(v);
+
+  auto uint64_to_double128_fast = [&dd](VU w) HWY_ATTR {
+    w = Or(w, VU{detail::BitCastToInteger(Set(dd, 0x0010000000000000).raw)});
+    return BitCast(dd, w) - Set(dd, 0x0010000000000000);
+  };
+
+  const auto v_lo_dbl = uint64_to_double128_fast(v_lo);
+  return MulAdd(cnst2_32_dbl, uint64_to_double128_fast(v_hi), v_lo_dbl);
+#endif
+}
+
 // Truncates (rounds toward zero).
 template <size_t N>
 HWY_API Vec128<int32_t, N> ConvertTo(const Simd<int32_t, N, 0> di,
@@ -5573,8 +5824,9 @@ HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
 #if HWY_TARGET == HWY_SSSE3
 
 // Toward nearest integer, ties to even
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
+template <typename T, size_t N>
 HWY_API Vec128<T, N> Round(const Vec128<T, N> v) {
+  static_assert(IsFloat<T>(), "Only for float");
   // Rely on rounding after addition with a large value such that no mantissa
   // bits remain (assuming the current mode is nearest-even). We may need a
   // compiler flag for precise floating-point to prevent "optimizing" this out.
@@ -5592,16 +5844,18 @@ namespace detail {
 // Truncating to integer and converting back to float is correct except when the
 // input magnitude is large, in which case the input was already an integer
 // (because mantissa >> exponent is zero).
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
+template <typename T, size_t N>
 HWY_INLINE Mask128<T, N> UseInt(const Vec128<T, N> v) {
+  static_assert(IsFloat<T>(), "Only for float");
   return Abs(v) < Set(Simd<T, N, 0>(), MantissaEnd<T>());
 }
 
 }  // namespace detail
 
 // Toward zero, aka truncate
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
+template <typename T, size_t N>
 HWY_API Vec128<T, N> Trunc(const Vec128<T, N> v) {
+  static_assert(IsFloat<T>(), "Only for float");
   const Simd<T, N, 0> df;
   const RebindToSigned<decltype(df)> di;
 
@@ -5612,8 +5866,9 @@ HWY_API Vec128<T, N> Trunc(const Vec128<T, N> v) {
 }
 
 // Toward +infinity, aka ceiling
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
+template <typename T, size_t N>
 HWY_API Vec128<T, N> Ceil(const Vec128<T, N> v) {
+  static_assert(IsFloat<T>(), "Only for float");
   const Simd<T, N, 0> df;
   const RebindToSigned<decltype(df)> di;
 
@@ -5627,8 +5882,9 @@ HWY_API Vec128<T, N> Ceil(const Vec128<T, N> v) {
 }
 
 // Toward -infinity, aka floor
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
+template <typename T, size_t N>
 HWY_API Vec128<T, N> Floor(const Vec128<T, N> v) {
+  static_assert(IsFloat<T>(), "Only for float");
   const Simd<T, N, 0> df;
   const RebindToSigned<decltype(df)> di;
 
@@ -5737,8 +5993,9 @@ HWY_API Mask128<double, N> IsFinite(const Vec128<double, N> v) {
 
 #else
 
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
+template <typename T, size_t N>
 HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
+  static_assert(IsFloat<T>(), "Only for float");
   const Simd<T, N, 0> d;
   const RebindToSigned<decltype(d)> di;
   const VFromD<decltype(di)> vi = BitCast(di, v);
@@ -5747,8 +6004,9 @@ HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
 }
 
 // Returns whether normal/subnormal/zero.
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
+template <typename T, size_t N>
 HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
+  static_assert(IsFloat<T>(), "Only for float");
   const Simd<T, N, 0> d;
   const RebindToUnsigned<decltype(d)> du;
   const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
@@ -5844,8 +6102,8 @@ HWY_API size_t StoreMaskBits(const Simd<T, N, 0> /* tag */,
 
   // Non-full byte, need to clear the undefined upper bits.
   if (N < 8) {
-    const int mask = (1 << N) - 1;
-    bits[0] = static_cast<uint8_t>(bits[0] & mask);
+    const int mask_bits = (1 << N) - 1;
+    bits[0] = static_cast<uint8_t>(bits[0] & mask_bits);
   }
 
   return kNumBytes;
@@ -5863,6 +6121,13 @@ HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */,
 }
 
 template <typename T, size_t N>
+HWY_API size_t FindKnownFirstTrue(const Simd<T, N, 0> /* tag */,
+                                  const Mask128<T, N> mask) {
+  const uint32_t mask_bits = static_cast<uint32_t>(mask.raw) & ((1u << N) - 1);
+  return Num0BitsBelowLS1Bit_Nonzero32(mask_bits);
+}
+
+template <typename T, size_t N>
 HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
                                const Mask128<T, N> mask) {
   const uint32_t mask_bits = static_cast<uint32_t>(mask.raw) & ((1u << N) - 1);
@@ -5983,6 +6248,12 @@ HWY_INLINE Vec128<uint16_t> IndicesForCompress16(uint64_t mask_bits) {
 }  // namespace detail
 #endif  // HWY_TARGET != HWY_AVX3_DL
 
+// Single lane: no-op
+template <typename T>
+HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
+  return v;
+}
+
 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
   const Simd<T, N, 0> d;
@@ -6003,28 +6274,47 @@ HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
   return Vec128<T, N>{_mm_maskz_compress_epi32(mask.raw, v.raw)};
 }
 
-template <size_t N>
+template <size_t N, HWY_IF_GE64(float, N)>
 HWY_API Vec128<float, N> Compress(Vec128<float, N> v, Mask128<float, N> mask) {
   return Vec128<float, N>{_mm_maskz_compress_ps(mask.raw, v.raw)};
 }
 
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
   HWY_DASSERT(mask.raw < 4);
 
   // There are only 2 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t packed_array[64] = {
+  alignas(16) constexpr uint8_t u8_indices[64] = {
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
       8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
 
-  const Simd<T, N, 0> d;
+  const Full128<T> d;
   const Repartition<uint8_t, decltype(d)> d8;
-  const auto index = Load(d8, packed_array + 16 * mask.raw);
+  const auto index = Load(d8, u8_indices + 16 * mask.raw);
   return BitCast(d, TableLookupBytes(BitCast(d8, v), index));
 }
 
+// ------------------------------ CompressNot (Compress)
+
+// Single lane: no-op
+template <typename T>
+HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
+  return v;
+}
+
+template <typename T, size_t N>
+HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
+  return Compress(v, Not(mask));
+}
+
+// ------------------------------ CompressBlocksNot
+HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
+                                           Mask128<uint64_t> /* m */) {
+  return v;
+}
+
 // ------------------------------ CompressBits (LoadMaskBits)
 
 template <typename T, size_t N>
@@ -6303,6 +6593,13 @@ HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */,
 }
 
 template <typename T, size_t N>
+HWY_API size_t FindKnownFirstTrue(const Simd<T, N, 0> /* tag */,
+                                  const Mask128<T, N> mask) {
+  const uint64_t mask_bits = detail::BitsFromMask(mask);
+  return Num0BitsBelowLS1Bit_Nonzero64(mask_bits);
+}
+
+template <typename T, size_t N>
 HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
                                const Mask128<T, N> mask) {
   const uint64_t mask_bits = detail::BitsFromMask(mask);
@@ -6313,6 +6610,7 @@ HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
 
 namespace detail {
 
+// Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6.
 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 256);
@@ -6328,6 +6626,7 @@ HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
   // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
   // is likely more costly than the higher cache footprint from storing bytes.
   alignas(16) constexpr uint8_t table[2048] = {
+      // PrintCompress16x8Tables
       0,  2,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
       2,  0,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
       4,  0,  2,  6,  8,  10, 12, 14, /**/ 0, 4,  2,  6,  8,  10, 12, 14,  //
@@ -6462,12 +6761,164 @@ HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
   return BitCast(d, pairs + Set(du, 0x0100));
 }
 
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
+HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0> d,
+                                           uint64_t mask_bits) {
+  HWY_DASSERT(mask_bits < 256);
+  const Rebind<uint8_t, decltype(d)> d8;
+  const Simd<uint16_t, N, 0> du;
+
+  // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
+  // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
+  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
+  // store lane indices and convert to byte indices (2*lane + 0..1), with the
+  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
+  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
+  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
+  // is likely more costly than the higher cache footprint from storing bytes.
+  alignas(16) constexpr uint8_t table[2048] = {
+      // PrintCompressNot16x8Tables
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 14, 0,   //
+      0, 4,  6,  8,  10, 12, 14, 2,  /**/ 4,  6,  8,  10, 12, 14, 0,  2,   //
+      0, 2,  6,  8,  10, 12, 14, 4,  /**/ 2,  6,  8,  10, 12, 14, 0,  4,   //
+      0, 6,  8,  10, 12, 14, 2,  4,  /**/ 6,  8,  10, 12, 14, 0,  2,  4,   //
+      0, 2,  4,  8,  10, 12, 14, 6,  /**/ 2,  4,  8,  10, 12, 14, 0,  6,   //
+      0, 4,  8,  10, 12, 14, 2,  6,  /**/ 4,  8,  10, 12, 14, 0,  2,  6,   //
+      0, 2,  8,  10, 12, 14, 4,  6,  /**/ 2,  8,  10, 12, 14, 0,  4,  6,   //
+      0, 8,  10, 12, 14, 2,  4,  6,  /**/ 8,  10, 12, 14, 0,  2,  4,  6,   //
+      0, 2,  4,  6,  10, 12, 14, 8,  /**/ 2,  4,  6,  10, 12, 14, 0,  8,   //
+      0, 4,  6,  10, 12, 14, 2,  8,  /**/ 4,  6,  10, 12, 14, 0,  2,  8,   //
+      0, 2,  6,  10, 12, 14, 4,  8,  /**/ 2,  6,  10, 12, 14, 0,  4,  8,   //
+      0, 6,  10, 12, 14, 2,  4,  8,  /**/ 6,  10, 12, 14, 0,  2,  4,  8,   //
+      0, 2,  4,  10, 12, 14, 6,  8,  /**/ 2,  4,  10, 12, 14, 0,  6,  8,   //
+      0, 4,  10, 12, 14, 2,  6,  8,  /**/ 4,  10, 12, 14, 0,  2,  6,  8,   //
+      0, 2,  10, 12, 14, 4,  6,  8,  /**/ 2,  10, 12, 14, 0,  4,  6,  8,   //
+      0, 10, 12, 14, 2,  4,  6,  8,  /**/ 10, 12, 14, 0,  2,  4,  6,  8,   //
+      0, 2,  4,  6,  8,  12, 14, 10, /**/ 2,  4,  6,  8,  12, 14, 0,  10,  //
+      0, 4,  6,  8,  12, 14, 2,  10, /**/ 4,  6,  8,  12, 14, 0,  2,  10,  //
+      0, 2,  6,  8,  12, 14, 4,  10, /**/ 2,  6,  8,  12, 14, 0,  4,  10,  //
+      0, 6,  8,  12, 14, 2,  4,  10, /**/ 6,  8,  12, 14, 0,  2,  4,  10,  //
+      0, 2,  4,  8,  12, 14, 6,  10, /**/ 2,  4,  8,  12, 14, 0,  6,  10,  //
+      0, 4,  8,  12, 14, 2,  6,  10, /**/ 4,  8,  12, 14, 0,  2,  6,  10,  //
+      0, 2,  8,  12, 14, 4,  6,  10, /**/ 2,  8,  12, 14, 0,  4,  6,  10,  //
+      0, 8,  12, 14, 2,  4,  6,  10, /**/ 8,  12, 14, 0,  2,  4,  6,  10,  //
+      0, 2,  4,  6,  12, 14, 8,  10, /**/ 2,  4,  6,  12, 14, 0,  8,  10,  //
+      0, 4,  6,  12, 14, 2,  8,  10, /**/ 4,  6,  12, 14, 0,  2,  8,  10,  //
+      0, 2,  6,  12, 14, 4,  8,  10, /**/ 2,  6,  12, 14, 0,  4,  8,  10,  //
+      0, 6,  12, 14, 2,  4,  8,  10, /**/ 6,  12, 14, 0,  2,  4,  8,  10,  //
+      0, 2,  4,  12, 14, 6,  8,  10, /**/ 2,  4,  12, 14, 0,  6,  8,  10,  //
+      0, 4,  12, 14, 2,  6,  8,  10, /**/ 4,  12, 14, 0,  2,  6,  8,  10,  //
+      0, 2,  12, 14, 4,  6,  8,  10, /**/ 2,  12, 14, 0,  4,  6,  8,  10,  //
+      0, 12, 14, 2,  4,  6,  8,  10, /**/ 12, 14, 0,  2,  4,  6,  8,  10,  //
+      0, 2,  4,  6,  8,  10, 14, 12, /**/ 2,  4,  6,  8,  10, 14, 0,  12,  //
+      0, 4,  6,  8,  10, 14, 2,  12, /**/ 4,  6,  8,  10, 14, 0,  2,  12,  //
+      0, 2,  6,  8,  10, 14, 4,  12, /**/ 2,  6,  8,  10, 14, 0,  4,  12,  //
+      0, 6,  8,  10, 14, 2,  4,  12, /**/ 6,  8,  10, 14, 0,  2,  4,  12,  //
+      0, 2,  4,  8,  10, 14, 6,  12, /**/ 2,  4,  8,  10, 14, 0,  6,  12,  //
+      0, 4,  8,  10, 14, 2,  6,  12, /**/ 4,  8,  10, 14, 0,  2,  6,  12,  //
+      0, 2,  8,  10, 14, 4,  6,  12, /**/ 2,  8,  10, 14, 0,  4,  6,  12,  //
+      0, 8,  10, 14, 2,  4,  6,  12, /**/ 8,  10, 14, 0,  2,  4,  6,  12,  //
+      0, 2,  4,  6,  10, 14, 8,  12, /**/ 2,  4,  6,  10, 14, 0,  8,  12,  //
+      0, 4,  6,  10, 14, 2,  8,  12, /**/ 4,  6,  10, 14, 0,  2,  8,  12,  //
+      0, 2,  6,  10, 14, 4,  8,  12, /**/ 2,  6,  10, 14, 0,  4,  8,  12,  //
+      0, 6,  10, 14, 2,  4,  8,  12, /**/ 6,  10, 14, 0,  2,  4,  8,  12,  //
+      0, 2,  4,  10, 14, 6,  8,  12, /**/ 2,  4,  10, 14, 0,  6,  8,  12,  //
+      0, 4,  10, 14, 2,  6,  8,  12, /**/ 4,  10, 14, 0,  2,  6,  8,  12,  //
+      0, 2,  10, 14, 4,  6,  8,  12, /**/ 2,  10, 14, 0,  4,  6,  8,  12,  //
+      0, 10, 14, 2,  4,  6,  8,  12, /**/ 10, 14, 0,  2,  4,  6,  8,  12,  //
+      0, 2,  4,  6,  8,  14, 10, 12, /**/ 2,  4,  6,  8,  14, 0,  10, 12,  //
+      0, 4,  6,  8,  14, 2,  10, 12, /**/ 4,  6,  8,  14, 0,  2,  10, 12,  //
+      0, 2,  6,  8,  14, 4,  10, 12, /**/ 2,  6,  8,  14, 0,  4,  10, 12,  //
+      0, 6,  8,  14, 2,  4,  10, 12, /**/ 6,  8,  14, 0,  2,  4,  10, 12,  //
+      0, 2,  4,  8,  14, 6,  10, 12, /**/ 2,  4,  8,  14, 0,  6,  10, 12,  //
+      0, 4,  8,  14, 2,  6,  10, 12, /**/ 4,  8,  14, 0,  2,  6,  10, 12,  //
+      0, 2,  8,  14, 4,  6,  10, 12, /**/ 2,  8,  14, 0,  4,  6,  10, 12,  //
+      0, 8,  14, 2,  4,  6,  10, 12, /**/ 8,  14, 0,  2,  4,  6,  10, 12,  //
+      0, 2,  4,  6,  14, 8,  10, 12, /**/ 2,  4,  6,  14, 0,  8,  10, 12,  //
+      0, 4,  6,  14, 2,  8,  10, 12, /**/ 4,  6,  14, 0,  2,  8,  10, 12,  //
+      0, 2,  6,  14, 4,  8,  10, 12, /**/ 2,  6,  14, 0,  4,  8,  10, 12,  //
+      0, 6,  14, 2,  4,  8,  10, 12, /**/ 6,  14, 0,  2,  4,  8,  10, 12,  //
+      0, 2,  4,  14, 6,  8,  10, 12, /**/ 2,  4,  14, 0,  6,  8,  10, 12,  //
+      0, 4,  14, 2,  6,  8,  10, 12, /**/ 4,  14, 0,  2,  6,  8,  10, 12,  //
+      0, 2,  14, 4,  6,  8,  10, 12, /**/ 2,  14, 0,  4,  6,  8,  10, 12,  //
+      0, 14, 2,  4,  6,  8,  10, 12, /**/ 14, 0,  2,  4,  6,  8,  10, 12,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 0,  14,  //
+      0, 4,  6,  8,  10, 12, 2,  14, /**/ 4,  6,  8,  10, 12, 0,  2,  14,  //
+      0, 2,  6,  8,  10, 12, 4,  14, /**/ 2,  6,  8,  10, 12, 0,  4,  14,  //
+      0, 6,  8,  10, 12, 2,  4,  14, /**/ 6,  8,  10, 12, 0,  2,  4,  14,  //
+      0, 2,  4,  8,  10, 12, 6,  14, /**/ 2,  4,  8,  10, 12, 0,  6,  14,  //
+      0, 4,  8,  10, 12, 2,  6,  14, /**/ 4,  8,  10, 12, 0,  2,  6,  14,  //
+      0, 2,  8,  10, 12, 4,  6,  14, /**/ 2,  8,  10, 12, 0,  4,  6,  14,  //
+      0, 8,  10, 12, 2,  4,  6,  14, /**/ 8,  10, 12, 0,  2,  4,  6,  14,  //
+      0, 2,  4,  6,  10, 12, 8,  14, /**/ 2,  4,  6,  10, 12, 0,  8,  14,  //
+      0, 4,  6,  10, 12, 2,  8,  14, /**/ 4,  6,  10, 12, 0,  2,  8,  14,  //
+      0, 2,  6,  10, 12, 4,  8,  14, /**/ 2,  6,  10, 12, 0,  4,  8,  14,  //
+      0, 6,  10, 12, 2,  4,  8,  14, /**/ 6,  10, 12, 0,  2,  4,  8,  14,  //
+      0, 2,  4,  10, 12, 6,  8,  14, /**/ 2,  4,  10, 12, 0,  6,  8,  14,  //
+      0, 4,  10, 12, 2,  6,  8,  14, /**/ 4,  10, 12, 0,  2,  6,  8,  14,  //
+      0, 2,  10, 12, 4,  6,  8,  14, /**/ 2,  10, 12, 0,  4,  6,  8,  14,  //
+      0, 10, 12, 2,  4,  6,  8,  14, /**/ 10, 12, 0,  2,  4,  6,  8,  14,  //
+      0, 2,  4,  6,  8,  12, 10, 14, /**/ 2,  4,  6,  8,  12, 0,  10, 14,  //
+      0, 4,  6,  8,  12, 2,  10, 14, /**/ 4,  6,  8,  12, 0,  2,  10, 14,  //
+      0, 2,  6,  8,  12, 4,  10, 14, /**/ 2,  6,  8,  12, 0,  4,  10, 14,  //
+      0, 6,  8,  12, 2,  4,  10, 14, /**/ 6,  8,  12, 0,  2,  4,  10, 14,  //
+      0, 2,  4,  8,  12, 6,  10, 14, /**/ 2,  4,  8,  12, 0,  6,  10, 14,  //
+      0, 4,  8,  12, 2,  6,  10, 14, /**/ 4,  8,  12, 0,  2,  6,  10, 14,  //
+      0, 2,  8,  12, 4,  6,  10, 14, /**/ 2,  8,  12, 0,  4,  6,  10, 14,  //
+      0, 8,  12, 2,  4,  6,  10, 14, /**/ 8,  12, 0,  2,  4,  6,  10, 14,  //
+      0, 2,  4,  6,  12, 8,  10, 14, /**/ 2,  4,  6,  12, 0,  8,  10, 14,  //
+      0, 4,  6,  12, 2,  8,  10, 14, /**/ 4,  6,  12, 0,  2,  8,  10, 14,  //
+      0, 2,  6,  12, 4,  8,  10, 14, /**/ 2,  6,  12, 0,  4,  8,  10, 14,  //
+      0, 6,  12, 2,  4,  8,  10, 14, /**/ 6,  12, 0,  2,  4,  8,  10, 14,  //
+      0, 2,  4,  12, 6,  8,  10, 14, /**/ 2,  4,  12, 0,  6,  8,  10, 14,  //
+      0, 4,  12, 2,  6,  8,  10, 14, /**/ 4,  12, 0,  2,  6,  8,  10, 14,  //
+      0, 2,  12, 4,  6,  8,  10, 14, /**/ 2,  12, 0,  4,  6,  8,  10, 14,  //
+      0, 12, 2,  4,  6,  8,  10, 14, /**/ 12, 0,  2,  4,  6,  8,  10, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 0,  12, 14,  //
+      0, 4,  6,  8,  10, 2,  12, 14, /**/ 4,  6,  8,  10, 0,  2,  12, 14,  //
+      0, 2,  6,  8,  10, 4,  12, 14, /**/ 2,  6,  8,  10, 0,  4,  12, 14,  //
+      0, 6,  8,  10, 2,  4,  12, 14, /**/ 6,  8,  10, 0,  2,  4,  12, 14,  //
+      0, 2,  4,  8,  10, 6,  12, 14, /**/ 2,  4,  8,  10, 0,  6,  12, 14,  //
+      0, 4,  8,  10, 2,  6,  12, 14, /**/ 4,  8,  10, 0,  2,  6,  12, 14,  //
+      0, 2,  8,  10, 4,  6,  12, 14, /**/ 2,  8,  10, 0,  4,  6,  12, 14,  //
+      0, 8,  10, 2,  4,  6,  12, 14, /**/ 8,  10, 0,  2,  4,  6,  12, 14,  //
+      0, 2,  4,  6,  10, 8,  12, 14, /**/ 2,  4,  6,  10, 0,  8,  12, 14,  //
+      0, 4,  6,  10, 2,  8,  12, 14, /**/ 4,  6,  10, 0,  2,  8,  12, 14,  //
+      0, 2,  6,  10, 4,  8,  12, 14, /**/ 2,  6,  10, 0,  4,  8,  12, 14,  //
+      0, 6,  10, 2,  4,  8,  12, 14, /**/ 6,  10, 0,  2,  4,  8,  12, 14,  //
+      0, 2,  4,  10, 6,  8,  12, 14, /**/ 2,  4,  10, 0,  6,  8,  12, 14,  //
+      0, 4,  10, 2,  6,  8,  12, 14, /**/ 4,  10, 0,  2,  6,  8,  12, 14,  //
+      0, 2,  10, 4,  6,  8,  12, 14, /**/ 2,  10, 0,  4,  6,  8,  12, 14,  //
+      0, 10, 2,  4,  6,  8,  12, 14, /**/ 10, 0,  2,  4,  6,  8,  12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  0,  10, 12, 14,  //
+      0, 4,  6,  8,  2,  10, 12, 14, /**/ 4,  6,  8,  0,  2,  10, 12, 14,  //
+      0, 2,  6,  8,  4,  10, 12, 14, /**/ 2,  6,  8,  0,  4,  10, 12, 14,  //
+      0, 6,  8,  2,  4,  10, 12, 14, /**/ 6,  8,  0,  2,  4,  10, 12, 14,  //
+      0, 2,  4,  8,  6,  10, 12, 14, /**/ 2,  4,  8,  0,  6,  10, 12, 14,  //
+      0, 4,  8,  2,  6,  10, 12, 14, /**/ 4,  8,  0,  2,  6,  10, 12, 14,  //
+      0, 2,  8,  4,  6,  10, 12, 14, /**/ 2,  8,  0,  4,  6,  10, 12, 14,  //
+      0, 8,  2,  4,  6,  10, 12, 14, /**/ 8,  0,  2,  4,  6,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  0,  8,  10, 12, 14,  //
+      0, 4,  6,  2,  8,  10, 12, 14, /**/ 4,  6,  0,  2,  8,  10, 12, 14,  //
+      0, 2,  6,  4,  8,  10, 12, 14, /**/ 2,  6,  0,  4,  8,  10, 12, 14,  //
+      0, 6,  2,  4,  8,  10, 12, 14, /**/ 6,  0,  2,  4,  8,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  0,  6,  8,  10, 12, 14,  //
+      0, 4,  2,  6,  8,  10, 12, 14, /**/ 4,  0,  2,  6,  8,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  0,  4,  6,  8,  10, 12, 14,  //
+      0, 2,  4,  6,  8,  10, 12, 14, /**/ 0,  2,  4,  6,  8,  10, 12, 14};
+
+  const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
+  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
+  return BitCast(d, pairs + Set(du, 0x0100));
+}
+
 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE128(T, N)>
 HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 16);
 
   // There are only 4 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t packed_array[256] = {
+  alignas(16) constexpr uint8_t u8_indices[256] = {
+      // PrintCompress32x4Tables
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
       4,  5,  6,  7,  0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15,  //
@@ -6486,7 +6937,35 @@ HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15};
 
   const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE128(T, N)>
+HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0> d,
+                                           uint64_t mask_bits) {
+  HWY_DASSERT(mask_bits < 16);
+
+  // There are only 4 lanes, so we can afford to load the index vector directly.
+  alignas(16) constexpr uint8_t u8_indices[256] = {
+      // PrintCompressNot32x4Tables
+      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,
+      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,
+      8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
+      14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
+      12, 13, 14, 15, 8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 0,  1,
+      2,  3,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,
+      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+      10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 0,  1,
+      2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,
+      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  4,  5,
+      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  0,  1,  2,  3,
+      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+      10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+      12, 13, 14, 15};
+
+  const Repartition<uint8_t, decltype(d)> d8;
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
 }
 
 template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8), HWY_IF_LE128(T, N)>
@@ -6494,14 +6973,32 @@ HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
   HWY_DASSERT(mask_bits < 4);
 
   // There are only 2 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t packed_array[64] = {
+  alignas(16) constexpr uint8_t u8_indices[64] = {
+      // PrintCompress64x2Tables
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
       8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
       0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
 
   const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
+}
+
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8), HWY_IF_LE128(T, N)>
+HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0> d,
+                                           uint64_t mask_bits) {
+  HWY_DASSERT(mask_bits < 4);
+
+  // There are only 2 lanes, so we can afford to load the index vector directly.
+  alignas(16) constexpr uint8_t u8_indices[64] = {
+      // PrintCompressNot64x2Tables
+      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
+      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
+      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
+      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
+
+  const Repartition<uint8_t, decltype(d)> d8;
+  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
 }
 
 template <typename T, size_t N>
@@ -6514,11 +7011,75 @@ HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, uint64_t mask_bits) {
   return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
 }
 
+template <typename T, size_t N>
+HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N> v, uint64_t mask_bits) {
+  const Simd<T, N, 0> d;
+  const RebindToUnsigned<decltype(d)> du;
+
+  HWY_DASSERT(mask_bits < (1ull << N));
+  const auto indices = BitCast(du, detail::IndicesFromNotBits(d, mask_bits));
+  return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
+}
+
 }  // namespace detail
 
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> m) {
-  return detail::CompressBits(v, detail::BitsFromMask(m));
+// Single lane: no-op
+template <typename T>
+HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
+  return v;
+}
+
+// Two lanes: conditional swap
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
+  // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
+  const Full128<T> d;
+  const Vec128<T> m = VecFromMask(d, mask);
+  const Vec128<T> maskL = DupEven(m);
+  const Vec128<T> maskH = DupOdd(m);
+  const Vec128<T> swap = AndNot(maskL, maskH);
+  return IfVecThenElse(swap, Shuffle01(v), v);
+}
+
+// General case
+template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
+HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
+  return detail::CompressBits(v, detail::BitsFromMask(mask));
+}
+
+// Single lane: no-op
+template <typename T>
+HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
+  return v;
+}
+
+// Two lanes: conditional swap
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
+  // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
+  const Full128<T> d;
+  const Vec128<T> m = VecFromMask(d, mask);
+  const Vec128<T> maskL = DupEven(m);
+  const Vec128<T> maskH = DupOdd(m);
+  const Vec128<T> swap = AndNot(maskH, maskL);
+  return IfVecThenElse(swap, Shuffle01(v), v);
+}
+
+// General case
+template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
+HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
+  // For partial vectors, we cannot pull the Not() into the table because
+  // BitsFromMask clears the upper bits.
+  if (N < 16 / sizeof(T)) {
+    return detail::CompressBits(v, detail::BitsFromMask(Not(mask)));
+  }
+  return detail::CompressNotBits(v, detail::BitsFromMask(mask));
+}
+
+// ------------------------------ CompressBlocksNot
+HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
+                                           Mask128<uint64_t> /* m */) {
+  return v;
 }
 
 template <typename T, size_t N>
@@ -6699,24 +7260,76 @@ HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
   return Max(v10, v01);
 }
 
-// u16/i16
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
-HWY_API Vec128<T, N> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
-  const Repartition<int32_t, Simd<T, N, 0>> d32;
+template <size_t N, HWY_IF_GE32(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
+                                       Vec128<uint16_t, N> v) {
+  const Simd<uint16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
   const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
   const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MinOfLanes(d32, Min(even, odd));
+  const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
   // Also broadcast into odd lanes.
-  return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
+  return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
+}
+template <size_t N, HWY_IF_GE32(int16_t, N)>
+HWY_API Vec128<int16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
+                                      Vec128<int16_t, N> v) {
+  const Simd<int16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
 }
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_GE32(T, N)>
-HWY_API Vec128<T, N> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec128<T, N> v) {
-  const Repartition<int32_t, Simd<T, N, 0>> d32;
+
+template <size_t N, HWY_IF_GE32(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
+                                       Vec128<uint16_t, N> v) {
+  const Simd<uint16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
   const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
   const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MaxOfLanes(d32, Max(even, odd));
+  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+template <size_t N, HWY_IF_GE32(int16_t, N)>
+HWY_API Vec128<int16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
+                                      Vec128<int16_t, N> v) {
+  const Simd<int16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
   // Also broadcast into odd lanes.
-  return BitCast(Simd<T, N, 0>(), Or(min, ShiftLeft<16>(min)));
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+
+template <size_t N, HWY_IF_GE32(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
+                                       Vec128<uint16_t, N> v) {
+  const Simd<uint16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+template <size_t N, HWY_IF_GE32(int16_t, N)>
+HWY_API Vec128<int16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
+                                      Vec128<int16_t, N> v) {
+  const Simd<int16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
 }
 
 }  // namespace detail
@@ -6742,7 +7355,8 @@ namespace detail {
 // Returns vector-mask for Lt128. Also used by x86_256/x86_512.
 template <class D, class V = VFromD<D>>
 HWY_INLINE V Lt128Vec(const D d, const V a, const V b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
   // Truth table of Eq and Lt for Hi and Lo u64.
   // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
   // =H =L cH cL  | out = cH | (=H & cL)
@@ -6756,92 +7370,116 @@ HWY_INLINE V Lt128Vec(const D d, const V a, const V b) {
   //  1  0  0  0  |  0
   //  1  0  0  1  |  1
   //  1  1  0  0  |  0
-  const V eqHL = VecFromMask(d, Eq(a, b));
+  const auto eqHL = Eq(a, b);
   const V ltHL = VecFromMask(d, Lt(a, b));
   const V ltLX = ShiftLeftLanes<1>(ltHL);
-  const V vecHx = OrAnd(ltHL, eqHL, ltLX);
+  const V vecHx = IfThenElse(eqHL, ltLX, ltHL);
   return InterleaveUpper(d, vecHx, vecHx);
 }
 
-}  // namespace detail
-
+// Returns vector-mask for Eq128. Also used by x86_256/x86_512.
 template <class D, class V = VFromD<D>>
-HWY_API MFromD<D> Lt128(D d, const V a, const V b) {
-  return MaskFromVec(detail::Lt128Vec(d, a, b));
+HWY_INLINE V Eq128Vec(const D d, const V a, const V b) {
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
+  const auto eqHL = VecFromMask(d, Eq(a, b));
+  const auto eqLH = Reverse2(d, eqHL);
+  return And(eqHL, eqLH);
 }
 
-// ------------------------------ Min128, Max128 (Lt128)
+template <class D, class V = VFromD<D>>
+HWY_INLINE V Ne128Vec(const D d, const V a, const V b) {
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
+  const auto neHL = VecFromMask(d, Ne(a, b));
+  const auto neLH = Reverse2(d, neHL);
+  return Or(neHL, neLH);
+}
 
-// Avoids the extra MaskFromVec in Lt128.
 template <class D, class V = VFromD<D>>
-HWY_API V Min128(D d, const V a, const V b) {
-  return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
+HWY_INLINE V Lt128UpperVec(const D d, const V a, const V b) {
+  // No specialization required for AVX-512: Mask <-> Vec is fast, and
+  // copying mask bits to their neighbor seems infeasible.
+  const V ltHL = VecFromMask(d, Lt(a, b));
+  return InterleaveUpper(d, ltHL, ltHL);
 }
 
 template <class D, class V = VFromD<D>>
-HWY_API V Max128(D d, const V a, const V b) {
-  return IfVecThenElse(detail::Lt128Vec(d, a, b), b, a);
+HWY_INLINE V Eq128UpperVec(const D d, const V a, const V b) {
+  // No specialization required for AVX-512: Mask <-> Vec is fast, and
+  // copying mask bits to their neighbor seems infeasible.
+  const V eqHL = VecFromMask(d, Eq(a, b));
+  return InterleaveUpper(d, eqHL, eqHL);
 }
 
-// ================================================== Operator wrapper
+template <class D, class V = VFromD<D>>
+HWY_INLINE V Ne128UpperVec(const D d, const V a, const V b) {
+  // No specialization required for AVX-512: Mask <-> Vec is fast, and
+  // copying mask bits to their neighbor seems infeasible.
+  const V neHL = VecFromMask(d, Ne(a, b));
+  return InterleaveUpper(d, neHL, neHL);
+}
 
-// These apply to all x86_*-inl.h because there are no restrictions on V.
+}  // namespace detail
 
-template <class V>
-HWY_API V Add(V a, V b) {
-  return a + b;
-}
-template <class V>
-HWY_API V Sub(V a, V b) {
-  return a - b;
+template <class D, class V = VFromD<D>>
+HWY_API MFromD<D> Lt128(D d, const V a, const V b) {
+  return MaskFromVec(detail::Lt128Vec(d, a, b));
 }
 
-template <class V>
-HWY_API V Mul(V a, V b) {
-  return a * b;
-}
-template <class V>
-HWY_API V Div(V a, V b) {
-  return a / b;
+template <class D, class V = VFromD<D>>
+HWY_API MFromD<D> Eq128(D d, const V a, const V b) {
+  return MaskFromVec(detail::Eq128Vec(d, a, b));
 }
 
-template <class V>
-V Shl(V a, V b) {
-  return a << b;
+template <class D, class V = VFromD<D>>
+HWY_API MFromD<D> Ne128(D d, const V a, const V b) {
+  return MaskFromVec(detail::Ne128Vec(d, a, b));
 }
-template <class V>
-V Shr(V a, V b) {
-  return a >> b;
+
+template <class D, class V = VFromD<D>>
+HWY_API MFromD<D> Lt128Upper(D d, const V a, const V b) {
+  return MaskFromVec(detail::Lt128UpperVec(d, a, b));
 }
 
-template <class V>
-HWY_API auto Eq(V a, V b) -> decltype(a == b) {
-  return a == b;
+template <class D, class V = VFromD<D>>
+HWY_API MFromD<D> Eq128Upper(D d, const V a, const V b) {
+  return MaskFromVec(detail::Eq128UpperVec(d, a, b));
 }
-template <class V>
-HWY_API auto Ne(V a, V b) -> decltype(a == b) {
-  return a != b;
+
+template <class D, class V = VFromD<D>>
+HWY_API MFromD<D> Ne128Upper(D d, const V a, const V b) {
+  return MaskFromVec(detail::Ne128UpperVec(d, a, b));
 }
-template <class V>
-HWY_API auto Lt(V a, V b) -> decltype(a == b) {
-  return a < b;
+
+// ------------------------------ Min128, Max128 (Lt128)
+
+// Avoids the extra MaskFromVec in Lt128.
+template <class D, class V = VFromD<D>>
+HWY_API V Min128(D d, const V a, const V b) {
+  return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
 }
 
-template <class V>
-HWY_API auto Gt(V a, V b) -> decltype(a == b) {
-  return a > b;
+template <class D, class V = VFromD<D>>
+HWY_API V Max128(D d, const V a, const V b) {
+  return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b);
 }
-template <class V>
-HWY_API auto Ge(V a, V b) -> decltype(a == b) {
-  return a >= b;
+
+template <class D, class V = VFromD<D>>
+HWY_API V Min128Upper(D d, const V a, const V b) {
+  return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b);
 }
 
-template <class V>
-HWY_API auto Le(V a, V b) -> decltype(a == b) {
-  return a <= b;
+template <class D, class V = VFromD<D>>
+HWY_API V Max128Upper(D d, const V a, const V b) {
+  return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b);
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
 HWY_AFTER_NAMESPACE();
+
+// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
+// the warning seems to be issued at the call site of intrinsics, i.e. our code.
+HWY_DIAGNOSTICS(pop)
diff --git a/media/highway/src/hwy/ops/x86_256-inl.h b/media/highway/src/hwy/ops/x86_256-inl.h
index d41d256daa..12a83cbfc6 100644
--- a/media/highway/src/hwy/ops/x86_256-inl.h
+++ b/media/highway/src/hwy/ops/x86_256-inl.h
@@ -20,11 +20,20 @@
 // WARNING: most operations do not cross 128-bit block boundaries. In
 // particular, "Broadcast", pack and zip behavior may be surprising.
 
+// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_CLANGCL
+#include "hwy/base.h"
+
+// Avoid uninitialized warnings in GCC's avx512fintrin.h - see
+// https://github.com/google/highway/issues/710)
+HWY_DIAGNOSTICS(push)
+#if HWY_COMPILER_GCC_ACTUAL
+HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
+HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized")
+#endif
+
 // Must come before HWY_COMPILER_CLANGCL
 #include <immintrin.h>  // AVX2+
 
-#include "hwy/base.h"
-
 #if HWY_COMPILER_CLANGCL
 // Including <immintrin.h> should be enough, but Clang's headers helpfully skip
 // including these headers when _MSC_VER is defined, like when using clang-cl.
@@ -40,6 +49,7 @@
 
 #include <stddef.h>
 #include <stdint.h>
+#include <string.h>  // memcpy
 
 #if HWY_IS_MSAN
 #include <sanitizer/msan_interface.h>
@@ -595,8 +605,9 @@ HWY_API Vec256<double> IfThenZeroElse(Mask256<double> mask, Vec256<double> no) {
   return Vec256<double>{_mm256_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
 }
 
-template <typename T, HWY_IF_FLOAT(T)>
+template <typename T>
 HWY_API Vec256<T> ZeroIfNegative(const Vec256<T> v) {
+  static_assert(IsSigned<T>(), "Only for float");
   // AVX3 MaskFromVec only looks at the MSB
   return IfThenZeroElse(MaskFromVec(v), v);
 }
@@ -753,6 +764,43 @@ HWY_INLINE Mask256<T> Xor(hwy::SizeTag<8> /*tag*/, const Mask256<T> a,
 #endif
 }
 
+template <typename T>
+HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<1> /*tag*/,
+                                       const Mask256<T> a, const Mask256<T> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask256<T>{_kxnor_mask32(a.raw, b.raw)};
+#else
+  return Mask256<T>{static_cast<__mmask32>(~(a.raw ^ b.raw) & 0xFFFFFFFF)};
+#endif
+}
+template <typename T>
+HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<2> /*tag*/,
+                                       const Mask256<T> a, const Mask256<T> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask256<T>{_kxnor_mask16(a.raw, b.raw)};
+#else
+  return Mask256<T>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)};
+#endif
+}
+template <typename T>
+HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<4> /*tag*/,
+                                       const Mask256<T> a, const Mask256<T> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask256<T>{_kxnor_mask8(a.raw, b.raw)};
+#else
+  return Mask256<T>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)};
+#endif
+}
+template <typename T>
+HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<8> /*tag*/,
+                                       const Mask256<T> a, const Mask256<T> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask256<T>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)};
+#else
+  return Mask256<T>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)};
+#endif
+}
+
 }  // namespace detail
 
 template <typename T>
@@ -782,6 +830,11 @@ HWY_API Mask256<T> Not(const Mask256<T> m) {
   return Xor(m, Mask256<T>::FromBits((1ull << N) - 1));
 }
 
+template <typename T>
+HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) {
+  return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
+}
+
 #else  // AVX2
 
 // ------------------------------ Mask
@@ -833,8 +886,9 @@ HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
   return AndNot(VecFromMask(Full256<T>(), mask), no);
 }
 
-template <typename T, HWY_IF_FLOAT(T)>
+template <typename T>
 HWY_API Vec256<T> ZeroIfNegative(Vec256<T> v) {
+  static_assert(IsSigned<T>(), "Only for float");
   const auto zero = Zero(Full256<T>());
   // AVX2 IfThenElse only looks at the MSB for 32/64-bit lanes
   return IfThenElse(MaskFromVec(v), zero, v);
@@ -871,6 +925,12 @@ HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
   return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
 }
 
+template <typename T>
+HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) {
+  const Full256<T> d;
+  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
+}
+
 #endif  // HWY_TARGET <= HWY_AVX3
 
 // ================================================== COMPARE
@@ -1135,11 +1195,10 @@ HWY_API Mask256<double> operator==(const Vec256<double> a,
 
 // ------------------------------ Inequality
 
-template <typename T, HWY_IF_NOT_FLOAT(T)>
+template <typename T>
 HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
   return Not(a == b);
 }
-
 HWY_API Mask256<float> operator!=(const Vec256<float> a,
                                   const Vec256<float> b) {
   return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_NEQ_OQ)};
@@ -1151,6 +1210,9 @@ HWY_API Mask256<double> operator!=(const Vec256<double> a,
 
 // ------------------------------ Strict inequality
 
+// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
+namespace detail {
+
 // Pre-9.3 GCC immintrin.h uses char, which may be unsigned, causing cmpgt_epi8
 // to perform an unsigned comparison instead of the intended signed. Workaround
 // is to cast to an explicitly signed type. See https://godbolt.org/z/PL7Ujy
@@ -1160,7 +1222,8 @@ HWY_API Mask256<double> operator!=(const Vec256<double> a,
 #define HWY_AVX2_GCC_CMPGT8_WORKAROUND 0
 #endif
 
-HWY_API Mask256<int8_t> operator>(Vec256<int8_t> a, Vec256<int8_t> b) {
+HWY_API Mask256<int8_t> Gt(hwy::SignedTag /*tag*/, Vec256<int8_t> a,
+                           Vec256<int8_t> b) {
 #if HWY_AVX2_GCC_CMPGT8_WORKAROUND
   using i8x32 = signed char __attribute__((__vector_size__(32)));
   return Mask256<int8_t>{static_cast<__m256i>(reinterpret_cast<i8x32>(a.raw) >
@@ -1169,34 +1232,43 @@ HWY_API Mask256<int8_t> operator>(Vec256<int8_t> a, Vec256<int8_t> b) {
   return Mask256<int8_t>{_mm256_cmpgt_epi8(a.raw, b.raw)};
 #endif
 }
-HWY_API Mask256<int16_t> operator>(const Vec256<int16_t> a,
-                                   const Vec256<int16_t> b) {
+HWY_API Mask256<int16_t> Gt(hwy::SignedTag /*tag*/, Vec256<int16_t> a,
+                            Vec256<int16_t> b) {
   return Mask256<int16_t>{_mm256_cmpgt_epi16(a.raw, b.raw)};
 }
-HWY_API Mask256<int32_t> operator>(const Vec256<int32_t> a,
-                                   const Vec256<int32_t> b) {
+HWY_API Mask256<int32_t> Gt(hwy::SignedTag /*tag*/, Vec256<int32_t> a,
+                            Vec256<int32_t> b) {
   return Mask256<int32_t>{_mm256_cmpgt_epi32(a.raw, b.raw)};
 }
-HWY_API Mask256<int64_t> operator>(const Vec256<int64_t> a,
-                                   const Vec256<int64_t> b) {
+HWY_API Mask256<int64_t> Gt(hwy::SignedTag /*tag*/, Vec256<int64_t> a,
+                            Vec256<int64_t> b) {
   return Mask256<int64_t>{_mm256_cmpgt_epi64(a.raw, b.raw)};
 }
 
-template <typename T, HWY_IF_UNSIGNED(T)>
-HWY_API Mask256<T> operator>(const Vec256<T> a, const Vec256<T> b) {
+template <typename T>
+HWY_INLINE Mask256<T> Gt(hwy::UnsignedTag /*tag*/, Vec256<T> a, Vec256<T> b) {
   const Full256<T> du;
   const RebindToSigned<decltype(du)> di;
   const Vec256<T> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
   return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb)));
 }
 
-HWY_API Mask256<float> operator>(const Vec256<float> a, const Vec256<float> b) {
+HWY_API Mask256<float> Gt(hwy::FloatTag /*tag*/, Vec256<float> a,
+                          Vec256<float> b) {
   return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GT_OQ)};
 }
-HWY_API Mask256<double> operator>(Vec256<double> a, Vec256<double> b) {
+HWY_API Mask256<double> Gt(hwy::FloatTag /*tag*/, Vec256<double> a,
+                           Vec256<double> b) {
   return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GT_OQ)};
 }
 
+}  // namespace detail
+
+template <typename T>
+HWY_API Mask256<T> operator>(Vec256<T> a, Vec256<T> b) {
+  return detail::Gt(hwy::TypeTag<T>(), a, b);
+}
+
 // ------------------------------ Weak inequality
 
 HWY_API Mask256<float> operator>=(const Vec256<float> a,
@@ -1857,16 +1929,27 @@ HWY_API Vec256<int8_t> ShiftRightSame(Vec256<int8_t> v, const int bits) {
 
 // ------------------------------ Neg (Xor, Sub)
 
-template <typename T, HWY_IF_FLOAT(T)>
-HWY_API Vec256<T> Neg(const Vec256<T> v) {
+// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
+namespace detail {
+
+template <typename T>
+HWY_INLINE Vec256<T> Neg(hwy::FloatTag /*tag*/, const Vec256<T> v) {
   return Xor(v, SignBit(Full256<T>()));
 }
 
-template <typename T, HWY_IF_NOT_FLOAT(T)>
-HWY_API Vec256<T> Neg(const Vec256<T> v) {
+// Not floating-point
+template <typename T>
+HWY_INLINE Vec256<T> Neg(hwy::NonFloatTag /*tag*/, const Vec256<T> v) {
   return Zero(Full256<T>()) - v;
 }
 
+}  // namespace detail
+
+template <typename T>
+HWY_API Vec256<T> Neg(const Vec256<T> v) {
+  return detail::Neg(hwy::IsFloatTag<T>(), v);
+}
+
 // ------------------------------ Floating-point mul / div
 
 HWY_API Vec256<float> operator*(const Vec256<float> a, const Vec256<float> b) {
@@ -2065,8 +2148,9 @@ HWY_API Mask256<double> IsFinite(const Vec256<double> v) {
 
 #else
 
-template <typename T, HWY_IF_FLOAT(T)>
+template <typename T>
 HWY_API Mask256<T> IsInf(const Vec256<T> v) {
+  static_assert(IsFloat<T>(), "Only for float");
   const Full256<T> d;
   const RebindToSigned<decltype(d)> di;
   const VFromD<decltype(di)> vi = BitCast(di, v);
@@ -2075,8 +2159,9 @@ HWY_API Mask256<T> IsInf(const Vec256<T> v) {
 }
 
 // Returns whether normal/subnormal/zero.
-template <typename T, HWY_IF_FLOAT(T)>
+template <typename T>
 HWY_API Mask256<T> IsFinite(const Vec256<T> v) {
+  static_assert(IsFloat<T>(), "Only for float");
   const Full256<T> d;
   const RebindToUnsigned<decltype(d)> du;
   const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
@@ -2206,11 +2291,7 @@ HWY_API Vec256<double> MaskedLoad(Mask256<double> m, Full256<double> d,
 // 3-cycle cost of moving data between 128-bit halves and avoids port 5.
 template <typename T>
 HWY_API Vec256<T> LoadDup128(Full256<T> /* tag */, const T* HWY_RESTRICT p) {
-#if HWY_LOADDUP_ASM
-  __m256i out;
-  asm("vbroadcasti128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
-  return Vec256<T>{out};
-#elif HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG && HWY_COMPILER_MSVC < 1931
+#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
   // Workaround for incorrect results with _mm256_broadcastsi128_si256. Note
   // that MSVC also lacks _mm256_zextsi128_si256, but cast (which leaves the
   // upper half undefined) is fine because we're overwriting that anyway.
@@ -2225,11 +2306,7 @@ HWY_API Vec256<T> LoadDup128(Full256<T> /* tag */, const T* HWY_RESTRICT p) {
 }
 HWY_API Vec256<float> LoadDup128(Full256<float> /* tag */,
                                  const float* const HWY_RESTRICT p) {
-#if HWY_LOADDUP_ASM
-  __m256 out;
-  asm("vbroadcastf128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
-  return Vec256<float>{out};
-#elif HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG && HWY_COMPILER_MSVC < 1931
+#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
   const __m128 v128 = LoadU(Full128<float>(), p).raw;
   return Vec256<float>{
       _mm256_insertf128_ps(_mm256_castps128_ps256(v128), v128, 1)};
@@ -2239,11 +2316,7 @@ HWY_API Vec256<float> LoadDup128(Full256<float> /* tag */,
 }
 HWY_API Vec256<double> LoadDup128(Full256<double> /* tag */,
                                   const double* const HWY_RESTRICT p) {
-#if HWY_LOADDUP_ASM
-  __m256d out;
-  asm("vbroadcastf128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
-  return Vec256<double>{out};
-#elif HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG && HWY_COMPILER_MSVC < 1931
+#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
   const __m128d v128 = LoadU(Full128<double>(), p).raw;
   return Vec256<double>{
       _mm256_insertf128_pd(_mm256_castpd128_pd256(v128), v128, 1)};
@@ -2344,7 +2417,7 @@ HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
   Store(BitCast(du, VecFromMask(d, m)), du, mask);
   for (size_t i = 0; i < 32 / sizeof(T); ++i) {
     if (mask[i]) {
-      CopyBytes<sizeof(T)>(buf + i, p + i);
+      CopySameSize(buf + i, p + i);
     }
   }
 }
@@ -2651,33 +2724,43 @@ HWY_API T GetLane(const Vec256<T> v) {
 // compiler could decide to optimize out code that relies on this.
 //
 // The newer _mm256_zextsi128_si256 intrinsic fixes this by specifying the
-// zeroing, but it is not available on MSVC nor GCC until 10.1. For older GCC,
-// we can still obtain the desired code thanks to pattern recognition; note that
-// the expensive insert instruction is not actually generated, see
-// https://gcc.godbolt.org/z/1MKGaP.
+// zeroing, but it is not available on MSVC until 15.7 nor GCC until 10.1. For
+// older GCC, we can still obtain the desired code thanks to pattern
+// recognition; note that the expensive insert instruction is not actually
+// generated, see https://gcc.godbolt.org/z/1MKGaP.
+
+#if !defined(HWY_HAVE_ZEXT)
+#if (HWY_COMPILER_MSVC && HWY_COMPILER_MSVC >= 1915) ||  \
+    (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 500) || \
+    (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1000)
+#define HWY_HAVE_ZEXT 1
+#else
+#define HWY_HAVE_ZEXT 0
+#endif
+#endif  // defined(HWY_HAVE_ZEXT)
 
 template <typename T>
 HWY_API Vec256<T> ZeroExtendVector(Full256<T> /* tag */, Vec128<T> lo) {
-#if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
-  return Vec256<T>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
+#if HWY_HAVE_ZEXT
+return Vec256<T>{_mm256_zextsi128_si256(lo.raw)};
 #else
-  return Vec256<T>{_mm256_zextsi128_si256(lo.raw)};
+  return Vec256<T>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
 #endif
 }
 HWY_API Vec256<float> ZeroExtendVector(Full256<float> /* tag */,
                                        Vec128<float> lo) {
-#if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
-  return Vec256<float>{_mm256_insertf128_ps(_mm256_setzero_ps(), lo.raw, 0)};
-#else
+#if HWY_HAVE_ZEXT
   return Vec256<float>{_mm256_zextps128_ps256(lo.raw)};
+#else
+  return Vec256<float>{_mm256_insertf128_ps(_mm256_setzero_ps(), lo.raw, 0)};
 #endif
 }
 HWY_API Vec256<double> ZeroExtendVector(Full256<double> /* tag */,
                                         Vec128<double> lo) {
-#if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
-  return Vec256<double>{_mm256_insertf128_pd(_mm256_setzero_pd(), lo.raw, 0)};
-#else
+#if HWY_HAVE_ZEXT
   return Vec256<double>{_mm256_zextpd128_pd256(lo.raw)};
+#else
+  return Vec256<double>{_mm256_insertf128_pd(_mm256_setzero_pd(), lo.raw, 0)};
 #endif
 }
 
@@ -3657,12 +3740,14 @@ HWY_API Vec256<TI> TableLookupBytes(const Vec128<T, N> bytes,
 
 // ------------------------------ Shl (Mul, ZipLower)
 
-#if HWY_TARGET > HWY_AVX3  // AVX2 or older
 namespace detail {
 
+#if HWY_TARGET > HWY_AVX3  // AVX2 or older
+
 // Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
+template <typename T>
 HWY_INLINE Vec256<MakeUnsigned<T>> Pow2(const Vec256<T> v) {
+  static_assert(sizeof(T) == 2, "Only for 16-bit");
   const Full256<T> d;
   const RepartitionToWide<decltype(d)> dw;
   const Rebind<float, decltype(dw)> df;
@@ -3680,63 +3765,66 @@ HWY_INLINE Vec256<MakeUnsigned<T>> Pow2(const Vec256<T> v) {
   return Vec256<MakeUnsigned<T>>{_mm256_packus_epi32(bits0.raw, bits1.raw)};
 }
 
-}  // namespace detail
 #endif  // HWY_TARGET > HWY_AVX3
 
-HWY_API Vec256<uint16_t> operator<<(const Vec256<uint16_t> v,
-                                    const Vec256<uint16_t> bits) {
+HWY_INLINE Vec256<uint16_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint16_t> v,
+                                Vec256<uint16_t> bits) {
 #if HWY_TARGET <= HWY_AVX3
   return Vec256<uint16_t>{_mm256_sllv_epi16(v.raw, bits.raw)};
 #else
-  return v * detail::Pow2(bits);
+  return v * Pow2(bits);
 #endif
 }
 
-HWY_API Vec256<uint32_t> operator<<(const Vec256<uint32_t> v,
-                                    const Vec256<uint32_t> bits) {
+HWY_INLINE Vec256<uint32_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint32_t> v,
+                                Vec256<uint32_t> bits) {
   return Vec256<uint32_t>{_mm256_sllv_epi32(v.raw, bits.raw)};
 }
 
-HWY_API Vec256<uint64_t> operator<<(const Vec256<uint64_t> v,
-                                    const Vec256<uint64_t> bits) {
+HWY_INLINE Vec256<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint64_t> v,
+                                Vec256<uint64_t> bits) {
   return Vec256<uint64_t>{_mm256_sllv_epi64(v.raw, bits.raw)};
 }
 
-// Signed left shift is the same as unsigned.
-template <typename T, HWY_IF_SIGNED(T)>
-HWY_API Vec256<T> operator<<(const Vec256<T> v, const Vec256<T> bits) {
+template <typename T>
+HWY_INLINE Vec256<T> Shl(hwy::SignedTag /*tag*/, Vec256<T> v, Vec256<T> bits) {
+  // Signed left shifts are the same as unsigned.
   const Full256<T> di;
   const Full256<MakeUnsigned<T>> du;
-  return BitCast(di, BitCast(du, v) << BitCast(du, bits));
+  return BitCast(di,
+                 Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
+}
+
+}  // namespace detail
+
+template <typename T>
+HWY_API Vec256<T> operator<<(Vec256<T> v, Vec256<T> bits) {
+  return detail::Shl(hwy::TypeTag<T>(), v, bits);
 }
 
 // ------------------------------ Shr (MulHigh, IfThenElse, Not)
 
-HWY_API Vec256<uint16_t> operator>>(const Vec256<uint16_t> v,
-                                    const Vec256<uint16_t> bits) {
+HWY_API Vec256<uint16_t> operator>>(Vec256<uint16_t> v, Vec256<uint16_t> bits) {
 #if HWY_TARGET <= HWY_AVX3
   return Vec256<uint16_t>{_mm256_srlv_epi16(v.raw, bits.raw)};
 #else
-  const Full256<uint16_t> d;
+  Full256<uint16_t> d;
   // For bits=0, we cannot mul by 2^16, so fix the result later.
-  const auto out = MulHigh(v, detail::Pow2(Set(d, 16) - bits));
+  auto out = MulHigh(v, detail::Pow2(Set(d, 16) - bits));
   // Replace output with input where bits == 0.
   return IfThenElse(bits == Zero(d), v, out);
 #endif
 }
 
-HWY_API Vec256<uint32_t> operator>>(const Vec256<uint32_t> v,
-                                    const Vec256<uint32_t> bits) {
+HWY_API Vec256<uint32_t> operator>>(Vec256<uint32_t> v, Vec256<uint32_t> bits) {
   return Vec256<uint32_t>{_mm256_srlv_epi32(v.raw, bits.raw)};
 }
 
-HWY_API Vec256<uint64_t> operator>>(const Vec256<uint64_t> v,
-                                    const Vec256<uint64_t> bits) {
+HWY_API Vec256<uint64_t> operator>>(Vec256<uint64_t> v, Vec256<uint64_t> bits) {
   return Vec256<uint64_t>{_mm256_srlv_epi64(v.raw, bits.raw)};
 }
 
-HWY_API Vec256<int16_t> operator>>(const Vec256<int16_t> v,
-                                   const Vec256<int16_t> bits) {
+HWY_API Vec256<int16_t> operator>>(Vec256<int16_t> v, Vec256<int16_t> bits) {
 #if HWY_TARGET <= HWY_AVX3
   return Vec256<int16_t>{_mm256_srav_epi16(v.raw, bits.raw)};
 #else
@@ -3744,13 +3832,11 @@ HWY_API Vec256<int16_t> operator>>(const Vec256<int16_t> v,
 #endif
 }
 
-HWY_API Vec256<int32_t> operator>>(const Vec256<int32_t> v,
-                                   const Vec256<int32_t> bits) {
+HWY_API Vec256<int32_t> operator>>(Vec256<int32_t> v, Vec256<int32_t> bits) {
   return Vec256<int32_t>{_mm256_srav_epi32(v.raw, bits.raw)};
 }
 
-HWY_API Vec256<int64_t> operator>>(const Vec256<int64_t> v,
-                                   const Vec256<int64_t> bits) {
+HWY_API Vec256<int64_t> operator>>(Vec256<int64_t> v, Vec256<int64_t> bits) {
 #if HWY_TARGET <= HWY_AVX3
   return Vec256<int64_t>{_mm256_srav_epi64(v.raw, bits.raw)};
 #else
@@ -3835,6 +3921,14 @@ HWY_API Vec256<float> ReorderWidenMulAccumulate(Full256<float> df32,
   return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
 }
 
+HWY_API Vec256<int32_t> ReorderWidenMulAccumulate(Full256<int32_t> /*d32*/,
+                                                  Vec256<int16_t> a,
+                                                  Vec256<int16_t> b,
+                                                  const Vec256<int32_t> sum0,
+                                                  Vec256<int32_t>& /*sum1*/) {
+  return sum0 + Vec256<int32_t>{_mm256_madd_epi16(a.raw, b.raw)};
+}
+
 // ================================================== CONVERT
 
 // ------------------------------ Promotions (part w/ narrow lanes -> full)
@@ -4015,6 +4109,11 @@ HWY_API Vec256<bfloat16_t> ReorderDemote2To(Full256<bfloat16_t> dbf16,
   return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
 }
 
+HWY_API Vec256<int16_t> ReorderDemote2To(Full256<int16_t> /*d16*/,
+                                         Vec256<int32_t> a, Vec256<int32_t> b) {
+  return Vec256<int16_t>{_mm256_packs_epi32(a.raw, b.raw)};
+}
+
 HWY_API Vec128<float> DemoteTo(Full128<float> /* tag */,
                                const Vec256<double> v) {
   return Vec128<float>{_mm256_cvtpd_ps(v.raw)};
@@ -4040,6 +4139,107 @@ HWY_API Vec128<uint8_t, 8> U8FromU32(const Vec256<uint32_t> v) {
   return BitCast(Full64<uint8_t>(), pair);
 }
 
+// ------------------------------ Truncations
+
+namespace detail {
+
+// LO and HI each hold four indices of bytes within a 128-bit block.
+template <uint32_t LO, uint32_t HI, typename T>
+HWY_INLINE Vec128<uint32_t> LookupAndConcatHalves(Vec256<T> v) {
+  const Full256<uint32_t> d32;
+
+#if HWY_TARGET <= HWY_AVX3_DL
+  alignas(32) constexpr uint32_t kMap[8] = {
+      LO, HI, 0x10101010 + LO, 0x10101010 + HI, 0, 0, 0, 0};
+  const auto result = _mm256_permutexvar_epi8(v.raw, Load(d32, kMap).raw);
+#else
+  alignas(32) static constexpr uint32_t kMap[8] = {LO,  HI,  ~0u, ~0u,
+                                                   ~0u, ~0u, LO,  HI};
+  const auto quad = TableLookupBytes(v, Load(d32, kMap));
+  const auto result = _mm256_permute4x64_epi64(quad.raw, 0xCC);
+  // Possible alternative:
+  // const auto lo = LowerHalf(quad);
+  // const auto hi = UpperHalf(Full128<uint32_t>(), quad);
+  // const auto result = lo | hi;
+#endif
+
+  return Vec128<uint32_t>{_mm256_castsi256_si128(result)};
+}
+
+// LO and HI each hold two indices of bytes within a 128-bit block.
+template <uint16_t LO, uint16_t HI, typename T>
+HWY_INLINE Vec128<uint32_t, 2> LookupAndConcatQuarters(Vec256<T> v) {
+  const Full256<uint16_t> d16;
+
+#if HWY_TARGET <= HWY_AVX3_DL
+  alignas(32) constexpr uint16_t kMap[16] = {
+      LO, HI, 0x1010 + LO, 0x1010 + HI, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  const auto result = _mm256_permutexvar_epi8(v.raw, Load(d16, kMap).raw);
+  return LowerHalf(Vec128<uint32_t>{_mm256_castsi256_si128(result)});
+#else
+  constexpr uint16_t ff = static_cast<uint16_t>(~0u);
+  alignas(32) static constexpr uint16_t kMap[16] = {
+      LO, ff, HI, ff, ff, ff, ff, ff, ff, ff, ff, ff, LO, ff, HI, ff};
+  const auto quad = TableLookupBytes(v, Load(d16, kMap));
+  const auto mixed = _mm256_permute4x64_epi64(quad.raw, 0xCC);
+  const auto half = _mm256_castsi256_si128(mixed);
+  return LowerHalf(Vec128<uint32_t>{_mm_packus_epi32(half, half)});
+#endif
+}
+
+}  // namespace detail
+
+HWY_API Vec128<uint8_t, 4> TruncateTo(Simd<uint8_t, 4, 0> /* tag */,
+                                      const Vec256<uint64_t> v) {
+  const Full256<uint32_t> d32;
+#if HWY_TARGET <= HWY_AVX3_DL
+  alignas(32) constexpr uint32_t kMap[8] = {0x18100800u, 0, 0, 0, 0, 0, 0, 0};
+  const auto result = _mm256_permutexvar_epi8(v.raw, Load(d32, kMap).raw);
+  return LowerHalf(LowerHalf(LowerHalf(Vec256<uint8_t>{result})));
+#else
+  alignas(32) static constexpr uint32_t kMap[8] = {0xFFFF0800u, ~0u, ~0u, ~0u,
+                                                   0x0800FFFFu, ~0u, ~0u, ~0u};
+  const auto quad = TableLookupBytes(v, Load(d32, kMap));
+  const auto lo = LowerHalf(quad);
+  const auto hi = UpperHalf(Full128<uint32_t>(), quad);
+  const auto result = lo | hi;
+  return LowerHalf(LowerHalf(Vec128<uint8_t>{result.raw}));
+#endif
+}
+
+HWY_API Vec128<uint16_t, 4> TruncateTo(Simd<uint16_t, 4, 0> /* tag */,
+                                       const Vec256<uint64_t> v) {
+  const auto result = detail::LookupAndConcatQuarters<0x100, 0x908>(v);
+  return Vec128<uint16_t, 4>{result.raw};
+}
+
+HWY_API Vec128<uint32_t> TruncateTo(Simd<uint32_t, 4, 0> /* tag */,
+                                    const Vec256<uint64_t> v) {
+  const Full256<uint32_t> d32;
+  alignas(32) constexpr uint32_t kEven[8] = {0, 2, 4, 6, 0, 2, 4, 6};
+  const auto v32 =
+      TableLookupLanes(BitCast(d32, v), SetTableIndices(d32, kEven));
+  return LowerHalf(Vec256<uint32_t>{v32.raw});
+}
+
+HWY_API Vec128<uint8_t, 8> TruncateTo(Simd<uint8_t, 8, 0> /* tag */,
+                                      const Vec256<uint32_t> v) {
+  const auto full = detail::LookupAndConcatQuarters<0x400, 0xC08>(v);
+  return Vec128<uint8_t, 8>{full.raw};
+}
+
+HWY_API Vec128<uint16_t> TruncateTo(Simd<uint16_t, 8, 0> /* tag */,
+                                    const Vec256<uint32_t> v) {
+  const auto full = detail::LookupAndConcatHalves<0x05040100, 0x0D0C0908>(v);
+  return Vec128<uint16_t>{full.raw};
+}
+
+HWY_API Vec128<uint8_t> TruncateTo(Simd<uint8_t, 16, 0> /* tag */,
+                                   const Vec256<uint16_t> v) {
+  const auto full = detail::LookupAndConcatHalves<0x06040200, 0x0E0C0A08>(v);
+  return Vec128<uint8_t>{full.raw};
+}
+
 // ------------------------------ Integer <=> fp (ShiftRight, OddEven)
 
 HWY_API Vec256<float> ConvertTo(Full256<float> /* tag */,
@@ -4069,6 +4269,53 @@ HWY_API Vec256<double> ConvertTo(Full256<double> dd, const Vec256<int64_t> v) {
 #endif
 }
 
+HWY_API Vec256<float> ConvertTo(HWY_MAYBE_UNUSED Full256<float> df,
+                                const Vec256<uint32_t> v) {
+#if HWY_TARGET <= HWY_AVX3
+  return Vec256<float>{_mm256_cvtepu32_ps(v.raw)};
+#else
+  // Based on wim's approach (https://stackoverflow.com/questions/34066228/)
+  const RebindToUnsigned<decltype(df)> du32;
+  const RebindToSigned<decltype(df)> d32;
+
+  const auto msk_lo = Set(du32, 0xFFFF);
+  const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16
+
+  // Extract the 16 lowest/highest significant bits of v and cast to signed int
+  const auto v_lo = BitCast(d32, And(v, msk_lo));
+  const auto v_hi = BitCast(d32, ShiftRight<16>(v));
+
+  return MulAdd(cnst2_16_flt, ConvertTo(df, v_hi), ConvertTo(df, v_lo));
+#endif
+}
+
+HWY_API Vec256<double> ConvertTo(HWY_MAYBE_UNUSED Full256<double> dd,
+                                  const Vec256<uint64_t> v) {
+#if HWY_TARGET <= HWY_AVX3
+  return Vec256<double>{_mm256_cvtepu64_pd(v.raw)};
+#else
+  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
+  const RebindToUnsigned<decltype(dd)> d64;
+  using VU = VFromD<decltype(d64)>;
+
+  const VU msk_lo = Set(d64, 0xFFFFFFFFULL);
+  const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32
+
+   // Extract the 32 lowest significant bits of v
+  const VU v_lo = And(v, msk_lo);
+  const VU v_hi = ShiftRight<32>(v);
+
+  auto uint64_to_double256_fast = [&dd](Vec256<uint64_t> w) HWY_ATTR {
+    w = Or(w, Vec256<uint64_t>{
+                  detail::BitCastToInteger(Set(dd, 0x0010000000000000).raw)});
+    return BitCast(dd, w) - Set(dd, 0x0010000000000000);
+  };
+
+  const auto v_lo_dbl = uint64_to_double256_fast(v_lo);
+  return MulAdd(cnst2_32_dbl, uint64_to_double256_fast(v_hi), v_lo_dbl);
+#endif
+}
+
 // Truncates (rounds toward zero).
 HWY_API Vec256<int32_t> ConvertTo(Full256<int32_t> d, const Vec256<float> v) {
   return detail::FixConversionOverflow(d, v, _mm256_cvttps_epi32(v.raw));
@@ -4258,8 +4505,8 @@ HWY_API size_t StoreMaskBits(const Full256<T> /* tag */, const Mask256<T> mask,
 
   // Non-full byte, need to clear the undefined upper bits.
   if (N < 8) {
-    const int mask = static_cast<int>((1ull << N) - 1);
-    bits[0] = static_cast<uint8_t>(bits[0] & mask);
+    const int mask_bits = static_cast<int>((1ull << N) - 1);
+    bits[0] = static_cast<uint8_t>(bits[0] & mask_bits);
   }
   return kNumBytes;
 }
@@ -4272,9 +4519,15 @@ HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
 }
 
 template <typename T>
-HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
-                               const Mask256<T> mask) {
-  return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask.raw)) : -1;
+HWY_API size_t FindKnownFirstTrue(const Full256<T> /* tag */,
+                                  const Mask256<T> mask) {
+  return Num0BitsBelowLS1Bit_Nonzero32(mask.raw);
+}
+
+template <typename T>
+HWY_API intptr_t FindFirstTrue(const Full256<T> d, const Mask256<T> mask) {
+  return mask.raw ? static_cast<intptr_t>(FindKnownFirstTrue(d, mask))
+                  : intptr_t{-1};
 }
 
 // Beware: the suffix indicates the number of mask bits, not lane size!
@@ -4373,8 +4626,10 @@ template <typename T, HWY_IF_LANE_SIZE(T, 8)>
 HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> mask) {
   // See CompressIsPartition.
   alignas(16) constexpr uint64_t packed_array[16] = {
-      0x3210, 0x3210, 0x3201, 0x3210, 0x3102, 0x3120, 0x3021, 0x3210,
-      0x2103, 0x2130, 0x2031, 0x2310, 0x1032, 0x1320, 0x0321, 0x3210};
+      // PrintCompress64x4NibbleTables
+      0x00003210, 0x00003210, 0x00003201, 0x00003210, 0x00003102, 0x00003120,
+      0x00003021, 0x00003210, 0x00002103, 0x00002130, 0x00002031, 0x00002310,
+      0x00001032, 0x00001320, 0x00000321, 0x00003210};
 
   // For lane i, shift the i-th 4-bit index down to bits [0, 2) -
   // _mm256_permutexvar_epi64 will ignore the upper bits.
@@ -4386,8 +4641,39 @@ HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> mask) {
   return TableLookupLanes(v, indices);
 }
 
-// ------------------------------ CompressBits (LoadMaskBits)
+// ------------------------------ CompressNot (Compress)
+
+template <typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
+HWY_API Vec256<T> CompressNot(Vec256<T> v, const Mask256<T> mask) {
+  return Compress(v, Not(mask));
+}
+
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec256<T> CompressNot(Vec256<T> v, Mask256<T> mask) {
+  // See CompressIsPartition.
+  alignas(16) constexpr uint64_t packed_array[16] = {
+      // PrintCompressNot64x4NibbleTables
+      0x00003210, 0x00000321, 0x00001320, 0x00001032, 0x00002310, 0x00002031,
+      0x00002130, 0x00002103, 0x00003210, 0x00003021, 0x00003120, 0x00003102,
+      0x00003210, 0x00003201, 0x00003210, 0x00003210};
+
+  // For lane i, shift the i-th 4-bit index down to bits [0, 2) -
+  // _mm256_permutexvar_epi64 will ignore the upper bits.
+  const Full256<T> d;
+  const RebindToUnsigned<decltype(d)> du64;
+  const auto packed = Set(du64, packed_array[mask.raw]);
+  alignas(64) constexpr uint64_t shifts[4] = {0, 4, 8, 12};
+  const auto indices = Indices256<T>{(packed >> Load(du64, shifts)).raw};
+  return TableLookupLanes(v, indices);
+}
 
+// ------------------------------ CompressBlocksNot
+HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
+                                           Mask256<uint64_t> mask) {
+  return CompressNot(v, mask);
+}
+
+// ------------------------------ CompressBits (LoadMaskBits)
 template <typename T>
 HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) {
   return Compress(v, LoadMaskBits(Full256<T>(), bits));
@@ -4478,8 +4764,6 @@ HWY_API size_t CompressStore(Vec256<double> v, Mask256<double> mask,
 
 // ------------------------------ CompressBlendedStore (CompressStore)
 
-#if HWY_TARGET <= HWY_AVX3
-
 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
 HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
                                     T* HWY_RESTRICT unaligned) {
@@ -4504,35 +4788,6 @@ HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
 #endif
 }
 
-#else  // AVX2
-
-template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
-HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
-                                    T* HWY_RESTRICT unaligned) {
-  const size_t count = CountTrue(m);
-  BlendedStore(FirstN(d, count), d, Compress(v, m));
-  return count;
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
-                                    T* HWY_RESTRICT unaligned) {
-  const size_t count = CountTrue(d, m);
-  const Vec256<T> compressed = Compress(v, m);
-#if HWY_MEM_OPS_MIGHT_FAULT
-  // BlendedStore tests mask for each lane, but we know that the mask is
-  // FirstN, so we can just copy.
-  alignas(32) T buf[16];
-  Store(compressed, d, buf);
-  memcpy(unaligned, buf, count * sizeof(T));
-#else
-  BlendedStore(compressed, FirstN(d, count), d, unaligned);
-#endif
-  return count;
-}
-
-#endif  // AVX2
-
 // ------------------------------ CompressBitsStore (LoadMaskBits)
 
 template <typename T>
@@ -4716,6 +4971,13 @@ HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
 }
 
 template <typename T>
+HWY_API size_t FindKnownFirstTrue(const Full256<T> /* tag */,
+                                  const Mask256<T> mask) {
+  const uint64_t mask_bits = detail::BitsFromMask(mask);
+  return Num0BitsBelowLS1Bit_Nonzero64(mask_bits);
+}
+
+template <typename T>
 HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
                                const Mask256<T> mask) {
   const uint64_t mask_bits = detail::BitsFromMask(mask);
@@ -4727,8 +4989,7 @@ HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
 namespace detail {
 
 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
-                                                uint64_t mask_bits) {
+HWY_INLINE Vec256<uint32_t> IndicesFromBits(Full256<T> d, uint64_t mask_bits) {
   const RebindToUnsigned<decltype(d)> d32;
   // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
   // of SetTableIndices would require 8 KiB, a large part of L1D. The other
@@ -4736,49 +4997,50 @@ HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
   // and unavailable in 32-bit builds. We instead compress each index into 4
   // bits, for a total of 1 KiB.
   alignas(16) constexpr uint32_t packed_array[256] = {
-      0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
-      0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
-      0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
-      0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210,
-      0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320,
-      0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510,
-      0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530,
-      0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210,
-      0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420,
-      0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310,
-      0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160,
-      0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210,
-      0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320,
-      0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410,
-      0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430,
-      0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210,
-      0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520,
-      0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310,
-      0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540,
-      0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210,
-      0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320,
-      0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710,
-      0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730,
-      0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210,
-      0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420,
-      0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310,
-      0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750,
-      0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210,
-      0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320,
-      0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410,
-      0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430,
-      0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210,
-      0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620,
-      0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310,
-      0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640,
-      0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210,
-      0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320,
-      0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510,
-      0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530,
-      0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210,
-      0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420,
-      0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310,
-      0x10765432, 0x17654320, 0x07654321, 0x76543210};
+      // PrintCompress32x8Tables
+      0x76543210, 0x76543218, 0x76543209, 0x76543298, 0x7654310a, 0x765431a8,
+      0x765430a9, 0x76543a98, 0x7654210b, 0x765421b8, 0x765420b9, 0x76542b98,
+      0x765410ba, 0x76541ba8, 0x76540ba9, 0x7654ba98, 0x7653210c, 0x765321c8,
+      0x765320c9, 0x76532c98, 0x765310ca, 0x76531ca8, 0x76530ca9, 0x7653ca98,
+      0x765210cb, 0x76521cb8, 0x76520cb9, 0x7652cb98, 0x76510cba, 0x7651cba8,
+      0x7650cba9, 0x765cba98, 0x7643210d, 0x764321d8, 0x764320d9, 0x76432d98,
+      0x764310da, 0x76431da8, 0x76430da9, 0x7643da98, 0x764210db, 0x76421db8,
+      0x76420db9, 0x7642db98, 0x76410dba, 0x7641dba8, 0x7640dba9, 0x764dba98,
+      0x763210dc, 0x76321dc8, 0x76320dc9, 0x7632dc98, 0x76310dca, 0x7631dca8,
+      0x7630dca9, 0x763dca98, 0x76210dcb, 0x7621dcb8, 0x7620dcb9, 0x762dcb98,
+      0x7610dcba, 0x761dcba8, 0x760dcba9, 0x76dcba98, 0x7543210e, 0x754321e8,
+      0x754320e9, 0x75432e98, 0x754310ea, 0x75431ea8, 0x75430ea9, 0x7543ea98,
+      0x754210eb, 0x75421eb8, 0x75420eb9, 0x7542eb98, 0x75410eba, 0x7541eba8,
+      0x7540eba9, 0x754eba98, 0x753210ec, 0x75321ec8, 0x75320ec9, 0x7532ec98,
+      0x75310eca, 0x7531eca8, 0x7530eca9, 0x753eca98, 0x75210ecb, 0x7521ecb8,
+      0x7520ecb9, 0x752ecb98, 0x7510ecba, 0x751ecba8, 0x750ecba9, 0x75ecba98,
+      0x743210ed, 0x74321ed8, 0x74320ed9, 0x7432ed98, 0x74310eda, 0x7431eda8,
+      0x7430eda9, 0x743eda98, 0x74210edb, 0x7421edb8, 0x7420edb9, 0x742edb98,
+      0x7410edba, 0x741edba8, 0x740edba9, 0x74edba98, 0x73210edc, 0x7321edc8,
+      0x7320edc9, 0x732edc98, 0x7310edca, 0x731edca8, 0x730edca9, 0x73edca98,
+      0x7210edcb, 0x721edcb8, 0x720edcb9, 0x72edcb98, 0x710edcba, 0x71edcba8,
+      0x70edcba9, 0x7edcba98, 0x6543210f, 0x654321f8, 0x654320f9, 0x65432f98,
+      0x654310fa, 0x65431fa8, 0x65430fa9, 0x6543fa98, 0x654210fb, 0x65421fb8,
+      0x65420fb9, 0x6542fb98, 0x65410fba, 0x6541fba8, 0x6540fba9, 0x654fba98,
+      0x653210fc, 0x65321fc8, 0x65320fc9, 0x6532fc98, 0x65310fca, 0x6531fca8,
+      0x6530fca9, 0x653fca98, 0x65210fcb, 0x6521fcb8, 0x6520fcb9, 0x652fcb98,
+      0x6510fcba, 0x651fcba8, 0x650fcba9, 0x65fcba98, 0x643210fd, 0x64321fd8,
+      0x64320fd9, 0x6432fd98, 0x64310fda, 0x6431fda8, 0x6430fda9, 0x643fda98,
+      0x64210fdb, 0x6421fdb8, 0x6420fdb9, 0x642fdb98, 0x6410fdba, 0x641fdba8,
+      0x640fdba9, 0x64fdba98, 0x63210fdc, 0x6321fdc8, 0x6320fdc9, 0x632fdc98,
+      0x6310fdca, 0x631fdca8, 0x630fdca9, 0x63fdca98, 0x6210fdcb, 0x621fdcb8,
+      0x620fdcb9, 0x62fdcb98, 0x610fdcba, 0x61fdcba8, 0x60fdcba9, 0x6fdcba98,
+      0x543210fe, 0x54321fe8, 0x54320fe9, 0x5432fe98, 0x54310fea, 0x5431fea8,
+      0x5430fea9, 0x543fea98, 0x54210feb, 0x5421feb8, 0x5420feb9, 0x542feb98,
+      0x5410feba, 0x541feba8, 0x540feba9, 0x54feba98, 0x53210fec, 0x5321fec8,
+      0x5320fec9, 0x532fec98, 0x5310feca, 0x531feca8, 0x530feca9, 0x53feca98,
+      0x5210fecb, 0x521fecb8, 0x520fecb9, 0x52fecb98, 0x510fecba, 0x51fecba8,
+      0x50fecba9, 0x5fecba98, 0x43210fed, 0x4321fed8, 0x4320fed9, 0x432fed98,
+      0x4310feda, 0x431feda8, 0x430feda9, 0x43feda98, 0x4210fedb, 0x421fedb8,
+      0x420fedb9, 0x42fedb98, 0x410fedba, 0x41fedba8, 0x40fedba9, 0x4fedba98,
+      0x3210fedc, 0x321fedc8, 0x320fedc9, 0x32fedc98, 0x310fedca, 0x31fedca8,
+      0x30fedca9, 0x3fedca98, 0x210fedcb, 0x21fedcb8, 0x20fedcb9, 0x2fedcb98,
+      0x10fedcba, 0x1fedcba8, 0x0fedcba9, 0xfedcba98};
 
   // No need to mask because _mm256_permutevar8x32_epi32 ignores bits 3..31.
   // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
@@ -4786,36 +5048,122 @@ HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
   // latency, it may be faster to use LoadDup128 and PSHUFB.
   const auto packed = Set(d32, packed_array[mask_bits]);
   alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
-  return Indices256<uint32_t>{(packed >> Load(d32, shifts)).raw};
+  return packed >> Load(d32, shifts);
 }
 
 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
-                                                uint64_t mask_bits) {
+HWY_INLINE Vec256<uint32_t> IndicesFromBits(Full256<T> d, uint64_t mask_bits) {
   const Repartition<uint32_t, decltype(d)> d32;
 
   // For 64-bit, we still need 32-bit indices because there is no 64-bit
   // permutevar, but there are only 4 lanes, so we can afford to skip the
   // unpacking and load the entire index vector directly.
-  alignas(32) constexpr uint32_t packed_array[128] = {
-      0, 1, 2, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7,  //
-      2, 3, 0, 1, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7,  //
-      4, 5, 0, 1, 2, 3, 6, 7, /**/ 0, 1, 4, 5, 2, 3, 6, 7,  //
-      2, 3, 4, 5, 0, 1, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7,  //
-      6, 7, 0, 1, 2, 3, 4, 5, /**/ 0, 1, 6, 7, 2, 3, 4, 5,  //
-      2, 3, 6, 7, 0, 1, 4, 5, /**/ 0, 1, 2, 3, 6, 7, 4, 5,  //
-      4, 5, 6, 7, 0, 1, 2, 3, /**/ 0, 1, 4, 5, 6, 7, 2, 3,  //
-      2, 3, 4, 5, 6, 7, 0, 1, /**/ 0, 1, 2, 3, 4, 5, 6, 7};
-  return Indices256<uint32_t>{Load(d32, packed_array + 8 * mask_bits).raw};
+  alignas(32) constexpr uint32_t u32_indices[128] = {
+      // PrintCompress64x4PairTables
+      0,  1,  2,  3,  4,  5,  6, 7, 8, 9, 2,  3,  4,  5,  6,  7,
+      10, 11, 0,  1,  4,  5,  6, 7, 8, 9, 10, 11, 4,  5,  6,  7,
+      12, 13, 0,  1,  2,  3,  6, 7, 8, 9, 12, 13, 2,  3,  6,  7,
+      10, 11, 12, 13, 0,  1,  6, 7, 8, 9, 10, 11, 12, 13, 6,  7,
+      14, 15, 0,  1,  2,  3,  4, 5, 8, 9, 14, 15, 2,  3,  4,  5,
+      10, 11, 14, 15, 0,  1,  4, 5, 8, 9, 10, 11, 14, 15, 4,  5,
+      12, 13, 14, 15, 0,  1,  2, 3, 8, 9, 12, 13, 14, 15, 2,  3,
+      10, 11, 12, 13, 14, 15, 0, 1, 8, 9, 10, 11, 12, 13, 14, 15};
+  return Load(d32, u32_indices + 8 * mask_bits);
 }
 
+template <typename T, HWY_IF_LANE_SIZE(T, 4)>
+HWY_INLINE Vec256<uint32_t> IndicesFromNotBits(Full256<T> d,
+                                               uint64_t mask_bits) {
+  const RebindToUnsigned<decltype(d)> d32;
+  // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
+  // of SetTableIndices would require 8 KiB, a large part of L1D. The other
+  // alternative is _pext_u64, but this is extremely slow on Zen2 (18 cycles)
+  // and unavailable in 32-bit builds. We instead compress each index into 4
+  // bits, for a total of 1 KiB.
+  alignas(16) constexpr uint32_t packed_array[256] = {
+      // PrintCompressNot32x8Tables
+      0xfedcba98, 0x8fedcba9, 0x9fedcba8, 0x98fedcba, 0xafedcb98, 0xa8fedcb9,
+      0xa9fedcb8, 0xa98fedcb, 0xbfedca98, 0xb8fedca9, 0xb9fedca8, 0xb98fedca,
+      0xbafedc98, 0xba8fedc9, 0xba9fedc8, 0xba98fedc, 0xcfedba98, 0xc8fedba9,
+      0xc9fedba8, 0xc98fedba, 0xcafedb98, 0xca8fedb9, 0xca9fedb8, 0xca98fedb,
+      0xcbfeda98, 0xcb8feda9, 0xcb9feda8, 0xcb98feda, 0xcbafed98, 0xcba8fed9,
+      0xcba9fed8, 0xcba98fed, 0xdfecba98, 0xd8fecba9, 0xd9fecba8, 0xd98fecba,
+      0xdafecb98, 0xda8fecb9, 0xda9fecb8, 0xda98fecb, 0xdbfeca98, 0xdb8feca9,
+      0xdb9feca8, 0xdb98feca, 0xdbafec98, 0xdba8fec9, 0xdba9fec8, 0xdba98fec,
+      0xdcfeba98, 0xdc8feba9, 0xdc9feba8, 0xdc98feba, 0xdcafeb98, 0xdca8feb9,
+      0xdca9feb8, 0xdca98feb, 0xdcbfea98, 0xdcb8fea9, 0xdcb9fea8, 0xdcb98fea,
+      0xdcbafe98, 0xdcba8fe9, 0xdcba9fe8, 0xdcba98fe, 0xefdcba98, 0xe8fdcba9,
+      0xe9fdcba8, 0xe98fdcba, 0xeafdcb98, 0xea8fdcb9, 0xea9fdcb8, 0xea98fdcb,
+      0xebfdca98, 0xeb8fdca9, 0xeb9fdca8, 0xeb98fdca, 0xebafdc98, 0xeba8fdc9,
+      0xeba9fdc8, 0xeba98fdc, 0xecfdba98, 0xec8fdba9, 0xec9fdba8, 0xec98fdba,
+      0xecafdb98, 0xeca8fdb9, 0xeca9fdb8, 0xeca98fdb, 0xecbfda98, 0xecb8fda9,
+      0xecb9fda8, 0xecb98fda, 0xecbafd98, 0xecba8fd9, 0xecba9fd8, 0xecba98fd,
+      0xedfcba98, 0xed8fcba9, 0xed9fcba8, 0xed98fcba, 0xedafcb98, 0xeda8fcb9,
+      0xeda9fcb8, 0xeda98fcb, 0xedbfca98, 0xedb8fca9, 0xedb9fca8, 0xedb98fca,
+      0xedbafc98, 0xedba8fc9, 0xedba9fc8, 0xedba98fc, 0xedcfba98, 0xedc8fba9,
+      0xedc9fba8, 0xedc98fba, 0xedcafb98, 0xedca8fb9, 0xedca9fb8, 0xedca98fb,
+      0xedcbfa98, 0xedcb8fa9, 0xedcb9fa8, 0xedcb98fa, 0xedcbaf98, 0xedcba8f9,
+      0xedcba9f8, 0xedcba98f, 0xfedcba98, 0xf8edcba9, 0xf9edcba8, 0xf98edcba,
+      0xfaedcb98, 0xfa8edcb9, 0xfa9edcb8, 0xfa98edcb, 0xfbedca98, 0xfb8edca9,
+      0xfb9edca8, 0xfb98edca, 0xfbaedc98, 0xfba8edc9, 0xfba9edc8, 0xfba98edc,
+      0xfcedba98, 0xfc8edba9, 0xfc9edba8, 0xfc98edba, 0xfcaedb98, 0xfca8edb9,
+      0xfca9edb8, 0xfca98edb, 0xfcbeda98, 0xfcb8eda9, 0xfcb9eda8, 0xfcb98eda,
+      0xfcbaed98, 0xfcba8ed9, 0xfcba9ed8, 0xfcba98ed, 0xfdecba98, 0xfd8ecba9,
+      0xfd9ecba8, 0xfd98ecba, 0xfdaecb98, 0xfda8ecb9, 0xfda9ecb8, 0xfda98ecb,
+      0xfdbeca98, 0xfdb8eca9, 0xfdb9eca8, 0xfdb98eca, 0xfdbaec98, 0xfdba8ec9,
+      0xfdba9ec8, 0xfdba98ec, 0xfdceba98, 0xfdc8eba9, 0xfdc9eba8, 0xfdc98eba,
+      0xfdcaeb98, 0xfdca8eb9, 0xfdca9eb8, 0xfdca98eb, 0xfdcbea98, 0xfdcb8ea9,
+      0xfdcb9ea8, 0xfdcb98ea, 0xfdcbae98, 0xfdcba8e9, 0xfdcba9e8, 0xfdcba98e,
+      0xfedcba98, 0xfe8dcba9, 0xfe9dcba8, 0xfe98dcba, 0xfeadcb98, 0xfea8dcb9,
+      0xfea9dcb8, 0xfea98dcb, 0xfebdca98, 0xfeb8dca9, 0xfeb9dca8, 0xfeb98dca,
+      0xfebadc98, 0xfeba8dc9, 0xfeba9dc8, 0xfeba98dc, 0xfecdba98, 0xfec8dba9,
+      0xfec9dba8, 0xfec98dba, 0xfecadb98, 0xfeca8db9, 0xfeca9db8, 0xfeca98db,
+      0xfecbda98, 0xfecb8da9, 0xfecb9da8, 0xfecb98da, 0xfecbad98, 0xfecba8d9,
+      0xfecba9d8, 0xfecba98d, 0xfedcba98, 0xfed8cba9, 0xfed9cba8, 0xfed98cba,
+      0xfedacb98, 0xfeda8cb9, 0xfeda9cb8, 0xfeda98cb, 0xfedbca98, 0xfedb8ca9,
+      0xfedb9ca8, 0xfedb98ca, 0xfedbac98, 0xfedba8c9, 0xfedba9c8, 0xfedba98c,
+      0xfedcba98, 0xfedc8ba9, 0xfedc9ba8, 0xfedc98ba, 0xfedcab98, 0xfedca8b9,
+      0xfedca9b8, 0xfedca98b, 0xfedcba98, 0xfedcb8a9, 0xfedcb9a8, 0xfedcb98a,
+      0xfedcba98, 0xfedcba89, 0xfedcba98, 0xfedcba98};
+
+  // No need to mask because <_mm256_permutevar8x32_epi32> ignores bits 3..31.
+  // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
+  // If broadcasting 32-bit from memory incurs the 3-cycle block-crossing
+  // latency, it may be faster to use LoadDup128 and PSHUFB.
+  const auto packed = Set(d32, packed_array[mask_bits]);
+  alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
+  return packed >> Load(d32, shifts);
+}
+
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_INLINE Vec256<uint32_t> IndicesFromNotBits(Full256<T> d,
+                                               uint64_t mask_bits) {
+  const Repartition<uint32_t, decltype(d)> d32;
+
+  // For 64-bit, we still need 32-bit indices because there is no 64-bit
+  // permutevar, but there are only 4 lanes, so we can afford to skip the
+  // unpacking and load the entire index vector directly.
+  alignas(32) constexpr uint32_t u32_indices[128] = {
+      // PrintCompressNot64x4PairTables
+      8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8,  9,
+      8, 9, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8,  9,  10, 11,
+      8, 9, 10, 11, 14, 15, 12, 13, 10, 11, 14, 15, 8,  9,  12, 13,
+      8, 9, 14, 15, 10, 11, 12, 13, 14, 15, 8,  9,  10, 11, 12, 13,
+      8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 8,  9,  14, 15,
+      8, 9, 12, 13, 10, 11, 14, 15, 12, 13, 8,  9,  10, 11, 14, 15,
+      8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 8,  9,  12, 13, 14, 15,
+      8, 9, 10, 11, 12, 13, 14, 15, 8,  9,  10, 11, 12, 13, 14, 15};
+  return Load(d32, u32_indices + 8 * mask_bits);
+}
 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
 HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
   const Full256<T> d;
   const Repartition<uint32_t, decltype(d)> du32;
 
   HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
-  const auto indices = IndicesFromBits(d, mask_bits);
+  // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is
+  // no instruction for 4x64).
+  const Indices256<uint32_t> indices{IndicesFromBits(d, mask_bits).raw};
   return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
 }
 
@@ -4859,12 +5207,41 @@ HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
   }
 }
 
+template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
+HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
+  const Full256<T> d;
+  const Repartition<uint32_t, decltype(d)> du32;
+
+  HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
+  // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is
+  // no instruction for 4x64).
+  const Indices256<uint32_t> indices{IndicesFromNotBits(d, mask_bits).raw};
+  return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
+}
+
+// LUTs are infeasible for 2^16 possible masks, so splice together two
+// half-vector Compress.
+template <typename T, HWY_IF_LANE_SIZE(T, 2)>
+HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
+  // Compress ensures only the lower 16 bits are set, so flip those.
+  return Compress(v, mask_bits ^ 0xFFFF);
+}
+
 }  // namespace detail
 
 template <typename T>
 HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> m) {
-  const uint64_t mask_bits = detail::BitsFromMask(m);
-  return detail::Compress(v, mask_bits);
+  return detail::Compress(v, detail::BitsFromMask(m));
+}
+
+template <typename T>
+HWY_API Vec256<T> CompressNot(Vec256<T> v, Mask256<T> m) {
+  return detail::CompressNot(v, detail::BitsFromMask(m));
+}
+
+HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
+                                           Mask256<uint64_t> mask) {
+  return CompressNot(v, mask);
 }
 
 template <typename T>
@@ -4897,12 +5274,27 @@ HWY_API size_t CompressStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
   return count;
 }
 
-template <typename T>
+template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
 HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
                                     T* HWY_RESTRICT unaligned) {
   const uint64_t mask_bits = detail::BitsFromMask(m);
   const size_t count = PopCount(mask_bits);
-  BlendedStore(detail::Compress(v, mask_bits), FirstN(d, count), d, unaligned);
+
+  const Repartition<uint32_t, decltype(d)> du32;
+  HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
+  // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is
+  // no instruction for 4x64). Nibble MSB encodes FirstN.
+  const Vec256<uint32_t> idx_and_mask = detail::IndicesFromBits(d, mask_bits);
+  // Shift nibble MSB into MSB
+  const Mask256<uint32_t> mask32 = MaskFromVec(ShiftLeft<28>(idx_and_mask));
+  // First cast to unsigned (RebindMask cannot change lane size)
+  const Mask256<MakeUnsigned<T>> mask_u{mask32.raw};
+  const Mask256<T> mask = RebindMask(d, mask_u);
+  const Vec256<T> compressed =
+      BitCast(d, TableLookupLanes(BitCast(du32, v),
+                                  Indices256<uint32_t>{idx_and_mask.raw}));
+
+  BlendedStore(compressed, mask, d, unaligned);
   // Workaround for MSAN not marking output as initialized (b/233326619)
 #if HWY_IS_MSAN
   __msan_unpoison(unaligned, count * sizeof(T));
@@ -4910,6 +5302,25 @@ HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
   return count;
 }
 
+template <typename T, HWY_IF_LANE_SIZE(T, 2)>
+HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
+                                    T* HWY_RESTRICT unaligned) {
+  const uint64_t mask_bits = detail::BitsFromMask(m);
+  const size_t count = PopCount(mask_bits);
+  const Vec256<T> compressed = detail::Compress(v, mask_bits);
+
+#if HWY_MEM_OPS_MIGHT_FAULT  // true if HWY_IS_MSAN
+  // BlendedStore tests mask for each lane, but we know that the mask is
+  // FirstN, so we can just copy.
+  alignas(32) T buf[16];
+  Store(compressed, d, buf);
+  memcpy(unaligned, buf, count * sizeof(T));
+#else
+  BlendedStore(compressed, FirstN(d, count), d, unaligned);
+#endif
+  return count;
+}
+
 template <typename T>
 HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits,
                                  Full256<T> d, T* HWY_RESTRICT unaligned) {
@@ -5113,29 +5524,75 @@ HWY_INLINE Vec256<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
   return Max(v10, v01);
 }
 
-// u16/i16
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> v) {
-  const Repartition<int32_t, Full256<T>> d32;
+HWY_API Vec256<uint16_t> SumOfLanes(hwy::SizeTag<2> /* tag */,
+                                    Vec256<uint16_t> v) {
+  const Full256<uint16_t> d;
+  const RepartitionToWide<decltype(d)> d32;
   const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
   const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MinOfLanes(d32, Min(even, odd));
+  const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
   // Also broadcast into odd lanes.
-  return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
+  return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
+}
+HWY_API Vec256<int16_t> SumOfLanes(hwy::SizeTag<2> /* tag */,
+                                   Vec256<int16_t> v) {
+  const Full256<int16_t> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
 }
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> v) {
-  const Repartition<int32_t, Full256<T>> d32;
+
+HWY_API Vec256<uint16_t> MinOfLanes(hwy::SizeTag<2> /* tag */,
+                                    Vec256<uint16_t> v) {
+  const Full256<uint16_t> d;
+  const RepartitionToWide<decltype(d)> d32;
   const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
   const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MaxOfLanes(d32, Max(even, odd));
+  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
   // Also broadcast into odd lanes.
-  return BitCast(Full256<T>(), Or(min, ShiftLeft<16>(min)));
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+HWY_API Vec256<int16_t> MinOfLanes(hwy::SizeTag<2> /* tag */,
+                                   Vec256<int16_t> v) {
+  const Full256<int16_t> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+
+HWY_API Vec256<uint16_t> MaxOfLanes(hwy::SizeTag<2> /* tag */,
+                                    Vec256<uint16_t> v) {
+  const Full256<uint16_t> d;
+  const RepartitionToWide<decltype(d)> d32;
+  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+HWY_API Vec256<int16_t> MaxOfLanes(hwy::SizeTag<2> /* tag */,
+                                   Vec256<int16_t> v) {
+  const Full256<int16_t> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
 }
 
 }  // namespace detail
 
-// Supported for {uif}32x8, {uif}64x4. Returns the sum in each lane.
+// Supported for {uif}{32,64},{ui}16. Returns the broadcasted result.
 template <typename T>
 HWY_API Vec256<T> SumOfLanes(Full256<T> d, const Vec256<T> vHL) {
   const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
@@ -5156,3 +5613,7 @@ HWY_API Vec256<T> MaxOfLanes(Full256<T> d, const Vec256<T> vHL) {
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
 HWY_AFTER_NAMESPACE();
+
+// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
+// the warning seems to be issued at the call site of intrinsics, i.e. our code.
+HWY_DIAGNOSTICS(pop)
diff --git a/media/highway/src/hwy/ops/x86_512-inl.h b/media/highway/src/hwy/ops/x86_512-inl.h
index 7df638d364..09b14a9374 100644
--- a/media/highway/src/hwy/ops/x86_512-inl.h
+++ b/media/highway/src/hwy/ops/x86_512-inl.h
@@ -25,7 +25,7 @@
 // Avoid uninitialized warnings in GCC's avx512fintrin.h - see
 // https://github.com/google/highway/issues/710)
 HWY_DIAGNOSTICS(push)
-#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
+#if HWY_COMPILER_GCC_ACTUAL
 HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
 HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized")
 #endif
@@ -467,7 +467,7 @@ namespace detail {
 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
 HWY_INLINE Mask512<T> FirstN(size_t n) {
   Mask512<T> m;
-  const uint32_t all = ~uint32_t(0);
+  const uint32_t all = ~uint32_t{0};
   // BZHI only looks at the lower 8 bits of n!
   m.raw = static_cast<decltype(m.raw)>((n > 255) ? all : _bzhi_u32(all, n));
   return m;
@@ -475,7 +475,7 @@ HWY_INLINE Mask512<T> FirstN(size_t n) {
 
 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
 HWY_INLINE Mask512<T> FirstN(size_t n) {
-  const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t(0);
+  const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t{0};
   return Mask512<T>{static_cast<__mmask64>(bits)};
 }
 
@@ -486,7 +486,7 @@ template <typename T>
 HWY_API Mask512<T> FirstN(const Full512<T> /*tag*/, size_t n) {
 #if HWY_ARCH_X86_64
   Mask512<T> m;
-  const uint64_t all = ~uint64_t(0);
+  const uint64_t all = ~uint64_t{0};
   // BZHI only looks at the lower 8 bits of n!
   m.raw = static_cast<decltype(m.raw)>((n > 255) ? all : _bzhi_u64(all, n));
   return m;
@@ -1164,6 +1164,22 @@ HWY_API Vec512<uint16_t> operator*(Vec512<uint16_t> a, Vec512<uint16_t> b) {
 HWY_API Vec512<uint32_t> operator*(Vec512<uint32_t> a, Vec512<uint32_t> b) {
   return Vec512<uint32_t>{_mm512_mullo_epi32(a.raw, b.raw)};
 }
+HWY_API Vec512<uint64_t> operator*(Vec512<uint64_t> a, Vec512<uint64_t> b) {
+  return Vec512<uint64_t>{_mm512_mullo_epi64(a.raw, b.raw)};
+}
+HWY_API Vec256<uint64_t> operator*(Vec256<uint64_t> a, Vec256<uint64_t> b) {
+  return Vec256<uint64_t>{_mm256_mullo_epi64(a.raw, b.raw)};
+}
+HWY_API Vec128<uint64_t> operator*(Vec128<uint64_t> a, Vec128<uint64_t> b) {
+  return Vec128<uint64_t>{_mm_mullo_epi64(a.raw, b.raw)};
+}
+
+// Per-target flag to prevent generic_ops-inl.h from defining i64 operator*.
+#ifdef HWY_NATIVE_I64MULLO
+#undef HWY_NATIVE_I64MULLO
+#else
+#define HWY_NATIVE_I64MULLO
+#endif
 
 // Signed
 HWY_API Vec512<int16_t> operator*(Vec512<int16_t> a, Vec512<int16_t> b) {
@@ -1172,7 +1188,15 @@ HWY_API Vec512<int16_t> operator*(Vec512<int16_t> a, Vec512<int16_t> b) {
 HWY_API Vec512<int32_t> operator*(Vec512<int32_t> a, Vec512<int32_t> b) {
   return Vec512<int32_t>{_mm512_mullo_epi32(a.raw, b.raw)};
 }
-
+HWY_API Vec512<int64_t> operator*(Vec512<int64_t> a, Vec512<int64_t> b) {
+  return Vec512<int64_t>{_mm512_mullo_epi64(a.raw, b.raw)};
+}
+HWY_API Vec256<int64_t> operator*(Vec256<int64_t> a, Vec256<int64_t> b) {
+  return Vec256<int64_t>{_mm256_mullo_epi64(a.raw, b.raw)};
+}
+HWY_API Vec128<int64_t> operator*(Vec128<int64_t> a, Vec128<int64_t> b) {
+  return Vec128<int64_t>{_mm_mullo_epi64(a.raw, b.raw)};
+}
 // Returns the upper 16 bits of a * b in each lane.
 HWY_API Vec512<uint16_t> MulHigh(Vec512<uint16_t> a, Vec512<uint16_t> b) {
   return Vec512<uint16_t>{_mm512_mulhi_epu16(a.raw, b.raw)};
@@ -1751,6 +1775,43 @@ HWY_INLINE Mask512<T> Xor(hwy::SizeTag<8> /*tag*/, const Mask512<T> a,
 #endif
 }
 
+template <typename T>
+HWY_INLINE Mask512<T> ExclusiveNeither(hwy::SizeTag<1> /*tag*/,
+                                       const Mask512<T> a, const Mask512<T> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask512<T>{_kxnor_mask64(a.raw, b.raw)};
+#else
+  return Mask512<T>{~(a.raw ^ b.raw)};
+#endif
+}
+template <typename T>
+HWY_INLINE Mask512<T> ExclusiveNeither(hwy::SizeTag<2> /*tag*/,
+                                       const Mask512<T> a, const Mask512<T> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask512<T>{_kxnor_mask32(a.raw, b.raw)};
+#else
+  return Mask512<T>{static_cast<__mmask32>(~(a.raw ^ b.raw) & 0xFFFFFFFF)};
+#endif
+}
+template <typename T>
+HWY_INLINE Mask512<T> ExclusiveNeither(hwy::SizeTag<4> /*tag*/,
+                                       const Mask512<T> a, const Mask512<T> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask512<T>{_kxnor_mask16(a.raw, b.raw)};
+#else
+  return Mask512<T>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)};
+#endif
+}
+template <typename T>
+HWY_INLINE Mask512<T> ExclusiveNeither(hwy::SizeTag<8> /*tag*/,
+                                       const Mask512<T> a, const Mask512<T> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask512<T>{_kxnor_mask8(a.raw, b.raw)};
+#else
+  return Mask512<T>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)};
+#endif
+}
+
 }  // namespace detail
 
 template <typename T>
@@ -1778,6 +1839,11 @@ HWY_API Mask512<T> Xor(const Mask512<T> a, Mask512<T> b) {
   return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
 }
 
+template <typename T>
+HWY_API Mask512<T> ExclusiveNeither(const Mask512<T> a, Mask512<T> b) {
+  return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
+}
+
 // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
 
 HWY_API Vec512<int8_t> BroadcastSignBit(const Vec512<int8_t> v) {
@@ -1894,40 +1960,19 @@ HWY_API Vec512<double> MaskedLoad(Mask512<double> m, Full512<double> /* tag */,
 template <typename T>
 HWY_API Vec512<T> LoadDup128(Full512<T> /* tag */,
                              const T* const HWY_RESTRICT p) {
-  // Clang 3.9 generates VINSERTF128 which is slower, but inline assembly leads
-  // to "invalid output size for constraint" without -mavx512:
-  // https://gcc.godbolt.org/z/-Jt_-F
-#if HWY_LOADDUP_ASM
-  __m512i out;
-  asm("vbroadcasti128 %1, %[reg]" : [reg] "=x"(out) : "m"(p[0]));
-  return Vec512<T>{out};
-#else
   const auto x4 = LoadU(Full128<T>(), p);
   return Vec512<T>{_mm512_broadcast_i32x4(x4.raw)};
-#endif
 }
 HWY_API Vec512<float> LoadDup128(Full512<float> /* tag */,
                                  const float* const HWY_RESTRICT p) {
-#if HWY_LOADDUP_ASM
-  __m512 out;
-  asm("vbroadcastf128 %1, %[reg]" : [reg] "=x"(out) : "m"(p[0]));
-  return Vec512<float>{out};
-#else
   const __m128 x4 = _mm_loadu_ps(p);
   return Vec512<float>{_mm512_broadcast_f32x4(x4)};
-#endif
 }
 
 HWY_API Vec512<double> LoadDup128(Full512<double> /* tag */,
                                   const double* const HWY_RESTRICT p) {
-#if HWY_LOADDUP_ASM
-  __m512d out;
-  asm("vbroadcastf128 %1, %[reg]" : [reg] "=x"(out) : "m"(p[0]));
-  return Vec512<double>{out};
-#else
   const __m128d x2 = _mm_loadu_pd(p);
   return Vec512<double>{_mm512_broadcast_f64x2(x2)};
-#endif
 }
 
 // ------------------------------ Store
@@ -2218,39 +2263,28 @@ HWY_API T GetLane(const Vec512<T> v) {
 
 // ------------------------------ ZeroExtendVector
 
-// Unfortunately the initial _mm512_castsi256_si512 intrinsic leaves the upper
-// bits undefined. Although it makes sense for them to be zero (EVEX encoded
-// instructions have that effect), a compiler could decide to optimize out code
-// that relies on this.
-//
-// The newer _mm512_zextsi256_si512 intrinsic fixes this by specifying the
-// zeroing, but it is not available on GCC until 10.1. For older GCC, we can
-// still obtain the desired code thanks to pattern recognition; note that the
-// expensive insert instruction is not actually generated, see
-// https://gcc.godbolt.org/z/1MKGaP.
-
 template <typename T>
 HWY_API Vec512<T> ZeroExtendVector(Full512<T> /* tag */, Vec256<T> lo) {
-#if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
-  return Vec512<T>{_mm512_inserti32x8(_mm512_setzero_si512(), lo.raw, 0)};
-#else
+#if HWY_HAVE_ZEXT  // See definition/comment in x86_256-inl.h.
   return Vec512<T>{_mm512_zextsi256_si512(lo.raw)};
+#else
+  return Vec512<T>{_mm512_inserti32x8(_mm512_setzero_si512(), lo.raw, 0)};
 #endif
 }
 HWY_API Vec512<float> ZeroExtendVector(Full512<float> /* tag */,
                                        Vec256<float> lo) {
-#if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
-  return Vec512<float>{_mm512_insertf32x8(_mm512_setzero_ps(), lo.raw, 0)};
-#else
+#if HWY_HAVE_ZEXT
   return Vec512<float>{_mm512_zextps256_ps512(lo.raw)};
+#else
+  return Vec512<float>{_mm512_insertf32x8(_mm512_setzero_ps(), lo.raw, 0)};
 #endif
 }
 HWY_API Vec512<double> ZeroExtendVector(Full512<double> /* tag */,
                                         Vec256<double> lo) {
-#if !HWY_COMPILER_CLANG && HWY_COMPILER_GCC && (HWY_COMPILER_GCC < 1000)
-  return Vec512<double>{_mm512_insertf64x4(_mm512_setzero_pd(), lo.raw, 0)};
-#else
+#if HWY_HAVE_ZEXT
   return Vec512<double>{_mm512_zextpd256_pd512(lo.raw)};
+#else
+  return Vec512<double>{_mm512_insertf64x4(_mm512_setzero_pd(), lo.raw, 0)};
 #endif
 }
 
@@ -3293,6 +3327,11 @@ HWY_API Vec512<bfloat16_t> ReorderDemote2To(Full512<bfloat16_t> dbf16,
   return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
 }
 
+HWY_API Vec512<int16_t> ReorderDemote2To(Full512<int16_t> /*d16*/,
+                                         Vec512<int32_t> a, Vec512<int32_t> b) {
+  return Vec512<int16_t>{_mm512_packs_epi32(a.raw, b.raw)};
+}
+
 HWY_API Vec256<float> DemoteTo(Full256<float> /* tag */,
                                const Vec512<double> v) {
   return Vec256<float>{_mm512_cvtpd_ps(v.raw)};
@@ -3319,6 +3358,106 @@ HWY_API Vec128<uint8_t, 16> U8FromU32(const Vec512<uint32_t> v) {
   return LowerHalf(LowerHalf(bytes));
 }
 
+// ------------------------------ Truncations
+
+HWY_API Vec128<uint8_t, 8> TruncateTo(Simd<uint8_t, 8, 0> d,
+                                      const Vec512<uint64_t> v) {
+#if HWY_TARGET == HWY_AVX3_DL
+  (void)d;
+  const Full512<uint8_t> d8;
+  alignas(16) static constexpr uint8_t k8From64[16] = {
+    0, 8, 16, 24, 32, 40, 48, 56, 0, 8, 16, 24, 32, 40, 48, 56};
+  const Vec512<uint8_t> bytes{
+      _mm512_permutexvar_epi8(LoadDup128(d8, k8From64).raw, v.raw)};
+  return LowerHalf(LowerHalf(LowerHalf(bytes)));
+#else
+  const Full512<uint32_t> d32;
+  alignas(64) constexpr uint32_t kEven[16] = {0, 2, 4, 6, 8, 10, 12, 14,
+                                              0, 2, 4, 6, 8, 10, 12, 14};
+  const Vec512<uint32_t> even{
+      _mm512_permutexvar_epi32(Load(d32, kEven).raw, v.raw)};
+  return TruncateTo(d, LowerHalf(even));
+#endif
+}
+
+HWY_API Vec128<uint16_t, 8> TruncateTo(Simd<uint16_t, 8, 0> /* tag */,
+                                       const Vec512<uint64_t> v) {
+  const Full512<uint16_t> d16;
+  alignas(16) static constexpr uint16_t k16From64[8] = {
+      0, 4, 8, 12, 16, 20, 24, 28};
+  const Vec512<uint16_t> bytes{
+      _mm512_permutexvar_epi16(LoadDup128(d16, k16From64).raw, v.raw)};
+  return LowerHalf(LowerHalf(bytes));
+}
+
+HWY_API Vec256<uint32_t> TruncateTo(Simd<uint32_t, 8, 0> /* tag */,
+                                    const Vec512<uint64_t> v) {
+  const Full512<uint32_t> d32;
+  alignas(64) constexpr uint32_t kEven[16] = {0, 2, 4, 6, 8, 10, 12, 14,
+                                              0, 2, 4, 6, 8, 10, 12, 14};
+  const Vec512<uint32_t> even{
+      _mm512_permutexvar_epi32(Load(d32, kEven).raw, v.raw)};
+  return LowerHalf(even);
+}
+
+HWY_API Vec128<uint8_t, 16> TruncateTo(Simd<uint8_t, 16, 0> /* tag */,
+                                       const Vec512<uint32_t> v) {
+#if HWY_TARGET == HWY_AVX3_DL
+  const Full512<uint8_t> d8;
+  alignas(16) static constexpr uint8_t k8From32[16] = {
+    0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60};
+  const Vec512<uint8_t> bytes{
+      _mm512_permutexvar_epi32(LoadDup128(d8, k8From32).raw, v.raw)};
+#else
+  const Full512<uint32_t> d32;
+  // In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the
+  // lowest 4 bytes.
+  alignas(16) static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u,
+                                                       ~0u};
+  const auto quads = TableLookupBytes(v, LoadDup128(d32, k8From32));
+  // Gather the lowest 4 bytes of 4 128-bit blocks.
+  alignas(16) static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12};
+  const Vec512<uint8_t> bytes{
+      _mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)};
+#endif
+  return LowerHalf(LowerHalf(bytes));
+}
+
+HWY_API Vec256<uint16_t> TruncateTo(Simd<uint16_t, 16, 0> /* tag */,
+                                    const Vec512<uint32_t> v) {
+  const Full512<uint16_t> d16;
+  alignas(64) static constexpr uint16_t k16From32[32] = {
+      0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+      0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
+  const Vec512<uint16_t> bytes{
+      _mm512_permutexvar_epi16(Load(d16, k16From32).raw, v.raw)};
+  return LowerHalf(bytes);
+}
+
+HWY_API Vec256<uint8_t> TruncateTo(Simd<uint8_t, 32, 0> /* tag */,
+                                   const Vec512<uint16_t> v) {
+#if HWY_TARGET == HWY_AVX3_DL
+  const Full512<uint8_t> d8;
+  alignas(64) static constexpr uint8_t k8From16[64] = {
+     0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+    32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+     0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
+    32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
+  const Vec512<uint8_t> bytes{
+      _mm512_permutexvar_epi8(Load(d8, k8From16).raw, v.raw)};
+#else
+  const Full512<uint32_t> d32;
+  alignas(16) static constexpr uint32_t k16From32[4] = {
+      0x06040200u, 0x0E0C0A08u, 0x06040200u, 0x0E0C0A08u};
+  const auto quads = TableLookupBytes(v, LoadDup128(d32, k16From32));
+  alignas(64) static constexpr uint32_t kIndex32[16] = {
+      0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
+  const Vec512<uint8_t> bytes{
+      _mm512_permutexvar_epi32(Load(d32, kIndex32).raw, quads.raw)};
+#endif
+  return LowerHalf(bytes);
+}
+
 // ------------------------------ Convert integer <=> floating point
 
 HWY_API Vec512<float> ConvertTo(Full512<float> /* tag */,
@@ -3331,6 +3470,16 @@ HWY_API Vec512<double> ConvertTo(Full512<double> /* tag */,
   return Vec512<double>{_mm512_cvtepi64_pd(v.raw)};
 }
 
+HWY_API Vec512<float> ConvertTo(Full512<float> /* tag*/,
+                                const Vec512<uint32_t> v) {
+  return Vec512<float>{_mm512_cvtepu32_ps(v.raw)};
+}
+
+HWY_API Vec512<double> ConvertTo(Full512<double> /* tag*/,
+                                const Vec512<uint64_t> v) {
+  return Vec512<double>{_mm512_cvtepu64_pd(v.raw)};
+}
+
 // Truncates (rounds toward zero).
 HWY_API Vec512<int32_t> ConvertTo(Full512<int32_t> d, const Vec512<float> v) {
   return detail::FixConversionOverflow(d, v, _mm512_cvttps_epi32(v.raw));
@@ -3544,15 +3693,21 @@ HWY_API size_t CountTrue(const Full512<T> /* tag */, const Mask512<T> mask) {
 }
 
 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
-HWY_API intptr_t FindFirstTrue(const Full512<T> /* tag */,
-                               const Mask512<T> mask) {
-  return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask.raw)) : -1;
+HWY_API size_t FindKnownFirstTrue(const Full512<T> /* tag */,
+                                  const Mask512<T> mask) {
+  return Num0BitsBelowLS1Bit_Nonzero32(mask.raw);
 }
 
 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API intptr_t FindFirstTrue(const Full512<T> /* tag */,
-                               const Mask512<T> mask) {
-  return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask.raw)) : -1;
+HWY_API size_t FindKnownFirstTrue(const Full512<T> /* tag */,
+                                  const Mask512<T> mask) {
+  return Num0BitsBelowLS1Bit_Nonzero64(mask.raw);
+}
+
+template <typename T>
+HWY_API intptr_t FindFirstTrue(const Full512<T> d, const Mask512<T> mask) {
+  return mask.raw ? static_cast<intptr_t>(FindKnownFirstTrue(d, mask))
+                  : intptr_t{-1};
 }
 
 // ------------------------------ Compress
@@ -3570,6 +3725,9 @@ template <typename T, HWY_IF_LANE_SIZE(T, 8)>
 HWY_API Vec512<T> Compress(Vec512<T> v, Mask512<T> mask) {
   // See CompressIsPartition. u64 is faster than u32.
   alignas(16) constexpr uint64_t packed_array[256] = {
+      // From PrintCompress32x8Tables, without the FirstN extension (there is
+      // no benefit to including them because 64-bit CompressStore is anyway
+      // masked, but also no harm because TableLookupLanes ignores the MSB).
       0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
       0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
       0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
@@ -3678,7 +3836,7 @@ HWY_API Vec512<T> Compress(Vec512<T> v, const Mask512<T> mask) {
       0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
       0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
-  const auto idx = LoadU(du, iota + 32 - num0);
+  const Vec512<uint16_t> idx = LoadU(du, iota + 32 - num0);
   const Vec512<uint16_t> cu{_mm512_mask_permutexvar_epi16(
       demoted0.raw, m_upper, idx.raw, demoted1.raw)};
 #endif  // HWY_TARGET == HWY_AVX3_DL
@@ -3686,6 +3844,79 @@ HWY_API Vec512<T> Compress(Vec512<T> v, const Mask512<T> mask) {
   return BitCast(d, cu);
 }
 
+// ------------------------------ CompressNot
+
+template <typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
+HWY_API Vec512<T> CompressNot(Vec512<T> v, const Mask512<T> mask) {
+  return Compress(v, Not(mask));
+}
+
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API Vec512<T> CompressNot(Vec512<T> v, Mask512<T> mask) {
+  // See CompressIsPartition. u64 is faster than u32.
+  alignas(16) constexpr uint64_t packed_array[256] = {
+      // From PrintCompressNot32x8Tables, without the FirstN extension (there is
+      // no benefit to including them because 64-bit CompressStore is anyway
+      // masked, but also no harm because TableLookupLanes ignores the MSB).
+      0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
+      0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
+      0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
+      0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653,
+      0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651,
+      0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432,
+      0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421,
+      0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764,
+      0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631,
+      0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762,
+      0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321,
+      0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543,
+      0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541,
+      0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532,
+      0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521,
+      0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075,
+      0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431,
+      0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742,
+      0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321,
+      0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073,
+      0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071,
+      0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432,
+      0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421,
+      0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654,
+      0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531,
+      0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652,
+      0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321,
+      0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643,
+      0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641,
+      0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632,
+      0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621,
+      0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106,
+      0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431,
+      0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542,
+      0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321,
+      0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053,
+      0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051,
+      0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432,
+      0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421,
+      0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104,
+      0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031,
+      0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102,
+      0x76543210, 0x76543201, 0x76543210, 0x76543210};
+
+  // For lane i, shift the i-th 4-bit index down to bits [0, 3) -
+  // _mm512_permutexvar_epi64 will ignore the upper bits.
+  const Full512<T> d;
+  const RebindToUnsigned<decltype(d)> du64;
+  const auto packed = Set(du64, packed_array[mask.raw]);
+  alignas(64) constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
+  const auto indices = Indices512<T>{(packed >> Load(du64, shifts)).raw};
+  return TableLookupLanes(v, indices);
+}
+
+HWY_API Vec512<uint64_t> CompressBlocksNot(Vec512<uint64_t> v,
+                                           Mask512<uint64_t> mask) {
+  return CompressNot(v, mask);
+}
+
 // ------------------------------ CompressBits
 template <typename T>
 HWY_API Vec512<T> CompressBits(Vec512<T> v, const uint8_t* HWY_RESTRICT bits) {
@@ -4050,6 +4281,14 @@ HWY_API Vec512<float> ReorderWidenMulAccumulate(Full512<float> df32,
   return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
 }
 
+HWY_API Vec512<int32_t> ReorderWidenMulAccumulate(Full512<int32_t> /*d32*/,
+                                                  Vec512<int16_t> a,
+                                                  Vec512<int16_t> b,
+                                                  const Vec512<int32_t> sum0,
+                                                  Vec512<int32_t>& /*sum1*/) {
+  return sum0 + Vec512<int32_t>{_mm512_madd_epi16(a.raw, b.raw)};
+}
+
 // ------------------------------ Reductions
 
 // Returns the sum in each lane.
@@ -4071,6 +4310,23 @@ HWY_API Vec512<float> SumOfLanes(Full512<float> d, Vec512<float> v) {
 HWY_API Vec512<double> SumOfLanes(Full512<double> d, Vec512<double> v) {
   return Set(d, _mm512_reduce_add_pd(v.raw));
 }
+HWY_API Vec512<uint16_t> SumOfLanes(Full512<uint16_t> d, Vec512<uint16_t> v) {
+  const RepartitionToWide<decltype(d)> d32;
+  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto sum = SumOfLanes(d32, even + odd);
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
+}
+HWY_API Vec512<int16_t> SumOfLanes(Full512<int16_t> d, Vec512<int16_t> v) {
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto sum = SumOfLanes(d32, even + odd);
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
+}
 
 // Returns the minimum in each lane.
 HWY_API Vec512<int32_t> MinOfLanes(Full512<int32_t> d, Vec512<int32_t> v) {
@@ -4091,14 +4347,22 @@ HWY_API Vec512<float> MinOfLanes(Full512<float> d, Vec512<float> v) {
 HWY_API Vec512<double> MinOfLanes(Full512<double> d, Vec512<double> v) {
   return Set(d, _mm512_reduce_min_pd(v.raw));
 }
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec512<T> MinOfLanes(Full512<T> d, Vec512<T> v) {
-  const Repartition<int32_t, decltype(d)> d32;
+HWY_API Vec512<uint16_t> MinOfLanes(Full512<uint16_t> d, Vec512<uint16_t> v) {
+  const RepartitionToWide<decltype(d)> d32;
   const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
   const auto odd = ShiftRight<16>(BitCast(d32, v));
   const auto min = MinOfLanes(d32, Min(even, odd));
   // Also broadcast into odd lanes.
-  return BitCast(d, Or(min, ShiftLeft<16>(min)));
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+HWY_API Vec512<int16_t> MinOfLanes(Full512<int16_t> d, Vec512<int16_t> v) {
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MinOfLanes(d32, Min(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
 }
 
 // Returns the maximum in each lane.
@@ -4120,14 +4384,22 @@ HWY_API Vec512<float> MaxOfLanes(Full512<float> d, Vec512<float> v) {
 HWY_API Vec512<double> MaxOfLanes(Full512<double> d, Vec512<double> v) {
   return Set(d, _mm512_reduce_max_pd(v.raw));
 }
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec512<T> MaxOfLanes(Full512<T> d, Vec512<T> v) {
-  const Repartition<int32_t, decltype(d)> d32;
+HWY_API Vec512<uint16_t> MaxOfLanes(Full512<uint16_t> d, Vec512<uint16_t> v) {
+  const RepartitionToWide<decltype(d)> d32;
   const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
   const auto odd = ShiftRight<16>(BitCast(d32, v));
   const auto min = MaxOfLanes(d32, Max(even, odd));
   // Also broadcast into odd lanes.
-  return BitCast(d, Or(min, ShiftLeft<16>(min)));
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
+}
+HWY_API Vec512<int16_t> MaxOfLanes(Full512<int16_t> d, Vec512<int16_t> v) {
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto min = MaxOfLanes(d32, Max(even, odd));
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/media/highway/src/hwy/per_target.cc b/media/highway/src/hwy/per_target.cc
new file mode 100644
index 0000000000..4cbf152328
--- /dev/null
+++ b/media/highway/src/hwy/per_target.cc
@@ -0,0 +1,50 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/per_target.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/per_target.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+// On SVE, Lanes rounds down to a power of two, but we want to know the actual
+// size here. Otherwise, hypothetical SVE with 48 bytes would round down to 32
+// and we'd enable HWY_SVE_256, and then fail reverse_test because Reverse on
+// HWY_SVE_256 requires the actual vector to be a power of two.
+#if HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE_256
+size_t GetVectorBytes() { return detail::AllHardwareLanes(hwy::SizeTag<1>()); }
+#else
+size_t GetVectorBytes() { return Lanes(ScalableTag<uint8_t>()); }
+#endif
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(GetVectorBytes);  // Local function.
+}  // namespace
+
+size_t VectorBytes() { return HWY_DYNAMIC_DISPATCH(GetVectorBytes)(); }
+
+}  // namespace hwy
+#endif  // HWY_ONCE
diff --git a/media/highway/src/hwy/per_target.h b/media/highway/src/hwy/per_target.h
new file mode 100644
index 0000000000..da85de3226
--- /dev/null
+++ b/media/highway/src/hwy/per_target.h
@@ -0,0 +1,37 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_PER_TARGET_H_
+#define HIGHWAY_HWY_PER_TARGET_H_
+
+#include <stddef.h>
+
+// Per-target functions.
+
+namespace hwy {
+
+// Returns size in bytes of a vector, i.e. `Lanes(ScalableTag<uint8_t>())`.
+//
+// Do not cache the result, which may change after calling DisableTargets, or
+// if software requests a different vector size (e.g. when entering/exiting SME
+// streaming mode). Instead call this right before the code that depends on the
+// result, without any DisableTargets or SME transition in-between. Note that
+// this involves an indirect call, so prefer not to call this frequently nor
+// unnecessarily.
+size_t VectorBytes();
+
+}  // namespace hwy
+
+#endif  // HIGHWAY_HWY_PER_TARGET_H_
diff --git a/media/highway/src/hwy/print-inl.h b/media/highway/src/hwy/print-inl.h
index 5223f3e608..d256657ebc 100644
--- a/media/highway/src/hwy/print-inl.h
+++ b/media/highway/src/hwy/print-inl.h
@@ -15,11 +15,11 @@
 
 // Print() function
 
-#include <inttypes.h>
 #include <stdint.h>
 
-#include "hwy/base.h"
-#include "hwy/tests/test_util.h"
+#include "hwy/aligned_allocator.h"
+#include "hwy/highway.h"
+#include "hwy/print.h"
 
 // Per-target include guard
 #if defined(HIGHWAY_HWY_PRINT_INL_H_) == \
@@ -34,32 +34,12 @@ HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
 
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_NOINLINE void PrintValue(T value) {
-  uint8_t byte;
-  CopyBytes<1>(&value, &byte);  // endian-safe: we ensured sizeof(T)=1.
-  fprintf(stderr, "0x%02X,", byte);
-}
-
-#if HWY_HAVE_FLOAT16
-HWY_NOINLINE void PrintValue(float16_t value) {
-  uint16_t bits;
-  CopyBytes<2>(&value, &bits);
-  fprintf(stderr, "0x%02X,", bits);
-}
-#endif
-
-template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
-HWY_NOINLINE void PrintValue(T value) {
-  fprintf(stderr, "%g,", double(value));
-}
-
 // Prints lanes around `lane`, in memory order.
 template <class D, class V = Vec<D>>
 void Print(const D d, const char* caption, VecArg<V> v, size_t lane_u = 0,
            size_t max_lanes = 7) {
-  using T = TFromD<D>;
   const size_t N = Lanes(d);
+  using T = TFromD<D>;
   auto lanes = AllocateAligned<T>(N);
   Store(v, d, lanes.get());
 
diff --git a/media/highway/src/hwy/print.cc b/media/highway/src/hwy/print.cc
index 1ceed59a25..0b52cde1b9 100644
--- a/media/highway/src/hwy/print.cc
+++ b/media/highway/src/hwy/print.cc
@@ -15,6 +15,9 @@
 
 #include "hwy/print.h"
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
 #include <inttypes.h>
 #include <stddef.h>
 #include <stdio.h>
@@ -28,12 +31,13 @@ HWY_DLLEXPORT void TypeName(const TypeInfo& info, size_t N, char* string100) {
   const char prefix = info.is_float ? 'f' : (info.is_signed ? 'i' : 'u');
   // Omit the xN suffix for scalars.
   if (N == 1) {
-    snprintf(string100, 64, "%c%" PRIu64, prefix,
-             static_cast<uint64_t>(info.sizeof_t * 8));
+    // NOLINTNEXTLINE
+    snprintf(string100, 64, "%c%d", prefix,
+             static_cast<int>(info.sizeof_t * 8));
   } else {
-    snprintf(string100, 64, "%c%" PRIu64 "x%" PRIu64, prefix,
-             static_cast<uint64_t>(info.sizeof_t * 8),
-             static_cast<uint64_t>(N));
+    // NOLINTNEXTLINE
+    snprintf(string100, 64, "%c%dx%d", prefix,
+             static_cast<int>(info.sizeof_t * 8), static_cast<int>(N));
   }
 }
 
@@ -42,39 +46,39 @@ HWY_DLLEXPORT void ToString(const TypeInfo& info, const void* ptr,
   if (info.sizeof_t == 1) {
     uint8_t byte;
     CopyBytes<1>(ptr, &byte);  // endian-safe: we ensured sizeof(T)=1.
-    snprintf(string100, 100, "0x%02X", byte);
+    snprintf(string100, 100, "0x%02X", byte);  // NOLINT
   } else if (info.sizeof_t == 2) {
     uint16_t bits;
     CopyBytes<2>(ptr, &bits);
-    snprintf(string100, 100, "0x%04X", bits);
+    snprintf(string100, 100, "0x%04X", bits);  // NOLINT
   } else if (info.sizeof_t == 4) {
     if (info.is_float) {
       float value;
       CopyBytes<4>(ptr, &value);
-      snprintf(string100, 100, "%g", double(value));
+      snprintf(string100, 100, "%g", static_cast<double>(value));  // NOLINT
     } else if (info.is_signed) {
       int32_t value;
       CopyBytes<4>(ptr, &value);
-      snprintf(string100, 100, "%d", value);
+      snprintf(string100, 100, "%d", value);  // NOLINT
     } else {
       uint32_t value;
       CopyBytes<4>(ptr, &value);
-      snprintf(string100, 100, "%u", value);
+      snprintf(string100, 100, "%u", value);  // NOLINT
     }
   } else {
     HWY_ASSERT(info.sizeof_t == 8);
     if (info.is_float) {
       double value;
       CopyBytes<8>(ptr, &value);
-      snprintf(string100, 100, "%g", value);
+      snprintf(string100, 100, "%g", value);  // NOLINT
     } else if (info.is_signed) {
       int64_t value;
       CopyBytes<8>(ptr, &value);
-      snprintf(string100, 100, "%" PRIi64 "", value);
+      snprintf(string100, 100, "%" PRIi64 "", value);  // NOLINT
     } else {
       uint64_t value;
       CopyBytes<8>(ptr, &value);
-      snprintf(string100, 100, "%" PRIu64 "", value);
+      snprintf(string100, 100, "%" PRIu64 "", value);  // NOLINT
     }
   }
 }
diff --git a/media/highway/src/hwy/print.h b/media/highway/src/hwy/print.h
index b2aaf5ab07..13792866a3 100644
--- a/media/highway/src/hwy/print.h
+++ b/media/highway/src/hwy/print.h
@@ -19,6 +19,7 @@
 // Helpers for printing vector lanes.
 
 #include <stddef.h>
+#include <stdio.h>
 
 #include "hwy/base.h"
 #include "hwy/highway_export.h"
@@ -53,6 +54,20 @@ HWY_DLLEXPORT void PrintArray(const TypeInfo& info, const char* caption,
                               size_t lane_u = 0, size_t max_lanes = 7);
 
 }  // namespace detail
+
+template <typename T>
+HWY_NOINLINE void PrintValue(T value) {
+  char str[100];
+  detail::ToString(hwy::detail::MakeTypeInfo<T>(), &value, str);
+  fprintf(stderr, "%s,", str);
+}
+
+template <typename T>
+HWY_NOINLINE void PrintArray(const T* value, size_t count) {
+  detail::PrintArray(hwy::detail::MakeTypeInfo<T>(), "", value, count, 0,
+                     count);
+}
+
 }  // namespace hwy
 
 #endif  // HWY_PRINT_H_
diff --git a/media/highway/src/hwy/targets.cc b/media/highway/src/hwy/targets.cc
index ec7705342f..2fde4db9ac 100644
--- a/media/highway/src/hwy/targets.cc
+++ b/media/highway/src/hwy/targets.cc
@@ -15,6 +15,10 @@
 
 #include "hwy/targets.h"
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
+#include <inttypes.h>  // PRIx64
 #include <stdarg.h>
 #include <stddef.h>
 #include <stdint.h>
@@ -22,7 +26,7 @@
 
 #include <atomic>
 
-#include "hwy/base.h"
+#include "hwy/per_target.h"  // VectorBytes
 
 #if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
 #include "sanitizer/common_interface_defs.h"  // __sanitizer_print_stack_trace
@@ -37,7 +41,11 @@
 #else  // !HWY_COMPILER_MSVC
 #include <cpuid.h>
 #endif  // HWY_COMPILER_MSVC
-#endif  // HWY_ARCH_X86
+
+#elif HWY_ARCH_ARM && HWY_OS_LINUX
+#include <asm/hwcap.h>
+#include <sys/auxv.h>
+#endif  // HWY_ARCH_*
 
 namespace hwy {
 namespace {
@@ -58,7 +66,7 @@ HWY_INLINE void Cpuid(const uint32_t level, const uint32_t count,
   for (int i = 0; i < 4; ++i) {
     abcd[i] = regs[i];
   }
-#else  // HWY_COMPILER_MSVC
+#else   // HWY_COMPILER_MSVC
   uint32_t a;
   uint32_t b;
   uint32_t c;
@@ -76,7 +84,7 @@ HWY_INLINE void Cpuid(const uint32_t level, const uint32_t count,
 uint32_t ReadXCR0() {
 #if HWY_COMPILER_MSVC
   return static_cast<uint32_t>(_xgetbv(0));
-#else  // HWY_COMPILER_MSVC
+#else   // HWY_COMPILER_MSVC
   uint32_t xcr0, xcr0_high;
   const uint32_t index = 0;
   asm volatile(".byte 0x0F, 0x01, 0xD0"
@@ -88,15 +96,12 @@ uint32_t ReadXCR0() {
 
 #endif  // HWY_ARCH_X86
 
-// Not function-local => no compiler-generated locking.
-std::atomic<uint32_t> supported_{0};  // Not yet initialized
-
 // When running tests, this value can be set to the mocked supported targets
 // mask. Only written to from a single thread before the test starts.
-uint32_t supported_targets_for_test_ = 0;
+int64_t supported_targets_for_test_ = 0;
 
 // Mask of targets disabled at runtime with DisableTargets.
-uint32_t supported_mask_{LimitsMax<uint32_t>()};
+int64_t supported_mask_ = LimitsMax<int64_t>();
 
 #if HWY_ARCH_X86
 // Arbritrary bit indices indicating which instruction set extensions are
@@ -184,77 +189,13 @@ constexpr uint64_t kGroupAVX3_DL =
 
 #endif  // HWY_ARCH_X86
 
-}  // namespace
-
-HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
-    Abort(const char* file, int line, const char* format, ...) {
-  char buf[2000];
-  va_list args;
-  va_start(args, format);
-  vsnprintf(buf, sizeof(buf), format, args);
-  va_end(args);
-
-  fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
-
-// If compiled with any sanitizer, they can also print a stack trace.
-#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
-  __sanitizer_print_stack_trace();
-#endif  // HWY_IS_*
-  fflush(stderr);
-
-// Now terminate the program:
-#if HWY_ARCH_RVV
-  exit(1);  // trap/abort just freeze Spike.
-#elif HWY_IS_DEBUG_BUILD && !HWY_COMPILER_MSVC
-  // Facilitates breaking into a debugger, but don't use this in non-debug
-  // builds because it looks like "illegal instruction", which is misleading.
-  __builtin_trap();
-#else
-  abort();  // Compile error without this due to HWY_NORETURN.
-#endif
-}
-
-HWY_DLLEXPORT void DisableTargets(uint32_t disabled_targets) {
-  supported_mask_ =
-      ~(disabled_targets & ~static_cast<uint32_t>(HWY_ENABLED_BASELINE));
-  // We can call Update() here to initialize the mask but that will trigger a
-  // call to SupportedTargets() which we use in tests to tell whether any of the
-  // highway dynamic dispatch functions were used.
-  GetChosenTarget().DeInit();
-}
-
-HWY_DLLEXPORT void SetSupportedTargetsForTest(uint32_t targets) {
-  // Reset the cached supported_ value to 0 to force a re-evaluation in the
-  // next call to SupportedTargets() which will use the mocked value set here
-  // if not zero.
-  supported_.store(0, std::memory_order_release);
-  supported_targets_for_test_ = targets;
-  GetChosenTarget().DeInit();
-}
-
-HWY_DLLEXPORT bool SupportedTargetsCalledForTest() {
-  return supported_.load(std::memory_order_acquire) != 0;
-}
-
-HWY_DLLEXPORT uint32_t SupportedTargets() {
-  uint32_t bits = supported_.load(std::memory_order_acquire);
-  // Already initialized?
-  if (HWY_LIKELY(bits != 0)) {
-    return bits & supported_mask_;
-  }
-
-  // When running tests, this allows to mock the current supported targets.
-  if (HWY_UNLIKELY(supported_targets_for_test_ != 0)) {
-    // Store the value to signal that this was used.
-    supported_.store(supported_targets_for_test_, std::memory_order_release);
-    return supported_targets_for_test_ & supported_mask_;
-  }
-
-#if defined(HWY_COMPILE_ONLY_SCALAR)
-  bits = HWY_SCALAR;
-#else
-  bits = HWY_EMU128;
-#endif
+// Returns targets supported by the CPU, independently of DisableTargets.
+// Factored out of SupportedTargets to make its structure more obvious. Note
+// that x86 CPUID may take several hundred cycles.
+int64_t DetectTargets() {
+  // Apps will use only one of these (the default is EMU128), but compile flags
+  // for this TU may differ from that of the app, so allow both.
+  int64_t bits = HWY_SCALAR | HWY_EMU128;
 
 #if HWY_ARCH_X86
   bool has_osxsave = false;
@@ -327,8 +268,8 @@ HWY_DLLEXPORT uint32_t SupportedTargets() {
   // are not preserved across context switches.
   if (has_osxsave) {
     const uint32_t xcr0 = ReadXCR0();
-    const uint32_t min_avx3 = HWY_AVX3 | HWY_AVX3_DL;
-    const uint32_t min_avx2 = HWY_AVX2 | min_avx3;
+    const int64_t min_avx3 = HWY_AVX3 | HWY_AVX3_DL;
+    const int64_t min_avx2 = HWY_AVX2 | min_avx3;
     // XMM
     if (!IsBitSet(xcr0, 1)) {
       bits &= ~(HWY_SSSE3 | HWY_SSE4 | min_avx2);
@@ -344,10 +285,58 @@ HWY_DLLEXPORT uint32_t SupportedTargets() {
   }
 
   if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) {
-    fprintf(stderr, "WARNING: CPU supports %zx but software requires %x\n",
-            size_t(bits), HWY_ENABLED_BASELINE);
+    fprintf(stderr,
+            "WARNING: CPU supports %" PRIx64 " but software requires %" PRIx64
+            "\n",
+            bits, static_cast<int64_t>(HWY_ENABLED_BASELINE));
   }
-#else
+
+#elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
+  using CapBits = unsigned long;  // NOLINT
+  const CapBits hw = getauxval(AT_HWCAP);
+  (void)hw;
+
+#if HWY_ARCH_ARM_A64
+
+#if defined(HWCAP_AES)
+  // aarch64 always has NEON and VFPv4, but not necessarily AES, which we
+  // require and thus must still check for.
+  if (hw & HWCAP_AES) {
+    bits |= HWY_NEON;
+  }
+#endif  // HWCAP_AES
+
+#if defined(HWCAP_SVE)
+  if (hw & HWCAP_SVE) {
+    bits |= HWY_SVE;
+  }
+#endif
+
+#if defined(HWCAP2_SVE2) && defined(HWCAP2_SVEAES)
+  const CapBits hw2 = getauxval(AT_HWCAP2);
+  if ((hw2 & HWCAP2_SVE2) && (hw2 & HWCAP2_SVEAES)) {
+    bits |= HWY_SVE2;
+  }
+#endif
+
+#else  // HWY_ARCH_ARM_A64
+
+// Some old auxv.h / hwcap.h do not define these. If not, treat as unsupported.
+// Note that AES has a different HWCAP bit compared to aarch64.
+#if defined(HWCAP_NEON) && defined(HWCAP_VFPv4)
+  if ((hw & HWCAP_NEON) && (hw & HWCAP_VFPv4)) {
+    bits |= HWY_NEON;
+  }
+#endif
+
+#endif  // HWY_ARCH_ARM_A64
+  if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) {
+    fprintf(stderr,
+            "WARNING: CPU supports %" PRIx64 " but software requires %" PRIx64
+            "\n",
+            bits, static_cast<int64_t>(HWY_ENABLED_BASELINE));
+  }
+#else   // HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
   // TODO(janwas): detect for other platforms and check for baseline
   // This file is typically compiled without HWY_IS_TEST, but targets_test has
   // it set, and will expect all of its HWY_TARGETS (= all attainable) to be
@@ -355,8 +344,86 @@ HWY_DLLEXPORT uint32_t SupportedTargets() {
   bits |= HWY_ENABLED_BASELINE;
 #endif  // HWY_ARCH_X86
 
-  supported_.store(bits, std::memory_order_release);
-  return bits & supported_mask_;
+  return bits;
+}
+
+}  // namespace
+
+HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
+    Abort(const char* file, int line, const char* format, ...) {
+  char buf[2000];
+  va_list args;
+  va_start(args, format);
+  vsnprintf(buf, sizeof(buf), format, args);
+  va_end(args);
+
+  fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
+
+// If compiled with any sanitizer, they can also print a stack trace.
+#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
+  __sanitizer_print_stack_trace();
+#endif  // HWY_IS_*
+  fflush(stderr);
+
+// Now terminate the program:
+#if HWY_ARCH_RVV
+  exit(1);  // trap/abort just freeze Spike.
+#elif HWY_IS_DEBUG_BUILD && !HWY_COMPILER_MSVC
+  // Facilitates breaking into a debugger, but don't use this in non-debug
+  // builds because it looks like "illegal instruction", which is misleading.
+  __builtin_trap();
+#else
+  abort();  // Compile error without this due to HWY_NORETURN.
+#endif
+}
+
+HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets) {
+  supported_mask_ = static_cast<int64_t>(~disabled_targets);
+  // This will take effect on the next call to SupportedTargets, which is
+  // called right before GetChosenTarget::Update. However, calling Update here
+  // would make it appear that HWY_DYNAMIC_DISPATCH was called, which we want
+  // to check in tests. We instead de-initialize such that the next
+  // HWY_DYNAMIC_DISPATCH calls GetChosenTarget::Update via FunctionCache.
+  GetChosenTarget().DeInit();
+}
+
+HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets) {
+  supported_targets_for_test_ = targets;
+  GetChosenTarget().DeInit();  // see comment above
+}
+
+HWY_DLLEXPORT int64_t SupportedTargets() {
+  int64_t targets = supported_targets_for_test_;
+  if (HWY_LIKELY(targets == 0)) {
+    // Mock not active. Re-detect instead of caching just in case we're on a
+    // heterogeneous ISA (also requires some app support to pin threads). This
+    // is only reached on the first HWY_DYNAMIC_DISPATCH or after each call to
+    // DisableTargets or SetSupportedTargetsForTest.
+    targets = DetectTargets();
+
+    // VectorBytes invokes HWY_DYNAMIC_DISPATCH. To prevent infinite recursion,
+    // first set up ChosenTarget. No need to Update() again afterwards with the
+    // final targets - that will be done by a caller of this function.
+    GetChosenTarget().Update(targets);
+
+    // Now that we can call VectorBytes, check for targets with specific sizes.
+    if (HWY_ARCH_ARM_A64) {
+      const size_t vec_bytes = VectorBytes();  // uncached, see declaration
+      if ((targets & HWY_SVE) && vec_bytes == 32) {
+        targets = static_cast<int64_t>(targets | HWY_SVE_256);
+      } else {
+        targets = static_cast<int64_t>(targets & ~HWY_SVE_256);
+      }
+      if ((targets & HWY_SVE2) && vec_bytes == 16) {
+        targets = static_cast<int64_t>(targets | HWY_SVE2_128);
+      } else {
+        targets = static_cast<int64_t>(targets & ~HWY_SVE2_128);
+      }
+    }  // HWY_ARCH_ARM_A64
+  }
+
+  targets &= supported_mask_;
+  return targets == 0 ? HWY_STATIC_TARGET : targets;
 }
 
 HWY_DLLEXPORT ChosenTarget& GetChosenTarget() {
@@ -364,14 +431,4 @@ HWY_DLLEXPORT ChosenTarget& GetChosenTarget() {
   return chosen_target;
 }
 
-HWY_DLLEXPORT void ChosenTarget::Update() {
-  // The supported variable contains the current CPU supported targets shifted
-  // to the location expected by the ChosenTarget mask. We enabled SCALAR
-  // regardless of whether it was compiled since it is also used as the
-  // fallback mechanism to the baseline target.
-  uint32_t supported = HWY_CHOSEN_TARGET_SHIFT(hwy::SupportedTargets()) |
-                       HWY_CHOSEN_TARGET_MASK_SCALAR;
-  StoreMask(supported);
-}
-
 }  // namespace hwy
diff --git a/media/highway/src/hwy/targets.h b/media/highway/src/hwy/targets.h
index 3bdb647e3d..2d9afbff47 100644
--- a/media/highway/src/hwy/targets.h
+++ b/media/highway/src/hwy/targets.h
@@ -31,11 +31,12 @@
 
 namespace hwy {
 
-// Returns (cached) bitfield of enabled targets that are supported on this CPU.
-// Implemented in targets.cc; unconditionally compiled to support the use case
-// of binary-only distributions. The HWY_SUPPORTED_TARGETS wrapper may allow
-// eliding calls to this function.
-HWY_DLLEXPORT uint32_t SupportedTargets();
+// Returns bitfield of enabled targets that are supported on this CPU; there is
+// always at least one such target, hence the return value is never 0. The
+// targets returned may change after calling DisableTargets. This function is
+// always defined, but the HWY_SUPPORTED_TARGETS wrapper may allow eliding
+// calls to it if there is only a single target enabled.
+HWY_DLLEXPORT int64_t SupportedTargets();
 
 // Evaluates to a function call, or literal if there is a single target.
 #if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0
@@ -44,40 +45,36 @@ HWY_DLLEXPORT uint32_t SupportedTargets();
 #define HWY_SUPPORTED_TARGETS hwy::SupportedTargets()
 #endif
 
-// Disable from runtime dispatch the mask of compiled in targets. Targets that
-// were not enabled at compile time are ignored. This function is useful to
-// disable a target supported by the CPU that is known to have bugs or when a
-// lower target is desired. For this reason, attempts to disable targets which
-// are in HWY_ENABLED_BASELINE have no effect so SupportedTargets() always
-// returns at least the baseline target.
-HWY_DLLEXPORT void DisableTargets(uint32_t disabled_targets);
-
-// Set the mock mask of CPU supported targets instead of the actual CPU
-// supported targets computed in SupportedTargets(). The return value of
-// SupportedTargets() will still be affected by the DisableTargets() mask
-// regardless of this mock, to prevent accidentally adding targets that are
-// known to be buggy in the current CPU. Call with a mask of 0 to disable the
-// mock and use the actual CPU supported targets instead.
-HWY_DLLEXPORT void SetSupportedTargetsForTest(uint32_t targets);
-
-// Returns whether the SupportedTargets() function was called since the last
-// SetSupportedTargetsForTest() call.
-HWY_DLLEXPORT bool SupportedTargetsCalledForTest();
+// Subsequent SupportedTargets will not return targets whose bit(s) are set in
+// `disabled_targets`. Exception: if SupportedTargets would return 0, it will
+// instead return HWY_STATIC_TARGET (there must always be one target to call).
+//
+// This function is useful for disabling targets known to be buggy, or if the
+// best available target is undesirable (perhaps due to throttling or memory
+// bandwidth limitations). Use SetSupportedTargetsForTest instead of this
+// function for iteratively enabling specific targets for testing.
+HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets);
+
+// Subsequent SupportedTargets will return the given set of targets, except
+// those disabled via DisableTargets. Call with a mask of 0 to disable the mock
+// and return to the normal SupportedTargets behavior. Used to run tests for
+// all targets.
+HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets);
 
 // Return the list of targets in HWY_TARGETS supported by the CPU as a list of
 // individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list
 // is affected by the current SetSupportedTargetsForTest() mock if any.
-HWY_INLINE std::vector<uint32_t> SupportedAndGeneratedTargets() {
-  std::vector<uint32_t> ret;
-  for (uint32_t targets = SupportedTargets() & HWY_TARGETS; targets != 0;
+HWY_INLINE std::vector<int64_t> SupportedAndGeneratedTargets() {
+  std::vector<int64_t> ret;
+  for (int64_t targets = SupportedTargets() & HWY_TARGETS; targets != 0;
        targets = targets & (targets - 1)) {
-    uint32_t current_target = targets & ~(targets - 1);
+    int64_t current_target = targets & ~(targets - 1);
     ret.push_back(current_target);
   }
   return ret;
 }
 
-static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
+static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
   switch (target) {
 #if HWY_ARCH_X86
     case HWY_SSSE3:
@@ -93,22 +90,28 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
 #endif
 
 #if HWY_ARCH_ARM
+    case HWY_SVE2_128:
+      return "SVE2_128";
+    case HWY_SVE_256:
+      return "SVE_256";
     case HWY_SVE2:
       return "SVE2";
     case HWY_SVE:
       return "SVE";
     case HWY_NEON:
-      return "Neon";
+      return "NEON";
 #endif
 
 #if HWY_ARCH_PPC
     case HWY_PPC8:
-      return "Power8";
+      return "PPC8";
 #endif
 
 #if HWY_ARCH_WASM
     case HWY_WASM:
-      return "Wasm";
+      return "WASM";
+    case HWY_WASM_EMU256:
+      return "WASM_EMU256";
 #endif
 
 #if HWY_ARCH_RVV
@@ -117,9 +120,9 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
 #endif
 
     case HWY_EMU128:
-      return "Emu128";
+      return "EMU128";
     case HWY_SCALAR:
-      return "Scalar";
+      return "SCALAR";
 
     default:
       return "Unknown";  // must satisfy gtest IsValidParamName()
@@ -132,7 +135,7 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
 // For the ChosenTarget mask and index we use a different bit arrangement than
 // in the HWY_TARGETS mask. Only the targets involved in the current
 // architecture are used in this mask, and therefore only the least significant
-// (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the uint32_t mask are used. The least
+// (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the int64_t mask are used. The least
 // significant bit is set when the mask is not initialized, the next
 // HWY_MAX_DYNAMIC_TARGETS more significant bits are a range of bits from the
 // HWY_TARGETS or SupportedTargets() mask for the given architecture shifted to
@@ -146,81 +149,111 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
 // scalar.
 
 // The HWY_SCALAR/HWY_EMU128 bit in the ChosenTarget mask format.
-#define HWY_CHOSEN_TARGET_MASK_SCALAR (1u << (HWY_MAX_DYNAMIC_TARGETS + 1))
+#define HWY_CHOSEN_TARGET_MASK_SCALAR (1LL << (HWY_MAX_DYNAMIC_TARGETS + 1))
 
 // Converts from a HWY_TARGETS mask to a ChosenTarget mask format for the
 // current architecture.
 #define HWY_CHOSEN_TARGET_SHIFT(X)                                    \
   ((((X) >> (HWY_HIGHEST_TARGET_BIT + 1 - HWY_MAX_DYNAMIC_TARGETS)) & \
-    ((1u << HWY_MAX_DYNAMIC_TARGETS) - 1))                            \
+    ((1LL << HWY_MAX_DYNAMIC_TARGETS) - 1))                           \
    << 1)
 
 // The HWY_TARGETS mask in the ChosenTarget mask format.
 #define HWY_CHOSEN_TARGET_MASK_TARGETS \
-  (HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1u)
+  (HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1LL)
 
 #if HWY_ARCH_X86
 // Maximum number of dynamic targets, changing this value is an ABI incompatible
 // change
-#define HWY_MAX_DYNAMIC_TARGETS 10
+#define HWY_MAX_DYNAMIC_TARGETS 15
 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_X86
 // These must match the order in which the HWY_TARGETS are defined
 // starting by the least significant (HWY_HIGHEST_TARGET_BIT + 1 -
 // HWY_MAX_DYNAMIC_TARGETS) bit. This list must contain exactly
 // HWY_MAX_DYNAMIC_TARGETS elements and does not include SCALAR. The first entry
 // corresponds to the best target. Don't include a "," at the end of the list.
-#define HWY_CHOOSE_TARGET_LIST(func_name)           \
-  nullptr,                           /* reserved */ \
-      nullptr,                       /* reserved */ \
-      HWY_CHOOSE_AVX3_DL(func_name), /* AVX3_DL */  \
-      HWY_CHOOSE_AVX3(func_name),    /* AVX3 */     \
-      HWY_CHOOSE_AVX2(func_name),    /* AVX2 */     \
-      nullptr,                       /* AVX */      \
-      HWY_CHOOSE_SSE4(func_name),    /* SSE4 */     \
-      HWY_CHOOSE_SSSE3(func_name),   /* SSSE3 */    \
-      nullptr,                       /* SSE3 */     \
-      nullptr                        /* SSE2 */
+#define HWY_CHOOSE_TARGET_LIST(func_name)                   \
+  nullptr,                           /* reserved */         \
+      nullptr,                       /* reserved */         \
+      nullptr,                       /* reserved */         \
+      nullptr,                       /* reserved */         \
+      nullptr,                       /* reserved */         \
+      nullptr,                       /* reserved */         \
+      nullptr,                       /* reserved */         \
+      HWY_CHOOSE_AVX3_DL(func_name), /* AVX3_DL */          \
+      HWY_CHOOSE_AVX3(func_name),    /* AVX3 */             \
+      HWY_CHOOSE_AVX2(func_name),    /* AVX2 */             \
+      nullptr,                       /* AVX */              \
+      HWY_CHOOSE_SSE4(func_name),    /* SSE4 */             \
+      HWY_CHOOSE_SSSE3(func_name),   /* SSSE3 */            \
+      nullptr ,                       /* reserved - SSE3? */ \
+      nullptr                        /* reserved - SSE2? */
 
 #elif HWY_ARCH_ARM
 // See HWY_ARCH_X86 above for details.
-#define HWY_MAX_DYNAMIC_TARGETS 4
+#define HWY_MAX_DYNAMIC_TARGETS 15
 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM
+#define HWY_CHOOSE_TARGET_LIST(func_name)                \
+  nullptr,                            /* reserved */     \
+      nullptr,                        /* reserved */     \
+      nullptr,                        /* reserved */     \
+      nullptr,                        /* reserved */     \
+      nullptr,                        /* reserved */     \
+      nullptr,                        /* reserved */     \
+      nullptr,                        /* reserved */     \
+      nullptr,                        /* reserved */     \
+      nullptr,                        /* reserved */     \
+      HWY_CHOOSE_SVE2_128(func_name), /* SVE2 128-bit */ \
+      HWY_CHOOSE_SVE_256(func_name),  /* SVE 256-bit */  \
+      HWY_CHOOSE_SVE2(func_name),     /* SVE2 */         \
+      HWY_CHOOSE_SVE(func_name),      /* SVE */          \
+      HWY_CHOOSE_NEON(func_name),     /* NEON */         \
+      nullptr                         /* reserved - Helium? */
+
+#elif HWY_ARCH_RVV
+// See HWY_ARCH_X86 above for details.
+#define HWY_MAX_DYNAMIC_TARGETS 9
+#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV
 #define HWY_CHOOSE_TARGET_LIST(func_name)       \
-  HWY_CHOOSE_SVE2(func_name),    /* SVE2 */     \
-      HWY_CHOOSE_SVE(func_name), /* SVE */      \
+  nullptr,                       /* reserved */ \
+      nullptr,                   /* reserved */ \
+      nullptr,                   /* reserved */ \
+      nullptr,                   /* reserved */ \
+      nullptr,                   /* reserved */ \
       nullptr,                   /* reserved */ \
-      HWY_CHOOSE_NEON(func_name) /* NEON */
+      nullptr,                   /* reserved */ \
+      HWY_CHOOSE_RVV(func_name), /* RVV */      \
+      nullptr                    /* reserved */
 
 #elif HWY_ARCH_PPC
 // See HWY_ARCH_X86 above for details.
-#define HWY_MAX_DYNAMIC_TARGETS 5
+#define HWY_MAX_DYNAMIC_TARGETS 9
 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC
-#define HWY_CHOOSE_TARGET_LIST(func_name)        \
-  nullptr,                        /* reserved */ \
-      nullptr,                    /* reserved */ \
-      HWY_CHOOSE_PPC8(func_name), /* PPC8 */     \
-      nullptr,                    /* VSX */      \
-      nullptr                     /* AltiVec */
+#define HWY_CHOOSE_TARGET_LIST(func_name)                         \
+  nullptr,                        /* reserved */                  \
+      nullptr,                    /* reserved */                  \
+      nullptr,                    /* reserved */                  \
+      nullptr,                    /* reserved */                  \
+      nullptr,                    /* reserved */                  \
+      nullptr,                    /* reserved */                  \
+      HWY_CHOOSE_PPC8(func_name), /* PPC8 */                      \
+      nullptr,                    /* reserved (VSX or AltiVec) */ \
+      nullptr                     /* reserved (VSX or AltiVec) */
 
 #elif HWY_ARCH_WASM
 // See HWY_ARCH_X86 above for details.
-#define HWY_MAX_DYNAMIC_TARGETS 4
+#define HWY_MAX_DYNAMIC_TARGETS 9
 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM
-#define HWY_CHOOSE_TARGET_LIST(func_name)         \
-  nullptr,                         /* reserved */ \
-      nullptr,                     /* reserved */ \
-      HWY_CHOOSE_WASM2(func_name), /* WASM2 */    \
-      HWY_CHOOSE_WASM(func_name)   /* WASM */
-
-#elif HWY_ARCH_RVV
-// See HWY_ARCH_X86 above for details.
-#define HWY_MAX_DYNAMIC_TARGETS 4
-#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV
-#define HWY_CHOOSE_TARGET_LIST(func_name)       \
-  nullptr,                       /* reserved */ \
-      nullptr,                   /* reserved */ \
-      nullptr,                   /* reserved */ \
-      HWY_CHOOSE_RVV(func_name) /* RVV */
+#define HWY_CHOOSE_TARGET_LIST(func_name)                  \
+  nullptr,                               /* reserved */    \
+      nullptr,                           /* reserved */    \
+      nullptr,                           /* reserved */    \
+      nullptr,                           /* reserved */    \
+      nullptr,                           /* reserved */    \
+      nullptr,                           /* reserved */    \
+      HWY_CHOOSE_WASM_EMU256(func_name), /* WASM_EMU256 */ \
+      HWY_CHOOSE_WASM(func_name),        /* WASM */        \
+      nullptr                            /* reserved */
 
 #else
 // Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though
@@ -229,41 +262,51 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_SCALAR
 #endif
 
+// Bitfield of supported and enabled targets. The format differs from that of
+// HWY_TARGETS; the lowest bit governs the first function pointer (which is
+// special in that it calls FunctionCache, then Update, then dispatches to the
+// actual implementation) in the tables created by HWY_EXPORT. Monostate (see
+// GetChosenTarget), thread-safe except on RVV.
 struct ChosenTarget {
  public:
-  // Update the ChosenTarget mask based on the current CPU supported
-  // targets.
-  HWY_DLLEXPORT void Update();
+  // Reset bits according to `targets` (typically the return value of
+  // SupportedTargets()). Postcondition: IsInitialized() == true.
+  void Update(int64_t targets) {
+    // These are `targets` shifted downwards, see above. Also include SCALAR
+    // (corresponds to the last entry in the function table) as fallback.
+    StoreMask(HWY_CHOSEN_TARGET_SHIFT(targets) | HWY_CHOSEN_TARGET_MASK_SCALAR);
+  }
 
-  // Reset the ChosenTarget to the uninitialized state.
+  // Reset to the uninitialized state, so that FunctionCache will call Update
+  // during the next HWY_DYNAMIC_DISPATCH, and IsInitialized returns false.
   void DeInit() { StoreMask(1); }
 
-  // Whether the ChosenTarget was initialized. This is useful to know whether
-  // any HWY_DYNAMIC_DISPATCH function was called.
+  // Whether Update was called. This indicates whether any HWY_DYNAMIC_DISPATCH
+  // function was called, which we check in tests.
   bool IsInitialized() const { return LoadMask() != 1; }
 
   // Return the index in the dynamic dispatch table to be used by the current
   // CPU. Note that this method must be in the header file so it uses the value
   // of HWY_CHOSEN_TARGET_MASK_TARGETS defined in the translation unit that
-  // calls it, which may be different from others. This allows to only consider
+  // calls it, which may be different from others. This means we only enable
   // those targets that were actually compiled in this module.
   size_t HWY_INLINE GetIndex() const {
-    return hwy::Num0BitsBelowLS1Bit_Nonzero32(LoadMask() &
-                                              HWY_CHOSEN_TARGET_MASK_TARGETS);
+    return hwy::Num0BitsBelowLS1Bit_Nonzero64(
+        static_cast<uint64_t>(LoadMask() & HWY_CHOSEN_TARGET_MASK_TARGETS));
   }
 
  private:
   // TODO(janwas): remove #if once <atomic> is available
 #if HWY_ARCH_RVV
-  uint32_t LoadMask() const { return mask_; }
-  void StoreMask(uint32_t mask) { mask_ = mask; }
+  int64_t LoadMask() const { return mask_; }
+  void StoreMask(int64_t mask) { mask_ = mask; }
 
-  uint32_t mask_{1};  // Initialized to 1 so GetIndex() returns 0.
+  int64_t mask_{1};  // Initialized to 1 so GetIndex() returns 0.
 #else
-  uint32_t LoadMask() const { return mask_.load(); }
-  void StoreMask(uint32_t mask) { mask_.store(mask); }
+  int64_t LoadMask() const { return mask_.load(); }
+  void StoreMask(int64_t mask) { mask_.store(mask); }
 
-  std::atomic<uint32_t> mask_{1};  // Initialized to 1 so GetIndex() returns 0.
+  std::atomic<int64_t> mask_{1};  // Initialized to 1 so GetIndex() returns 0.
 #endif  // HWY_ARCH_RVV
 };
 
diff --git a/media/highway/src/hwy/targets_test.cc b/media/highway/src/hwy/targets_test.cc
index 984f4c76eb..e58a6fa463 100644
--- a/media/highway/src/hwy/targets_test.cc
+++ b/media/highway/src/hwy/targets_test.cc
@@ -22,7 +22,7 @@ namespace fake {
 #define DECLARE_FUNCTION(TGT)                                                \
   namespace N_##TGT {                                                        \
     /* Function argument is just to ensure/demonstrate they are possible. */ \
-    uint32_t FakeFunction(int) { return HWY_##TGT; }                         \
+    int64_t FakeFunction(int) { return HWY_##TGT; }                          \
   }
 
 DECLARE_FUNCTION(AVX3_DL)
@@ -33,6 +33,8 @@ DECLARE_FUNCTION(SSSE3)
 DECLARE_FUNCTION(NEON)
 DECLARE_FUNCTION(SVE)
 DECLARE_FUNCTION(SVE2)
+DECLARE_FUNCTION(SVE_256)
+DECLARE_FUNCTION(SVE2_128)
 DECLARE_FUNCTION(PPC8)
 DECLARE_FUNCTION(WASM)
 DECLARE_FUNCTION(RVV)
@@ -41,13 +43,13 @@ DECLARE_FUNCTION(EMU128)
 
 HWY_EXPORT(FakeFunction);
 
-void CallFunctionForTarget(uint32_t target, int line) {
+void CallFunctionForTarget(int64_t target, int line) {
   if ((HWY_TARGETS & target) == 0) return;
   hwy::SetSupportedTargetsForTest(target);
 
   // Call Update() first to make &HWY_DYNAMIC_DISPATCH() return
   // the pointer to the already cached function.
-  hwy::GetChosenTarget().Update();
+  hwy::GetChosenTarget().Update(hwy::SupportedTargets());
 
   EXPECT_EQ(target, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line;
 
@@ -55,7 +57,11 @@ void CallFunctionForTarget(uint32_t target, int line) {
   // also calls the right function.
   hwy::GetChosenTarget().DeInit();
 
+#if HWY_DISPATCH_WORKAROUND
+  EXPECT_EQ(HWY_STATIC_TARGET, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line;
+#else
   EXPECT_EQ(target, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line;
+#endif
 
   // Second call uses the cached value from the previous call.
   EXPECT_EQ(target, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line;
@@ -71,12 +77,14 @@ void CheckFakeFunction() {
   CallFunctionForTarget(HWY_NEON, __LINE__);
   CallFunctionForTarget(HWY_SVE, __LINE__);
   CallFunctionForTarget(HWY_SVE2, __LINE__);
+  CallFunctionForTarget(HWY_SVE_256, __LINE__);
+  CallFunctionForTarget(HWY_SVE2_128, __LINE__);
   CallFunctionForTarget(HWY_PPC8, __LINE__);
   CallFunctionForTarget(HWY_WASM, __LINE__);
   CallFunctionForTarget(HWY_RVV, __LINE__);
   // The tables only have space for either HWY_SCALAR or HWY_EMU128; the former
   // is opt-in only.
-#if defined(HWY_COMPILE_ONLY_SCALAR)
+#if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128
   CallFunctionForTarget(HWY_SCALAR, __LINE__);
 #else
   CallFunctionForTarget(HWY_EMU128, __LINE__);
@@ -101,25 +109,22 @@ class HwyTargetsTest : public testing::Test {
 TEST_F(HwyTargetsTest, ChosenTargetOrderTest) { fake::CheckFakeFunction(); }
 
 TEST_F(HwyTargetsTest, DisabledTargetsTest) {
-  DisableTargets(~0u);
-#if HWY_ARCH_X86
-  // Check that the baseline can't be disabled.
-  HWY_ASSERT(HWY_ENABLED_BASELINE == SupportedTargets());
-#else
-  // TODO(janwas): update when targets.cc changes
-  HWY_ASSERT(HWY_TARGETS == SupportedTargets());
-#endif
+  DisableTargets(~0LL);
+  // Check that disabling everything at least leaves the static target.
+  HWY_ASSERT(HWY_STATIC_TARGET == SupportedTargets());
 
   DisableTargets(0);  // Reset the mask.
-  uint32_t current_targets = SupportedTargets();
-  if ((current_targets & ~static_cast<uint32_t>(HWY_ENABLED_BASELINE)) == 0) {
+  const int64_t current_targets = SupportedTargets();
+  const int64_t enabled_baseline = static_cast<int64_t>(HWY_ENABLED_BASELINE);
+  // Exclude these two because they are always returned by SupportedTargets.
+  const int64_t fallback = HWY_SCALAR | HWY_EMU128;
+  if ((current_targets & ~enabled_baseline & ~fallback) == 0) {
     // We can't test anything else if the only compiled target is the baseline.
     return;
   }
+
   // Get the lowest bit in the mask (the best target) and disable that one.
-  uint32_t best_target = current_targets & (~current_targets + 1);
-  // The lowest target shouldn't be one in the baseline.
-  HWY_ASSERT((best_target & ~static_cast<uint32_t>(HWY_ENABLED_BASELINE)) != 0);
+  const int64_t best_target = current_targets & (~current_targets + 1);
   DisableTargets(best_target);
 
   // Check that the other targets are still enabled.
diff --git a/media/highway/src/hwy/tests/arithmetic_test.cc b/media/highway/src/hwy/tests/arithmetic_test.cc
index c982e7d10f..1fbbd29add 100644
--- a/media/highway/src/hwy/tests/arithmetic_test.cc
+++ b/media/highway/src/hwy/tests/arithmetic_test.cc
@@ -13,13 +13,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/arithmetic_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -175,6 +174,23 @@ HWY_NOINLINE void TestAllAbs() {
   ForFloatTypes(ForPartialVectors<TestFloatAbs>());
 }
 
+struct TestNeg {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vn = Set(d, T(-3));
+    const auto vp = Set(d, T(3));
+    HWY_ASSERT_VEC_EQ(d, v0, Neg(v0));
+    HWY_ASSERT_VEC_EQ(d, vp, Neg(vn));
+    HWY_ASSERT_VEC_EQ(d, vn, Neg(vp));
+  }
+};
+
+HWY_NOINLINE void TestAllNeg() {
+  ForSignedTypes(ForPartialVectors<TestNeg>());
+  ForFloatTypes(ForPartialVectors<TestNeg>());
+}
+
 struct TestUnsignedMinMax {
   template <typename T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
@@ -261,16 +277,15 @@ HWY_NOINLINE void TestAllMinMax() {
   ForFloatTypes(ForPartialVectors<TestFloatMinMax>());
 }
 
-class TestMinMax128 {
-  template <class D>
-  static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
-    alignas(16) uint64_t in[2];
-    in[0] = lo;
-    in[1] = hi;
-    return LoadDup128(d, in);
-  }
+template <class D>
+static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
+  alignas(16) uint64_t in[2];
+  in[0] = lo;
+  in[1] = hi;
+  return LoadDup128(d, in);
+}
 
- public:
+struct TestMinMax128 {
   template <typename T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
     using V = Vec<D>;
@@ -339,149 +354,73 @@ HWY_NOINLINE void TestAllMinMax128() {
   ForGEVectors<128, TestMinMax128>()(uint64_t());
 }
 
-
-struct TestSumOfLanes {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto in_lanes = AllocateAligned<T>(N);
-
-    // Lane i = bit i, higher lanes 0
-    double sum = 0.0;
-    // Avoid setting sign bit and cap at double precision
-    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0;
-      sum += static_cast<double>(in_lanes[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)),
-                      SumOfLanes(d, Load(d, in_lanes.get())));
-
-    // Lane i = i (iota) to include upper lanes
-    sum = 0.0;
-    for (size_t i = 0; i < N; ++i) {
-      sum += static_cast<double>(i);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)), SumOfLanes(d, Iota(d, 0)));
-  }
-};
-
-HWY_NOINLINE void TestAllSumOfLanes() {
-  ForUIF3264(ForPartialVectors<TestSumOfLanes>());
-}
-
-struct TestMinOfLanes {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto in_lanes = AllocateAligned<T>(N);
-
-    // Lane i = bit i, higher lanes = 2 (not the minimum)
-    T min = HighestValue<T>();
-    // Avoid setting sign bit and cap at double precision
-    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 2;
-      min = HWY_MIN(min, in_lanes[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
-
-    // Lane i = N - i to include upper lanes
-    min = HighestValue<T>();
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = static_cast<T>(N - i);  // no 8-bit T so no wraparound
-      min = HWY_MIN(min, in_lanes[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
-  }
-};
-
-struct TestMaxOfLanes {
+struct TestMinMax128Upper {
   template <typename T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using V = Vec<D>;
     const size_t N = Lanes(d);
-    auto in_lanes = AllocateAligned<T>(N);
-
-    T max = LowestValue<T>();
-    // Avoid setting sign bit and cap at double precision
-    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0;
-      max = HWY_MAX(max, in_lanes[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
-
-    // Lane i = i to include upper lanes
-    max = LowestValue<T>();
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = static_cast<T>(i);  // no 8-bit T so no wraparound
-      max = HWY_MAX(max, in_lanes[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
-  }
-};
+    auto a_lanes = AllocateAligned<T>(N);
+    auto b_lanes = AllocateAligned<T>(N);
+    auto min_lanes = AllocateAligned<T>(N);
+    auto max_lanes = AllocateAligned<T>(N);
+    RandomState rng;
 
-HWY_NOINLINE void TestAllMinMaxOfLanes() {
-  const ForPartialVectors<TestMinOfLanes> test_min;
-  const ForPartialVectors<TestMaxOfLanes> test_max;
-  ForUIF3264(test_min);
-  ForUIF3264(test_max);
-  test_min(uint16_t());
-  test_max(uint16_t());
-  test_min(int16_t());
-  test_max(int16_t());
-}
+    const V v00 = Zero(d);
+    const V v01 = Make128(d, 0, 1);
+    const V v10 = Make128(d, 1, 0);
+    const V v11 = Add(v01, v10);
 
-struct TestSumsOf8 {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
+    // Same arg
+    HWY_ASSERT_VEC_EQ(d, v00, Min128Upper(d, v00, v00));
+    HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v01, v01));
+    HWY_ASSERT_VEC_EQ(d, v10, Min128Upper(d, v10, v10));
+    HWY_ASSERT_VEC_EQ(d, v11, Min128Upper(d, v11, v11));
+    HWY_ASSERT_VEC_EQ(d, v00, Max128Upper(d, v00, v00));
+    HWY_ASSERT_VEC_EQ(d, v01, Max128Upper(d, v01, v01));
+    HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v10, v10));
+    HWY_ASSERT_VEC_EQ(d, v11, Max128Upper(d, v11, v11));
+
+    // Equivalent but not equal (chooses second arg)
+    HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v00, v01));
+    HWY_ASSERT_VEC_EQ(d, v11, Min128Upper(d, v10, v11));
+    HWY_ASSERT_VEC_EQ(d, v00, Min128Upper(d, v01, v00));
+    HWY_ASSERT_VEC_EQ(d, v10, Min128Upper(d, v11, v10));
+    HWY_ASSERT_VEC_EQ(d, v00, Max128Upper(d, v01, v00));
+    HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v11, v10));
+    HWY_ASSERT_VEC_EQ(d, v01, Max128Upper(d, v00, v01));
+    HWY_ASSERT_VEC_EQ(d, v11, Max128Upper(d, v10, v11));
 
-    const size_t N = Lanes(d);
-    if (N < 8) return;
-    const Repartition<uint64_t, D> du64;
+    // First arg less
+    HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v01, v10));
+    HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v01, v10));
 
-    auto in_lanes = AllocateAligned<T>(N);
-    auto sum_lanes = AllocateAligned<uint64_t>(N / 8);
+    // Second arg less
+    HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v10, v01));
+    HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v10, v01));
 
-    for (size_t rep = 0; rep < 100; ++rep) {
+    // Also check 128-bit blocks are independent
+    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
       for (size_t i = 0; i < N; ++i) {
-        in_lanes[i] = Random64(&rng) & 0xFF;
+        a_lanes[i] = Random64(&rng);
+        b_lanes[i] = Random64(&rng);
       }
-
-      for (size_t idx_sum = 0; idx_sum < N / 8; ++idx_sum) {
-        uint64_t sum = 0;
-        for (size_t i = 0; i < 8; ++i) {
-          sum += in_lanes[idx_sum * 8 + i];
-        }
-        sum_lanes[idx_sum] = sum;
+      const V a = Load(d, a_lanes.get());
+      const V b = Load(d, b_lanes.get());
+      for (size_t i = 0; i < N; i += 2) {
+        const bool lt = a_lanes[i + 1] < b_lanes[i + 1];
+        min_lanes[i + 0] = lt ? a_lanes[i + 0] : b_lanes[i + 0];
+        min_lanes[i + 1] = lt ? a_lanes[i + 1] : b_lanes[i + 1];
+        max_lanes[i + 0] = lt ? b_lanes[i + 0] : a_lanes[i + 0];
+        max_lanes[i + 1] = lt ? b_lanes[i + 1] : a_lanes[i + 1];
       }
-
-      const Vec<D> in = Load(d, in_lanes.get());
-      HWY_ASSERT_VEC_EQ(du64, sum_lanes.get(), SumsOf8(in));
+      HWY_ASSERT_VEC_EQ(d, min_lanes.get(), Min128Upper(d, a, b));
+      HWY_ASSERT_VEC_EQ(d, max_lanes.get(), Max128Upper(d, a, b));
     }
   }
 };
 
-HWY_NOINLINE void TestAllSumsOf8() {
-  ForGEVectors<64, TestSumsOf8>()(uint8_t());
-}
-
-struct TestNeg {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto vn = Set(d, T(-3));
-    const auto vp = Set(d, T(3));
-    HWY_ASSERT_VEC_EQ(d, v0, Neg(v0));
-    HWY_ASSERT_VEC_EQ(d, vp, Neg(vn));
-    HWY_ASSERT_VEC_EQ(d, vn, Neg(vp));
-  }
-};
-
-HWY_NOINLINE void TestAllNeg() {
-  ForSignedTypes(ForPartialVectors<TestNeg>());
-  ForFloatTypes(ForPartialVectors<TestNeg>());
+HWY_NOINLINE void TestAllMinMax128Upper() {
+  ForGEVectors<128, TestMinMax128Upper>()(uint64_t());
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
@@ -495,14 +434,12 @@ namespace hwy {
 HWY_BEFORE_TEST(HwyArithmeticTest);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSaturatingArithmetic);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAverage);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbs);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSumOfLanes);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMaxOfLanes);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSumsOf8);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNeg);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128Upper);
 }  // namespace hwy
 
 #endif
diff --git a/media/highway/src/hwy/tests/blockwise_shift_test.cc b/media/highway/src/hwy/tests/blockwise_shift_test.cc
index ed10b04f31..d14fb86e3d 100644
--- a/media/highway/src/hwy/tests/blockwise_shift_test.cc
+++ b/media/highway/src/hwy/tests/blockwise_shift_test.cc
@@ -15,11 +15,11 @@
 
 #include <stddef.h>
 #include <stdint.h>
-#include <string.h>
+#include <string.h>  // memcpy
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/blockwise_shift_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -63,17 +63,17 @@ struct TestShiftBytes {
     auto expected = AllocateAligned<T>(N);
     uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
 
-    const size_t kBlockSize = HWY_MIN(N8, 16);
-    for (size_t block = 0; block < N8; block += kBlockSize) {
+    const size_t block_size = HWY_MIN(N8, 16);
+    for (size_t block = 0; block < N8; block += block_size) {
       expected_bytes[block] = 0;
-      memcpy(expected_bytes + block + 1, in_bytes + block, kBlockSize - 1);
+      memcpy(expected_bytes + block + 1, in_bytes + block, block_size - 1);
     }
     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(v));
     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(d, v));
 
-    for (size_t block = 0; block < N8; block += kBlockSize) {
-      memcpy(expected_bytes + block, in_bytes + block + 1, kBlockSize - 1);
-      expected_bytes[block + kBlockSize - 1] = 0;
+    for (size_t block = 0; block < N8; block += block_size) {
+      memcpy(expected_bytes + block, in_bytes + block + 1, block_size - 1);
+      expected_bytes[block + block_size - 1] = 0;
     }
     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightBytes<1>(d, v));
 #else
@@ -152,7 +152,7 @@ template <int kBytes>
 struct TestCombineShiftRightBytes {
   template <class T, class D>
   HWY_NOINLINE void operator()(T, D d) {
-    const size_t kBlockSize = 16;
+    constexpr size_t kBlockSize = 16;
     static_assert(kBytes < kBlockSize, "Shift count is per block");
     const Repartition<uint8_t, D> d8;
     const size_t N8 = Lanes(d8);
@@ -170,6 +170,7 @@ struct TestCombineShiftRightBytes {
         lo_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
       }
       for (size_t i = 0; i < N8; i += kBlockSize) {
+        // Arguments are not the same size.
         CopyBytes<kBlockSize>(&lo_bytes[i], combined);
         CopyBytes<kBlockSize>(&hi_bytes[i], combined + kBlockSize);
         CopyBytes<kBlockSize>(combined + kBytes, &expected_bytes[i]);
@@ -194,7 +195,7 @@ struct TestCombineShiftRightLanes {
     auto hi_bytes = AllocateAligned<uint8_t>(N8);
     auto lo_bytes = AllocateAligned<uint8_t>(N8);
     auto expected_bytes = AllocateAligned<uint8_t>(N8);
-    const size_t kBlockSize = 16;
+    constexpr size_t kBlockSize = 16;
     uint8_t combined[2 * kBlockSize];
 
     // Random inputs in each lane
@@ -205,6 +206,7 @@ struct TestCombineShiftRightLanes {
         lo_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
       }
       for (size_t i = 0; i < N8; i += kBlockSize) {
+        // Arguments are not the same size.
         CopyBytes<kBlockSize>(&lo_bytes[i], combined);
         CopyBytes<kBlockSize>(&hi_bytes[i], combined + kBlockSize);
         CopyBytes<kBlockSize>(combined + kLanes * sizeof(T),
@@ -226,7 +228,8 @@ struct TestCombineShiftRight {
   HWY_NOINLINE void operator()(T t, D d) {
 // Scalar does not define CombineShiftRightBytes.
 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
-    constexpr int kMaxBytes = HWY_MIN(16, int(MaxLanes(d) * sizeof(T)));
+    constexpr int kMaxBytes =
+        HWY_MIN(16, static_cast<int>(MaxLanes(d) * sizeof(T)));
     constexpr int kMaxLanes = kMaxBytes / static_cast<int>(sizeof(T));
     TestCombineShiftRightBytes<kMaxBytes - 1>()(t, d);
     TestCombineShiftRightBytes<HWY_MAX(kMaxBytes / 2, 1)>()(t, d);
diff --git a/media/highway/src/hwy/tests/blockwise_test.cc b/media/highway/src/hwy/tests/blockwise_test.cc
index 63a4fe4376..41097eeca5 100644
--- a/media/highway/src/hwy/tests/blockwise_test.cc
+++ b/media/highway/src/hwy/tests/blockwise_test.cc
@@ -19,7 +19,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/blockwise_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -147,7 +147,7 @@ struct TestTableLookupBytes {
       const uint8_t prev_index = index_bytes[i];
       expected_bytes[i] = 0;
 
-      const int idx = 0x80 + (int(Random32(&rng) & 7) << 4);
+      const int idx = 0x80 + (static_cast<int>(Random32(&rng) & 7) << 4);
       HWY_ASSERT(0x80 <= idx && idx < 256);
       index_bytes[i] = static_cast<uint8_t>(idx);
 
@@ -248,25 +248,54 @@ struct TestZipLower {
     const auto even = Load(d, even_lanes.get());
     const auto odd = Load(d, odd_lanes.get());
 
+    const Repartition<WideT, D> dw;
+#if HWY_TARGET == HWY_SCALAR
+    // Safely handle big-endian
+    const auto expected = Set(dw, static_cast<WideT>(1ULL << (sizeof(T) * 8)));
+#else
     const size_t blockN = HWY_MIN(size_t(16) / sizeof(T), N);
-
     for (size_t i = 0; i < N; i += 2) {
       const size_t base = (i / blockN) * blockN;
       const size_t mod = i % blockN;
       zip_lanes[i + 0] = even_lanes[mod / 2 + base];
       zip_lanes[i + 1] = odd_lanes[mod / 2 + base];
     }
-    const Repartition<WideT, D> dw;
     const auto expected =
         Load(dw, reinterpret_cast<const WideT*>(zip_lanes.get()));
+#endif  // HWY_TARGET == HWY_SCALAR
     HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(even, odd));
     HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(dw, even, odd));
   }
 };
 
+HWY_NOINLINE void TestAllZipLower() {
+  const ForDemoteVectors<TestZipLower> lower_unsigned;
+  lower_unsigned(uint8_t());
+  lower_unsigned(uint16_t());
+#if HWY_HAVE_INTEGER64
+  lower_unsigned(uint32_t());  // generates u64
+#endif
+
+  const ForDemoteVectors<TestZipLower> lower_signed;
+  lower_signed(int8_t());
+  lower_signed(int16_t());
+#if HWY_HAVE_INTEGER64
+  lower_signed(int32_t());  // generates i64
+#endif
+
+  // No float - concatenating f32 does not result in a f64
+}
+
+// Remove this test (so it does not show as having run) if the only target is
+// HWY_SCALAR, which does not support this op.
+#if HWY_TARGETS != HWY_SCALAR
+
 struct TestZipUpper {
   template <class T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET == HWY_SCALAR
+    (void)d;
+#else
     using WideT = MakeWide<T>;
     static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
     static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
@@ -295,24 +324,11 @@ struct TestZipUpper {
     const auto expected =
         Load(dw, reinterpret_cast<const WideT*>(zip_lanes.get()));
     HWY_ASSERT_VEC_EQ(dw, expected, ZipUpper(dw, even, odd));
+#endif  // HWY_TARGET == HWY_SCALAR
   }
 };
 
-HWY_NOINLINE void TestAllZip() {
-  const ForDemoteVectors<TestZipLower> lower_unsigned;
-  lower_unsigned(uint8_t());
-  lower_unsigned(uint16_t());
-#if HWY_HAVE_INTEGER64
-  lower_unsigned(uint32_t());  // generates u64
-#endif
-
-  const ForDemoteVectors<TestZipLower> lower_signed;
-  lower_signed(int8_t());
-  lower_signed(int16_t());
-#if HWY_HAVE_INTEGER64
-  lower_signed(int32_t());  // generates i64
-#endif
-
+HWY_NOINLINE void TestAllZipUpper() {
   const ForShrinkableVectors<TestZipUpper> upper_unsigned;
   upper_unsigned(uint8_t());
   upper_unsigned(uint16_t());
@@ -330,6 +346,8 @@ HWY_NOINLINE void TestAllZip() {
   // No float - concatenating f32 does not result in a f64
 }
 
+#endif  // HWY_TARGETS != HWY_SCALAR
+
 class TestSpecialShuffle32 {
  public:
   template <class T, class D>
@@ -424,7 +442,10 @@ HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllBroadcast);
 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytesSame);
 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytesMixed);
 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllInterleave);
-HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZip);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZipLower);
+#if HWY_TARGETS != HWY_SCALAR
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZipUpper);
+#endif
 HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllSpecialShuffles);
 }  // namespace hwy
 
diff --git a/media/highway/src/hwy/tests/combine_test.cc b/media/highway/src/hwy/tests/combine_test.cc
index 495a9bb2e8..b99f07a7dd 100644
--- a/media/highway/src/hwy/tests/combine_test.cc
+++ b/media/highway/src/hwy/tests/combine_test.cc
@@ -15,11 +15,11 @@
 
 #include <stddef.h>
 #include <stdint.h>
+#include <string.h>  // memcpy
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/combine_test.cc"
-#include "hwy/foreach_target.h"
-
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -235,6 +235,14 @@ struct TestConcatOddEven {
     const auto odd = Add(even, Set(d, 1));
     HWY_ASSERT_VEC_EQ(d, odd, ConcatOdd(d, hi, lo));
     HWY_ASSERT_VEC_EQ(d, even, ConcatEven(d, hi, lo));
+
+    // This test catches inadvertent saturation.
+    const auto min = Set(d, LowestValue<T>());
+    const auto max = Set(d, HighestValue<T>());
+    HWY_ASSERT_VEC_EQ(d, max, ConcatOdd(d, max, max));
+    HWY_ASSERT_VEC_EQ(d, max, ConcatEven(d, max, max));
+    HWY_ASSERT_VEC_EQ(d, min, ConcatOdd(d, min, min));
+    HWY_ASSERT_VEC_EQ(d, min, ConcatEven(d, min, min));
 #else
     (void)d;
 #endif
diff --git a/media/highway/src/hwy/tests/compare_test.cc b/media/highway/src/hwy/tests/compare_test.cc
index fbecea849d..a96e29fc62 100644
--- a/media/highway/src/hwy/tests/compare_test.cc
+++ b/media/highway/src/hwy/tests/compare_test.cc
@@ -19,7 +19,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/compare_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -149,8 +149,21 @@ struct TestStrictInt {
   }
 };
 
+// S-SSE3 bug (#795): same upper, differing MSB in lower
+struct TestStrictInt64 {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto m0 = MaskFalse(d);
+    const auto m1 = MaskTrue(d);
+    HWY_ASSERT_MASK_EQ(d, m0, Lt(Set(d, 0x380000000LL), Set(d, 0x300000001LL)));
+    HWY_ASSERT_MASK_EQ(d, m1, Lt(Set(d, 0xF00000000LL), Set(d, 0xF80000000LL)));
+    HWY_ASSERT_MASK_EQ(d, m1, Lt(Set(d, 0xF00000000LL), Set(d, 0xF80000001LL)));
+  }
+};
+
 HWY_NOINLINE void TestAllStrictInt() {
   ForSignedTypes(ForPartialVectors<TestStrictInt>());
+  ForPartialVectors<TestStrictInt64>()(int64_t());
 }
 
 struct TestStrictFloat {
@@ -219,16 +232,15 @@ HWY_NOINLINE void TestAllWeakFloat() {
   ForFloatTypes(ForPartialVectors<TestWeakFloat>());
 }
 
-class TestLt128 {
-  template <class D>
-  static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
-    alignas(16) uint64_t in[2];
-    in[0] = lo;
-    in[1] = hi;
-    return LoadDup128(d, in);
-  }
+template <class D>
+static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
+  alignas(16) uint64_t in[2];
+  in[0] = lo;
+  in[1] = hi;
+  return LoadDup128(d, in);
+}
 
- public:
+struct TestLt128 {
   template <typename T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
     using V = Vec<D>;
@@ -276,6 +288,204 @@ class TestLt128 {
 
 HWY_NOINLINE void TestAllLt128() { ForGEVectors<128, TestLt128>()(uint64_t()); }
 
+struct TestLt128Upper {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using V = Vec<D>;
+    const V v00 = Zero(d);
+    const V v01 = Make128(d, 0, 1);
+    const V v10 = Make128(d, 1, 0);
+    const V v11 = Add(v01, v10);
+
+    const auto mask_false = MaskFalse(d);
+    const auto mask_true = MaskTrue(d);
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v00, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v01, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v10, v10));
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v00, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v01, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v01, v11));
+
+    // Reversed order
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v01, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v10, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v11, v01));
+
+    // Also check 128-bit blocks are independent
+    const V iota = Iota(d, 1);
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, iota, Add(iota, v01)));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, iota, Add(iota, v10)));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, Add(iota, v01), iota));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, Add(iota, v10), iota));
+
+    // Max value
+    const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v11));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v00, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v01, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v10, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v11, vm));
+  }
+};
+
+HWY_NOINLINE void TestAllLt128Upper() {
+  ForGEVectors<128, TestLt128Upper>()(uint64_t());
+}
+
+struct TestEq128 {  // Also Ne128
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using V = Vec<D>;
+    const V v00 = Zero(d);
+    const V v01 = Make128(d, 0, 1);
+    const V v10 = Make128(d, 1, 0);
+    const V v11 = Add(v01, v10);
+
+    const auto mask_false = MaskFalse(d);
+    const auto mask_true = MaskTrue(d);
+
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v00, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v01, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v10, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v00, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v01, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v10, v10));
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v00, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v11));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v00, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v11));
+
+    // Reversed order
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v10, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v11, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v10, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v11, v01));
+
+    // Also check 128-bit blocks are independent
+    const V iota = Iota(d, 1);
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, iota, Add(iota, v01)));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, iota, Add(iota, v10)));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, Add(iota, v01), iota));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, Add(iota, v10), iota));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, iota, Add(iota, v01)));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, iota, Add(iota, v10)));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, Add(iota, v01), iota));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, Add(iota, v10), iota));
+
+    // Max value
+    const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, vm, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, vm, vm));
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v11));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v00, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v10, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v11, vm));
+
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v11));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v00, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v10, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v11, vm));
+  }
+};
+
+HWY_NOINLINE void TestAllEq128() { ForGEVectors<128, TestEq128>()(uint64_t()); }
+
+struct TestEq128Upper {  // Also Ne128Upper
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using V = Vec<D>;
+    const V v00 = Zero(d);
+    const V v01 = Make128(d, 0, 1);
+    const V v10 = Make128(d, 1, 0);
+    const V v11 = Add(v01, v10);
+
+    const auto mask_false = MaskFalse(d);
+    const auto mask_true = MaskTrue(d);
+
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v00, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v01, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v10, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v00, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v01, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v10, v10));
+
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v00, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v00, v01));
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, v11));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, v11));
+
+    // Reversed order
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v01, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v01, v00));
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v10, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v11, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v10, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v11, v01));
+
+    // Also check 128-bit blocks are independent
+    const V iota = Iota(d, 1);
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, iota, Add(iota, v01)));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, iota, Add(iota, v01)));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, iota, Add(iota, v10)));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, iota, Add(iota, v10)));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, Add(iota, v01), iota));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, Add(iota, v01), iota));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, Add(iota, v10), iota));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, Add(iota, v10), iota));
+
+    // Max value
+    const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, vm, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, vm, vm));
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v11));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v00, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v10, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v11, vm));
+
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v11));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v00, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v10, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v11, vm));
+  }
+};
+
+HWY_NOINLINE void TestAllEq128Upper() {
+  ForGEVectors<128, TestEq128Upper>()(uint64_t());
+}
+
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
@@ -291,6 +501,9 @@ HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictInt);
 HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictFloat);
 HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllWeakFloat);
 HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllLt128);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllLt128Upper);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEq128);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEq128Upper);
 }  // namespace hwy
 
 #endif
diff --git a/media/highway/src/hwy/tests/compress_test.cc b/media/highway/src/hwy/tests/compress_test.cc
index 861c9c299c..e2d0ef0ba9 100644
--- a/media/highway/src/hwy/tests/compress_test.cc
+++ b/media/highway/src/hwy/tests/compress_test.cc
@@ -13,7 +13,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <inttypes.h>  // PRIu64
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>  // memset
@@ -24,7 +23,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/compress_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -32,41 +31,38 @@ HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
 
-// For regenerating tables used in the implementation
+// Regenerate tables used in the implementation, instead of testing.
 #define HWY_PRINT_TABLES 0
 
-class TestCompress {
-  template <class D, class DI, typename T = TFromD<D>, typename TI = TFromD<DI>>
-  void CheckStored(D d, DI di, size_t expected_pos, size_t actual_pos,
-                   size_t num_to_check, const AlignedFreeUniquePtr<T[]>& in,
-                   const AlignedFreeUniquePtr<TI[]>& mask_lanes,
-                   const AlignedFreeUniquePtr<T[]>& expected, const T* actual_u,
-                   int line) {
-    if (expected_pos != actual_pos) {
-      hwy::Abort(
-          __FILE__, line,
-          "Size mismatch for %s: expected %" PRIu64 ", actual %" PRIu64 "\n",
-          TypeName(T(), Lanes(d)).c_str(), static_cast<uint64_t>(expected_pos),
-          static_cast<uint64_t>(actual_pos));
-    }
-    // Modified from AssertVecEqual - we may not be checking all lanes.
-    for (size_t i = 0; i < num_to_check; ++i) {
-      if (!IsEqual(expected[i], actual_u[i])) {
-        const size_t N = Lanes(d);
-        fprintf(stderr,
-                "Mismatch at i=%" PRIu64 " of %" PRIu64 ", line %d:\n\n",
-                static_cast<uint64_t>(i), static_cast<uint64_t>(num_to_check),
-                line);
-        Print(di, "mask", Load(di, mask_lanes.get()), 0, N);
-        Print(d, "in", Load(d, in.get()), 0, N);
-        Print(d, "expect", Load(d, expected.get()), 0, N);
-        Print(d, "actual", Load(d, actual_u), 0, N);
-        HWY_ASSERT(false);
-      }
+#if !HWY_PRINT_TABLES || HWY_IDE
+
+template <class D, class DI, typename T = TFromD<D>, typename TI = TFromD<DI>>
+void CheckStored(D d, DI di, size_t expected_pos, size_t actual_pos,
+                 size_t num_to_check, const AlignedFreeUniquePtr<T[]>& in,
+                 const AlignedFreeUniquePtr<TI[]>& mask_lanes,
+                 const AlignedFreeUniquePtr<T[]>& expected, const T* actual_u,
+                 int line) {
+  if (expected_pos != actual_pos) {
+    hwy::Abort(__FILE__, line, "Size mismatch for %s: expected %d, actual %d\n",
+               TypeName(T(), Lanes(d)).c_str(), static_cast<int>(expected_pos),
+               static_cast<int>(actual_pos));
+  }
+  // Modified from AssertVecEqual - we may not be checking all lanes.
+  for (size_t i = 0; i < num_to_check; ++i) {
+    if (!IsEqual(expected[i], actual_u[i])) {
+      const size_t N = Lanes(d);
+      fprintf(stderr, "Mismatch at i=%d of %d, line %d:\n\n",
+              static_cast<int>(i), static_cast<int>(num_to_check), line);
+      Print(di, "mask", Load(di, mask_lanes.get()), 0, N);
+      Print(d, "in", Load(d, in.get()), 0, N);
+      Print(d, "expect", Load(d, expected.get()), 0, N);
+      Print(d, "actual", Load(d, actual_u), 0, N);
+      HWY_ASSERT(false);
     }
   }
+}
 
- public:
+struct TestCompress {
   template <class T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
     RandomState rng;
@@ -97,7 +93,7 @@ class TestCompress {
         for (size_t i = 0; i < N; ++i) {
           const uint64_t bits = Random32(&rng);
           in_lanes[i] = T();  // cannot initialize float16_t directly.
-          CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);
+          CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);  // not same size
           mask_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
           if (mask_lanes[i] > 0) {
             expected[expected_pos++] = in_lanes[i];
@@ -131,19 +127,25 @@ class TestCompress {
         CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes,
                     mask_lanes, expected, actual_u, __LINE__);
 
+        // CompressNot
+        memset(actual_u, 0, N * sizeof(T));
+        StoreU(CompressNot(in, Not(mask)), d, actual_u);
+        CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes,
+                    mask_lanes, expected, actual_u, __LINE__);
+
         // CompressStore
         memset(actual_u, 0, N * sizeof(T));
         const size_t size1 = CompressStore(in, mask, d, actual_u);
-        // expected_pos instead of num_to_check because this op is not affected
-        // by CompressIsPartition.
+        // expected_pos instead of num_to_check because this op is not
+        // affected by CompressIsPartition.
         CheckStored(d, di, expected_pos, size1, expected_pos, in_lanes,
                     mask_lanes, expected, actual_u, __LINE__);
 
         // CompressBlendedStore
         memset(actual_u, 0, N * sizeof(T));
         const size_t size2 = CompressBlendedStore(in, mask, d, actual_u);
-        // expected_pos instead of num_to_check because this op only writes the
-        // mask=true lanes.
+        // expected_pos instead of num_to_check because this op only writes
+        // the mask=true lanes.
         CheckStored(d, di, expected_pos, size2, expected_pos, in_lanes,
                     mask_lanes, expected, actual_u, __LINE__);
         // Subsequent lanes are untouched.
@@ -160,8 +162,8 @@ class TestCompress {
         // CompressBitsStore
         memset(actual_u, 0, N * sizeof(T));
         const size_t size3 = CompressBitsStore(in, bits.get(), d, actual_u);
-        // expected_pos instead of num_to_check because this op is not affected
-        // by CompressIsPartition.
+        // expected_pos instead of num_to_check because this op is not
+        // affected by CompressIsPartition.
         CheckStored(d, di, expected_pos, size3, expected_pos, in_lanes,
                     mask_lanes, expected, actual_u, __LINE__);
       }  // rep
@@ -169,8 +171,81 @@ class TestCompress {
   }      // operator()
 };
 
-#if HWY_PRINT_TABLES
+HWY_NOINLINE void TestAllCompress() {
+  ForUIF163264(ForPartialVectors<TestCompress>());
+}
+
+struct TestCompressBlocks {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET == HWY_SCALAR
+    (void)d;
+#else
+    static_assert(sizeof(T) == 8 && !IsSigned<T>(), "Should be u64");
+    RandomState rng;
+
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(d);
+
+    auto in_lanes = AllocateAligned<T>(N);
+    auto mask_lanes = AllocateAligned<TI>(N);
+    auto expected = AllocateAligned<T>(N);
+    auto actual = AllocateAligned<T>(N);
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      size_t expected_pos = 0;
+      for (size_t i = 0; i < N; i += 2) {
+        const uint64_t bits = Random32(&rng);
+        in_lanes[i + 1] = in_lanes[i] = T();  // cannot set float16_t directly.
+        CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);      // not same size
+        CopyBytes<sizeof(T)>(&bits, &in_lanes[i + 1]);  // not same size
+        mask_lanes[i + 1] = mask_lanes[i] = TI{(Random32(&rng) & 8) ? 1 : 0};
+        if (mask_lanes[i] > 0) {
+          expected[expected_pos++] = in_lanes[i];
+          expected[expected_pos++] = in_lanes[i + 1];
+        }
+      }
+      size_t num_to_check;
+      if (CompressIsPartition<T>::value) {
+        // For non-native Compress, also check that mask=false lanes were
+        // moved to the back of the vector (highest indices).
+        size_t extra = expected_pos;
+        for (size_t i = 0; i < N; ++i) {
+          if (mask_lanes[i] == 0) {
+            expected[extra++] = in_lanes[i];
+          }
+        }
+        HWY_ASSERT(extra == N);
+        num_to_check = N;
+      } else {
+        // For native Compress, only the mask=true lanes are defined.
+        num_to_check = expected_pos;
+      }
+
+      const auto in = Load(d, in_lanes.get());
+      const auto mask = RebindMask(d, Gt(Load(di, mask_lanes.get()), Zero(di)));
+
+      // CompressBlocksNot
+      memset(actual.get(), 0, N * sizeof(T));
+      StoreU(CompressBlocksNot(in, Not(mask)), d, actual.get());
+      CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes,
+                  mask_lanes, expected, actual.get(), __LINE__);
+    }  // rep
+#endif  // HWY_TARGET == HWY_SCALAR
+  }     // operator()
+};
+
+HWY_NOINLINE void TestAllCompressBlocks() {
+  ForGE128Vectors<TestCompressBlocks>()(uint64_t());
+}
+
+#endif  // !HWY_PRINT_TABLES
+
+#if HWY_PRINT_TABLES || HWY_IDE
 namespace detail {  // for code folding
+
 void PrintCompress16x8Tables() {
   printf("======================================= 16x8\n");
   constexpr size_t N = 8;  // 128-bit SIMD
@@ -200,11 +275,11 @@ void PrintCompress16x8Tables() {
   printf("\n");
 }
 
-// Similar to the above, but uses native 16-bit shuffle instead of bytes.
-void PrintCompress16x16HalfTables() {
-  printf("======================================= 16x16Half\n");
-  constexpr size_t N = 8;
-  for (uint64_t code = 0; code < (1ull << N); ++code) {
+void PrintCompressNot16x8Tables() {
+  printf("======================================= Not 16x8\n");
+  constexpr size_t N = 8;  // 128-bit SIMD
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
     std::array<uint8_t, N> indices{0};
     size_t pos = 0;
     // All lanes where mask = true
@@ -221,19 +296,61 @@ void PrintCompress16x16HalfTables() {
     }
     HWY_ASSERT(pos == N);
 
+    // Doubled (for converting lane to byte indices)
     for (size_t i = 0; i < N; ++i) {
-      printf("%d,", indices[i]);
+      printf("%d,", 2 * indices[i]);
     }
-    printf(code & 1 ? "//\n" : "/**/");
+    printf(not_code & 1 ? "//\n" : "/**/");
   }
   printf("\n");
 }
 
-// Compressed to nibbles
+// Compressed to nibbles, unpacked via variable right shift. Also includes
+// FirstN bits in the nibble MSB.
 void PrintCompress32x8Tables() {
-  printf("======================================= 32x8\n");
-  constexpr size_t N = 8;  // AVX2
+  printf("======================================= 32/64x8\n");
+  constexpr size_t N = 8;  // AVX2 or 64-bit AVX3
   for (uint64_t code = 0; code < (1ull << N); ++code) {
+    const size_t count = PopCount(code);
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Convert to nibbles
+    uint64_t packed = 0;
+    for (size_t i = 0; i < N; ++i) {
+      HWY_ASSERT(indices[i] < N);
+      if (i < count) {
+        indices[i] |= N;
+        HWY_ASSERT(indices[i] < 0x10);
+      }
+      packed += indices[i] << (i * 4);
+    }
+
+    HWY_ASSERT(packed < (1ull << (N * 4)));
+    printf("0x%08x,", static_cast<uint32_t>(packed));
+  }
+  printf("\n");
+}
+
+void PrintCompressNot32x8Tables() {
+  printf("======================================= Not 32/64x8\n");
+  constexpr size_t N = 8;  // AVX2 or 64-bit AVX3
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    const size_t count = PopCount(code);
     std::array<uint32_t, N> indices{0};
     size_t pos = 0;
     // All lanes where mask = true
@@ -254,6 +371,10 @@ void PrintCompress32x8Tables() {
     uint64_t packed = 0;
     for (size_t i = 0; i < N; ++i) {
       HWY_ASSERT(indices[i] < N);
+      if (i < count) {
+        indices[i] |= N;
+        HWY_ASSERT(indices[i] < 0x10);
+      }
       packed += indices[i] << (i * 4);
     }
 
@@ -266,7 +387,7 @@ void PrintCompress32x8Tables() {
 // Compressed to nibbles (for AVX3 64x4)
 void PrintCompress64x4NibbleTables() {
   printf("======================================= 64x4Nibble\n");
-  constexpr size_t N = 4;
+  constexpr size_t N = 4;  // AVX2
   for (uint64_t code = 0; code < (1ull << N); ++code) {
     std::array<uint32_t, N> indices{0};
     size_t pos = 0;
@@ -297,12 +418,147 @@ void PrintCompress64x4NibbleTables() {
   printf("\n");
 }
 
-// Pairs of 32-bit lane indices
+void PrintCompressNot64x4NibbleTables() {
+  printf("======================================= Not 64x4Nibble\n");
+  constexpr size_t N = 4;  // AVX2
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Convert to nibbles
+    uint64_t packed = 0;
+    for (size_t i = 0; i < N; ++i) {
+      HWY_ASSERT(indices[i] < N);
+      packed += indices[i] << (i * 4);
+    }
+
+    HWY_ASSERT(packed < (1ull << (N * 4)));
+    printf("0x%08x,", static_cast<uint32_t>(packed));
+  }
+  printf("\n");
+}
+
 void PrintCompress64x4Tables() {
-  printf("======================================= 64x4\n");
+  printf("======================================= 64x4 uncompressed\n");
+  constexpr size_t N = 4;  // SVE_256
+  for (uint64_t code = 0; code < (1ull << N); ++code) {
+    std::array<size_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Store uncompressed indices because SVE TBL returns 0 if an index is out
+    // of bounds. On AVX3 we simply variable-shift because permute indices are
+    // interpreted modulo N. Compression is not worth the extra shift+AND
+    // because the table is anyway only 512 bytes.
+    for (size_t i = 0; i < N; ++i) {
+      printf("%d,", static_cast<int>(indices[i]));
+    }
+  }
+  printf("\n");
+}
+
+void PrintCompressNot64x4Tables() {
+  printf("======================================= Not 64x4 uncompressed\n");
+  constexpr size_t N = 4;  // SVE_256
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    std::array<size_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Store uncompressed indices because SVE TBL returns 0 if an index is out
+    // of bounds. On AVX3 we simply variable-shift because permute indices are
+    // interpreted modulo N. Compression is not worth the extra shift+AND
+    // because the table is anyway only 512 bytes.
+    for (size_t i = 0; i < N; ++i) {
+      printf("%d,", static_cast<int>(indices[i]));
+    }
+  }
+  printf("\n");
+}
+
+// Same as above, but prints pairs of u32 indices (for AVX2). Also includes
+// FirstN bits in the nibble MSB.
+void PrintCompress64x4PairTables() {
+  printf("======================================= 64x4 u32 index\n");
   constexpr size_t N = 4;  // AVX2
   for (uint64_t code = 0; code < (1ull << N); ++code) {
-    std::array<uint32_t, N> indices{0};
+    const size_t count = PopCount(code);
+    std::array<size_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    // Store uncompressed indices because SVE TBL returns 0 if an index is out
+    // of bounds. On AVX3 we simply variable-shift because permute indices are
+    // interpreted modulo N. Compression is not worth the extra shift+AND
+    // because the table is anyway only 512 bytes.
+    for (size_t i = 0; i < N; ++i) {
+      const int first_n_bit = i < count ? 8 : 0;
+      const int low = static_cast<int>(2 * indices[i]) + first_n_bit;
+      HWY_ASSERT(low < 0x10);
+      printf("%d, %d, ", low, low + 1);
+    }
+  }
+  printf("\n");
+}
+
+void PrintCompressNot64x4PairTables() {
+  printf("======================================= Not 64x4 u32 index\n");
+  constexpr size_t N = 4;  // AVX2
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    const size_t count = PopCount(code);
+    std::array<size_t, N> indices{0};
     size_t pos = 0;
     // All lanes where mask = true
     for (size_t i = 0; i < N; ++i) {
@@ -318,8 +574,15 @@ void PrintCompress64x4Tables() {
     }
     HWY_ASSERT(pos == N);
 
+    // Store uncompressed indices because SVE TBL returns 0 if an index is out
+    // of bounds. On AVX3 we simply variable-shift because permute indices are
+    // interpreted modulo N. Compression is not worth the extra shift+AND
+    // because the table is anyway only 512 bytes.
     for (size_t i = 0; i < N; ++i) {
-      printf("%d,%d,", 2 * indices[i], 2 * indices[i] + 1);
+      const int first_n_bit = i < count ? 8 : 0;
+      const int low = static_cast<int>(2 * indices[i]) + first_n_bit;
+      HWY_ASSERT(low < 0x10);
+      printf("%d, %d, ", low, low + 1);
     }
   }
   printf("\n");
@@ -349,8 +612,38 @@ void PrintCompress32x4Tables() {
 
     for (size_t i = 0; i < N; ++i) {
       for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
-        printf("%" PRIu64 ",",
-               static_cast<uint64_t>(sizeof(T) * indices[i] + idx_byte));
+        printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
+      }
+    }
+  }
+  printf("\n");
+}
+
+void PrintCompressNot32x4Tables() {
+  printf("======================================= Not 32x4\n");
+  using T = uint32_t;
+  constexpr size_t N = 4;  // SSE4
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    for (size_t i = 0; i < N; ++i) {
+      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+        printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
       }
     }
   }
@@ -381,30 +674,68 @@ void PrintCompress64x2Tables() {
 
     for (size_t i = 0; i < N; ++i) {
       for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
-        printf("%" PRIu64 ",",
-               static_cast<uint64_t>(sizeof(T) * indices[i] + idx_byte));
+        printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
       }
     }
   }
   printf("\n");
 }
+
+void PrintCompressNot64x2Tables() {
+  printf("======================================= Not 64x2\n");
+  using T = uint64_t;
+  constexpr size_t N = 2;  // SSE4
+  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
+    const uint64_t code = ~not_code;
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    // All lanes where mask = true
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+    // All lanes where mask = false
+    for (size_t i = 0; i < N; ++i) {
+      if (!(code & (1ull << i))) {
+        indices[pos++] = i;
+      }
+    }
+    HWY_ASSERT(pos == N);
+
+    for (size_t i = 0; i < N; ++i) {
+      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+        printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
+      }
+    }
+  }
+  printf("\n");
+}
+
 }  // namespace detail
-#endif  // HWY_PRINT_TABLES
 
-HWY_NOINLINE void TestAllCompress() {
-#if HWY_PRINT_TABLES
+HWY_NOINLINE void PrintTables() {
+  // Only print once.
+#if HWY_TARGET == HWY_STATIC_TARGET
   detail::PrintCompress32x8Tables();
+  detail::PrintCompressNot32x8Tables();
   detail::PrintCompress64x4NibbleTables();
+  detail::PrintCompressNot64x4NibbleTables();
   detail::PrintCompress64x4Tables();
+  detail::PrintCompressNot64x4Tables();
   detail::PrintCompress32x4Tables();
+  detail::PrintCompressNot32x4Tables();
   detail::PrintCompress64x2Tables();
+  detail::PrintCompressNot64x2Tables();
+  detail::PrintCompress64x4PairTables();
+  detail::PrintCompressNot64x4PairTables();
   detail::PrintCompress16x8Tables();
-  detail::PrintCompress16x16HalfTables();
+  detail::PrintCompressNot16x8Tables();
 #endif
-
-  ForUIF163264(ForPartialVectors<TestCompress>());
 }
 
+#endif  // HWY_PRINT_TABLES
+
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
@@ -414,7 +745,13 @@ HWY_AFTER_NAMESPACE();
 
 namespace hwy {
 HWY_BEFORE_TEST(HwyCompressTest);
+#if HWY_PRINT_TABLES
+// Only print instead of running tests; this will be visible in the log.
+HWY_EXPORT_AND_TEST_P(HwyCompressTest, PrintTables);
+#else
 HWY_EXPORT_AND_TEST_P(HwyCompressTest, TestAllCompress);
+HWY_EXPORT_AND_TEST_P(HwyCompressTest, TestAllCompressBlocks);
+#endif
 }  // namespace hwy
 
 #endif
diff --git a/media/highway/src/hwy/tests/convert_test.cc b/media/highway/src/hwy/tests/convert_test.cc
index 8b7df7bdef..a7aea5fe9e 100644
--- a/media/highway/src/hwy/tests/convert_test.cc
+++ b/media/highway/src/hwy/tests/convert_test.cc
@@ -17,10 +17,13 @@
 #include <stdint.h>
 #include <string.h>
 
+#include <cmath>  // std::isfinite
+
+#include "hwy/base.h"
+
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/convert_test.cc"
-#include "hwy/foreach_target.h"
-
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -155,7 +158,7 @@ struct TestPromoteTo {
     for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
       for (size_t i = 0; i < N; ++i) {
         const uint64_t bits = rng();
-        memcpy(&from[i], &bits, sizeof(T));
+        CopyBytes<sizeof(T)>(&bits, &from[i]);  // not same size
         expected[i] = from[i];
       }
 
@@ -235,13 +238,19 @@ AlignedFreeUniquePtr<float[]> F16TestCases(D d, size_t& padded) {
       -2.00390625f, -3.99609375f,
       // No infinity/NaN - implementation-defined due to ARM.
   };
-  const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
+  constexpr size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
   const size_t N = Lanes(d);
+  HWY_ASSERT(N != 0);
   padded = RoundUpTo(kNumTestCases, N);  // allow loading whole vectors
   auto in = AllocateAligned<float>(padded);
   auto expected = AllocateAligned<float>(padded);
-  std::copy(test_cases, test_cases + kNumTestCases, in.get());
-  std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
+  size_t i = 0;
+  for (; i < kNumTestCases; ++i) {
+    in[i] = test_cases[i];
+  }
+  for (; i < padded; ++i) {
+    in[i] = 0.0f;
+  }
   return in;
 }
 
@@ -250,10 +259,11 @@ struct TestF16 {
   HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
 #if HWY_HAVE_FLOAT16
     size_t padded;
+    const size_t N = Lanes(d32);  // same count for f16
+    HWY_ASSERT(N != 0);
     auto in = F16TestCases(d32, padded);
     using TF16 = float16_t;
     const Rebind<TF16, DF32> d16;
-    const size_t N = Lanes(d32);  // same count for f16
     auto temp16 = AllocateAligned<TF16>(N);
 
     for (size_t i = 0; i < padded; i += N) {
@@ -289,13 +299,19 @@ AlignedFreeUniquePtr<float[]> BF16TestCases(D d, size_t& padded) {
       // negative +/- delta
       -2.015625f, -3.984375f,
   };
-  const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
+  constexpr size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
   const size_t N = Lanes(d);
+  HWY_ASSERT(N != 0);
   padded = RoundUpTo(kNumTestCases, N);  // allow loading whole vectors
   auto in = AllocateAligned<float>(padded);
   auto expected = AllocateAligned<float>(padded);
-  std::copy(test_cases, test_cases + kNumTestCases, in.get());
-  std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
+  size_t i = 0;
+  for (; i < kNumTestCases; ++i) {
+    in[i] = test_cases[i];
+  }
+  for (; i < padded; ++i) {
+    in[i] = 0.0f;
+  }
   return in;
 }
 
@@ -335,8 +351,6 @@ struct TestConvertU8 {
   template <typename T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, const D du32) {
     const Rebind<uint8_t, D> du8;
-    auto lanes8 = AllocateAligned<uint8_t>(Lanes(du8));
-    Store(Iota(du8, 0), du8, lanes8.get());
     const auto wrap = Set(du32, 0xFF);
     HWY_ASSERT_VEC_EQ(du8, Iota(du8, 0), U8FromU32(And(Iota(du32, 0), wrap)));
     HWY_ASSERT_VEC_EQ(du8, Iota(du8, 0x7F),
@@ -348,15 +362,54 @@ HWY_NOINLINE void TestAllConvertU8() {
   ForDemoteVectors<TestConvertU8, 2>()(uint32_t());
 }
 
+template <typename From, typename To, class D>
+constexpr bool IsSupportedTruncation() {
+  return (sizeof(To) < sizeof(From)) &&
+         (Pow2(Rebind<To, D>()) + 3 >= static_cast<int>(CeilLog2(sizeof(To))));
+}
+
+struct TestTruncateTo {
+  template <typename From, typename To, class D,
+            hwy::EnableIf<!IsSupportedTruncation<From, To, D>()>* = nullptr>
+  HWY_NOINLINE void testTo(From, To, const D) {
+    // do nothing
+  }
+
+  template <typename From, typename To, class D,
+            hwy::EnableIf<IsSupportedTruncation<From, To, D>()>* = nullptr>
+  HWY_NOINLINE void testTo(From, To, const D d) {
+    constexpr uint32_t base = 0xFA578D00;
+    const Rebind<To, D> dTo;
+    const auto src = Iota(d, static_cast<From>(base));
+    const auto expected = Iota(dTo, static_cast<To>(base));
+    const VFromD<decltype(dTo)> actual = TruncateTo(dTo, src);
+    HWY_ASSERT_VEC_EQ(dTo, expected, actual);
+  }
+
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T from, const D d) {
+    testTo<T, uint8_t, D>(from, uint8_t(), d);
+    testTo<T, uint16_t, D>(from, uint16_t(), d);
+    testTo<T, uint32_t, D>(from, uint32_t(), d);
+  }
+};
+
+HWY_NOINLINE void TestAllTruncate() {
+  ForUnsignedTypes(ForPartialVectors<TestTruncateTo>());
+}
+
 // Separate function to attempt to work around a compiler bug on ARM: when this
 // is merged with TestIntFromFloat, outputs match a previous Iota(-(N+1)) input.
 struct TestIntFromFloatHuge {
   template <typename TF, class DF>
   HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
-    // Still does not work, although ARMv7 manual says that float->int
-    // saturates, i.e. chooses the nearest representable value. Also causes
-    // out-of-memory for MSVC.
-#if HWY_TARGET != HWY_NEON && !HWY_COMPILER_MSVC
+    // The ARMv7 manual says that float->int saturates, i.e. chooses the
+    // nearest representable value. This works correctly on armhf with GCC, but
+    // not with clang. For reasons unknown, MSVC also runs into an out-of-memory
+    // error here.
+#if HWY_COMPILER_CLANG || HWY_COMPILER_MSVC
+    (void)df;
+#else
     using TI = MakeSigned<TF>;
     const Rebind<TI, DF> di;
 
@@ -372,8 +425,6 @@ struct TestIntFromFloatHuge {
     // Huge negative
     Store(Set(di, LimitsMin<TI>()), di, expected.get());
     HWY_ASSERT_VEC_EQ(di, expected.get(), ConvertTo(di, Set(df, TF(-1E20))));
-#else
-    (void)df;
 #endif
   }
 };
@@ -390,7 +441,7 @@ class TestIntFromFloat {
     for (int sign = 0; sign < 2; ++sign) {
       for (size_t shift = 0; shift < kBits - 1; ++shift) {
         for (int64_t ofs : ofs_table) {
-          const int64_t mag = (int64_t(1) << shift) + ofs;
+          const int64_t mag = (int64_t{1} << shift) + ofs;
           const int64_t val = sign ? mag : -mag;
           HWY_ASSERT_VEC_EQ(di, Set(di, static_cast<TI>(val)),
                             ConvertTo(di, Set(df, static_cast<TF>(val))));
@@ -417,7 +468,7 @@ class TestIntFromFloat {
       for (size_t i = 0; i < N; ++i) {
         do {
           const uint64_t bits = rng();
-          memcpy(&from[i], &bits, sizeof(TF));
+          CopyBytes<sizeof(TF)>(&bits, &from[i]);  // not same size
         } while (!std::isfinite(from[i]));
         if (from[i] >= max) {
           expected[i] = LimitsMax<TI>();
@@ -498,6 +549,34 @@ HWY_NOINLINE void TestAllFloatFromInt() {
   ForFloatTypes(ForPartialVectors<TestFloatFromInt>());
 }
 
+struct TestFloatFromUint {
+  template <typename TF, class DF>
+  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
+    using TU = MakeUnsigned<TF>;
+    const RebindToUnsigned<DF> du;
+
+    // Integer positive
+    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), ConvertTo(df, Iota(du, TU(4))));
+    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(65535.0)),
+                      ConvertTo(df, Iota(du, 65535)));  // 2^16-1
+    if (sizeof(TF) > 4) {
+      HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4294967295.0)),
+                        ConvertTo(df, Iota(du, 4294967295ULL)));  // 2^32-1
+    }
+
+    // Max positive
+    HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TU>())),
+                      ConvertTo(df, Set(du, LimitsMax<TU>())));
+
+    // Zero
+    HWY_ASSERT_VEC_EQ(df, Zero(df), ConvertTo(df, Zero(du)));
+  }
+};
+
+HWY_NOINLINE void TestAllFloatFromUint() {
+  ForFloatTypes(ForPartialVectors<TestFloatFromUint>());
+}
+
 struct TestI32F64 {
   template <typename TF, class DF>
   HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
@@ -554,8 +633,10 @@ HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteTo);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBF16);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllConvertU8);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllTruncate);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromUint);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllI32F64);
 }  // namespace hwy
 
diff --git a/media/highway/src/hwy/tests/crypto_test.cc b/media/highway/src/hwy/tests/crypto_test.cc
index 2ed9dcb9b4..b7dfb198a3 100644
--- a/media/highway/src/hwy/tests/crypto_test.cc
+++ b/media/highway/src/hwy/tests/crypto_test.cc
@@ -21,7 +21,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/crypto_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -492,8 +492,8 @@ struct TestCLMul {
     const size_t padded = RoundUpTo(kCLMulNum, N);
     auto expected_lower = AllocateAligned<T>(padded);
     auto expected_upper = AllocateAligned<T>(padded);
-    memcpy(expected_lower.get(), kCLMulLower, kCLMulNum * sizeof(T));
-    memcpy(expected_upper.get(), kCLMulUpper, kCLMulNum * sizeof(T));
+    CopyBytes<kCLMulNum * sizeof(T)>(kCLMulLower, expected_lower.get());
+    CopyBytes<kCLMulNum * sizeof(T)>(kCLMulUpper, expected_upper.get());
     const size_t padding_size = (padded - kCLMulNum) * sizeof(T);
     memset(expected_lower.get() + kCLMulNum, 0, padding_size);
     memset(expected_upper.get() + kCLMulNum, 0, padding_size);
diff --git a/media/highway/src/hwy/tests/demote_test.cc b/media/highway/src/hwy/tests/demote_test.cc
index 104ced09fb..4339a54375 100644
--- a/media/highway/src/hwy/tests/demote_test.cc
+++ b/media/highway/src/hwy/tests/demote_test.cc
@@ -15,12 +15,10 @@
 
 #include <stddef.h>
 #include <stdint.h>
-#include <string.h>
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/demote_test.cc"
-#include "hwy/foreach_target.h"
-
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -67,7 +65,7 @@ struct TestDemoteTo {
       for (size_t i = 0; i < N; ++i) {
         do {
           const uint64_t bits = rng();
-          memcpy(&from[i], &bits, sizeof(T));
+          CopyBytes<sizeof(T)>(&bits, &from[i]);  // not same size
         } while (!value_ok(from[i]));
         expected[i] = static_cast<ToT>(HWY_MIN(HWY_MAX(min, from[i]), max));
       }
@@ -117,7 +115,7 @@ struct TestDemoteToFloat {
       for (size_t i = 0; i < N; ++i) {
         do {
           const uint64_t bits = rng();
-          memcpy(&from[i], &bits, sizeof(T));
+          CopyBytes<sizeof(T)>(&bits, &from[i]);  // not same size
         } while (!IsFiniteT(from[i]));
         const T magn = std::abs(from[i]);
         const T max_abs = HighestValue<ToT>();
@@ -214,7 +212,6 @@ class TestReorderDemote2To {
   template <typename TF32, class DF32>
   HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
 #if HWY_TARGET != HWY_SCALAR
-
     size_t padded;
     auto in = ReorderBF16TestCases(d32, padded);
 
@@ -235,11 +232,12 @@ class TestReorderDemote2To {
       const auto promoted1 = PromoteTo(d32, Load(dbf16_half, temp16.get() + N));
 
       // Smoke test: sum should be same (with tolerance for non-associativity)
-      const auto sum_expected =
+      const auto sum_expected = GetLane(SumOfLanes(d32, Add(f0, f1)));
+      const auto sum_actual =
           GetLane(SumOfLanes(d32, Add(promoted0, promoted1)));
-      const auto sum_actual = GetLane(SumOfLanes(d32, Add(f0, f1)));
-      HWY_ASSERT(sum_actual - 1E-4 <= sum_actual &&
-                 sum_expected <= sum_actual + 1E-4);
+
+      HWY_ASSERT(sum_expected - 1E-4 <= sum_actual &&
+                 sum_actual <= sum_expected + 1E-4);
 
       // Ensure values are the same after sorting to undo the Reorder
       Store(f0, d32, expected.get() + 0);
diff --git a/media/highway/src/hwy/tests/float_test.cc b/media/highway/src/hwy/tests/float_test.cc
index cc9b313ca4..05d7b7605a 100644
--- a/media/highway/src/hwy/tests/float_test.cc
+++ b/media/highway/src/hwy/tests/float_test.cc
@@ -15,7 +15,6 @@
 
 // Tests some ops specific to floating-point types (Div, Round etc.)
 
-#include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>
 
@@ -24,7 +23,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/float_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -113,9 +112,8 @@ struct TestReciprocalSquareRoot {
       float err = lanes[i] - 0.090166f;
       if (err < 0.0f) err = -err;
       if (err >= 4E-4f) {
-        HWY_ABORT("Lane %" PRIu64 "(%" PRIu64 "): actual %f err %f\n",
-                  static_cast<uint64_t>(i), static_cast<uint64_t>(N), lanes[i],
-                  err);
+        HWY_ABORT("Lane %d (%d): actual %f err %f\n", static_cast<int>(i),
+                  static_cast<int>(N), lanes[i], err);
       }
     }
   }
@@ -222,7 +220,7 @@ struct TestNearestInt {
       if (std::isnan(in[i])) {
         // We replace NaN with 0 below (no_nan)
         expected[i] = 0;
-      } else if (std::isinf(in[i]) || double(std::abs(in[i])) >= max) {
+      } else if (std::isinf(in[i]) || double{std::abs(in[i])} >= max) {
         // Avoid undefined result for lrintf
         expected[i] = std::signbit(in[i]) ? LimitsMin<TI>() : LimitsMax<TI>();
       } else {
diff --git a/media/highway/src/hwy/tests/hwy_gtest.h b/media/highway/src/hwy/tests/hwy_gtest.h
index ff29823f04..acecee8e30 100644
--- a/media/highway/src/hwy/tests/hwy_gtest.h
+++ b/media/highway/src/hwy/tests/hwy_gtest.h
@@ -44,7 +44,7 @@ namespace hwy {
 //   };
 //   HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite);
 //   TEST_P(MyTestSuite, MyTest) { ... }
-class TestWithParamTarget : public testing::TestWithParam<uint32_t> {
+class TestWithParamTarget : public testing::TestWithParam<int64_t> {
  protected:
   void SetUp() override { SetSupportedTargetsForTest(GetParam()); }
 
@@ -53,7 +53,7 @@ class TestWithParamTarget : public testing::TestWithParam<uint32_t> {
     // was compiled with more than one target. In the single-target case only
     // static dispatch will be used anyway.
 #if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
-    EXPECT_TRUE(SupportedTargetsCalledForTest())
+    EXPECT_TRUE(GetChosenTarget().IsInitialized())
         << "This hwy target parametric test doesn't use dynamic-dispatch and "
            "doesn't need to be parametric.";
 #endif
@@ -64,7 +64,7 @@ class TestWithParamTarget : public testing::TestWithParam<uint32_t> {
 // Function to convert the test parameter of a TestWithParamTarget for
 // displaying it in the gtest test name.
 static inline std::string TestParamTargetName(
-    const testing::TestParamInfo<uint32_t>& info) {
+    const testing::TestParamInfo<int64_t>& info) {
   return TargetName(info.param);
 }
 
@@ -85,7 +85,7 @@ static inline std::string TestParamTargetName(
 //   TEST_P(MyTestSuite, MyTest) { ... GetParam() .... }
 template <typename T>
 class TestWithParamTargetAndT
-    : public ::testing::TestWithParam<std::tuple<uint32_t, T>> {
+    : public ::testing::TestWithParam<std::tuple<int64_t, T>> {
  public:
   // Expose the parametric type here so it can be used by the
   // HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T macro.
@@ -94,7 +94,7 @@ class TestWithParamTargetAndT
  protected:
   void SetUp() override {
     SetSupportedTargetsForTest(std::get<0>(
-        ::testing::TestWithParam<std::tuple<uint32_t, T>>::GetParam()));
+        ::testing::TestWithParam<std::tuple<int64_t, T>>::GetParam()));
   }
 
   void TearDown() override {
@@ -102,7 +102,7 @@ class TestWithParamTargetAndT
     // was compiled with more than one target. In the single-target case only
     // static dispatch will be used anyway.
 #if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
-    EXPECT_TRUE(SupportedTargetsCalledForTest())
+    EXPECT_TRUE(GetChosenTarget().IsInitialized())
         << "This hwy target parametric test doesn't use dynamic-dispatch and "
            "doesn't need to be parametric.";
 #endif
@@ -111,13 +111,13 @@ class TestWithParamTargetAndT
 
   T GetParam() {
     return std::get<1>(
-        ::testing::TestWithParam<std::tuple<uint32_t, T>>::GetParam());
+        ::testing::TestWithParam<std::tuple<int64_t, T>>::GetParam());
   }
 };
 
 template <typename T>
 std::string TestParamTargetNameAndT(
-    const testing::TestParamInfo<std::tuple<uint32_t, T>>& info) {
+    const testing::TestParamInfo<std::tuple<int64_t, T>>& info) {
   return std::string(TargetName(std::get<0>(info.param))) + "_" +
          ::testing::PrintToString(std::get<1>(info.param));
 }
diff --git a/media/highway/src/hwy/tests/if_test.cc b/media/highway/src/hwy/tests/if_test.cc
new file mode 100644
index 0000000000..e44a878a0c
--- /dev/null
+++ b/media/highway/src/hwy/tests/if_test.cc
@@ -0,0 +1,175 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "hwy/aligned_allocator.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/if_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestIfThenElse {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(d);
+    auto in1 = AllocateAligned<T>(N);
+    auto in2 = AllocateAligned<T>(N);
+    auto bool_lanes = AllocateAligned<TI>(N);
+    auto expected = AllocateAligned<T>(N);
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        in1[i] = static_cast<T>(Random32(&rng));
+        in2[i] = static_cast<T>(Random32(&rng));
+        bool_lanes[i] = (Random32(&rng) & 16) ? TI(1) : TI(0);
+      }
+
+      const auto v1 = Load(d, in1.get());
+      const auto v2 = Load(d, in2.get());
+      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+
+      for (size_t i = 0; i < N; ++i) {
+        expected[i] = bool_lanes[i] ? in1[i] : in2[i];
+      }
+      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElse(mask, v1, v2));
+
+      for (size_t i = 0; i < N; ++i) {
+        expected[i] = bool_lanes[i] ? in1[i] : T(0);
+      }
+      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElseZero(mask, v1));
+
+      for (size_t i = 0; i < N; ++i) {
+        expected[i] = bool_lanes[i] ? T(0) : in2[i];
+      }
+      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenZeroElse(mask, v2));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllIfThenElse() {
+  ForAllTypes(ForPartialVectors<TestIfThenElse>());
+}
+
+struct TestIfVecThenElse {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    using TU = MakeUnsigned<T>;  // For all-one mask
+    const Rebind<TU, D> du;
+    const size_t N = Lanes(d);
+    auto in1 = AllocateAligned<T>(N);
+    auto in2 = AllocateAligned<T>(N);
+    auto vec_lanes = AllocateAligned<TU>(N);
+    auto expected = AllocateAligned<T>(N);
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        in1[i] = static_cast<T>(Random32(&rng));
+        in2[i] = static_cast<T>(Random32(&rng));
+        vec_lanes[i] = (Random32(&rng) & 16) ? static_cast<TU>(~TU(0)) : TU(0);
+      }
+
+      const auto v1 = Load(d, in1.get());
+      const auto v2 = Load(d, in2.get());
+      const auto vec = BitCast(d, Load(du, vec_lanes.get()));
+
+      for (size_t i = 0; i < N; ++i) {
+        expected[i] = vec_lanes[i] ? in1[i] : in2[i];
+      }
+      HWY_ASSERT_VEC_EQ(d, expected.get(), IfVecThenElse(vec, v1, v2));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllIfVecThenElse() {
+  ForAllTypes(ForPartialVectors<TestIfVecThenElse>());
+}
+
+struct TestZeroIfNegative {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vp = Iota(d, 1);
+    const auto vn = Iota(d, T(-1E5));  // assumes N < 10^5
+
+    // Zero and positive remain unchanged
+    HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(v0));
+    HWY_ASSERT_VEC_EQ(d, vp, ZeroIfNegative(vp));
+
+    // Negative are all replaced with zero
+    HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(vn));
+  }
+};
+
+HWY_NOINLINE void TestAllZeroIfNegative() {
+  ForFloatTypes(ForPartialVectors<TestZeroIfNegative>());
+}
+
+struct TestIfNegative {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vp = Iota(d, 1);
+    const auto vn = Or(vp, SignBit(d));
+
+    // Zero and positive remain unchanged
+    HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(v0, vn, v0));
+    HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(v0, v0, vn));
+    HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vp, vn, vp));
+    HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vp, vp, vn));
+
+    // Negative are replaced with 2nd arg
+    HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(vn, v0, vp));
+    HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vn, vn, v0));
+    HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vn, vp, vn));
+  }
+};
+
+HWY_NOINLINE void TestAllIfNegative() {
+  ForFloatTypes(ForPartialVectors<TestIfNegative>());
+  ForSignedTypes(ForPartialVectors<TestIfNegative>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyIfTest);
+HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllIfThenElse);
+HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllIfVecThenElse);
+HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllZeroIfNegative);
+HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllIfNegative);
+}  // namespace hwy
+
+#endif
diff --git a/media/highway/src/hwy/tests/interleaved_test.cc b/media/highway/src/hwy/tests/interleaved_test.cc
new file mode 100644
index 0000000000..4d1fbd5ac5
--- /dev/null
+++ b/media/highway/src/hwy/tests/interleaved_test.cc
@@ -0,0 +1,256 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/interleaved_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestLoadStoreInterleaved2 {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+
+    RandomState rng;
+
+    // Data to be interleaved
+    auto bytes = AllocateAligned<T>(2 * N);
+    for (size_t i = 0; i < 2 * N; ++i) {
+      bytes[i] = static_cast<T>(Random32(&rng) & 0xFF);
+    }
+    const auto in0 = Load(d, &bytes[0 * N]);
+    const auto in1 = Load(d, &bytes[1 * N]);
+
+    // Interleave here, ensure vector results match scalar
+    auto expected = AllocateAligned<T>(3 * N);
+    auto actual_aligned = AllocateAligned<T>(3 * N + 1);
+    T* actual = actual_aligned.get() + 1;
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        expected[2 * i + 0] = bytes[0 * N + i];
+        expected[2 * i + 1] = bytes[1 * N + i];
+        // Ensure we do not write more than 2*N bytes
+        expected[2 * N + i] = actual[2 * N + i] = 0;
+      }
+      StoreInterleaved2(in0, in1, d, actual);
+      size_t pos = 0;
+      if (!BytesEqual(expected.get(), actual, 3 * N * sizeof(T), &pos)) {
+        Print(d, "in0", in0, pos / 4);
+        Print(d, "in1", in1, pos / 4);
+        const size_t i = pos;
+        fprintf(stderr, "interleaved i=%d %f %f %f %f  %f %f %f %f\n",
+                static_cast<int>(i), static_cast<double>(actual[i]),
+                static_cast<double>(actual[i + 1]),
+                static_cast<double>(actual[i + 2]),
+                static_cast<double>(actual[i + 3]),
+                static_cast<double>(actual[i + 4]),
+                static_cast<double>(actual[i + 5]),
+                static_cast<double>(actual[i + 6]),
+                static_cast<double>(actual[i + 7]));
+        HWY_ASSERT(false);
+      }
+
+      Vec<D> out0, out1;
+      LoadInterleaved2(d, actual, out0, out1);
+      HWY_ASSERT_VEC_EQ(d, in0, out0);
+      HWY_ASSERT_VEC_EQ(d, in1, out1);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllLoadStoreInterleaved2() {
+#if HWY_TARGET == HWY_RVV
+  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
+  const ForExtendableVectors<TestLoadStoreInterleaved2, 2> test;
+#else
+  const ForPartialVectors<TestLoadStoreInterleaved2> test;
+#endif
+  ForAllTypes(test);
+}
+
+// Workaround for build timeout on GCC 12 aarch64, see #776
+#if HWY_COMPILER_GCC_ACTUAL >= 1200 && HWY_ARCH_ARM_A64
+#define HWY_BROKEN_LOAD34 1
+#else
+#define HWY_BROKEN_LOAD34 0
+#endif
+
+#if !HWY_BROKEN_LOAD34
+
+struct TestLoadStoreInterleaved3 {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+
+    RandomState rng;
+
+    // Data to be interleaved
+    auto bytes = AllocateAligned<T>(3 * N);
+    for (size_t i = 0; i < 3 * N; ++i) {
+      bytes[i] = static_cast<T>(Random32(&rng) & 0xFF);
+    }
+    const auto in0 = Load(d, &bytes[0 * N]);
+    const auto in1 = Load(d, &bytes[1 * N]);
+    const auto in2 = Load(d, &bytes[2 * N]);
+
+    // Interleave here, ensure vector results match scalar
+    auto expected = AllocateAligned<T>(4 * N);
+    auto actual_aligned = AllocateAligned<T>(4 * N + 1);
+    T* actual = actual_aligned.get() + 1;
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        expected[3 * i + 0] = bytes[0 * N + i];
+        expected[3 * i + 1] = bytes[1 * N + i];
+        expected[3 * i + 2] = bytes[2 * N + i];
+        // Ensure we do not write more than 3*N bytes
+        expected[3 * N + i] = actual[3 * N + i] = 0;
+      }
+      StoreInterleaved3(in0, in1, in2, d, actual);
+      size_t pos = 0;
+      if (!BytesEqual(expected.get(), actual, 4 * N * sizeof(T), &pos)) {
+        Print(d, "in0", in0, pos / 3, N);
+        Print(d, "in1", in1, pos / 3, N);
+        Print(d, "in2", in2, pos / 3, N);
+        const size_t i = pos;
+        fprintf(stderr, "interleaved i=%d %f %f %f  %f %f %f\n",
+                static_cast<int>(i), static_cast<double>(actual[i]),
+                static_cast<double>(actual[i + 1]),
+                static_cast<double>(actual[i + 2]),
+                static_cast<double>(actual[i + 3]),
+                static_cast<double>(actual[i + 4]),
+                static_cast<double>(actual[i + 5]));
+        HWY_ASSERT(false);
+      }
+
+      Vec<D> out0, out1, out2;
+      LoadInterleaved3(d, actual, out0, out1, out2);
+      HWY_ASSERT_VEC_EQ(d, in0, out0);
+      HWY_ASSERT_VEC_EQ(d, in1, out1);
+      HWY_ASSERT_VEC_EQ(d, in2, out2);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllLoadStoreInterleaved3() {
+#if HWY_TARGET == HWY_RVV
+  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
+  const ForExtendableVectors<TestLoadStoreInterleaved3, 2> test;
+#else
+  const ForPartialVectors<TestLoadStoreInterleaved3> test;
+#endif
+  ForAllTypes(test);
+}
+
+struct TestLoadStoreInterleaved4 {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+
+    RandomState rng;
+
+    // Data to be interleaved
+    auto bytes = AllocateAligned<T>(4 * N);
+
+    for (size_t i = 0; i < 4 * N; ++i) {
+      bytes[i] = static_cast<T>(Random32(&rng) & 0xFF);
+    }
+    const auto in0 = Load(d, &bytes[0 * N]);
+    const auto in1 = Load(d, &bytes[1 * N]);
+    const auto in2 = Load(d, &bytes[2 * N]);
+    const auto in3 = Load(d, &bytes[3 * N]);
+
+    // Interleave here, ensure vector results match scalar
+    auto expected = AllocateAligned<T>(5 * N);
+    auto actual_aligned = AllocateAligned<T>(5 * N + 1);
+    T* actual = actual_aligned.get() + 1;
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        expected[4 * i + 0] = bytes[0 * N + i];
+        expected[4 * i + 1] = bytes[1 * N + i];
+        expected[4 * i + 2] = bytes[2 * N + i];
+        expected[4 * i + 3] = bytes[3 * N + i];
+        // Ensure we do not write more than 4*N bytes
+        expected[4 * N + i] = actual[4 * N + i] = 0;
+      }
+      StoreInterleaved4(in0, in1, in2, in3, d, actual);
+      size_t pos = 0;
+      if (!BytesEqual(expected.get(), actual, 5 * N * sizeof(T), &pos)) {
+        Print(d, "in0", in0, pos / 4);
+        Print(d, "in1", in1, pos / 4);
+        Print(d, "in2", in2, pos / 4);
+        Print(d, "in3", in3, pos / 4);
+        const size_t i = pos;
+        fprintf(stderr, "interleaved i=%d %f %f %f %f  %f %f %f %f\n",
+                static_cast<int>(i), static_cast<double>(actual[i]),
+                static_cast<double>(actual[i + 1]),
+                static_cast<double>(actual[i + 2]),
+                static_cast<double>(actual[i + 3]),
+                static_cast<double>(actual[i + 4]),
+                static_cast<double>(actual[i + 5]),
+                static_cast<double>(actual[i + 6]),
+                static_cast<double>(actual[i + 7]));
+        HWY_ASSERT(false);
+      }
+
+      Vec<D> out0, out1, out2, out3;
+      LoadInterleaved4(d, actual, out0, out1, out2, out3);
+      HWY_ASSERT_VEC_EQ(d, in0, out0);
+      HWY_ASSERT_VEC_EQ(d, in1, out1);
+      HWY_ASSERT_VEC_EQ(d, in2, out2);
+      HWY_ASSERT_VEC_EQ(d, in3, out3);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllLoadStoreInterleaved4() {
+#if HWY_TARGET == HWY_RVV
+  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
+  const ForExtendableVectors<TestLoadStoreInterleaved4, 2> test;
+#else
+  const ForPartialVectors<TestLoadStoreInterleaved4> test;
+#endif
+  ForAllTypes(test);
+}
+
+#endif  // !HWY_BROKEN_LOAD34
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyInterleavedTest);
+HWY_EXPORT_AND_TEST_P(HwyInterleavedTest, TestAllLoadStoreInterleaved2);
+#if !HWY_BROKEN_LOAD34
+HWY_EXPORT_AND_TEST_P(HwyInterleavedTest, TestAllLoadStoreInterleaved3);
+HWY_EXPORT_AND_TEST_P(HwyInterleavedTest, TestAllLoadStoreInterleaved4);
+#endif
+}  // namespace hwy
+
+#endif
diff --git a/media/highway/src/hwy/tests/list_targets.cc b/media/highway/src/hwy/tests/list_targets.cc
index 4bc947fca0..d09ee4fe86 100644
--- a/media/highway/src/hwy/tests/list_targets.cc
+++ b/media/highway/src/hwy/tests/list_targets.cc
@@ -20,10 +20,10 @@
 
 #include "hwy/highway.h"
 
-void PrintTargets(const char* msg, uint32_t targets) {
+void PrintTargets(const char* msg, int64_t targets) {
   fprintf(stderr, "%s", msg);
   // For each bit:
-  for (uint32_t x = targets; x != 0; x = x & (x - 1)) {
+  for (int64_t x = targets; x != 0; x = x & (x - 1)) {
     // Extract value of least-significant bit.
     fprintf(stderr, " %s", hwy::TargetName(x & (~x + 1)));
   }
@@ -31,8 +31,41 @@ void PrintTargets(const char* msg, uint32_t targets) {
 }
 
 int main() {
-  PrintTargets("Compiled HWY_TARGETS:", HWY_TARGETS);
-  PrintTargets("HWY_BASELINE_TARGETS:", HWY_BASELINE_TARGETS);
-  PrintTargets("Current CPU supports:", hwy::SupportedTargets());
+#ifdef HWY_COMPILE_ONLY_EMU128
+  const int only_emu128 = 1;
+#else
+  const int only_emu128 = 0;
+#endif
+#ifdef HWY_COMPILE_ONLY_SCALAR
+  const int only_scalar = 1;
+#else
+  const int only_scalar = 0;
+#endif
+#ifdef HWY_COMPILE_ONLY_STATIC
+  const int only_static = 1;
+#else
+  const int only_static = 0;
+#endif
+#ifdef HWY_COMPILE_ALL_ATTAINABLE
+  const int all_attain = 1;
+#else
+  const int all_attain = 0;
+#endif
+#ifdef HWY_IS_TEST
+  const int is_test = 1;
+#else
+  const int is_test = 0;
+#endif
+
+  fprintf(stderr,
+          "Config: emu128:%d scalar:%d static:%d all_attain:%d is_test:%d\n",
+          only_emu128, only_scalar, only_static, all_attain, is_test);
+  PrintTargets("Compiled HWY_TARGETS:  ", HWY_TARGETS);
+  PrintTargets("HWY_ATTAINABLE_TARGETS:", HWY_ATTAINABLE_TARGETS);
+  PrintTargets("HWY_BASELINE_TARGETS:  ", HWY_BASELINE_TARGETS);
+  PrintTargets("HWY_STATIC_TARGET:     ", HWY_STATIC_TARGET);
+  PrintTargets("HWY_BROKEN_TARGETS:    ", HWY_BROKEN_TARGETS);
+  PrintTargets("HWY_DISABLED_TARGETS:  ", HWY_DISABLED_TARGETS);
+  PrintTargets("Current CPU supports:  ", hwy::SupportedTargets());
   return 0;
 }
diff --git a/media/highway/src/hwy/tests/logical_test.cc b/media/highway/src/hwy/tests/logical_test.cc
index 0cb0bb68a4..fa2b9b9adf 100644
--- a/media/highway/src/hwy/tests/logical_test.cc
+++ b/media/highway/src/hwy/tests/logical_test.cc
@@ -21,7 +21,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/logical_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -174,92 +174,10 @@ struct TestCopySign {
   }
 };
 
-struct TestIfVecThenElse {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-
-    using TU = MakeUnsigned<T>;  // For all-one mask
-    const Rebind<TU, D> du;
-    const size_t N = Lanes(d);
-    auto in1 = AllocateAligned<T>(N);
-    auto in2 = AllocateAligned<T>(N);
-    auto vec_lanes = AllocateAligned<TU>(N);
-    auto expected = AllocateAligned<T>(N);
-
-    // Each lane should have a chance of having mask=true.
-    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        in1[i] = static_cast<T>(Random32(&rng));
-        in2[i] = static_cast<T>(Random32(&rng));
-        vec_lanes[i] = (Random32(&rng) & 16) ? static_cast<TU>(~TU(0)) : TU(0);
-      }
-
-      const auto v1 = Load(d, in1.get());
-      const auto v2 = Load(d, in2.get());
-      const auto vec = BitCast(d, Load(du, vec_lanes.get()));
-
-      for (size_t i = 0; i < N; ++i) {
-        expected[i] = vec_lanes[i] ? in1[i] : in2[i];
-      }
-      HWY_ASSERT_VEC_EQ(d, expected.get(), IfVecThenElse(vec, v1, v2));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllIfVecThenElse() {
-  ForAllTypes(ForPartialVectors<TestIfVecThenElse>());
-}
-
 HWY_NOINLINE void TestAllCopySign() {
   ForFloatTypes(ForPartialVectors<TestCopySign>());
 }
 
-struct TestZeroIfNegative {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto vp = Iota(d, 1);
-    const auto vn = Iota(d, T(-1E5));  // assumes N < 10^5
-
-    // Zero and positive remain unchanged
-    HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(v0));
-    HWY_ASSERT_VEC_EQ(d, vp, ZeroIfNegative(vp));
-
-    // Negative are all replaced with zero
-    HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(vn));
-  }
-};
-
-HWY_NOINLINE void TestAllZeroIfNegative() {
-  ForFloatTypes(ForPartialVectors<TestZeroIfNegative>());
-}
-
-struct TestIfNegative {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto vp = Iota(d, 1);
-    const auto vn = Or(vp, SignBit(d));
-
-    // Zero and positive remain unchanged
-    HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(v0, vn, v0));
-    HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(v0, v0, vn));
-    HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vp, vn, vp));
-    HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vp, vp, vn));
-
-    // Negative are replaced with 2nd arg
-    HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(vn, v0, vp));
-    HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vn, vn, v0));
-    HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vn, vp, vn));
-  }
-};
-
-HWY_NOINLINE void TestAllIfNegative() {
-  ForFloatTypes(ForPartialVectors<TestIfNegative>());
-  ForSignedTypes(ForPartialVectors<TestIfNegative>());
-}
-
 struct TestBroadcastSignBit {
   template <class T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
@@ -343,10 +261,7 @@ namespace hwy {
 HWY_BEFORE_TEST(HwyLogicalTest);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalInteger);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalFloat);
-HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfVecThenElse);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCopySign);
-HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllZeroIfNegative);
-HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfNegative);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllBroadcastSignBit);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllTestBit);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllPopulationCount);
diff --git a/media/highway/src/hwy/tests/mask_mem_test.cc b/media/highway/src/hwy/tests/mask_mem_test.cc
new file mode 100644
index 0000000000..c44119dcd7
--- /dev/null
+++ b/media/highway/src/hwy/tests/mask_mem_test.cc
@@ -0,0 +1,197 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS  // before inttypes.h
+#endif
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>  // memcmp
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/mask_mem_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestMaskedLoad {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(d);
+    auto bool_lanes = AllocateAligned<TI>(N);
+
+    auto lanes = AllocateAligned<T>(N);
+    Store(Iota(d, T{1}), d, lanes.get());
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
+      }
+
+      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+      const auto expected = IfThenElseZero(mask, Load(d, lanes.get()));
+      const auto actual = MaskedLoad(mask, d, lanes.get());
+      HWY_ASSERT_VEC_EQ(d, expected, actual);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllMaskedLoad() {
+  ForAllTypes(ForPartialVectors<TestMaskedLoad>());
+}
+
+struct TestBlendedStore {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(d);
+    auto bool_lanes = AllocateAligned<TI>(N);
+
+    const Vec<D> v = Iota(d, T{1});
+    auto actual = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<T>(N);
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
+        // Re-initialize to something distinct from v[i].
+        actual[i] = static_cast<T>(127 - (i & 127));
+        expected[i] = bool_lanes[i] ? static_cast<T>(i + 1) : actual[i];
+      }
+
+      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+      BlendedStore(v, mask, d, actual.get());
+      HWY_ASSERT_VEC_EQ(d, expected.get(), Load(d, actual.get()));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllBlendedStore() {
+  ForAllTypes(ForPartialVectors<TestBlendedStore>());
+}
+
+class TestStoreMaskBits {
+ public:
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*t*/, D /*d*/) {
+    RandomState rng;
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(di);
+    auto bool_lanes = AllocateAligned<TI>(N);
+
+    const ScalableTag<uint8_t, -3> d_bits;
+    const size_t expected_num_bytes = (N + 7) / 8;
+    auto expected = AllocateAligned<uint8_t>(expected_num_bytes);
+    auto actual = AllocateAligned<uint8_t>(HWY_MAX(8, expected_num_bytes));
+
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      // Generate random mask pattern.
+      for (size_t i = 0; i < N; ++i) {
+        bool_lanes[i] = static_cast<TI>((rng() & 1024) ? 1 : 0);
+      }
+      const auto bools = Load(di, bool_lanes.get());
+      const auto mask = Gt(bools, Zero(di));
+
+      // Requires at least 8 bytes, ensured above.
+      const size_t bytes_written = StoreMaskBits(di, mask, actual.get());
+      if (bytes_written != expected_num_bytes) {
+        fprintf(stderr, "%s expected %" PRIu64 " bytes, actual %" PRIu64 "\n",
+                TypeName(T(), N).c_str(),
+                static_cast<uint64_t>(expected_num_bytes),
+                static_cast<uint64_t>(bytes_written));
+
+        HWY_ASSERT(false);
+      }
+
+      // Requires at least 8 bytes, ensured above.
+      const auto mask2 = LoadMaskBits(di, actual.get());
+      HWY_ASSERT_MASK_EQ(di, mask, mask2);
+
+      memset(expected.get(), 0, expected_num_bytes);
+      for (size_t i = 0; i < N; ++i) {
+        expected[i / 8] =
+            static_cast<uint8_t>(expected[i / 8] | (bool_lanes[i] << (i % 8)));
+      }
+
+      size_t i = 0;
+      // Stored bits must match original mask
+      for (; i < N; ++i) {
+        const TI is_set = (actual[i / 8] & (1 << (i % 8))) ? 1 : 0;
+        if (is_set != bool_lanes[i]) {
+          fprintf(stderr, "%s lane %" PRIu64 ": expected %d, actual %d\n",
+                  TypeName(T(), N).c_str(), static_cast<uint64_t>(i),
+                  static_cast<int>(bool_lanes[i]), static_cast<int>(is_set));
+          Print(di, "bools", bools, 0, N);
+          Print(d_bits, "expected bytes", Load(d_bits, expected.get()), 0,
+                expected_num_bytes);
+          Print(d_bits, "actual bytes", Load(d_bits, actual.get()), 0,
+                expected_num_bytes);
+
+          HWY_ASSERT(false);
+        }
+      }
+      // Any partial bits in the last byte must be zero
+      for (; i < 8 * bytes_written; ++i) {
+        const int bit = (actual[i / 8] & (1 << (i % 8)));
+        if (bit != 0) {
+          fprintf(stderr, "%s: bit #%" PRIu64 " should be zero\n",
+                  TypeName(T(), N).c_str(), static_cast<uint64_t>(i));
+          Print(di, "bools", bools, 0, N);
+          Print(d_bits, "expected bytes", Load(d_bits, expected.get()), 0,
+                expected_num_bytes);
+          Print(d_bits, "actual bytes", Load(d_bits, actual.get()), 0,
+                expected_num_bytes);
+
+          HWY_ASSERT(false);
+        }
+      }
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllStoreMaskBits() {
+  ForAllTypes(ForPartialVectors<TestStoreMaskBits>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyMaskTest);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskedLoad);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllBlendedStore);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllStoreMaskBits);
+}  // namespace hwy
+
+#endif
diff --git a/media/highway/src/hwy/tests/mask_test.cc b/media/highway/src/hwy/tests/mask_test.cc
index d397c72536..f48b476be8 100644
--- a/media/highway/src/hwy/tests/mask_test.cc
+++ b/media/highway/src/hwy/tests/mask_test.cc
@@ -13,15 +13,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>  // memcmp
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/mask_test.cc"
-#include "hwy/foreach_target.h"
-
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -83,53 +81,6 @@ HWY_NOINLINE void TestAllFirstN() {
   ForAllTypes(ForPartialVectors<TestFirstN>());
 }
 
-struct TestIfThenElse {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-
-    using TI = MakeSigned<T>;  // For mask > 0 comparison
-    const Rebind<TI, D> di;
-    const size_t N = Lanes(d);
-    auto in1 = AllocateAligned<T>(N);
-    auto in2 = AllocateAligned<T>(N);
-    auto bool_lanes = AllocateAligned<TI>(N);
-    auto expected = AllocateAligned<T>(N);
-
-    // Each lane should have a chance of having mask=true.
-    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        in1[i] = static_cast<T>(Random32(&rng));
-        in2[i] = static_cast<T>(Random32(&rng));
-        bool_lanes[i] = (Random32(&rng) & 16) ? TI(1) : TI(0);
-      }
-
-      const auto v1 = Load(d, in1.get());
-      const auto v2 = Load(d, in2.get());
-      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
-
-      for (size_t i = 0; i < N; ++i) {
-        expected[i] = bool_lanes[i] ? in1[i] : in2[i];
-      }
-      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElse(mask, v1, v2));
-
-      for (size_t i = 0; i < N; ++i) {
-        expected[i] = bool_lanes[i] ? in1[i] : T(0);
-      }
-      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElseZero(mask, v1));
-
-      for (size_t i = 0; i < N; ++i) {
-        expected[i] = bool_lanes[i] ? T(0) : in2[i];
-      }
-      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenZeroElse(mask, v2));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllIfThenElse() {
-  ForAllTypes(ForPartialVectors<TestIfThenElse>());
-}
-
 struct TestMaskVec {
   template <class T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
@@ -162,71 +113,6 @@ HWY_NOINLINE void TestAllMaskVec() {
   ForUIF3264(test);
 }
 
-struct TestMaskedLoad {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-
-    using TI = MakeSigned<T>;  // For mask > 0 comparison
-    const Rebind<TI, D> di;
-    const size_t N = Lanes(d);
-    auto bool_lanes = AllocateAligned<TI>(N);
-
-    auto lanes = AllocateAligned<T>(N);
-    Store(Iota(d, T{1}), d, lanes.get());
-
-    // Each lane should have a chance of having mask=true.
-    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
-      }
-
-      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
-      const auto expected = IfThenElseZero(mask, Load(d, lanes.get()));
-      const auto actual = MaskedLoad(mask, d, lanes.get());
-      HWY_ASSERT_VEC_EQ(d, expected, actual);
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllMaskedLoad() {
-  ForAllTypes(ForPartialVectors<TestMaskedLoad>());
-}
-
-struct TestBlendedStore {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-
-    using TI = MakeSigned<T>;  // For mask > 0 comparison
-    const Rebind<TI, D> di;
-    const size_t N = Lanes(d);
-    auto bool_lanes = AllocateAligned<TI>(N);
-
-    const Vec<D> v = Iota(d, T{1});
-    auto actual = AllocateAligned<T>(N);
-    auto expected = AllocateAligned<T>(N);
-
-    // Each lane should have a chance of having mask=true.
-    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
-        // Re-initialize to something distinct from v[i].
-        actual[i] = static_cast<T>(127 - (i & 127));
-        expected[i] = bool_lanes[i] ? static_cast<T>(i + 1) : actual[i];
-      }
-
-      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
-      BlendedStore(v, mask, d, actual.get());
-      HWY_ASSERT_VEC_EQ(d, expected.get(), Load(d, actual.get()));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllBlendedStore() {
-  ForAllTypes(ForPartialVectors<TestBlendedStore>());
-}
-
 struct TestAllTrueFalse {
   template <class T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
@@ -237,8 +123,6 @@ struct TestAllTrueFalse {
     auto lanes = AllocateAligned<T>(N);
     std::fill(lanes.get(), lanes.get() + N, T(0));
 
-    auto mask_lanes = AllocateAligned<T>(N);
-
     HWY_ASSERT(AllTrue(d, Eq(v, zero)));
     HWY_ASSERT(!AllFalse(d, Eq(v, zero)));
 
@@ -251,11 +135,7 @@ struct TestAllTrueFalse {
       lanes[i] = T(1);
       v = Load(d, lanes.get());
 
-      // GCC 10.2.1 workaround: AllTrue(Eq(v, zero)) is true but should not be.
-      // Assigning to an lvalue is insufficient but storing to memory prevents
-      // the bug; so does Print of VecFromMask(d, Eq(v, zero)).
-      Store(VecFromMask(d, Eq(v, zero)), d, mask_lanes.get());
-      HWY_ASSERT(!AllTrue(d, MaskFromVec(Load(d, mask_lanes.get()))));
+      HWY_ASSERT(!AllTrue(d, Eq(v, zero)));
 
       HWY_ASSERT(expected_all_false ^ AllFalse(d, Eq(v, zero)));
 
@@ -277,89 +157,6 @@ HWY_NOINLINE void TestAllAllTrueFalse() {
   ForAllTypes(ForPartialVectors<TestAllTrueFalse>());
 }
 
-class TestStoreMaskBits {
- public:
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*t*/, D /*d*/) {
-    RandomState rng;
-    using TI = MakeSigned<T>;  // For mask > 0 comparison
-    const Rebind<TI, D> di;
-    const size_t N = Lanes(di);
-    auto bool_lanes = AllocateAligned<TI>(N);
-
-    const ScalableTag<uint8_t, -3> d_bits;
-    const size_t expected_num_bytes = (N + 7) / 8;
-    auto expected = AllocateAligned<uint8_t>(expected_num_bytes);
-    auto actual = AllocateAligned<uint8_t>(HWY_MAX(8, expected_num_bytes));
-
-    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
-      // Generate random mask pattern.
-      for (size_t i = 0; i < N; ++i) {
-        bool_lanes[i] = static_cast<TI>((rng() & 1024) ? 1 : 0);
-      }
-      const auto bools = Load(di, bool_lanes.get());
-      const auto mask = Gt(bools, Zero(di));
-
-      // Requires at least 8 bytes, ensured above.
-      const size_t bytes_written = StoreMaskBits(di, mask, actual.get());
-      if (bytes_written != expected_num_bytes) {
-        fprintf(stderr, "%s expected %" PRIu64 " bytes, actual %" PRIu64 "\n",
-                TypeName(T(), N).c_str(),
-                static_cast<uint64_t>(expected_num_bytes),
-                static_cast<uint64_t>(bytes_written));
-
-        HWY_ASSERT(false);
-      }
-
-      // Requires at least 8 bytes, ensured above.
-      const auto mask2 = LoadMaskBits(di, actual.get());
-      HWY_ASSERT_MASK_EQ(di, mask, mask2);
-
-      memset(expected.get(), 0, expected_num_bytes);
-      for (size_t i = 0; i < N; ++i) {
-        expected[i / 8] = uint8_t(expected[i / 8] | (bool_lanes[i] << (i % 8)));
-      }
-
-      size_t i = 0;
-      // Stored bits must match original mask
-      for (; i < N; ++i) {
-        const TI is_set = (actual[i / 8] & (1 << (i % 8))) ? 1 : 0;
-        if (is_set != bool_lanes[i]) {
-          fprintf(stderr, "%s lane %" PRIu64 ": expected %d, actual %d\n",
-                  TypeName(T(), N).c_str(), static_cast<uint64_t>(i),
-                  int(bool_lanes[i]), int(is_set));
-          Print(di, "bools", bools, 0, N);
-          Print(d_bits, "expected bytes", Load(d_bits, expected.get()), 0,
-                expected_num_bytes);
-          Print(d_bits, "actual bytes", Load(d_bits, actual.get()), 0,
-                expected_num_bytes);
-
-          HWY_ASSERT(false);
-        }
-      }
-      // Any partial bits in the last byte must be zero
-      for (; i < 8 * bytes_written; ++i) {
-        const int bit = (actual[i / 8] & (1 << (i % 8)));
-        if (bit != 0) {
-          fprintf(stderr, "%s: bit #%" PRIu64 " should be zero\n",
-                  TypeName(T(), N).c_str(), static_cast<uint64_t>(i));
-          Print(di, "bools", bools, 0, N);
-          Print(d_bits, "expected bytes", Load(d_bits, expected.get()), 0,
-                expected_num_bytes);
-          Print(d_bits, "actual bytes", Load(d_bits, actual.get()), 0,
-                expected_num_bytes);
-
-          HWY_ASSERT(false);
-        }
-      }
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllStoreMaskBits() {
-  ForAllTypes(ForPartialVectors<TestStoreMaskBits>());
-}
-
 struct TestCountTrue {
   template <class T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
@@ -392,7 +189,7 @@ HWY_NOINLINE void TestAllCountTrue() {
   ForAllTypes(ForPartialVectors<TestCountTrue>());
 }
 
-struct TestFindFirstTrue {
+struct TestFindFirstTrue {  // Also FindKnownFirstTrue
   template <class T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
     using TI = MakeSigned<T>;  // For mask > 0 comparison
@@ -406,17 +203,18 @@ struct TestFindFirstTrue {
 
     HWY_ASSERT_EQ(intptr_t(-1), FindFirstTrue(d, MaskFalse(d)));
     HWY_ASSERT_EQ(intptr_t(0), FindFirstTrue(d, MaskTrue(d)));
+    HWY_ASSERT_EQ(size_t(0), FindKnownFirstTrue(d, MaskTrue(d)));
 
     for (size_t code = 1; code < (1ull << max_lanes); ++code) {
       for (size_t i = 0; i < max_lanes; ++i) {
         bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0);
       }
 
-      const intptr_t expected =
-          static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero32(uint32_t(code)));
+      const size_t expected =
+          Num0BitsBelowLS1Bit_Nonzero32(static_cast<uint32_t>(code));
       const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
-      const intptr_t actual = FindFirstTrue(d, mask);
-      HWY_ASSERT_EQ(expected, actual);
+      HWY_ASSERT_EQ(static_cast<intptr_t>(expected), FindFirstTrue(d, mask));
+      HWY_ASSERT_EQ(expected, FindKnownFirstTrue(d, mask));
     }
   }
 };
@@ -440,6 +238,11 @@ struct TestLogicalMask {
     HWY_ASSERT_MASK_EQ(d, m0, Not(m_all));
     HWY_ASSERT_MASK_EQ(d, m_all, Not(m0));
 
+    Print(d, ".", VecFromMask(d, ExclusiveNeither(m0, m0)));
+    HWY_ASSERT_MASK_EQ(d, m_all, ExclusiveNeither(m0, m0));
+    HWY_ASSERT_MASK_EQ(d, m0, ExclusiveNeither(m_all, m0));
+    HWY_ASSERT_MASK_EQ(d, m0, ExclusiveNeither(m0, m_all));
+
     // For all combinations of zero/nonzero state of subset of lanes:
     const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6)));
     for (size_t code = 0; code < (1ull << max_lanes); ++code) {
@@ -480,12 +283,8 @@ namespace hwy {
 HWY_BEFORE_TEST(HwyMaskTest);
 HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFromVec);
 HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFirstN);
-HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllIfThenElse);
 HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskVec);
-HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskedLoad);
-HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllBlendedStore);
 HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllAllTrueFalse);
-HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllStoreMaskBits);
 HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllCountTrue);
 HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFindFirstTrue);
 HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllLogicalMask);
diff --git a/media/highway/src/hwy/tests/memory_test.cc b/media/highway/src/hwy/tests/memory_test.cc
index b6ac5bda04..b78be2bcee 100644
--- a/media/highway/src/hwy/tests/memory_test.cc
+++ b/media/highway/src/hwy/tests/memory_test.cc
@@ -26,7 +26,7 @@
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/memory_test.cc"
 #include "hwy/cache_control.h"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -132,206 +132,6 @@ HWY_NOINLINE void TestAllSafeCopyN() {
   ForAllTypes(ForPartialVectors<TestSafeCopyN>());
 }
 
-struct TestLoadStoreInterleaved2 {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-
-    RandomState rng;
-
-    // Data to be interleaved
-    auto bytes = AllocateAligned<T>(2 * N);
-    for (size_t i = 0; i < 2 * N; ++i) {
-      bytes[i] = static_cast<T>(Random32(&rng) & 0xFF);
-    }
-    const auto in0 = Load(d, &bytes[0 * N]);
-    const auto in1 = Load(d, &bytes[1 * N]);
-
-    // Interleave here, ensure vector results match scalar
-    auto expected = AllocateAligned<T>(3 * N);
-    auto actual_aligned = AllocateAligned<T>(3 * N + 1);
-    T* actual = actual_aligned.get() + 1;
-
-    for (size_t rep = 0; rep < 100; ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        expected[2 * i + 0] = bytes[0 * N + i];
-        expected[2 * i + 1] = bytes[1 * N + i];
-        // Ensure we do not write more than 2*N bytes
-        expected[2 * N + i] = actual[2 * N + i] = 0;
-      }
-      StoreInterleaved2(in0, in1, d, actual);
-      size_t pos = 0;
-      if (!BytesEqual(expected.get(), actual, 3 * N * sizeof(T), &pos)) {
-        Print(d, "in0", in0, pos / 4);
-        Print(d, "in1", in1, pos / 4);
-        const size_t i = pos;
-        fprintf(stderr, "interleaved i=%d %f %f %f %f  %f %f %f %f\n",
-                static_cast<int>(i), static_cast<double>(actual[i]),
-                static_cast<double>(actual[i + 1]),
-                static_cast<double>(actual[i + 2]),
-                static_cast<double>(actual[i + 3]),
-                static_cast<double>(actual[i + 4]),
-                static_cast<double>(actual[i + 5]),
-                static_cast<double>(actual[i + 6]),
-                static_cast<double>(actual[i + 7]));
-        HWY_ASSERT(false);
-      }
-
-      Vec<D> out0, out1;
-      LoadInterleaved2(d, actual, out0, out1);
-      HWY_ASSERT_VEC_EQ(d, in0, out0);
-      HWY_ASSERT_VEC_EQ(d, in1, out1);
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllLoadStoreInterleaved2() {
-#if HWY_TARGET == HWY_RVV
-  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
-  const ForExtendableVectors<TestLoadStoreInterleaved2, 2> test;
-#else
-  const ForPartialVectors<TestLoadStoreInterleaved2> test;
-#endif
-  ForAllTypes(test);
-}
-
-struct TestLoadStoreInterleaved3 {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-
-    RandomState rng;
-
-    // Data to be interleaved
-    auto bytes = AllocateAligned<T>(3 * N);
-    for (size_t i = 0; i < 3 * N; ++i) {
-      bytes[i] = static_cast<T>(Random32(&rng) & 0xFF);
-    }
-    const auto in0 = Load(d, &bytes[0 * N]);
-    const auto in1 = Load(d, &bytes[1 * N]);
-    const auto in2 = Load(d, &bytes[2 * N]);
-
-    // Interleave here, ensure vector results match scalar
-    auto expected = AllocateAligned<T>(4 * N);
-    auto actual_aligned = AllocateAligned<T>(4 * N + 1);
-    T* actual = actual_aligned.get() + 1;
-
-    for (size_t rep = 0; rep < 100; ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        expected[3 * i + 0] = bytes[0 * N + i];
-        expected[3 * i + 1] = bytes[1 * N + i];
-        expected[3 * i + 2] = bytes[2 * N + i];
-        // Ensure we do not write more than 3*N bytes
-        expected[3 * N + i] = actual[3 * N + i] = 0;
-      }
-      StoreInterleaved3(in0, in1, in2, d, actual);
-      size_t pos = 0;
-      if (!BytesEqual(expected.get(), actual, 4 * N * sizeof(T), &pos)) {
-        Print(d, "in0", in0, pos / 3, N);
-        Print(d, "in1", in1, pos / 3, N);
-        Print(d, "in2", in2, pos / 3, N);
-        const size_t i = pos;
-        fprintf(stderr, "interleaved i=%d %f %f %f  %f %f %f\n",
-                static_cast<int>(i), static_cast<double>(actual[i]),
-                static_cast<double>(actual[i + 1]),
-                static_cast<double>(actual[i + 2]),
-                static_cast<double>(actual[i + 3]),
-                static_cast<double>(actual[i + 4]),
-                static_cast<double>(actual[i + 5]));
-        HWY_ASSERT(false);
-      }
-
-      Vec<D> out0, out1, out2;
-      LoadInterleaved3(d, actual, out0, out1, out2);
-      HWY_ASSERT_VEC_EQ(d, in0, out0);
-      HWY_ASSERT_VEC_EQ(d, in1, out1);
-      HWY_ASSERT_VEC_EQ(d, in2, out2);
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllLoadStoreInterleaved3() {
-#if HWY_TARGET == HWY_RVV
-  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
-  const ForExtendableVectors<TestLoadStoreInterleaved3, 2> test;
-#else
-  const ForPartialVectors<TestLoadStoreInterleaved3> test;
-#endif
-  ForAllTypes(test);
-}
-
-struct TestLoadStoreInterleaved4 {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-
-    RandomState rng;
-
-    // Data to be interleaved
-    auto bytes = AllocateAligned<T>(4 * N);
-
-    for (size_t i = 0; i < 4 * N; ++i) {
-      bytes[i] = static_cast<T>(Random32(&rng) & 0xFF);
-    }
-    const auto in0 = Load(d, &bytes[0 * N]);
-    const auto in1 = Load(d, &bytes[1 * N]);
-    const auto in2 = Load(d, &bytes[2 * N]);
-    const auto in3 = Load(d, &bytes[3 * N]);
-
-    // Interleave here, ensure vector results match scalar
-    auto expected = AllocateAligned<T>(5 * N);
-    auto actual_aligned = AllocateAligned<T>(5 * N + 1);
-    T* actual = actual_aligned.get() + 1;
-
-    for (size_t rep = 0; rep < 100; ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        expected[4 * i + 0] = bytes[0 * N + i];
-        expected[4 * i + 1] = bytes[1 * N + i];
-        expected[4 * i + 2] = bytes[2 * N + i];
-        expected[4 * i + 3] = bytes[3 * N + i];
-        // Ensure we do not write more than 4*N bytes
-        expected[4 * N + i] = actual[4 * N + i] = 0;
-      }
-      StoreInterleaved4(in0, in1, in2, in3, d, actual);
-      size_t pos = 0;
-      if (!BytesEqual(expected.get(), actual, 5 * N * sizeof(T), &pos)) {
-        Print(d, "in0", in0, pos / 4);
-        Print(d, "in1", in1, pos / 4);
-        Print(d, "in2", in2, pos / 4);
-        Print(d, "in3", in3, pos / 4);
-        const size_t i = pos;
-        fprintf(stderr, "interleaved i=%d %f %f %f %f  %f %f %f %f\n",
-                static_cast<int>(i), static_cast<double>(actual[i]),
-                static_cast<double>(actual[i + 1]),
-                static_cast<double>(actual[i + 2]),
-                static_cast<double>(actual[i + 3]),
-                static_cast<double>(actual[i + 4]),
-                static_cast<double>(actual[i + 5]),
-                static_cast<double>(actual[i + 6]),
-                static_cast<double>(actual[i + 7]));
-        HWY_ASSERT(false);
-      }
-
-      Vec<D> out0, out1, out2, out3;
-      LoadInterleaved4(d, actual, out0, out1, out2, out3);
-      HWY_ASSERT_VEC_EQ(d, in0, out0);
-      HWY_ASSERT_VEC_EQ(d, in1, out1);
-      HWY_ASSERT_VEC_EQ(d, in2, out2);
-      HWY_ASSERT_VEC_EQ(d, in3, out3);
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllLoadStoreInterleaved4() {
-#if HWY_TARGET == HWY_RVV
-  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
-  const ForExtendableVectors<TestLoadStoreInterleaved4, 2> test;
-#else
-  const ForPartialVectors<TestLoadStoreInterleaved4> test;
-#endif
-  ForAllTypes(test);
-}
-
 struct TestLoadDup128 {
   template <class T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
@@ -531,9 +331,6 @@ namespace hwy {
 HWY_BEFORE_TEST(HwyMemoryTest);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllSafeCopyN);
-HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStoreInterleaved2);
-HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStoreInterleaved3);
-HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStoreInterleaved4);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadDup128);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStream);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter);
diff --git a/media/highway/src/hwy/tests/mul_test.cc b/media/highway/src/hwy/tests/mul_test.cc
index fad2e9b1f8..fab4292d4b 100644
--- a/media/highway/src/hwy/tests/mul_test.cc
+++ b/media/highway/src/hwy/tests/mul_test.cc
@@ -13,13 +13,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/mul_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -27,6 +26,15 @@ HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
 
+template <size_t kBits>
+constexpr uint64_t FirstBits() {
+  return (1ull << kBits) - 1;
+}
+template <>
+constexpr uint64_t FirstBits<64>() {
+  return ~uint64_t{0};
+}
+
 struct TestUnsignedMul {
   template <typename T, class D>
   HWY_NOINLINE void operator()(T /*unused*/, D d) {
@@ -57,9 +65,8 @@ struct TestUnsignedMul {
     HWY_ASSERT_VEC_EQ(d, vmax, Mul(vmax, v1));
     HWY_ASSERT_VEC_EQ(d, vmax, Mul(v1, vmax));
 
-    const size_t bits = sizeof(T) * 8;
-    const uint64_t mask = (1ull << bits) - 1;
-    const T max2 = (uint64_t(max) * max) & mask;
+    constexpr uint64_t kMask = FirstBits<sizeof(T) * 8>();
+    const T max2 = (static_cast<uint64_t>(max) * max) & kMask;
     HWY_ASSERT_VEC_EQ(d, Set(d, max2), Mul(vmax, vmax));
   }
 };
@@ -97,13 +104,13 @@ HWY_NOINLINE void TestAllMul() {
   // No u8.
   test_unsigned(uint16_t());
   test_unsigned(uint32_t());
-  // No u64.
+  test_unsigned(uint64_t());
 
   const ForPartialVectors<TestSignedMul> test_signed;
   // No i8.
   test_signed(int16_t());
   test_signed(int32_t());
-  // No i64.
+  test_signed(int64_t());
 }
 
 struct TestMulHigh {
@@ -115,7 +122,8 @@ struct TestMulHigh {
     auto expected_lanes = AllocateAligned<T>(N);
 
     const auto vi = Iota(d, 1);
-    const auto vni = Iota(d, -T(N));  // no i8 supported, so no wraparound
+    // no i8 supported, so no wraparound
+    const auto vni = Iota(d, T(static_cast<T>(~N + 1)));
 
     const auto v0 = Zero(d);
     HWY_ASSERT_VEC_EQ(d, v0, MulHigh(v0, v0));
@@ -349,64 +357,65 @@ struct TestReorderWidenMulAccumulate {
   HWY_NOINLINE void operator()(TN /*unused*/, DN dn) {
     using TW = MakeWide<TN>;
     const RepartitionToWide<DN> dw;
-    const auto f0 = Zero(dw);
-    const auto f1 = Set(dw, 1.0f);
-    const auto fi = Iota(dw, 1);
-    const auto bf0 = ReorderDemote2To(dn, f0, f0);
-    const auto bf1 = ReorderDemote2To(dn, f1, f1);
-    const auto bfi = ReorderDemote2To(dn, fi, fi);
-    const size_t NW = Lanes(dw);
-    auto delta = AllocateAligned<TW>(2 * NW);
-    for (size_t i = 0; i < 2 * NW; ++i) {
-      delta[i] = 0.0f;
-    }
+    const Half<DN> dnh;
+    using VW = Vec<decltype(dw)>;
+    using VN = Vec<decltype(dn)>;
+    const size_t NN = Lanes(dn);
+
+    const VW f0 = Zero(dw);
+    const VW f1 = Set(dw, TW{1});
+    const VN bf0 = Zero(dn);
+    // Cannot Set() bfloat16_t directly.
+    const VN bf1 = ReorderDemote2To(dn, f1, f1);
 
     // Any input zero => both outputs zero
-    auto sum1 = f0;
+    VW sum1 = f0;
     HWY_ASSERT_VEC_EQ(dw, f0,
                       ReorderWidenMulAccumulate(dw, bf0, bf0, f0, sum1));
     HWY_ASSERT_VEC_EQ(dw, f0, sum1);
     HWY_ASSERT_VEC_EQ(dw, f0,
-                      ReorderWidenMulAccumulate(dw, bf0, bfi, f0, sum1));
+                      ReorderWidenMulAccumulate(dw, bf0, bf1, f0, sum1));
     HWY_ASSERT_VEC_EQ(dw, f0, sum1);
     HWY_ASSERT_VEC_EQ(dw, f0,
-                      ReorderWidenMulAccumulate(dw, bfi, bf0, f0, sum1));
+                      ReorderWidenMulAccumulate(dw, bf1, bf0, f0, sum1));
     HWY_ASSERT_VEC_EQ(dw, f0, sum1);
 
-    // delta[p] := 1.0, all others zero. For each p: Dot(delta, all-ones) == 1.
-    for (size_t p = 0; p < 2 * NW; ++p) {
-      delta[p] = 1.0f;
-      const auto delta0 = Load(dw, delta.get() + 0);
-      const auto delta1 = Load(dw, delta.get() + NW);
-      delta[p] = 0.0f;
-      const auto bf_delta = ReorderDemote2To(dn, delta0, delta1);
+    // delta[p] := 1, all others zero. For each p: Dot(delta, all-ones) == 1.
+    auto delta_w = AllocateAligned<TW>(NN);
+    for (size_t i = 0; i < NN; ++i) {
+      delta_w[i] = TW{0};
+    }
+    for (size_t p = 0; p < NN; ++p) {
+      delta_w[p] = TW{1};
+      const VW delta0 = Load(dw, delta_w.get());
+      const VW delta1 = Load(dw, delta_w.get() + NN / 2);
+      delta_w[p] = TW{0};
+      const VN delta = ReorderDemote2To(dn, delta0, delta1);
 
       {
         sum1 = f0;
-        const auto sum0 =
-            ReorderWidenMulAccumulate(dw, bf_delta, bf1, f0, sum1);
-        HWY_ASSERT_EQ(1.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
+        const VW sum0 = ReorderWidenMulAccumulate(dw, delta, bf1, f0, sum1);
+        HWY_ASSERT_EQ(TW{1}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
       }
       // Swapped arg order
       {
         sum1 = f0;
-        const auto sum0 =
-            ReorderWidenMulAccumulate(dw, bf1, bf_delta, f0, sum1);
-        HWY_ASSERT_EQ(1.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
+        const VW sum0 = ReorderWidenMulAccumulate(dw, bf1, delta, f0, sum1);
+        HWY_ASSERT_EQ(TW{1}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
       }
       // Start with nonzero sum0 or sum1
       {
-        sum1 = delta1;
-        const auto sum0 =
-            ReorderWidenMulAccumulate(dw, bf_delta, bf1, delta0, sum1);
-        HWY_ASSERT_EQ(2.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
+        VW sum0 = PromoteTo(dw, LowerHalf(dnh, delta));
+        sum1 = PromoteTo(dw, UpperHalf(dnh, delta));
+        sum0 = ReorderWidenMulAccumulate(dw, delta, bf1, sum0, sum1);
+        HWY_ASSERT_EQ(TW{2}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
       }
       // Start with nonzero sum0 or sum1, and swap arg order
       {
-        sum1 = delta1;
-        const auto sum0 =
-            ReorderWidenMulAccumulate(dw, bf1, bf_delta, delta0, sum1);
-        HWY_ASSERT_EQ(2.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
+        VW sum0 = PromoteTo(dw, LowerHalf(dnh, delta));
+        sum1 = PromoteTo(dw, UpperHalf(dnh, delta));
+        sum0 = ReorderWidenMulAccumulate(dw, bf1, delta, sum0, sum1);
+        HWY_ASSERT_EQ(TW{2}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
       }
     }
   }
@@ -414,6 +423,7 @@ struct TestReorderWidenMulAccumulate {
 
 HWY_NOINLINE void TestAllReorderWidenMulAccumulate() {
   ForShrinkableVectors<TestReorderWidenMulAccumulate>()(bfloat16_t());
+  ForShrinkableVectors<TestReorderWidenMulAccumulate>()(int16_t());
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/media/highway/src/hwy/tests/reduction_test.cc b/media/highway/src/hwy/tests/reduction_test.cc
new file mode 100644
index 0000000000..5e39abc55a
--- /dev/null
+++ b/media/highway/src/hwy/tests/reduction_test.cc
@@ -0,0 +1,227 @@
+// Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/reduction_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestSumOfLanes {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto in_lanes = AllocateAligned<T>(N);
+
+    // Lane i = bit i, higher lanes 0
+    double sum = 0.0;
+    // Avoid setting sign bit and cap at double precision
+    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
+    for (size_t i = 0; i < N; ++i) {
+      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0;
+      sum += static_cast<double>(in_lanes[i]);
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)),
+                      SumOfLanes(d, Load(d, in_lanes.get())));
+
+    // Lane i = i (iota) to include upper lanes
+    sum = 0.0;
+    for (size_t i = 0; i < N; ++i) {
+      sum += static_cast<double>(i);
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)), SumOfLanes(d, Iota(d, 0)));
+  }
+};
+
+HWY_NOINLINE void TestAllSumOfLanes() {
+  ForUIF3264(ForPartialVectors<TestSumOfLanes>());
+  ForUI16(ForPartialVectors<TestSumOfLanes>());
+}
+
+struct TestMinOfLanes {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto in_lanes = AllocateAligned<T>(N);
+
+    // Lane i = bit i, higher lanes = 2 (not the minimum)
+    T min = HighestValue<T>();
+    // Avoid setting sign bit and cap at double precision
+    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
+    for (size_t i = 0; i < N; ++i) {
+      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 2;
+      min = HWY_MIN(min, in_lanes[i]);
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
+
+    // Lane i = N - i to include upper lanes
+    min = HighestValue<T>();
+    for (size_t i = 0; i < N; ++i) {
+      in_lanes[i] = static_cast<T>(N - i);  // no 8-bit T so no wraparound
+      min = HWY_MIN(min, in_lanes[i]);
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
+
+    // Bug #910: also check negative values
+    min = HighestValue<T>();
+    const T input_copy[] = {static_cast<T>(-1),
+                            static_cast<T>(-2),
+                            1,
+                            2,
+                            3,
+                            4,
+                            5,
+                            6,
+                            7,
+                            8,
+                            9,
+                            10,
+                            11,
+                            12,
+                            13,
+                            14};
+    size_t i = 0;
+    for (; i < HWY_MIN(N, sizeof(input_copy) / sizeof(T)); ++i) {
+      in_lanes[i] = input_copy[i];
+      min = HWY_MIN(min, input_copy[i]);
+    }
+    // Pad with neutral element to full vector (so we can load)
+    for (; i < N; ++i) {
+      in_lanes[i] = min;
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
+  }
+};
+
+struct TestMaxOfLanes {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto in_lanes = AllocateAligned<T>(N);
+
+    T max = LowestValue<T>();
+    // Avoid setting sign bit and cap at double precision
+    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
+    for (size_t i = 0; i < N; ++i) {
+      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0;
+      max = HWY_MAX(max, in_lanes[i]);
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
+
+    // Lane i = i to include upper lanes
+    max = LowestValue<T>();
+    for (size_t i = 0; i < N; ++i) {
+      in_lanes[i] = static_cast<T>(i);  // no 8-bit T so no wraparound
+      max = HWY_MAX(max, in_lanes[i]);
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
+
+    // Bug #910: also check negative values
+    max = LowestValue<T>();
+    const T input_copy[] = {static_cast<T>(-1),
+                            static_cast<T>(-2),
+                            1,
+                            2,
+                            3,
+                            4,
+                            5,
+                            6,
+                            7,
+                            8,
+                            9,
+                            10,
+                            11,
+                            12,
+                            13,
+                            14};
+    size_t i = 0;
+    for (; i < HWY_MIN(N, sizeof(input_copy) / sizeof(T)); ++i) {
+      in_lanes[i] = input_copy[i];
+      max = HWY_MAX(max, in_lanes[i]);
+    }
+    // Pad with neutral element to full vector (so we can load)
+    for (; i < N; ++i) {
+      in_lanes[i] = max;
+    }
+    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
+  }
+};
+
+HWY_NOINLINE void TestAllMinMaxOfLanes() {
+  const ForPartialVectors<TestMinOfLanes> test_min;
+  const ForPartialVectors<TestMaxOfLanes> test_max;
+  ForUIF3264(test_min);
+  ForUIF3264(test_max);
+  ForUI16(test_min);
+  ForUI16(test_max);
+}
+
+struct TestSumsOf8 {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    const size_t N = Lanes(d);
+    if (N < 8) return;
+    const Repartition<uint64_t, D> du64;
+
+    auto in_lanes = AllocateAligned<T>(N);
+    auto sum_lanes = AllocateAligned<uint64_t>(N / 8);
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        in_lanes[i] = Random64(&rng) & 0xFF;
+      }
+
+      for (size_t idx_sum = 0; idx_sum < N / 8; ++idx_sum) {
+        uint64_t sum = 0;
+        for (size_t i = 0; i < 8; ++i) {
+          sum += in_lanes[idx_sum * 8 + i];
+        }
+        sum_lanes[idx_sum] = sum;
+      }
+
+      const Vec<D> in = Load(d, in_lanes.get());
+      HWY_ASSERT_VEC_EQ(du64, sum_lanes.get(), SumsOf8(in));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllSumsOf8() {
+  ForGEVectors<64, TestSumsOf8>()(uint8_t());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyReductionTest);
+HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumOfLanes);
+HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllMinMaxOfLanes);
+HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumsOf8);
+}  // namespace hwy
+
+#endif
diff --git a/media/highway/src/hwy/tests/reverse_test.cc b/media/highway/src/hwy/tests/reverse_test.cc
index a5206b93fc..fcbcb7fa1c 100644
--- a/media/highway/src/hwy/tests/reverse_test.cc
+++ b/media/highway/src/hwy/tests/reverse_test.cc
@@ -19,7 +19,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/reverse_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
diff --git a/media/highway/src/hwy/tests/shift_test.cc b/media/highway/src/hwy/tests/shift_test.cc
index e644add095..585eba761c 100644
--- a/media/highway/src/hwy/tests/shift_test.cc
+++ b/media/highway/src/hwy/tests/shift_test.cc
@@ -13,7 +13,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>
 
@@ -22,7 +21,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/shift_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
@@ -44,7 +43,8 @@ struct TestLeftShifts {
     const size_t N = Lanes(d);
     auto expected = AllocateAligned<T>(N);
 
-    const auto values = Iota(d, kSigned ? -TI(N) : TI(0));  // value to shift
+    // Values to shift
+    const auto values = Iota(d, static_cast<T>(kSigned ? -TI(N) : TI(0)));
     constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
 
     // 0
@@ -242,7 +242,7 @@ T RightShiftNegative(T val) {
   // seen divisions replaced with shifts, so resort to bit operations.
   using TU = hwy::MakeUnsigned<T>;
   TU bits;
-  CopyBytes<sizeof(T)>(&val, &bits);
+  CopySameSize(&val, &bits);
 
   const TU shifted = TU(bits >> kAmount);
 
@@ -251,7 +251,7 @@ T RightShiftNegative(T val) {
   const TU sign_extended = static_cast<TU>((all << num_zero) & LimitsMax<TU>());
 
   bits = shifted | sign_extended;
-  CopyBytes<sizeof(T)>(&bits, &val);
+  CopySameSize(&bits, &val);
   return val;
 }
 
@@ -355,7 +355,7 @@ struct TestVariableSignedRightShifts {
     for (size_t i = 0; i < N; ++i) {
       const size_t amount = i & kMaxShift;
       const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
-      CopyBytes<sizeof(T)>(&shifted, &expected[i]);
+      CopySameSize(&shifted, &expected[i]);
     }
     HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), small_shifts));
 
@@ -363,7 +363,7 @@ struct TestVariableSignedRightShifts {
     for (size_t i = 0; i < N; ++i) {
       const size_t amount = kMaxShift - (i & kMaxShift);
       const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
-      CopyBytes<sizeof(T)>(&shifted, &expected[i]);
+      CopySameSize(&shifted, &expected[i]);
     }
     HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), large_shifts));
   }
diff --git a/media/highway/src/hwy/tests/swizzle_test.cc b/media/highway/src/hwy/tests/swizzle_test.cc
index 5878d24dc5..f447f7a800 100644
--- a/media/highway/src/hwy/tests/swizzle_test.cc
+++ b/media/highway/src/hwy/tests/swizzle_test.cc
@@ -20,7 +20,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/swizzle_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
diff --git a/media/highway/src/hwy/tests/test_util-inl.h b/media/highway/src/hwy/tests/test_util-inl.h
index c9858df2e0..d9c1aebc31 100644
--- a/media/highway/src/hwy/tests/test_util-inl.h
+++ b/media/highway/src/hwy/tests/test_util-inl.h
@@ -15,14 +15,15 @@
 
 // Target-specific helper functions for use by *_test.cc.
 
-#include <inttypes.h>
 #include <stdint.h>
 
 #include "hwy/base.h"
-#include "hwy/print-inl.h"
 #include "hwy/tests/hwy_gtest.h"
 #include "hwy/tests/test_util.h"
 
+// After test_util (also includes highway.h)
+#include "hwy/print-inl.h"
+
 // Per-target include guard
 #if defined(HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_) == \
     defined(HWY_TARGET_TOGGLE)
@@ -95,8 +96,8 @@ HWY_NOINLINE void AssertMaskEqual(D d, VecArg<Mask<D>> a, VecArg<Mask<D>> b,
   // First check whole bytes (if that many elements are still valid)
   for (; i < N / 8; ++i) {
     if (bits_a[i] != bits_b[i]) {
-      fprintf(stderr, "Mismatch in byte %" PRIu64 ": %d != %d\n",
-              static_cast<uint64_t>(i), bits_a[i], bits_b[i]);
+      fprintf(stderr, "Mismatch in byte %d: %d != %d\n", static_cast<int>(i),
+              bits_a[i], bits_b[i]);
       Print(d8, "expect", Load(d8, bits_a.get()), 0, N8);
       Print(d8, "actual", Load(d8, bits_b.get()), 0, N8);
       hwy::Abort(filename, line, "Masks not equal");
@@ -109,8 +110,8 @@ HWY_NOINLINE void AssertMaskEqual(D d, VecArg<Mask<D>> a, VecArg<Mask<D>> b,
     const int valid_a = bits_a[i] & mask;
     const int valid_b = bits_b[i] & mask;
     if (valid_a != valid_b) {
-      fprintf(stderr, "Mismatch in last byte %" PRIu64 ": %d != %d\n",
-              static_cast<uint64_t>(i), valid_a, valid_b);
+      fprintf(stderr, "Mismatch in last byte %d: %d != %d\n",
+              static_cast<int>(i), valid_a, valid_b);
       Print(d8, "expect", Load(d8, bits_a.get()), 0, N8);
       Print(d8, "actual", Load(d8, bits_b.get()), 0, N8);
       hwy::Abort(filename, line, "Masks not equal");
diff --git a/media/highway/src/hwy/tests/test_util.cc b/media/highway/src/hwy/tests/test_util.cc
index 878fd6f26e..a0796b15f9 100644
--- a/media/highway/src/hwy/tests/test_util.cc
+++ b/media/highway/src/hwy/tests/test_util.cc
@@ -15,7 +15,6 @@
 
 #include "hwy/tests/test_util.h"
 
-#include <inttypes.h>
 #include <stddef.h>
 #include <stdio.h>
 
@@ -71,8 +70,7 @@ HWY_TEST_DLLEXPORT bool IsEqual(const TypeInfo& info, const void* expected_ptr,
     CopyBytes<8>(actual_ptr, &actual);
     return ComputeUlpDelta(expected, actual) <= 1;
   } else {
-    HWY_ABORT("Unexpected float size %" PRIu64 "\n",
-              static_cast<uint64_t>(info.sizeof_t));
+    HWY_ABORT("Unexpected float size %d\n", static_cast<int>(info.sizeof_t));
     return false;
   }
 }
@@ -88,10 +86,9 @@ HWY_TEST_DLLEXPORT HWY_NORETURN void PrintMismatchAndAbort(
   char actual_str[100];
   ToString(info, actual_ptr, actual_str);
   Abort(filename, line,
-        "%s, %sx%" PRIu64 " lane %" PRIu64
-        " mismatch: expected '%s', got '%s'.\n",
-        target_name, type_name, static_cast<uint64_t>(num_lanes),
-        static_cast<uint64_t>(lane), expected_str, actual_str);
+        "%s, %sx%d lane %d mismatch: expected '%s', got '%s'.\n", target_name,
+        type_name, static_cast<int>(num_lanes), static_cast<int>(lane),
+        expected_str, actual_str);
 }
 
 HWY_TEST_DLLEXPORT void AssertArrayEqual(const TypeInfo& info,
diff --git a/media/highway/src/hwy/tests/test_util.h b/media/highway/src/hwy/tests/test_util.h
index ab77f47951..459de961ce 100644
--- a/media/highway/src/hwy/tests/test_util.h
+++ b/media/highway/src/hwy/tests/test_util.h
@@ -105,8 +105,8 @@ TU ComputeUlpDelta(const T expected, const T actual) {
   // Compute the difference in units of last place. We do not need to check for
   // differing signs; they will result in large differences, which is fine.
   TU ux, uy;
-  CopyBytes<sizeof(T)>(&expected, &ux);
-  CopyBytes<sizeof(T)>(&actual, &uy);
+  CopySameSize(&expected, &ux);
+  CopySameSize(&actual, &uy);
 
   // Avoid unsigned->signed cast: 2's complement is only guaranteed by C++20.
   const TU ulp = HWY_MAX(ux, uy) - HWY_MIN(ux, uy);
diff --git a/media/highway/src/hwy/tests/test_util_test.cc b/media/highway/src/hwy/tests/test_util_test.cc
index 41f9f9ff7e..d55e2e8cb6 100644
--- a/media/highway/src/hwy/tests/test_util_test.cc
+++ b/media/highway/src/hwy/tests/test_util_test.cc
@@ -18,7 +18,7 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/test_util_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"
 
diff --git a/media/highway/src/libhwy-test.pc.in b/media/highway/src/libhwy-test.pc.in
index ff91690fd0..0416b10df3 100644
--- a/media/highway/src/libhwy-test.pc.in
+++ b/media/highway/src/libhwy-test.pc.in
@@ -1,4 +1,5 @@
 prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
 libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
 includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
 
diff --git a/media/highway/src/run_tests.sh b/media/highway/src/run_tests.sh
index 017e536acf..7f7d3447cc 100755
--- a/media/highway/src/run_tests.sh
+++ b/media/highway/src/run_tests.sh
@@ -19,11 +19,11 @@ cd ..
 rm -rf build
 
 #######################################
-echo DEBUG Clang 7
+echo DEBUG Clang 9
 rm -rf build_dbg
 mkdir build_dbg
 cd build_dbg
-CXX=clang++-7 CC=clang-7 cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON -DCMAKE_BUILD_TYPE=Debug
+CXX=clang++-9 CC=clang-9 cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON -DCMAKE_BUILD_TYPE=Debug
 make -j
 ctest -j
 cd ..
@@ -41,7 +41,7 @@ cd ..
 rm -rf build_32
 
 #######################################
-for VER in 8 9 10; do
+for VER in 10 11 12; do
   echo GCC $VER
   rm -rf build_g$VER
   mkdir build_g$VER
author	Job Bautista <jobbautista9@protonmail.com>	2022-12-22 12:55:25 +0800
committer	Job Bautista <jobbautista9@protonmail.com>	2022-12-22 12:55:25 +0800
commit	ada110f57919f20740557929fb782352900e0af7 (patch)
tree	b49ee6c17d6c139a21173c7e0cbbb6f61219c42d /media
parent	1356ecde966f1565f2beee191a32eab835a3794e (diff)
download	uxp-ada110f57919f20740557929fb782352900e0af7.tar.gz