system/xen: Updated for version 4.16.2.

Signed-off-by: Mario Preksavec <mario@slackware.hr> Signed-off-by: Willy Sudiarto Raharjo <willysr@slackbuilds.org>
author: Mario Preksavec <mario@slackware.hr> 2022-10-13 11:36:16 +0200
committer: Willy Sudiarto Raharjo <willysr@slackbuilds.org> 2022-10-15 10:47:42 +0700
commit: b75c769415e3718b1b934f26737ad17f9ed019ce (patch)
tree: 7a6e3fed3a7a8f14befad896144c810f2e66a74b /system/xen
parent: aae988ef834b611712b5b7b4df0256aae61ff3ff (diff)
download: slackbuilds-b75c769415e3718b1b934f26737ad17f9ed019ce.tar.gz
20 files changed, 1282 insertions, 1137 deletions
diff --git a/system/xen/xen.SlackBuild b/system/xen/xen.SlackBuild
index e504dc97bf..2d326cc732 100644
--- a/system/xen/xen.SlackBuild
+++ b/system/xen/xen.SlackBuild
@@ -25,8 +25,8 @@
 cd $(dirname $0) ; CWD=$(pwd)
 
 PRGNAM=xen
-VERSION=${VERSION:-4.16.1}
-BUILD=${BUILD:-2}
+VERSION=${VERSION:-4.16.2}
+BUILD=${BUILD:-1}
 TAG=${TAG:-_SBo}
 PKGTYPE=${PKGTYPE:-tgz}
 
diff --git a/system/xen/xen.info b/system/xen/xen.info
index e6c8cb4da0..150bd770b5 100644
--- a/system/xen/xen.info
+++ b/system/xen/xen.info
@@ -1,9 +1,9 @@
 PRGNAM="xen"
-VERSION="4.16.1"
+VERSION="4.16.2"
 HOMEPAGE="http://www.xenproject.org/"
 DOWNLOAD="UNSUPPORTED"
 MD5SUM=""
-DOWNLOAD_x86_64="http://mirror.slackware.hr/sources/xen/xen-4.16.1.tar.gz \
+DOWNLOAD_x86_64="http://mirror.slackware.hr/sources/xen/xen-4.16.2.tar.gz \
                  http://mirror.slackware.hr/sources/xen-extfiles/ipxe-git-3c040ad387099483102708bb1839110bc788cefb.tar.gz \
                  http://mirror.slackware.hr/sources/xen-extfiles/lwip-1.3.0.tar.gz \
                  http://mirror.slackware.hr/sources/xen-extfiles/zlib-1.2.3.tar.gz \
@@ -15,7 +15,7 @@ DOWNLOAD_x86_64="http://mirror.slackware.hr/sources/xen/xen-4.16.1.tar.gz \
                  http://mirror.slackware.hr/sources/xen-extfiles/tpm_emulator-0.7.4.tar.gz \
                  http://mirror.slackware.hr/sources/xen-seabios/seabios-1.14.0.tar.gz \
                  http://mirror.slackware.hr/sources/xen-ovmf/xen-ovmf-20210824_7b4a99be8a.tar.bz2"
-MD5SUM_x86_64="1c2cd4f7f966c1d455aab630953e5fad \
+MD5SUM_x86_64="6bd720f53e3c34a35cb8a8897a561e18 \
                23ba00d5e2c5b4343d12665af73e1cb5 \
                36cc57650cffda9a0269493be2a169bb \
                debc62758716a169df9f62e6ab2bc634 \
diff --git a/system/xen/xsa/xsa401-4.16-1.patch b/system/xen/xsa/xsa401-4.16-1.patch
deleted file mode 100644
index 5c8c50617a..0000000000
--- a/system/xen/xsa/xsa401-4.16-1.patch
+++ /dev/null
@@ -1,170 +0,0 @@
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Subject: x86/pv: Clean up _get_page_type()
-
-Various fixes for clarity, ahead of making complicated changes.
-
- * Split the overflow check out of the if/else chain for type handling, as
-   it's somewhat unrelated.
- * Comment the main if/else chain to explain what is going on.  Adjust one
-   ASSERT() and state the bit layout for validate-locked and partial states.
- * Correct the comment about TLB flushing, as it's backwards.  The problem
-   case is when writeable mappings are retained to a page becoming read-only,
-   as it allows the guest to bypass Xen's safety checks for updates.
- * Reduce the scope of 'y'.  It is an artefact of the cmpxchg loop and not
-   valid for use by subsequent logic.  Switch to using ACCESS_ONCE() to treat
-   all reads as explicitly volatile.  The only thing preventing the validated
-   wait-loop being infinite is the compiler barrier hidden in cpu_relax().
- * Replace one page_get_owner(page) with the already-calculated 'd' already in
-   scope.
-
-No functional change.
-
-This is part of XSA-401 / CVE-2022-26362.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-Reviewed-by: George Dunlap <george.dunlap@citrix.com>
-
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index 796faca64103..ddd32f88c798 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -2935,16 +2935,17 @@ static int _put_page_type(struct page_info *page, unsigned int flags,
- static int _get_page_type(struct page_info *page, unsigned long type,
-                           bool preemptible)
- {
--    unsigned long nx, x, y = page->u.inuse.type_info;
-+    unsigned long nx, x;
-     int rc = 0;
- 
-     ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
-     ASSERT(!in_irq());
- 
--    for ( ; ; )
-+    for ( unsigned long y = ACCESS_ONCE(page->u.inuse.type_info); ; )
-     {
-         x  = y;
-         nx = x + 1;
-+
-         if ( unlikely((nx & PGT_count_mask) == 0) )
-         {
-             gdprintk(XENLOG_WARNING,
-@@ -2952,8 +2953,15 @@ static int _get_page_type(struct page_info *page, unsigned long type,
-                      mfn_x(page_to_mfn(page)));
-             return -EINVAL;
-         }
--        else if ( unlikely((x & PGT_count_mask) == 0) )
-+
-+        if ( unlikely((x & PGT_count_mask) == 0) )
-         {
-+            /*
-+             * Typeref 0 -> 1.
-+             *
-+             * Type changes are permitted when the typeref is 0.  If the type
-+             * actually changes, the page needs re-validating.
-+             */
-             struct domain *d = page_get_owner(page);
- 
-             if ( d && shadow_mode_enabled(d) )
-@@ -2964,8 +2972,8 @@ static int _get_page_type(struct page_info *page, unsigned long type,
-             {
-                 /*
-                  * On type change we check to flush stale TLB entries. It is
--                 * vital that no other CPUs are left with mappings of a frame
--                 * which is about to become writeable to the guest.
-+                 * vital that no other CPUs are left with writeable mappings
-+                 * to a frame which is intending to become pgtable/segdesc.
-                  */
-                 cpumask_t *mask = this_cpu(scratch_cpumask);
- 
-@@ -2977,7 +2985,7 @@ static int _get_page_type(struct page_info *page, unsigned long type,
- 
-                 if ( unlikely(!cpumask_empty(mask)) &&
-                      /* Shadow mode: track only writable pages. */
--                     (!shadow_mode_enabled(page_get_owner(page)) ||
-+                     (!shadow_mode_enabled(d) ||
-                       ((nx & PGT_type_mask) == PGT_writable_page)) )
-                 {
-                     perfc_incr(need_flush_tlb_flush);
-@@ -3008,7 +3016,14 @@ static int _get_page_type(struct page_info *page, unsigned long type,
-         }
-         else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
-         {
--            /* Don't log failure if it could be a recursive-mapping attempt. */
-+            /*
-+             * else, we're trying to take a new reference, of the wrong type.
-+             *
-+             * This (being able to prohibit use of the wrong type) is what the
-+             * typeref system exists for, but skip printing the failure if it
-+             * looks like a recursive mapping, as subsequent logic might
-+             * ultimately permit the attempt.
-+             */
-             if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
-                  (type == PGT_l1_page_table) )
-                 return -EINVAL;
-@@ -3027,18 +3042,46 @@ static int _get_page_type(struct page_info *page, unsigned long type,
-         }
-         else if ( unlikely(!(x & PGT_validated)) )
-         {
-+            /*
-+             * else, the count is non-zero, and we're grabbing the right type;
-+             * but the page hasn't been validated yet.
-+             *
-+             * The page is in one of two states (depending on PGT_partial),
-+             * and should have exactly one reference.
-+             */
-+            ASSERT((x & (PGT_type_mask | PGT_count_mask)) == (type | 1));
-+
-             if ( !(x & PGT_partial) )
-             {
--                /* Someone else is updating validation of this page. Wait... */
-+                /*
-+                 * The page has been left in the "validate locked" state
-+                 * (i.e. PGT_[type] | 1) which means that a concurrent caller
-+                 * of _get_page_type() is in the middle of validation.
-+                 *
-+                 * Spin waiting for the concurrent user to complete (partial
-+                 * or fully validated), then restart our attempt to acquire a
-+                 * type reference.
-+                 */
-                 do {
-                     if ( preemptible && hypercall_preempt_check() )
-                         return -EINTR;
-                     cpu_relax();
--                } while ( (y = page->u.inuse.type_info) == x );
-+                } while ( (y = ACCESS_ONCE(page->u.inuse.type_info)) == x );
-                 continue;
-             }
--            /* Type ref count was left at 1 when PGT_partial got set. */
--            ASSERT((x & PGT_count_mask) == 1);
-+
-+            /*
-+             * The page has been left in the "partial" state
-+             * (i.e., PGT_[type] | PGT_partial | 1).
-+             *
-+             * Rather than bumping the type count, we need to try to grab the
-+             * validation lock; if we succeed, we need to validate the page,
-+             * then drop the general ref associated with the PGT_partial bit.
-+             *
-+             * We grab the validation lock by setting nx to (PGT_[type] | 1)
-+             * (i.e., non-zero type count, neither PGT_validated nor
-+             * PGT_partial set).
-+             */
-             nx = x & ~PGT_partial;
-         }
- 
-@@ -3087,6 +3130,13 @@ static int _get_page_type(struct page_info *page, unsigned long type,
-     }
- 
-  out:
-+    /*
-+     * Did we drop the PGT_partial bit when acquiring the typeref?  If so,
-+     * drop the general reference that went along with it.
-+     *
-+     * N.B. validate_page() may have have re-set PGT_partial, not reflected in
-+     * nx, but will have taken an extra ref when doing so.
-+     */
-     if ( (x & PGT_partial) && !(nx & PGT_partial) )
-         put_page(page);
- 
diff --git a/system/xen/xsa/xsa401-4.16-2.patch b/system/xen/xsa/xsa401-4.16-2.patch
deleted file mode 100644
index be58db59a5..0000000000
--- a/system/xen/xsa/xsa401-4.16-2.patch
+++ /dev/null
@@ -1,191 +0,0 @@
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Subject: x86/pv: Fix ABAC cmpxchg() race in _get_page_type()
-
-_get_page_type() suffers from a race condition where it incorrectly assumes
-that because 'x' was read and a subsequent a cmpxchg() succeeds, the type
-cannot have changed in-between.  Consider:
-
-CPU A:
-  1. Creates an L2e referencing pg
-     `-> _get_page_type(pg, PGT_l1_page_table), sees count 0, type PGT_writable_page
-  2.     Issues flush_tlb_mask()
-CPU B:
-  3. Creates a writeable mapping of pg
-     `-> _get_page_type(pg, PGT_writable_page), count increases to 1
-  4. Writes into new mapping, creating a TLB entry for pg
-  5. Removes the writeable mapping of pg
-     `-> _put_page_type(pg), count goes back down to 0
-CPU A:
-  7.     Issues cmpxchg(), setting count 1, type PGT_l1_page_table
-
-CPU B now has a writeable mapping to pg, which Xen believes is a pagetable and
-suitably protected (i.e. read-only).  The TLB flush in step 2 must be deferred
-until after the guest is prohibited from creating new writeable mappings,
-which is after step 7.
-
-Defer all safety actions until after the cmpxchg() has successfully taken the
-intended typeref, because that is what prevents concurrent users from using
-the old type.
-
-Also remove the early validation for writeable and shared pages.  This removes
-race conditions where one half of a parallel mapping attempt can return
-successfully before:
- * The IOMMU pagetables are in sync with the new page type
- * Writeable mappings to shared pages have been torn down
-
-This is part of XSA-401 / CVE-2022-26362.
-
-Reported-by: Jann Horn <jannh@google.com>
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-Reviewed-by: George Dunlap <george.dunlap@citrix.com>
-
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index ddd32f88c798..1693b580b152 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -2962,56 +2962,12 @@ static int _get_page_type(struct page_info *page, unsigned long type,
-              * Type changes are permitted when the typeref is 0.  If the type
-              * actually changes, the page needs re-validating.
-              */
--            struct domain *d = page_get_owner(page);
--
--            if ( d && shadow_mode_enabled(d) )
--               shadow_prepare_page_type_change(d, page, type);
- 
-             ASSERT(!(x & PGT_pae_xen_l2));
-             if ( (x & PGT_type_mask) != type )
-             {
--                /*
--                 * On type change we check to flush stale TLB entries. It is
--                 * vital that no other CPUs are left with writeable mappings
--                 * to a frame which is intending to become pgtable/segdesc.
--                 */
--                cpumask_t *mask = this_cpu(scratch_cpumask);
--
--                BUG_ON(in_irq());
--                cpumask_copy(mask, d->dirty_cpumask);
--
--                /* Don't flush if the timestamp is old enough */
--                tlbflush_filter(mask, page->tlbflush_timestamp);
--
--                if ( unlikely(!cpumask_empty(mask)) &&
--                     /* Shadow mode: track only writable pages. */
--                     (!shadow_mode_enabled(d) ||
--                      ((nx & PGT_type_mask) == PGT_writable_page)) )
--                {
--                    perfc_incr(need_flush_tlb_flush);
--                    /*
--                     * If page was a page table make sure the flush is
--                     * performed using an IPI in order to avoid changing the
--                     * type of a page table page under the feet of
--                     * spurious_page_fault().
--                     */
--                    flush_mask(mask,
--                               (x & PGT_type_mask) &&
--                               (x & PGT_type_mask) <= PGT_root_page_table
--                               ? FLUSH_TLB | FLUSH_FORCE_IPI
--                               : FLUSH_TLB);
--                }
--
--                /* We lose existing type and validity. */
-                 nx &= ~(PGT_type_mask | PGT_validated);
-                 nx |= type;
--
--                /*
--                 * No special validation needed for writable pages.
--                 * Page tables and GDT/LDT need to be scanned for validity.
--                 */
--                if ( type == PGT_writable_page || type == PGT_shared_page )
--                    nx |= PGT_validated;
-             }
-         }
-         else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
-@@ -3092,6 +3048,56 @@ static int _get_page_type(struct page_info *page, unsigned long type,
-             return -EINTR;
-     }
- 
-+    /*
-+     * One typeref has been taken and is now globally visible.
-+     *
-+     * The page is either in the "validate locked" state (PGT_[type] | 1) or
-+     * fully validated (PGT_[type] | PGT_validated | >0).
-+     */
-+
-+    if ( unlikely((x & PGT_count_mask) == 0) )
-+    {
-+        struct domain *d = page_get_owner(page);
-+
-+        if ( d && shadow_mode_enabled(d) )
-+            shadow_prepare_page_type_change(d, page, type);
-+
-+        if ( (x & PGT_type_mask) != type )
-+        {
-+            /*
-+             * On type change we check to flush stale TLB entries. It is
-+             * vital that no other CPUs are left with writeable mappings
-+             * to a frame which is intending to become pgtable/segdesc.
-+             */
-+            cpumask_t *mask = this_cpu(scratch_cpumask);
-+
-+            BUG_ON(in_irq());
-+            cpumask_copy(mask, d->dirty_cpumask);
-+
-+            /* Don't flush if the timestamp is old enough */
-+            tlbflush_filter(mask, page->tlbflush_timestamp);
-+
-+            if ( unlikely(!cpumask_empty(mask)) &&
-+                 /* Shadow mode: track only writable pages. */
-+                 (!shadow_mode_enabled(d) ||
-+                  ((nx & PGT_type_mask) == PGT_writable_page)) )
-+            {
-+                perfc_incr(need_flush_tlb_flush);
-+                /*
-+                 * If page was a page table make sure the flush is
-+                 * performed using an IPI in order to avoid changing the
-+                 * type of a page table page under the feet of
-+                 * spurious_page_fault().
-+                 */
-+                flush_mask(mask,
-+                           (x & PGT_type_mask) &&
-+                           (x & PGT_type_mask) <= PGT_root_page_table
-+                           ? FLUSH_TLB | FLUSH_FORCE_IPI
-+                           : FLUSH_TLB);
-+            }
-+        }
-+    }
-+
-     if ( unlikely(((x & PGT_type_mask) == PGT_writable_page) !=
-                   (type == PGT_writable_page)) )
-     {
-@@ -3120,13 +3126,25 @@ static int _get_page_type(struct page_info *page, unsigned long type,
- 
-     if ( unlikely(!(nx & PGT_validated)) )
-     {
--        if ( !(x & PGT_partial) )
-+        /*
-+         * No special validation needed for writable or shared pages.  Page
-+         * tables and GDT/LDT need to have their contents audited.
-+         *
-+         * per validate_page(), non-atomic updates are fine here.
-+         */
-+        if ( type == PGT_writable_page || type == PGT_shared_page )
-+            page->u.inuse.type_info |= PGT_validated;
-+        else
-         {
--            page->nr_validated_ptes = 0;
--            page->partial_flags = 0;
--            page->linear_pt_count = 0;
-+            if ( !(x & PGT_partial) )
-+            {
-+                page->nr_validated_ptes = 0;
-+                page->partial_flags = 0;
-+                page->linear_pt_count = 0;
-+            }
-+
-+            rc = validate_page(page, type, preemptible);
-         }
--        rc = validate_page(page, type, preemptible);
-     }
- 
-  out:
diff --git a/system/xen/xsa/xsa402-4.16-1.patch b/system/xen/xsa/xsa402-4.16-1.patch
deleted file mode 100644
index b783383fc8..0000000000
--- a/system/xen/xsa/xsa402-4.16-1.patch
+++ /dev/null
@@ -1,43 +0,0 @@
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Subject: x86/page: Introduce _PAGE_* constants for memory types
-
-... rather than opencoding the PAT/PCD/PWT attributes in __PAGE_HYPERVISOR_*
-constants.  These are going to be needed by forthcoming logic.
-
-No functional change.
-
-This is part of XSA-402.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-
-diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
-index 1d080cffbe84..2e542050f65a 100644
---- a/xen/include/asm-x86/page.h
-+++ b/xen/include/asm-x86/page.h
-@@ -331,6 +331,14 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t);
- 
- #define PAGE_CACHE_ATTRS (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)
- 
-+/* Memory types, encoded under Xen's choice of MSR_PAT. */
-+#define _PAGE_WB         (                                0)
-+#define _PAGE_WT         (                        _PAGE_PWT)
-+#define _PAGE_UCM        (            _PAGE_PCD            )
-+#define _PAGE_UC         (            _PAGE_PCD | _PAGE_PWT)
-+#define _PAGE_WC         (_PAGE_PAT                        )
-+#define _PAGE_WP         (_PAGE_PAT |             _PAGE_PWT)
-+
- /*
-  * Debug option: Ensure that granted mappings are not implicitly unmapped.
-  * WARNING: This will need to be disabled to run OSes that use the spare PTE
-@@ -349,8 +357,8 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t);
- #define __PAGE_HYPERVISOR_RX      (_PAGE_PRESENT | _PAGE_ACCESSED)
- #define __PAGE_HYPERVISOR         (__PAGE_HYPERVISOR_RX | \
-                                    _PAGE_DIRTY | _PAGE_RW)
--#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_PCD)
--#define __PAGE_HYPERVISOR_UC      (__PAGE_HYPERVISOR | _PAGE_PCD | _PAGE_PWT)
-+#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_UCM)
-+#define __PAGE_HYPERVISOR_UC      (__PAGE_HYPERVISOR | _PAGE_UC)
- #define __PAGE_HYPERVISOR_SHSTK   (__PAGE_HYPERVISOR_RO | _PAGE_DIRTY)
- 
- #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */
diff --git a/system/xen/xsa/xsa402-4.16-2.patch b/system/xen/xsa/xsa402-4.16-2.patch
deleted file mode 100644
index ebb2f5e221..0000000000
--- a/system/xen/xsa/xsa402-4.16-2.patch
+++ /dev/null
@@ -1,213 +0,0 @@
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Subject: x86: Don't change the cacheability of the directmap
-
-Changeset 55f97f49b7ce ("x86: Change cache attributes of Xen 1:1 page mappings
-in response to guest mapping requests") attempted to keep the cacheability
-consistent between different mappings of the same page.
-
-The reason wasn't described in the changelog, but it is understood to be in
-regards to a concern over machine check exceptions, owing to errata when using
-mixed cacheabilities.  It did this primarily by updating Xen's mapping of the
-page in the direct map when the guest mapped a page with reduced cacheability.
-
-Unfortunately, the logic didn't actually prevent mixed cacheability from
-occurring:
- * A guest could map a page normally, and then map the same page with
-   different cacheability; nothing prevented this.
- * The cacheability of the directmap was always latest-takes-precedence in
-   terms of guest requests.
- * Grant-mapped frames with lesser cacheability didn't adjust the page's
-   cacheattr settings.
- * The map_domain_page() function still unconditionally created WB mappings,
-   irrespective of the page's cacheattr settings.
-
-Additionally, update_xen_mappings() had a bug where the alias calculation was
-wrong for mfn's which were .init content, which should have been treated as
-fully guest pages, not Xen pages.
-
-Worse yet, the logic introduced a vulnerability whereby necessary
-pagetable/segdesc adjustments made by Xen in the validation logic could become
-non-coherent between the cache and main memory.  The CPU could subsequently
-operate on the stale value in the cache, rather than the safe value in main
-memory.
-
-The directmap contains primarily mappings of RAM.  PAT/MTRR conflict
-resolution is asymmetric, and generally for MTRR=WB ranges, PAT of lesser
-cacheability resolves to being coherent.  The special case is WC mappings,
-which are non-coherent against MTRR=WB regions (except for fully-coherent
-CPUs).
-
-Xen must not have any WC cacheability in the directmap, to prevent Xen's
-actions from creating non-coherency.  (Guest actions creating non-coherency is
-dealt with in subsequent patches.)  As all memory types for MTRR=WB ranges
-inter-operate coherently, so leave Xen's directmap mappings as WB.
-
-Only PV guests with access to devices can use reduced-cacheability mappings to
-begin with, and they're trusted not to mount DoSs against the system anyway.
-
-Drop PGC_cacheattr_{base,mask} entirely, and the logic to manipulate them.
-Shift the later PGC_* constants up, to gain 3 extra bits in the main reference
-count.  Retain the check in get_page_from_l1e() for special_pages() because a
-guest has no business using reduced cacheability on these.
-
-This reverts changeset 55f97f49b7ce6c3520c555d19caac6cf3f9a5df0
-
-This is CVE-2022-26363, part of XSA-402.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: George Dunlap <george.dunlap@citrix.com>
-
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index c6429b0f749a..ab32d13a1a0d 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -783,28 +783,6 @@ bool is_iomem_page(mfn_t mfn)
-     return (page_get_owner(page) == dom_io);
- }
- 
--static int update_xen_mappings(unsigned long mfn, unsigned int cacheattr)
--{
--    int err = 0;
--    bool alias = mfn >= PFN_DOWN(xen_phys_start) &&
--         mfn < PFN_UP(xen_phys_start + xen_virt_end - XEN_VIRT_START);
--    unsigned long xen_va =
--        XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT);
--
--    if ( boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) )
--        return 0;
--
--    if ( unlikely(alias) && cacheattr )
--        err = map_pages_to_xen(xen_va, _mfn(mfn), 1, 0);
--    if ( !err )
--        err = map_pages_to_xen((unsigned long)mfn_to_virt(mfn), _mfn(mfn), 1,
--                     PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
--    if ( unlikely(alias) && !cacheattr && !err )
--        err = map_pages_to_xen(xen_va, _mfn(mfn), 1, PAGE_HYPERVISOR);
--
--    return err;
--}
--
- #ifndef NDEBUG
- struct mmio_emul_range_ctxt {
-     const struct domain *d;
-@@ -1009,47 +987,14 @@ get_page_from_l1e(
-         goto could_not_pin;
-     }
- 
--    if ( pte_flags_to_cacheattr(l1f) !=
--         ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) )
-+    if ( (l1f & PAGE_CACHE_ATTRS) != _PAGE_WB && is_special_page(page) )
-     {
--        unsigned long x, nx, y = page->count_info;
--        unsigned long cacheattr = pte_flags_to_cacheattr(l1f);
--        int err;
--
--        if ( is_special_page(page) )
--        {
--            if ( write )
--                put_page_type(page);
--            put_page(page);
--            gdprintk(XENLOG_WARNING,
--                     "Attempt to change cache attributes of Xen heap page\n");
--            return -EACCES;
--        }
--
--        do {
--            x  = y;
--            nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
--        } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
--
--        err = update_xen_mappings(mfn, cacheattr);
--        if ( unlikely(err) )
--        {
--            cacheattr = y & PGC_cacheattr_mask;
--            do {
--                x  = y;
--                nx = (x & ~PGC_cacheattr_mask) | cacheattr;
--            } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
--
--            if ( write )
--                put_page_type(page);
--            put_page(page);
--
--            gdprintk(XENLOG_WARNING, "Error updating mappings for mfn %" PRI_mfn
--                     " (pfn %" PRI_pfn ", from L1 entry %" PRIpte ") for d%d\n",
--                     mfn, get_gpfn_from_mfn(mfn),
--                     l1e_get_intpte(l1e), l1e_owner->domain_id);
--            return err;
--        }
-+        if ( write )
-+            put_page_type(page);
-+        put_page(page);
-+        gdprintk(XENLOG_WARNING,
-+                 "Attempt to change cache attributes of Xen heap page\n");
-+        return -EACCES;
-     }
- 
-     return 0;
-@@ -2467,25 +2412,10 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
-  */
- static int cleanup_page_mappings(struct page_info *page)
- {
--    unsigned int cacheattr =
--        (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base;
-     int rc = 0;
-     unsigned long mfn = mfn_x(page_to_mfn(page));
- 
-     /*
--     * If we've modified xen mappings as a result of guest cache
--     * attributes, restore them to the "normal" state.
--     */
--    if ( unlikely(cacheattr) )
--    {
--        page->count_info &= ~PGC_cacheattr_mask;
--
--        BUG_ON(is_special_page(page));
--
--        rc = update_xen_mappings(mfn, 0);
--    }
--
--    /*
-      * If this may be in a PV domain's IOMMU, remove it.
-      *
-      * NB that writable xenheap pages have their type set and cleared by
-diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
-index cb9052749963..8a9a43bb0a9d 100644
---- a/xen/include/asm-x86/mm.h
-+++ b/xen/include/asm-x86/mm.h
-@@ -69,25 +69,22 @@
-  /* Set when is using a page as a page table */
- #define _PGC_page_table   PG_shift(3)
- #define PGC_page_table    PG_mask(1, 3)
-- /* 3-bit PAT/PCD/PWT cache-attribute hint. */
--#define PGC_cacheattr_base PG_shift(6)
--#define PGC_cacheattr_mask PG_mask(7, 6)
-  /* Page is broken? */
--#define _PGC_broken       PG_shift(7)
--#define PGC_broken        PG_mask(1, 7)
-+#define _PGC_broken       PG_shift(4)
-+#define PGC_broken        PG_mask(1, 4)
-  /* Mutually-exclusive page states: { inuse, offlining, offlined, free }. */
--#define PGC_state         PG_mask(3, 9)
--#define PGC_state_inuse   PG_mask(0, 9)
--#define PGC_state_offlining PG_mask(1, 9)
--#define PGC_state_offlined PG_mask(2, 9)
--#define PGC_state_free    PG_mask(3, 9)
-+#define PGC_state           PG_mask(3, 6)
-+#define PGC_state_inuse     PG_mask(0, 6)
-+#define PGC_state_offlining PG_mask(1, 6)
-+#define PGC_state_offlined  PG_mask(2, 6)
-+#define PGC_state_free      PG_mask(3, 6)
- #define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st)
- /* Page is not reference counted (see below for caveats) */
--#define _PGC_extra        PG_shift(10)
--#define PGC_extra         PG_mask(1, 10)
-+#define _PGC_extra        PG_shift(7)
-+#define PGC_extra         PG_mask(1, 7)
- 
- /* Count of references to this frame. */
--#define PGC_count_width   PG_shift(10)
-+#define PGC_count_width   PG_shift(7)
- #define PGC_count_mask    ((1UL<<PGC_count_width)-1)
- 
- /*
diff --git a/system/xen/xsa/xsa402-4.16-3.patch b/system/xen/xsa/xsa402-4.16-3.patch
deleted file mode 100644
index b4d2a4c835..0000000000
--- a/system/xen/xsa/xsa402-4.16-3.patch
+++ /dev/null
@@ -1,284 +0,0 @@
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Subject: x86: Split cache_flush() out of cache_writeback()
-
-Subsequent changes will want a fully flushing version.
-
-Use the new helper rather than opencoding it in flush_area_local().  This
-resolves an outstanding issue where the conditional sfence is on the wrong
-side of the clflushopt loop.  clflushopt is ordered with respect to older
-stores, not to younger stores.
-
-Rename gnttab_cache_flush()'s helper to avoid colliding in name.
-grant_table.c can see the prototype from cache.h so the build fails
-otherwise.
-
-This is part of XSA-402.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-
-Xen 4.16 and earlier:
- * Also backport half of c/s 3330013e67396 "VT-d / x86: re-arrange cache
-   syncing" to split cache_writeback() out of the IOMMU logic, but without the
-   associated hooks changes.
-
-diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
-index 25798df50f54..0c912b8669f8 100644
---- a/xen/arch/x86/flushtlb.c
-+++ b/xen/arch/x86/flushtlb.c
-@@ -234,7 +234,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
-     if ( flags & FLUSH_CACHE )
-     {
-         const struct cpuinfo_x86 *c = &current_cpu_data;
--        unsigned long i, sz = 0;
-+        unsigned long sz = 0;
- 
-         if ( order < (BITS_PER_LONG - PAGE_SHIFT) )
-             sz = 1UL << (order + PAGE_SHIFT);
-@@ -244,13 +244,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
-              c->x86_clflush_size && c->x86_cache_size && sz &&
-              ((sz >> 10) < c->x86_cache_size) )
-         {
--            alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
--            for ( i = 0; i < sz; i += c->x86_clflush_size )
--                alternative_input(".byte " __stringify(NOP_DS_PREFIX) ";"
--                                  " clflush %0",
--                                  "data16 clflush %0",      /* clflushopt */
--                                  X86_FEATURE_CLFLUSHOPT,
--                                  "m" (((const char *)va)[i]));
-+            cache_flush(va, sz);
-             flags &= ~FLUSH_CACHE;
-         }
-         else
-@@ -265,6 +259,80 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
-     return flags;
- }
- 
-+void cache_flush(const void *addr, unsigned int size)
-+{
-+    /*
-+     * This function may be called before current_cpu_data is established.
-+     * Hence a fallback is needed to prevent the loop below becoming infinite.
-+     */
-+    unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16;
-+    const void *end = addr + size;
-+
-+    addr -= (unsigned long)addr & (clflush_size - 1);
-+    for ( ; addr < end; addr += clflush_size )
-+    {
-+        /*
-+         * Note regarding the "ds" prefix use: it's faster to do a clflush
-+         * + prefix than a clflush + nop, and hence the prefix is added instead
-+         * of letting the alternative framework fill the gap by appending nops.
-+         */
-+        alternative_io("ds; clflush %[p]",
-+                       "data16 clflush %[p]", /* clflushopt */
-+                       X86_FEATURE_CLFLUSHOPT,
-+                       /* no outputs */,
-+                       [p] "m" (*(const char *)(addr)));
-+    }
-+
-+    alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
-+}
-+
-+void cache_writeback(const void *addr, unsigned int size)
-+{
-+    unsigned int clflush_size;
-+    const void *end = addr + size;
-+
-+    /* Fall back to CLFLUSH{,OPT} when CLWB isn't available. */
-+    if ( !boot_cpu_has(X86_FEATURE_CLWB) )
-+        return cache_flush(addr, size);
-+
-+    /*
-+     * This function may be called before current_cpu_data is established.
-+     * Hence a fallback is needed to prevent the loop below becoming infinite.
-+     */
-+    clflush_size = current_cpu_data.x86_clflush_size ?: 16;
-+    addr -= (unsigned long)addr & (clflush_size - 1);
-+    for ( ; addr < end; addr += clflush_size )
-+    {
-+/*
-+ * The arguments to a macro must not include preprocessor directives. Doing so
-+ * results in undefined behavior, so we have to create some defines here in
-+ * order to avoid it.
-+ */
-+#if defined(HAVE_AS_CLWB)
-+# define CLWB_ENCODING "clwb %[p]"
-+#elif defined(HAVE_AS_XSAVEOPT)
-+# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */
-+#else
-+# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */
-+#endif
-+
-+#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr))
-+#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT)
-+# define INPUT BASE_INPUT
-+#else
-+# define INPUT(addr) "a" (addr), BASE_INPUT(addr)
-+#endif
-+
-+        asm volatile (CLWB_ENCODING :: INPUT(addr));
-+
-+#undef INPUT
-+#undef BASE_INPUT
-+#undef CLWB_ENCODING
-+    }
-+
-+    asm volatile ("sfence" ::: "memory");
-+}
-+
- unsigned int guest_flush_tlb_flags(const struct domain *d)
- {
-     bool shadow = paging_mode_shadow(d);
-diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
-index 66f8ce71741c..4c742cd8fe81 100644
---- a/xen/common/grant_table.c
-+++ b/xen/common/grant_table.c
-@@ -3431,7 +3431,7 @@ gnttab_swap_grant_ref(XEN_GUEST_HANDLE_PARAM(gnttab_swap_grant_ref_t) uop,
-     return 0;
- }
- 
--static int cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref)
-+static int _cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref)
- {
-     struct domain *d, *owner;
-     struct page_info *page;
-@@ -3525,7 +3525,7 @@ gnttab_cache_flush(XEN_GUEST_HANDLE_PARAM(gnttab_cache_flush_t) uop,
-             return -EFAULT;
-         for ( ; ; )
-         {
--            int ret = cache_flush(&op, cur_ref);
-+            int ret = _cache_flush(&op, cur_ref);
- 
-             if ( ret < 0 )
-                 return ret;
-diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
-index 01e010a10d61..401079299725 100644
---- a/xen/drivers/passthrough/vtd/extern.h
-+++ b/xen/drivers/passthrough/vtd/extern.h
-@@ -76,7 +76,6 @@ int __must_check qinval_device_iotlb_sync(struct vtd_iommu *iommu,
-                                           struct pci_dev *pdev,
-                                           u16 did, u16 size, u64 addr);
- 
--unsigned int get_cache_line_size(void);
- void flush_all_cache(void);
- 
- uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node);
-diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
-index 8975c1de61bc..bc377c9bcfa4 100644
---- a/xen/drivers/passthrough/vtd/iommu.c
-+++ b/xen/drivers/passthrough/vtd/iommu.c
-@@ -31,6 +31,7 @@
- #include <xen/pci.h>
- #include <xen/pci_regs.h>
- #include <xen/keyhandler.h>
-+#include <asm/cache.h>
- #include <asm/msi.h>
- #include <asm/nops.h>
- #include <asm/irq.h>
-@@ -206,54 +207,6 @@ static void check_cleanup_domid_map(const struct domain *d,
-     }
- }
- 
--static void sync_cache(const void *addr, unsigned int size)
--{
--    static unsigned long clflush_size = 0;
--    const void *end = addr + size;
--
--    if ( clflush_size == 0 )
--        clflush_size = get_cache_line_size();
--
--    addr -= (unsigned long)addr & (clflush_size - 1);
--    for ( ; addr < end; addr += clflush_size )
--/*
-- * The arguments to a macro must not include preprocessor directives. Doing so
-- * results in undefined behavior, so we have to create some defines here in
-- * order to avoid it.
-- */
--#if defined(HAVE_AS_CLWB)
--# define CLWB_ENCODING "clwb %[p]"
--#elif defined(HAVE_AS_XSAVEOPT)
--# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */
--#else
--# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */
--#endif
--
--#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr))
--#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT)
--# define INPUT BASE_INPUT
--#else
--# define INPUT(addr) "a" (addr), BASE_INPUT(addr)
--#endif
--        /*
--         * Note regarding the use of NOP_DS_PREFIX: it's faster to do a clflush
--         * + prefix than a clflush + nop, and hence the prefix is added instead
--         * of letting the alternative framework fill the gap by appending nops.
--         */
--        alternative_io_2(".byte " __stringify(NOP_DS_PREFIX) "; clflush %[p]",
--                         "data16 clflush %[p]", /* clflushopt */
--                         X86_FEATURE_CLFLUSHOPT,
--                         CLWB_ENCODING,
--                         X86_FEATURE_CLWB, /* no outputs */,
--                         INPUT(addr));
--#undef INPUT
--#undef BASE_INPUT
--#undef CLWB_ENCODING
--
--    alternative_2("", "sfence", X86_FEATURE_CLFLUSHOPT,
--                      "sfence", X86_FEATURE_CLWB);
--}
--
- /* Allocate page table, return its machine address */
- uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node)
- {
-@@ -273,7 +226,7 @@ uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node)
-         clear_page(vaddr);
- 
-         if ( (iommu_ops.init ? &iommu_ops : &vtd_ops)->sync_cache )
--            sync_cache(vaddr, PAGE_SIZE);
-+            cache_writeback(vaddr, PAGE_SIZE);
-         unmap_domain_page(vaddr);
-         cur_pg++;
-     }
-@@ -1305,7 +1258,7 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd)
-     iommu->nr_pt_levels = agaw_to_level(agaw);
- 
-     if ( !ecap_coherent(iommu->ecap) )
--        vtd_ops.sync_cache = sync_cache;
-+        vtd_ops.sync_cache = cache_writeback;
- 
-     /* allocate domain id bitmap */
-     iommu->domid_bitmap = xzalloc_array(unsigned long, BITS_TO_LONGS(nr_dom));
-diff --git a/xen/drivers/passthrough/vtd/x86/vtd.c b/xen/drivers/passthrough/vtd/x86/vtd.c
-index 6681dccd6970..55f0faa521cb 100644
---- a/xen/drivers/passthrough/vtd/x86/vtd.c
-+++ b/xen/drivers/passthrough/vtd/x86/vtd.c
-@@ -47,11 +47,6 @@ void unmap_vtd_domain_page(const void *va)
-     unmap_domain_page(va);
- }
- 
--unsigned int get_cache_line_size(void)
--{
--    return ((cpuid_ebx(1) >> 8) & 0xff) * 8;
--}
--
- void flush_all_cache()
- {
-     wbinvd();
-diff --git a/xen/include/asm-x86/cache.h b/xen/include/asm-x86/cache.h
-index 1f7173d8c72c..e4770efb22b9 100644
---- a/xen/include/asm-x86/cache.h
-+++ b/xen/include/asm-x86/cache.h
-@@ -11,4 +11,11 @@
- 
- #define __read_mostly __section(".data.read_mostly")
- 
-+#ifndef __ASSEMBLY__
-+
-+void cache_flush(const void *addr, unsigned int size);
-+void cache_writeback(const void *addr, unsigned int size);
-+
-+#endif
-+
- #endif
diff --git a/system/xen/xsa/xsa402-4.16-4.patch b/system/xen/xsa/xsa402-4.16-4.patch
deleted file mode 100644
index 21109225d7..0000000000
--- a/system/xen/xsa/xsa402-4.16-4.patch
+++ /dev/null
@@ -1,83 +0,0 @@
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Subject: x86/amd: Work around CLFLUSH ordering on older parts
-
-On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakely ordered with everything,
-including reads and writes to the address, and LFENCE/SFENCE instructions.
-
-This creates a multitude of problematic corner cases, laid out in the manual.
-Arrange to use MFENCE on both sides of the CLFLUSH to force proper ordering.
-
-This is part of XSA-402.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-
-diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
-index a8e37dbb1f5c..b3b9a0df5fed 100644
---- a/xen/arch/x86/cpu/amd.c
-+++ b/xen/arch/x86/cpu/amd.c
-@@ -812,6 +812,14 @@ static void init_amd(struct cpuinfo_x86 *c)
- 	if (!cpu_has_lfence_dispatch)
- 		__set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability);
- 
-+	/*
-+	 * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with
-+	 * everything, including reads and writes to address, and
-+	 * LFENCE/SFENCE instructions.
-+	 */
-+	if (!cpu_has_clflushopt)
-+		setup_force_cpu_cap(X86_BUG_CLFLUSH_MFENCE);
-+
- 	switch(c->x86)
- 	{
- 	case 0xf ... 0x11:
-diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c
-index 0c912b8669f8..dcbb4064012e 100644
---- a/xen/arch/x86/flushtlb.c
-+++ b/xen/arch/x86/flushtlb.c
-@@ -259,6 +259,13 @@ unsigned int flush_area_local(const void *va, unsigned int flags)
-     return flags;
- }
- 
-+/*
-+ * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with everything,
-+ * including reads and writes to address, and LFENCE/SFENCE instructions.
-+ *
-+ * This function only works safely after alternatives have run.  Luckily, at
-+ * the time of writing, we don't flush the caches that early.
-+ */
- void cache_flush(const void *addr, unsigned int size)
- {
-     /*
-@@ -268,6 +275,8 @@ void cache_flush(const void *addr, unsigned int size)
-     unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16;
-     const void *end = addr + size;
- 
-+    alternative("", "mfence", X86_BUG_CLFLUSH_MFENCE);
-+
-     addr -= (unsigned long)addr & (clflush_size - 1);
-     for ( ; addr < end; addr += clflush_size )
-     {
-@@ -283,7 +292,9 @@ void cache_flush(const void *addr, unsigned int size)
-                        [p] "m" (*(const char *)(addr)));
-     }
- 
--    alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
-+    alternative_2("",
-+                  "sfence", X86_FEATURE_CLFLUSHOPT,
-+                  "mfence", X86_BUG_CLFLUSH_MFENCE);
- }
- 
- void cache_writeback(const void *addr, unsigned int size)
-diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h
-index 7413febd7ad8..ff3157d52d13 100644
---- a/xen/include/asm-x86/cpufeatures.h
-+++ b/xen/include/asm-x86/cpufeatures.h
-@@ -47,6 +47,7 @@ XEN_CPUFEATURE(XEN_IBT,           X86_SYNTH(27)) /* Xen uses CET Indirect Branch
- 
- #define X86_BUG_FPU_PTRS          X86_BUG( 0) /* (F)X{SAVE,RSTOR} doesn't save/restore FOP/FIP/FDP. */
- #define X86_BUG_NULL_SEG          X86_BUG( 1) /* NULL-ing a selector preserves the base and limit. */
-+#define X86_BUG_CLFLUSH_MFENCE    X86_BUG( 2) /* MFENCE needed to serialise CLFLUSH */
- 
- /* Total number of capability words, inc synth and bug words. */
- #define NCAPINTS (FSCAPINTS + X86_NR_SYNTH + X86_NR_BUG) /* N 32-bit words worth of info */
diff --git a/system/xen/xsa/xsa402-4.16-5.patch b/system/xen/xsa/xsa402-4.16-5.patch
deleted file mode 100644
index 4806d25c6f..0000000000
--- a/system/xen/xsa/xsa402-4.16-5.patch
+++ /dev/null
@@ -1,148 +0,0 @@
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Subject: x86/pv: Track and flush non-coherent mappings of RAM
-
-There are legitimate uses of WC mappings of RAM, e.g. for DMA buffers with
-devices that make non-coherent writes.  The Linux sound subsystem makes
-extensive use of this technique.
-
-For such usecases, the guest's DMA buffer is mapped and consistently used as
-WC, and Xen doesn't interact with the buffer.
-
-However, a mischevious guest can use WC mappings to deliberately create
-non-coherency between the cache and RAM, and use this to trick Xen into
-validating a pagetable which isn't actually safe.
-
-Allocate a new PGT_non_coherent to track the non-coherency of mappings.  Set
-it whenever a non-coherent writeable mapping is created.  If the page is used
-as anything other than PGT_writable_page, force a cache flush before
-validation.  Also force a cache flush before the page is returned to the heap.
-
-This is CVE-2022-26364, part of XSA-402.
-
-Reported-by: Jann Horn <jannh@google.com>
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: George Dunlap <george.dunlap@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index ab32d13a1a0d..bab9624fabb7 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -997,6 +997,15 @@ get_page_from_l1e(
-         return -EACCES;
-     }
- 
-+    /*
-+     * Track writeable non-coherent mappings to RAM pages, to trigger a cache
-+     * flush later if the target is used as anything but a PGT_writeable page.
-+     * We care about all writeable mappings, including foreign mappings.
-+     */
-+    if ( !boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) &&
-+         (l1f & (PAGE_CACHE_ATTRS | _PAGE_RW)) == (_PAGE_WC | _PAGE_RW) )
-+        set_bit(_PGT_non_coherent, &page->u.inuse.type_info);
-+
-     return 0;
- 
-  could_not_pin:
-@@ -2454,6 +2463,19 @@ static int cleanup_page_mappings(struct page_info *page)
-         }
-     }
- 
-+    /*
-+     * Flush the cache if there were previously non-coherent writeable
-+     * mappings of this page.  This forces the page to be coherent before it
-+     * is freed back to the heap.
-+     */
-+    if ( __test_and_clear_bit(_PGT_non_coherent, &page->u.inuse.type_info) )
-+    {
-+        void *addr = __map_domain_page(page);
-+
-+        cache_flush(addr, PAGE_SIZE);
-+        unmap_domain_page(addr);
-+    }
-+
-     return rc;
- }
- 
-@@ -3028,6 +3050,22 @@ static int _get_page_type(struct page_info *page, unsigned long type,
-     if ( unlikely(!(nx & PGT_validated)) )
-     {
-         /*
-+         * Flush the cache if there were previously non-coherent mappings of
-+         * this page, and we're trying to use it as anything other than a
-+         * writeable page.  This forces the page to be coherent before we
-+         * validate its contents for safety.
-+         */
-+        if ( (nx & PGT_non_coherent) && type != PGT_writable_page )
-+        {
-+            void *addr = __map_domain_page(page);
-+
-+            cache_flush(addr, PAGE_SIZE);
-+            unmap_domain_page(addr);
-+
-+            page->u.inuse.type_info &= ~PGT_non_coherent;
-+        }
-+
-+        /*
-          * No special validation needed for writable or shared pages.  Page
-          * tables and GDT/LDT need to have their contents audited.
-          *
-diff --git a/xen/arch/x86/pv/grant_table.c b/xen/arch/x86/pv/grant_table.c
-index 0325618c9883..81c72e61ed55 100644
---- a/xen/arch/x86/pv/grant_table.c
-+++ b/xen/arch/x86/pv/grant_table.c
-@@ -109,7 +109,17 @@ int create_grant_pv_mapping(uint64_t addr, mfn_t frame,
- 
-     ol1e = *pl1e;
-     if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) )
-+    {
-+        /*
-+         * We always create mappings in this path.  However, our caller,
-+         * map_grant_ref(), only passes potentially non-zero cache_flags for
-+         * MMIO frames, so this path doesn't create non-coherent mappings of
-+         * RAM frames and there's no need to calculate PGT_non_coherent.
-+         */
-+        ASSERT(!cache_flags || is_iomem_page(frame));
-+
-         rc = GNTST_okay;
-+    }
- 
-  out_unlock:
-     page_unlock(page);
-@@ -294,7 +304,18 @@ int replace_grant_pv_mapping(uint64_t addr, mfn_t frame,
-                  l1e_get_flags(ol1e), addr, grant_pte_flags);
- 
-     if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) )
-+    {
-+        /*
-+         * Generally, replace_grant_pv_mapping() is used to destroy mappings
-+         * (n1le = l1e_empty()), but it can be a present mapping on the
-+         * GNTABOP_unmap_and_replace path.
-+         *
-+         * In such cases, the PTE is fully transplanted from its old location
-+         * via steal_linear_addr(), so we need not perform PGT_non_coherent
-+         * checking here.
-+         */
-         rc = GNTST_okay;
-+    }
- 
-  out_unlock:
-     page_unlock(page);
-diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
-index 8a9a43bb0a9d..7464167ae192 100644
---- a/xen/include/asm-x86/mm.h
-+++ b/xen/include/asm-x86/mm.h
-@@ -53,8 +53,12 @@
- #define _PGT_partial      PG_shift(8)
- #define PGT_partial       PG_mask(1, 8)
- 
-+/* Has this page been mapped writeable with a non-coherent memory type? */
-+#define _PGT_non_coherent PG_shift(9)
-+#define PGT_non_coherent  PG_mask(1, 9)
-+
-  /* Count of uses of this frame as its current type. */
--#define PGT_count_width   PG_shift(8)
-+#define PGT_count_width   PG_shift(9)
- #define PGT_count_mask    ((1UL<<PGT_count_width)-1)
- 
- /* Are the 'type mask' bits identical? */
diff --git a/system/xen/xsa/xsa410-4.16-01.patch b/system/xen/xsa/xsa410-4.16-01.patch
new file mode 100644
index 0000000000..8f352ee86d
--- /dev/null
+++ b/system/xen/xsa/xsa410-4.16-01.patch
@@ -0,0 +1,59 @@
+From 4b4359122a414cc15156e13e3805988b71ff9da0 Mon Sep 17 00:00:00 2001
+From: Julien Grall <jgrall@amazon.com>
+Date: Mon, 6 Jun 2022 06:17:25 +0000
+Subject: [PATCH 1/2] xen/arm: p2m: Prevent adding mapping when domain is dying
+
+During the domain destroy process, the domain will still be accessible
+until it is fully destroyed. So does the P2M because we don't bail
+out early if is_dying is non-zero. If a domain has permission to
+modify the other domain's P2M (i.e. dom0, or a stubdomain), then
+foreign mapping can be added past relinquish_p2m_mapping().
+
+Therefore, we need to prevent mapping to be added when the domain
+is dying. This commit prevents such adding of mapping by adding the
+d->is_dying check to p2m_set_entry(). Also this commit enhances the
+check in relinquish_p2m_mapping() to make sure that no mappings can
+be added in the P2M after the P2M lock is released.
+
+This is part of CVE-2022-33746 / XSA-410.
+
+Signed-off-by: Julien Grall <jgrall@amazon.com>
+Signed-off-by: Henry Wang <Henry.Wang@arm.com>
+Tested-by: Henry Wang <Henry.Wang@arm.com>
+Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
+---
+ xen/arch/arm/p2m.c | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
+index fb71fa4c1c90..cbeff90f4371 100644
+--- a/xen/arch/arm/p2m.c
++++ b/xen/arch/arm/p2m.c
+@@ -1093,6 +1093,15 @@ int p2m_set_entry(struct p2m_domain *p2m,
+ {
+     int rc = 0;
+ 
++    /*
++     * Any reference taken by the P2M mappings (e.g. foreign mapping) will
++     * be dropped in relinquish_p2m_mapping(). As the P2M will still
++     * be accessible after, we need to prevent mapping to be added when the
++     * domain is dying.
++     */
++    if ( unlikely(p2m->domain->is_dying) )
++        return -ENOMEM;
++
+     while ( nr )
+     {
+         unsigned long mask;
+@@ -1610,6 +1619,8 @@ int relinquish_p2m_mapping(struct domain *d)
+     unsigned int order;
+     gfn_t start, end;
+ 
++    BUG_ON(!d->is_dying);
++    /* No mappings can be added in the P2M after the P2M lock is released. */
+     p2m_write_lock(p2m);
+ 
+     start = p2m->lowest_mapped_gfn;
+-- 
+2.37.1
+
diff --git a/system/xen/xsa/xsa410-4.16-02.patch b/system/xen/xsa/xsa410-4.16-02.patch
new file mode 100644
index 0000000000..7599a475f9
--- /dev/null
+++ b/system/xen/xsa/xsa410-4.16-02.patch
@@ -0,0 +1,165 @@
+From 0d5846490348fa09a0d0915d7c795685a016ce10 Mon Sep 17 00:00:00 2001
+From: Julien Grall <jgrall@amazon.com>
+Date: Mon, 6 Jun 2022 06:17:26 +0000
+Subject: [PATCH 2/2] xen/arm: p2m: Handle preemption when freeing intermediate
+ page tables
+
+At the moment the P2M page tables will be freed when the domain structure
+is freed without any preemption. As the P2M is quite large, iterating
+through this may take more time than it is reasonable without intermediate
+preemption (to run softirqs and perhaps scheduler).
+
+Split p2m_teardown() in two parts: one preemptible and called when
+relinquishing the resources, the other one non-preemptible and called
+when freeing the domain structure.
+
+As we are now freeing the P2M pages early, we also need to prevent
+further allocation if someone call p2m_set_entry() past p2m_teardown()
+(I wasn't able to prove this will never happen). This is done by
+the checking domain->is_dying from previous patch in p2m_set_entry().
+
+Similarly, we want to make sure that no-one can accessed the free
+pages. Therefore the root is cleared before freeing pages.
+
+This is part of CVE-2022-33746 / XSA-410.
+
+Signed-off-by: Julien Grall <jgrall@amazon.com>
+Signed-off-by: Henry Wang <Henry.Wang@arm.com>
+Tested-by: Henry Wang <Henry.Wang@arm.com>
+Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
+---
+ xen/arch/arm/domain.c     | 10 +++++++--
+ xen/arch/arm/p2m.c        | 47 ++++++++++++++++++++++++++++++++++++---
+ xen/include/asm-arm/p2m.h | 13 +++++++++--
+ 3 files changed, 63 insertions(+), 7 deletions(-)
+
+diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
+index 96e1b235501d..2694c39127c5 100644
+--- a/xen/arch/arm/domain.c
++++ b/xen/arch/arm/domain.c
+@@ -789,10 +789,10 @@ fail:
+ void arch_domain_destroy(struct domain *d)
+ {
+     /* IOMMU page table is shared with P2M, always call
+-     * iommu_domain_destroy() before p2m_teardown().
++     * iommu_domain_destroy() before p2m_final_teardown().
+      */
+     iommu_domain_destroy(d);
+-    p2m_teardown(d);
++    p2m_final_teardown(d);
+     domain_vgic_free(d);
+     domain_vuart_free(d);
+     free_xenheap_page(d->shared_info);
+@@ -996,6 +996,7 @@ enum {
+     PROG_xen,
+     PROG_page,
+     PROG_mapping,
++    PROG_p2m,
+     PROG_done,
+ };
+ 
+@@ -1056,6 +1057,11 @@ int domain_relinquish_resources(struct domain *d)
+         if ( ret )
+             return ret;
+ 
++    PROGRESS(p2m):
++        ret = p2m_teardown(d);
++        if ( ret )
++            return ret;
++
+     PROGRESS(done):
+         break;
+ 
+diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
+index cbeff90f4371..3bcd1e897e88 100644
+--- a/xen/arch/arm/p2m.c
++++ b/xen/arch/arm/p2m.c
+@@ -1527,17 +1527,58 @@ static void p2m_free_vmid(struct domain *d)
+     spin_unlock(&vmid_alloc_lock);
+ }
+ 
+-void p2m_teardown(struct domain *d)
++int p2m_teardown(struct domain *d)
+ {
+     struct p2m_domain *p2m = p2m_get_hostp2m(d);
++    unsigned long count = 0;
+     struct page_info *pg;
++    unsigned int i;
++    int rc = 0;
++
++    p2m_write_lock(p2m);
++
++    /*
++     * We are about to free the intermediate page-tables, so clear the
++     * root to prevent any walk to use them.
++     */
++    for ( i = 0; i < P2M_ROOT_PAGES; i++ )
++        clear_and_clean_page(p2m->root + i);
++
++    /*
++     * The domain will not be scheduled anymore, so in theory we should
++     * not need to flush the TLBs. Do it for safety purpose.
++     *
++     * Note that all the devices have already been de-assigned. So we don't
++     * need to flush the IOMMU TLB here.
++     */
++    p2m_force_tlb_flush_sync(p2m);
++
++    while ( (pg = page_list_remove_head(&p2m->pages)) )
++    {
++        free_domheap_page(pg);
++        count++;
++        /* Arbitrarily preempt every 512 iterations */
++        if ( !(count % 512) && hypercall_preempt_check() )
++        {
++            rc = -ERESTART;
++            break;
++        }
++    }
++
++    p2m_write_unlock(p2m);
++
++    return rc;
++}
++
++void p2m_final_teardown(struct domain *d)
++{
++    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+ 
+     /* p2m not actually initialized */
+     if ( !p2m->domain )
+         return;
+ 
+-    while ( (pg = page_list_remove_head(&p2m->pages)) )
+-        free_domheap_page(pg);
++    ASSERT(page_list_empty(&p2m->pages));
+ 
+     if ( p2m->root )
+         free_domheap_pages(p2m->root, P2M_ROOT_ORDER);
+diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
+index 8f11d9c97b5d..b3ba83283e11 100644
+--- a/xen/include/asm-arm/p2m.h
++++ b/xen/include/asm-arm/p2m.h
+@@ -192,8 +192,17 @@ void setup_virt_paging(void);
+ /* Init the datastructures for later use by the p2m code */
+ int p2m_init(struct domain *d);
+ 
+-/* Return all the p2m resources to Xen. */
+-void p2m_teardown(struct domain *d);
++/*
++ * The P2M resources are freed in two parts:
++ *  - p2m_teardown() will be called when relinquish the resources. It
++ *    will free large resources (e.g. intermediate page-tables) that
++ *    requires preemption.
++ *  - p2m_final_teardown() will be called when domain struct is been
++ *    freed. This *cannot* be preempted and therefore one small
++ *    resources should be freed here.
++ */
++int p2m_teardown(struct domain *d);
++void p2m_final_teardown(struct domain *d);
+ 
+ /*
+  * Remove mapping refcount on each mapping page in the p2m
+-- 
+2.37.1
+
diff --git a/system/xen/xsa/xsa410-4.16-03.patch b/system/xen/xsa/xsa410-4.16-03.patch
new file mode 100644
index 0000000000..7411d900ea
--- /dev/null
+++ b/system/xen/xsa/xsa410-4.16-03.patch
@@ -0,0 +1,113 @@
+From: Roger Pau Monné <roger.pau@citrix.com>
+Subject: x86/p2m: add option to skip root pagetable removal in p2m_teardown()
+
+Add a new parameter to p2m_teardown() in order to select whether the
+root page table should also be freed.  Note that all users are
+adjusted to pass the parameter to remove the root page tables, so
+behavior is not modified.
+
+No functional change intended.
+
+This is part of CVE-2022-33746 / XSA-410.
+
+Suggested-by: Julien Grall <julien@xen.org>
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Tim Deegan <tim@xen.org>
+
+--- a/xen/include/asm-x86/p2m.h
++++ b/xen/include/asm-x86/p2m.h
+@@ -574,7 +574,7 @@ int p2m_init(struct domain *d);
+ int p2m_alloc_table(struct p2m_domain *p2m);
+ 
+ /* Return all the p2m resources to Xen. */
+-void p2m_teardown(struct p2m_domain *p2m);
++void p2m_teardown(struct p2m_domain *p2m, bool remove_root);
+ void p2m_final_teardown(struct domain *d);
+ 
+ /* Add a page to a domain's p2m table */
+--- a/xen/arch/x86/mm/hap/hap.c
++++ b/xen/arch/x86/mm/hap/hap.c
+@@ -541,18 +541,18 @@ void hap_final_teardown(struct domain *d
+         }
+ 
+         for ( i = 0; i < MAX_ALTP2M; i++ )
+-            p2m_teardown(d->arch.altp2m_p2m[i]);
++            p2m_teardown(d->arch.altp2m_p2m[i], true);
+     }
+ 
+     /* Destroy nestedp2m's first */
+     for (i = 0; i < MAX_NESTEDP2M; i++) {
+-        p2m_teardown(d->arch.nested_p2m[i]);
++        p2m_teardown(d->arch.nested_p2m[i], true);
+     }
+ 
+     if ( d->arch.paging.hap.total_pages != 0 )
+         hap_teardown(d, NULL);
+ 
+-    p2m_teardown(p2m_get_hostp2m(d));
++    p2m_teardown(p2m_get_hostp2m(d), true);
+     /* Free any memory that the p2m teardown released */
+     paging_lock(d);
+     hap_set_allocation(d, 0, NULL);
+--- a/xen/arch/x86/mm/p2m.c
++++ b/xen/arch/x86/mm/p2m.c
+@@ -749,11 +749,11 @@ int p2m_alloc_table(struct p2m_domain *p
+  * hvm fixme: when adding support for pvh non-hardware domains, this path must
+  * cleanup any foreign p2m types (release refcnts on them).
+  */
+-void p2m_teardown(struct p2m_domain *p2m)
++void p2m_teardown(struct p2m_domain *p2m, bool remove_root)
+ /* Return all the p2m pages to Xen.
+  * We know we don't have any extra mappings to these pages */
+ {
+-    struct page_info *pg;
++    struct page_info *pg, *root_pg = NULL;
+     struct domain *d;
+ 
+     if (p2m == NULL)
+@@ -763,10 +763,22 @@ void p2m_teardown(struct p2m_domain *p2m
+ 
+     p2m_lock(p2m);
+     ASSERT(atomic_read(&d->shr_pages) == 0);
+-    p2m->phys_table = pagetable_null();
++
++    if ( remove_root )
++        p2m->phys_table = pagetable_null();
++    else if ( !pagetable_is_null(p2m->phys_table) )
++    {
++        root_pg = pagetable_get_page(p2m->phys_table);
++        clear_domain_page(pagetable_get_mfn(p2m->phys_table));
++    }
+ 
+     while ( (pg = page_list_remove_head(&p2m->pages)) )
+-        d->arch.paging.free_page(d, pg);
++        if ( pg != root_pg )
++            d->arch.paging.free_page(d, pg);
++
++    if ( root_pg )
++        page_list_add(root_pg, &p2m->pages);
++
+     p2m_unlock(p2m);
+ }
+ 
+--- a/xen/arch/x86/mm/shadow/common.c
++++ b/xen/arch/x86/mm/shadow/common.c
+@@ -2701,7 +2701,7 @@ int shadow_enable(struct domain *d, u32
+     paging_unlock(d);
+  out_unlocked:
+     if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) )
+-        p2m_teardown(p2m);
++        p2m_teardown(p2m, true);
+     if ( rv != 0 && pg != NULL )
+     {
+         pg->count_info &= ~PGC_count_mask;
+@@ -2866,7 +2866,7 @@ void shadow_final_teardown(struct domain
+         shadow_teardown(d, NULL);
+ 
+     /* It is now safe to pull down the p2m map. */
+-    p2m_teardown(p2m_get_hostp2m(d));
++    p2m_teardown(p2m_get_hostp2m(d), true);
+     /* Free any shadow memory that the p2m teardown released */
+     paging_lock(d);
+     shadow_set_allocation(d, 0, NULL);
diff --git a/system/xen/xsa/xsa410-4.16-04.patch b/system/xen/xsa/xsa410-4.16-04.patch
new file mode 100644
index 0000000000..2170b8e1a3
--- /dev/null
+++ b/system/xen/xsa/xsa410-4.16-04.patch
@@ -0,0 +1,62 @@
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/HAP: adjust monitor table related error handling
+
+hap_make_monitor_table() will return INVALID_MFN if it encounters an
+error condition, but hap_update_paging_modes() wasn’t handling this
+value, resulting in an inappropriate value being stored in
+monitor_table. This would subsequently misguide at least
+hap_vcpu_teardown(). Avoid this by bailing early.
+
+Further, when a domain has/was already crashed or (perhaps less
+important as there's no such path known to lead here) is already dying,
+avoid calling domain_crash() on it again - that's at best confusing.
+
+This is part of CVE-2022-33746 / XSA-410.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+
+--- a/xen/arch/x86/mm/hap/hap.c
++++ b/xen/arch/x86/mm/hap/hap.c
+@@ -39,6 +39,7 @@
+ #include <asm/domain.h>
+ #include <xen/numa.h>
+ #include <asm/hvm/nestedhvm.h>
++#include <public/sched.h>
+ 
+ #include "private.h"
+ 
+@@ -405,8 +406,13 @@ static mfn_t hap_make_monitor_table(stru
+     return m4mfn;
+ 
+  oom:
+-    printk(XENLOG_G_ERR "out of memory building monitor pagetable\n");
+-    domain_crash(d);
++    if ( !d->is_dying &&
++         (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) )
++    {
++        printk(XENLOG_G_ERR "%pd: out of memory building monitor pagetable\n",
++               d);
++        domain_crash(d);
++    }
+     return INVALID_MFN;
+ }
+ 
+@@ -766,6 +772,9 @@ static void hap_update_paging_modes(stru
+     if ( pagetable_is_null(v->arch.hvm.monitor_table) )
+     {
+         mfn_t mmfn = hap_make_monitor_table(v);
++
++        if ( mfn_eq(mmfn, INVALID_MFN) )
++            goto unlock;
+         v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn);
+         make_cr3(v, mmfn);
+         hvm_update_host_cr3(v);
+@@ -774,6 +783,7 @@ static void hap_update_paging_modes(stru
+     /* CR3 is effectively updated by a mode change. Flush ASIDs, etc. */
+     hap_update_cr3(v, 0, false);
+ 
++ unlock:
+     paging_unlock(d);
+     put_gfn(d, cr3_gfn);
+ }
diff --git a/system/xen/xsa/xsa410-4.16-05.patch b/system/xen/xsa/xsa410-4.16-05.patch
new file mode 100644
index 0000000000..dc626c7f54
--- /dev/null
+++ b/system/xen/xsa/xsa410-4.16-05.patch
@@ -0,0 +1,60 @@
+From: Jan Beulich <jbeulich@suse.com>
+Subject: x86/shadow: tolerate failure of sh_set_toplevel_shadow()
+
+Subsequently sh_set_toplevel_shadow() will be adjusted to install a
+blank entry in case prealloc fails. There are, in fact, pre-existing
+error paths which would put in place a blank entry. The 4- and 2-level
+code in sh_update_cr3(), however, assume the top level entry to be
+valid.
+
+Hence bail from the function in the unlikely event that it's not. Note
+that 3-level logic works differently: In particular a guest is free to
+supply a PDPTR pointing at 4 non-present (or otherwise deemed invalid)
+entries. The guest will crash, but we already cope with that.
+
+Really mfn_valid() is likely wrong to use in sh_set_toplevel_shadow(),
+and it should instead be !mfn_eq(gmfn, INVALID_MFN). Avoid such a change
+in security context, but add a respective assertion.
+
+This is part of CVE-2022-33746 / XSA-410.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Tim Deegan <tim@xen.org>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+
+--- a/xen/arch/x86/mm/shadow/common.c
++++ b/xen/arch/x86/mm/shadow/common.c
+@@ -2516,6 +2516,7 @@ void sh_set_toplevel_shadow(struct vcpu
+     /* Now figure out the new contents: is this a valid guest MFN? */
+     if ( !mfn_valid(gmfn) )
+     {
++        ASSERT(mfn_eq(gmfn, INVALID_MFN));
+         new_entry = pagetable_null();
+         goto install_new_entry;
+     }
+--- a/xen/arch/x86/mm/shadow/multi.c
++++ b/xen/arch/x86/mm/shadow/multi.c
+@@ -3312,6 +3312,11 @@ sh_update_cr3(struct vcpu *v, int do_loc
+     if ( sh_remove_write_access(d, gmfn, 4, 0) != 0 )
+         guest_flush_tlb_mask(d, d->dirty_cpumask);
+     sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow, sh_make_shadow);
++    if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) )
++    {
++        ASSERT(d->is_dying || d->is_shutting_down);
++        return;
++    }
+     if ( !shadow_mode_external(d) && !is_pv_32bit_domain(d) )
+     {
+         mfn_t smfn = pagetable_get_mfn(v->arch.paging.shadow.shadow_table[0]);
+@@ -3370,6 +3375,11 @@ sh_update_cr3(struct vcpu *v, int do_loc
+     if ( sh_remove_write_access(d, gmfn, 2, 0) != 0 )
+         guest_flush_tlb_mask(d, d->dirty_cpumask);
+     sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow, sh_make_shadow);
++    if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) )
++    {
++        ASSERT(d->is_dying || d->is_shutting_down);
++        return;
++    }
+ #else
+ #error This should never happen
+ #endif
diff --git a/system/xen/xsa/xsa410-4.16-06.patch b/system/xen/xsa/xsa410-4.16-06.patch
new file mode 100644
index 0000000000..8c89eb32c7
--- /dev/null
+++ b/system/xen/xsa/xsa410-4.16-06.patch
@@ -0,0 +1,255 @@
+From: Roger Pau Monné <roger.pau@citrix.com>
+Subject: x86/shadow: tolerate failure in shadow_prealloc()
+
+Prevent _shadow_prealloc() from calling BUG() when unable to fulfill
+the pre-allocation and instead return true/false.  Modify
+shadow_prealloc() to crash the domain on allocation failure (if the
+domain is not already dying), as shadow cannot operate normally after
+that.  Modify callers to also gracefully handle {_,}shadow_prealloc()
+failing to fulfill the request.
+
+Note this in turn requires adjusting the callers of
+sh_make_monitor_table() also to handle it returning INVALID_MFN.
+sh_update_paging_modes() is also modified to add additional error
+paths in case of allocation failure, some of those will return with
+null monitor page tables (and the domain likely crashed).  This is no
+different that current error paths, but the newly introduced ones are
+more likely to trigger.
+
+The now added failure points in sh_update_paging_modes() also require
+that on some error return paths the previous structures are cleared,
+and thus monitor table is null.
+
+While there adjust the 'type' parameter type of shadow_prealloc() to
+unsigned int rather than u32.
+
+This is part of CVE-2022-33746 / XSA-410.
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Tim Deegan <tim@xen.org>
+
+--- a/xen/arch/x86/mm/shadow/common.c
++++ b/xen/arch/x86/mm/shadow/common.c
+@@ -36,6 +36,7 @@
+ #include <asm/flushtlb.h>
+ #include <asm/shadow.h>
+ #include <xen/numa.h>
++#include <public/sched.h>
+ #include "private.h"
+ 
+ DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags);
+@@ -928,14 +929,15 @@ static inline void trace_shadow_prealloc
+ 
+ /* Make sure there are at least count order-sized pages
+  * available in the shadow page pool. */
+-static void _shadow_prealloc(struct domain *d, unsigned int pages)
++static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages)
+ {
+     struct vcpu *v;
+     struct page_info *sp, *t;
+     mfn_t smfn;
+     int i;
+ 
+-    if ( d->arch.paging.shadow.free_pages >= pages ) return;
++    if ( d->arch.paging.shadow.free_pages >= pages )
++        return true;
+ 
+     /* Shouldn't have enabled shadows if we've no vcpus. */
+     ASSERT(d->vcpu && d->vcpu[0]);
+@@ -951,7 +953,8 @@ static void _shadow_prealloc(struct doma
+         sh_unpin(d, smfn);
+ 
+         /* See if that freed up enough space */
+-        if ( d->arch.paging.shadow.free_pages >= pages ) return;
++        if ( d->arch.paging.shadow.free_pages >= pages )
++            return true;
+     }
+ 
+     /* Stage two: all shadow pages are in use in hierarchies that are
+@@ -974,7 +977,7 @@ static void _shadow_prealloc(struct doma
+                 if ( d->arch.paging.shadow.free_pages >= pages )
+                 {
+                     guest_flush_tlb_mask(d, d->dirty_cpumask);
+-                    return;
++                    return true;
+                 }
+             }
+         }
+@@ -987,7 +990,12 @@ static void _shadow_prealloc(struct doma
+            d->arch.paging.shadow.total_pages,
+            d->arch.paging.shadow.free_pages,
+            d->arch.paging.shadow.p2m_pages);
+-    BUG();
++
++    ASSERT(d->is_dying);
++
++    guest_flush_tlb_mask(d, d->dirty_cpumask);
++
++    return false;
+ }
+ 
+ /* Make sure there are at least count pages of the order according to
+@@ -995,9 +1003,19 @@ static void _shadow_prealloc(struct doma
+  * This must be called before any calls to shadow_alloc().  Since this
+  * will free existing shadows to make room, it must be called early enough
+  * to avoid freeing shadows that the caller is currently working on. */
+-void shadow_prealloc(struct domain *d, u32 type, unsigned int count)
++bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count)
+ {
+-    return _shadow_prealloc(d, shadow_size(type) * count);
++    bool ret = _shadow_prealloc(d, shadow_size(type) * count);
++
++    if ( !ret && !d->is_dying &&
++         (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) )
++        /*
++         * Failing to allocate memory required for shadow usage can only result in
++         * a domain crash, do it here rather that relying on every caller to do it.
++         */
++        domain_crash(d);
++
++    return ret;
+ }
+ 
+ /* Deliberately free all the memory we can: this will tear down all of
+@@ -1218,7 +1236,7 @@ void shadow_free(struct domain *d, mfn_t
+ static struct page_info *
+ shadow_alloc_p2m_page(struct domain *d)
+ {
+-    struct page_info *pg;
++    struct page_info *pg = NULL;
+ 
+     /* This is called both from the p2m code (which never holds the
+      * paging lock) and the log-dirty code (which always does). */
+@@ -1236,16 +1254,18 @@ shadow_alloc_p2m_page(struct domain *d)
+                     d->arch.paging.shadow.p2m_pages,
+                     shadow_min_acceptable_pages(d));
+         }
+-        paging_unlock(d);
+-        return NULL;
++        goto out;
+     }
+ 
+-    shadow_prealloc(d, SH_type_p2m_table, 1);
++    if ( !shadow_prealloc(d, SH_type_p2m_table, 1) )
++        goto out;
++
+     pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0));
+     d->arch.paging.shadow.p2m_pages++;
+     d->arch.paging.shadow.total_pages--;
+     ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask));
+ 
++ out:
+     paging_unlock(d);
+ 
+     return pg;
+@@ -1336,7 +1356,9 @@ int shadow_set_allocation(struct domain
+         else if ( d->arch.paging.shadow.total_pages > pages )
+         {
+             /* Need to return memory to domheap */
+-            _shadow_prealloc(d, 1);
++            if ( !_shadow_prealloc(d, 1) )
++                return -ENOMEM;
++
+             sp = page_list_remove_head(&d->arch.paging.shadow.freelist);
+             ASSERT(sp);
+             /*
+@@ -2334,12 +2356,13 @@ static void sh_update_paging_modes(struc
+     if ( mfn_eq(v->arch.paging.shadow.oos_snapshot[0], INVALID_MFN) )
+     {
+         int i;
++
++        if ( !shadow_prealloc(d, SH_type_oos_snapshot, SHADOW_OOS_PAGES) )
++            return;
++
+         for(i = 0; i < SHADOW_OOS_PAGES; i++)
+-        {
+-            shadow_prealloc(d, SH_type_oos_snapshot, 1);
+             v->arch.paging.shadow.oos_snapshot[i] =
+                 shadow_alloc(d, SH_type_oos_snapshot, 0);
+-        }
+     }
+ #endif /* OOS */
+ 
+@@ -2403,6 +2426,9 @@ static void sh_update_paging_modes(struc
+             mfn_t mmfn = sh_make_monitor_table(
+                              v, v->arch.paging.mode->shadow.shadow_levels);
+ 
++            if ( mfn_eq(mmfn, INVALID_MFN) )
++                return;
++
+             v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn);
+             make_cr3(v, mmfn);
+             hvm_update_host_cr3(v);
+@@ -2441,6 +2467,12 @@ static void sh_update_paging_modes(struc
+                 v->arch.hvm.monitor_table = pagetable_null();
+                 new_mfn = sh_make_monitor_table(
+                               v, v->arch.paging.mode->shadow.shadow_levels);
++                if ( mfn_eq(new_mfn, INVALID_MFN) )
++                {
++                    sh_destroy_monitor_table(v, old_mfn,
++                                             old_mode->shadow.shadow_levels);
++                    return;
++                }
+                 v->arch.hvm.monitor_table = pagetable_from_mfn(new_mfn);
+                 SHADOW_PRINTK("new monitor table %"PRI_mfn "\n",
+                                mfn_x(new_mfn));
+@@ -2526,7 +2558,12 @@ void sh_set_toplevel_shadow(struct vcpu
+     if ( !mfn_valid(smfn) )
+     {
+         /* Make sure there's enough free shadow memory. */
+-        shadow_prealloc(d, root_type, 1);
++        if ( !shadow_prealloc(d, root_type, 1) )
++        {
++            new_entry = pagetable_null();
++            goto install_new_entry;
++        }
++
+         /* Shadow the page. */
+         smfn = make_shadow(v, gmfn, root_type);
+     }
+--- a/xen/arch/x86/mm/shadow/hvm.c
++++ b/xen/arch/x86/mm/shadow/hvm.c
+@@ -700,7 +700,9 @@ mfn_t sh_make_monitor_table(const struct
+     ASSERT(!pagetable_get_pfn(v->arch.hvm.monitor_table));
+ 
+     /* Guarantee we can get the memory we need */
+-    shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS);
++    if ( !shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS) )
++        return INVALID_MFN;
++
+     m4mfn = shadow_alloc(d, SH_type_monitor_table, 0);
+     mfn_to_page(m4mfn)->shadow_flags = 4;
+ 
+--- a/xen/arch/x86/mm/shadow/multi.c
++++ b/xen/arch/x86/mm/shadow/multi.c
+@@ -2440,9 +2440,14 @@ static int sh_page_fault(struct vcpu *v,
+      * Preallocate shadow pages *before* removing writable accesses
+      * otherwhise an OOS L1 might be demoted and promoted again with
+      * writable mappings. */
+-    shadow_prealloc(d,
+-                    SH_type_l1_shadow,
+-                    GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1);
++    if ( !shadow_prealloc(d, SH_type_l1_shadow,
++                          GUEST_PAGING_LEVELS < 4
++                          ? 1 : GUEST_PAGING_LEVELS - 1) )
++    {
++        paging_unlock(d);
++        put_gfn(d, gfn_x(gfn));
++        return 0;
++    }
+ 
+     rc = gw_remove_write_accesses(v, va, &gw);
+ 
+--- a/xen/arch/x86/mm/shadow/private.h
++++ b/xen/arch/x86/mm/shadow/private.h
+@@ -383,7 +383,8 @@ void shadow_promote(struct domain *d, mf
+ void shadow_demote(struct domain *d, mfn_t gmfn, u32 type);
+ 
+ /* Shadow page allocation functions */
+-void  shadow_prealloc(struct domain *d, u32 shadow_type, unsigned int count);
++bool __must_check shadow_prealloc(struct domain *d, unsigned int shadow_type,
++                                  unsigned int count);
+ mfn_t shadow_alloc(struct domain *d,
+                     u32 shadow_type,
+                     unsigned long backpointer);
diff --git a/system/xen/xsa/xsa410-4.16-07.patch b/system/xen/xsa/xsa410-4.16-07.patch
new file mode 100644
index 0000000000..9eea91c42d
--- /dev/null
+++ b/system/xen/xsa/xsa410-4.16-07.patch
@@ -0,0 +1,82 @@
+From: Roger Pau Monné <roger.pau@citrix.com>
+Subject: x86/p2m: refuse new allocations for dying domains
+
+This will in particular prevent any attempts to add entries to the p2m,
+once - in a subsequent change - non-root entries have been removed.
+
+This is part of CVE-2022-33746 / XSA-410.
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Tim Deegan <tim@xen.org>
+
+--- a/xen/arch/x86/mm/hap/hap.c
++++ b/xen/arch/x86/mm/hap/hap.c
+@@ -245,6 +245,9 @@ static struct page_info *hap_alloc(struc
+ 
+     ASSERT(paging_locked_by_me(d));
+ 
++    if ( unlikely(d->is_dying) )
++        return NULL;
++
+     pg = page_list_remove_head(&d->arch.paging.hap.freelist);
+     if ( unlikely(!pg) )
+         return NULL;
+@@ -281,7 +284,7 @@ static struct page_info *hap_alloc_p2m_p
+         d->arch.paging.hap.p2m_pages++;
+         ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask));
+     }
+-    else if ( !d->arch.paging.p2m_alloc_failed )
++    else if ( !d->arch.paging.p2m_alloc_failed && !d->is_dying )
+     {
+         d->arch.paging.p2m_alloc_failed = 1;
+         dprintk(XENLOG_ERR, "d%i failed to allocate from HAP pool\n",
+--- a/xen/arch/x86/mm/shadow/common.c
++++ b/xen/arch/x86/mm/shadow/common.c
+@@ -939,6 +939,10 @@ static bool __must_check _shadow_preallo
+     if ( d->arch.paging.shadow.free_pages >= pages )
+         return true;
+ 
++    if ( unlikely(d->is_dying) )
++        /* No reclaim when the domain is dying, teardown will take care of it. */
++        return false;
++
+     /* Shouldn't have enabled shadows if we've no vcpus. */
+     ASSERT(d->vcpu && d->vcpu[0]);
+ 
+@@ -991,7 +995,7 @@ static bool __must_check _shadow_preallo
+            d->arch.paging.shadow.free_pages,
+            d->arch.paging.shadow.p2m_pages);
+ 
+-    ASSERT(d->is_dying);
++    ASSERT_UNREACHABLE();
+ 
+     guest_flush_tlb_mask(d, d->dirty_cpumask);
+ 
+@@ -1005,10 +1009,13 @@ static bool __must_check _shadow_preallo
+  * to avoid freeing shadows that the caller is currently working on. */
+ bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count)
+ {
+-    bool ret = _shadow_prealloc(d, shadow_size(type) * count);
++    bool ret;
++
++    if ( unlikely(d->is_dying) )
++       return false;
+ 
+-    if ( !ret && !d->is_dying &&
+-         (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) )
++    ret = _shadow_prealloc(d, shadow_size(type) * count);
++    if ( !ret && (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) )
+         /*
+          * Failing to allocate memory required for shadow usage can only result in
+          * a domain crash, do it here rather that relying on every caller to do it.
+@@ -1238,6 +1245,9 @@ shadow_alloc_p2m_page(struct domain *d)
+ {
+     struct page_info *pg = NULL;
+ 
++    if ( unlikely(d->is_dying) )
++       return NULL;
++
+     /* This is called both from the p2m code (which never holds the
+      * paging lock) and the log-dirty code (which always does). */
+     paging_lock_recursive(d);
diff --git a/system/xen/xsa/xsa410-4.16-08.patch b/system/xen/xsa/xsa410-4.16-08.patch
new file mode 100644
index 0000000000..92cdb4942b
--- /dev/null
+++ b/system/xen/xsa/xsa410-4.16-08.patch
@@ -0,0 +1,96 @@
+From: Roger Pau Monné <roger.pau@citrix.com>
+Subject: x86/p2m: truly free paging pool memory for dying domains
+
+Modify {hap,shadow}_free to free the page immediately if the domain is
+dying, so that pages don't accumulate in the pool when
+{shadow,hap}_final_teardown() get called. This is to limit the amount of
+work which needs to be done there (in a non-preemptable manner).
+
+Note the call to shadow_free() in shadow_free_p2m_page() is moved after
+increasing total_pages, so that the decrease done in shadow_free() in
+case the domain is dying doesn't underflow the counter, even if just for
+a short interval.
+
+This is part of CVE-2022-33746 / XSA-410.
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Tim Deegan <tim@xen.org>
+
+--- a/xen/arch/x86/mm/hap/hap.c
++++ b/xen/arch/x86/mm/hap/hap.c
+@@ -265,6 +265,18 @@ static void hap_free(struct domain *d, m
+ 
+     ASSERT(paging_locked_by_me(d));
+ 
++    /*
++     * For dying domains, actually free the memory here. This way less work is
++     * left to hap_final_teardown(), which cannot easily have preemption checks
++     * added.
++     */
++    if ( unlikely(d->is_dying) )
++    {
++        free_domheap_page(pg);
++        d->arch.paging.hap.total_pages--;
++        return;
++    }
++
+     d->arch.paging.hap.free_pages++;
+     page_list_add_tail(pg, &d->arch.paging.hap.freelist);
+ }
+--- a/xen/arch/x86/mm/shadow/common.c
++++ b/xen/arch/x86/mm/shadow/common.c
+@@ -1187,6 +1187,7 @@ mfn_t shadow_alloc(struct domain *d,
+ void shadow_free(struct domain *d, mfn_t smfn)
+ {
+     struct page_info *next = NULL, *sp = mfn_to_page(smfn);
++    bool dying = ACCESS_ONCE(d->is_dying);
+     struct page_list_head *pin_list;
+     unsigned int pages;
+     u32 shadow_type;
+@@ -1229,11 +1230,32 @@ void shadow_free(struct domain *d, mfn_t
+          * just before the allocator hands the page out again. */
+         page_set_tlbflush_timestamp(sp);
+         perfc_decr(shadow_alloc_count);
+-        page_list_add_tail(sp, &d->arch.paging.shadow.freelist);
++
++        /*
++         * For dying domains, actually free the memory here. This way less
++         * work is left to shadow_final_teardown(), which cannot easily have
++         * preemption checks added.
++         */
++        if ( unlikely(dying) )
++        {
++            /*
++             * The backpointer field (sh.back) used by shadow code aliases the
++             * domain owner field, unconditionally clear it here to avoid
++             * free_domheap_page() attempting to parse it.
++             */
++            page_set_owner(sp, NULL);
++            free_domheap_page(sp);
++        }
++        else
++            page_list_add_tail(sp, &d->arch.paging.shadow.freelist);
++
+         sp = next;
+     }
+ 
+-    d->arch.paging.shadow.free_pages += pages;
++    if ( unlikely(dying) )
++        d->arch.paging.shadow.total_pages -= pages;
++    else
++        d->arch.paging.shadow.free_pages += pages;
+ }
+ 
+ /* Divert a page from the pool to be used by the p2m mapping.
+@@ -1303,9 +1325,9 @@ shadow_free_p2m_page(struct domain *d, s
+      * paging lock) and the log-dirty code (which always does). */
+     paging_lock_recursive(d);
+ 
+-    shadow_free(d, page_to_mfn(pg));
+     d->arch.paging.shadow.p2m_pages--;
+     d->arch.paging.shadow.total_pages++;
++    shadow_free(d, page_to_mfn(pg));
+ 
+     paging_unlock(d);
+ }
diff --git a/system/xen/xsa/xsa410-4.16-09.patch b/system/xen/xsa/xsa410-4.16-09.patch
new file mode 100644
index 0000000000..60259e1c09
--- /dev/null
+++ b/system/xen/xsa/xsa410-4.16-09.patch
@@ -0,0 +1,159 @@
+From: Roger Pau Monné <roger.pau@citrix.com>
+Subject: x86/p2m: free the paging memory pool preemptively
+
+The paging memory pool is currently freed in two different places:
+from {shadow,hap}_teardown() via domain_relinquish_resources() and
+from {shadow,hap}_final_teardown() via complete_domain_destroy().
+While the former does handle preemption, the later doesn't.
+
+Attempt to move as much p2m related freeing as possible to happen
+before the call to {shadow,hap}_teardown(), so that most memory can be
+freed in a preemptive way.  In order to avoid causing issues to
+existing callers leave the root p2m page tables set and free them in
+{hap,shadow}_final_teardown().  Also modify {hap,shadow}_free to free
+the page immediately if the domain is dying, so that pages don't
+accumulate in the pool when {shadow,hap}_final_teardown() get called.
+
+Move altp2m_vcpu_disable_ve() to be done in hap_teardown(), as that's
+the place where altp2m_active gets disabled now.
+
+This is part of CVE-2022-33746 / XSA-410.
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Tim Deegan <tim@xen.org>
+
+--- a/xen/arch/x86/domain.c
++++ b/xen/arch/x86/domain.c
+@@ -38,7 +38,6 @@
+ #include <xen/livepatch.h>
+ #include <public/sysctl.h>
+ #include <public/hvm/hvm_vcpu.h>
+-#include <asm/altp2m.h>
+ #include <asm/regs.h>
+ #include <asm/mc146818rtc.h>
+ #include <asm/system.h>
+@@ -2381,12 +2380,6 @@ int domain_relinquish_resources(struct d
+             vpmu_destroy(v);
+         }
+ 
+-        if ( altp2m_active(d) )
+-        {
+-            for_each_vcpu ( d, v )
+-                altp2m_vcpu_disable_ve(v);
+-        }
+-
+         if ( is_pv_domain(d) )
+         {
+             for_each_vcpu ( d, v )
+--- a/xen/arch/x86/mm/hap/hap.c
++++ b/xen/arch/x86/mm/hap/hap.c
+@@ -28,6 +28,7 @@
+ #include <xen/domain_page.h>
+ #include <xen/guest_access.h>
+ #include <xen/keyhandler.h>
++#include <asm/altp2m.h>
+ #include <asm/event.h>
+ #include <asm/page.h>
+ #include <asm/current.h>
+@@ -546,24 +547,8 @@ void hap_final_teardown(struct domain *d
+     unsigned int i;
+ 
+     if ( hvm_altp2m_supported() )
+-    {
+-        d->arch.altp2m_active = 0;
+-
+-        if ( d->arch.altp2m_eptp )
+-        {
+-            free_xenheap_page(d->arch.altp2m_eptp);
+-            d->arch.altp2m_eptp = NULL;
+-        }
+-
+-        if ( d->arch.altp2m_visible_eptp )
+-        {
+-            free_xenheap_page(d->arch.altp2m_visible_eptp);
+-            d->arch.altp2m_visible_eptp = NULL;
+-        }
+-
+         for ( i = 0; i < MAX_ALTP2M; i++ )
+             p2m_teardown(d->arch.altp2m_p2m[i], true);
+-    }
+ 
+     /* Destroy nestedp2m's first */
+     for (i = 0; i < MAX_NESTEDP2M; i++) {
+@@ -578,6 +563,8 @@ void hap_final_teardown(struct domain *d
+     paging_lock(d);
+     hap_set_allocation(d, 0, NULL);
+     ASSERT(d->arch.paging.hap.p2m_pages == 0);
++    ASSERT(d->arch.paging.hap.free_pages == 0);
++    ASSERT(d->arch.paging.hap.total_pages == 0);
+     paging_unlock(d);
+ }
+ 
+@@ -603,6 +590,7 @@ void hap_vcpu_teardown(struct vcpu *v)
+ void hap_teardown(struct domain *d, bool *preempted)
+ {
+     struct vcpu *v;
++    unsigned int i;
+ 
+     ASSERT(d->is_dying);
+     ASSERT(d != current->domain);
+@@ -611,6 +599,28 @@ void hap_teardown(struct domain *d, bool
+     for_each_vcpu ( d, v )
+         hap_vcpu_teardown(v);
+ 
++    /* Leave the root pt in case we get further attempts to modify the p2m. */
++    if ( hvm_altp2m_supported() )
++    {
++        if ( altp2m_active(d) )
++            for_each_vcpu ( d, v )
++                altp2m_vcpu_disable_ve(v);
++
++        d->arch.altp2m_active = 0;
++
++        FREE_XENHEAP_PAGE(d->arch.altp2m_eptp);
++        FREE_XENHEAP_PAGE(d->arch.altp2m_visible_eptp);
++
++        for ( i = 0; i < MAX_ALTP2M; i++ )
++            p2m_teardown(d->arch.altp2m_p2m[i], false);
++    }
++
++    /* Destroy nestedp2m's after altp2m. */
++    for ( i = 0; i < MAX_NESTEDP2M; i++ )
++        p2m_teardown(d->arch.nested_p2m[i], false);
++
++    p2m_teardown(p2m_get_hostp2m(d), false);
++
+     paging_lock(d); /* Keep various asserts happy */
+ 
+     if ( d->arch.paging.hap.total_pages != 0 )
+--- a/xen/arch/x86/mm/shadow/common.c
++++ b/xen/arch/x86/mm/shadow/common.c
+@@ -2824,8 +2824,17 @@ void shadow_teardown(struct domain *d, b
+     for_each_vcpu ( d, v )
+         shadow_vcpu_teardown(v);
+ 
++    p2m_teardown(p2m_get_hostp2m(d), false);
++
+     paging_lock(d);
+ 
++    /*
++     * Reclaim all shadow memory so that shadow_set_allocation() doesn't find
++     * in-use pages, as _shadow_prealloc() will no longer try to reclaim pages
++     * because the domain is dying.
++     */
++    shadow_blow_tables(d);
++
+ #if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC))
+     /* Free the virtual-TLB array attached to each vcpu */
+     for_each_vcpu(d, v)
+@@ -2946,6 +2955,9 @@ void shadow_final_teardown(struct domain
+                    d->arch.paging.shadow.total_pages,
+                    d->arch.paging.shadow.free_pages,
+                    d->arch.paging.shadow.p2m_pages);
++    ASSERT(!d->arch.paging.shadow.total_pages);
++    ASSERT(!d->arch.paging.shadow.free_pages);
++    ASSERT(!d->arch.paging.shadow.p2m_pages);
+     paging_unlock(d);
+ }
+ 
diff --git a/system/xen/xsa/xsa410-4.16-10.patch b/system/xen/xsa/xsa410-4.16-10.patch
new file mode 100644
index 0000000000..258b7fff31
--- /dev/null
+++ b/system/xen/xsa/xsa410-4.16-10.patch
@@ -0,0 +1,171 @@
+From: Julien Grall <jgrall@amazon.com>
+Subject: xen/x86: p2m: Add preemption in p2m_teardown()
+
+The list p2m->pages contain all the pages used by the P2M. On large
+instance this can be quite large and the time spent to call
+d->arch.paging.free_page() will take more than 1ms for a 80GB guest
+on a Xen running in nested environment on a c5.metal.
+
+By extrapolation, it would take > 100ms for a 8TB guest (what we
+current security support). So add some preemption in p2m_teardown()
+and propagate to the callers. Note there are 3 places where
+the preemption is not enabled:
+    - hap_final_teardown()/shadow_final_teardown(): We are
+      preventing update the P2M once the domain is dying (so
+      no more pages could be allocated) and most of the P2M pages
+      will be freed in preemptive manneer when relinquishing the
+      resources. So this is fine to disable preemption.
+    - shadow_enable(): This is fine because it will undo the allocation
+      that may have been made by p2m_alloc_table() (so only the root
+      page table).
+
+The preemption is arbitrarily checked every 1024 iterations.
+
+Note that with the current approach, Xen doesn't keep track on whether
+the alt/nested P2Ms have been cleared. So there are some redundant work.
+However, this is not expected to incurr too much overhead (the P2M lock
+shouldn't be contended during teardown). So this is optimization is
+left outside of the security event.
+
+This is part of CVE-2022-33746 / XSA-410.
+
+Signed-off-by: Julien Grall <jgrall@amazon.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+
+--- a/xen/include/asm-x86/p2m.h
++++ b/xen/include/asm-x86/p2m.h
+@@ -574,7 +574,7 @@ int p2m_init(struct domain *d);
+ int p2m_alloc_table(struct p2m_domain *p2m);
+ 
+ /* Return all the p2m resources to Xen. */
+-void p2m_teardown(struct p2m_domain *p2m, bool remove_root);
++void p2m_teardown(struct p2m_domain *p2m, bool remove_root, bool *preempted);
+ void p2m_final_teardown(struct domain *d);
+ 
+ /* Add a page to a domain's p2m table */
+--- a/xen/arch/x86/mm/hap/hap.c
++++ b/xen/arch/x86/mm/hap/hap.c
+@@ -548,17 +548,17 @@ void hap_final_teardown(struct domain *d
+ 
+     if ( hvm_altp2m_supported() )
+         for ( i = 0; i < MAX_ALTP2M; i++ )
+-            p2m_teardown(d->arch.altp2m_p2m[i], true);
++            p2m_teardown(d->arch.altp2m_p2m[i], true, NULL);
+ 
+     /* Destroy nestedp2m's first */
+     for (i = 0; i < MAX_NESTEDP2M; i++) {
+-        p2m_teardown(d->arch.nested_p2m[i], true);
++        p2m_teardown(d->arch.nested_p2m[i], true, NULL);
+     }
+ 
+     if ( d->arch.paging.hap.total_pages != 0 )
+         hap_teardown(d, NULL);
+ 
+-    p2m_teardown(p2m_get_hostp2m(d), true);
++    p2m_teardown(p2m_get_hostp2m(d), true, NULL);
+     /* Free any memory that the p2m teardown released */
+     paging_lock(d);
+     hap_set_allocation(d, 0, NULL);
+@@ -612,14 +612,24 @@ void hap_teardown(struct domain *d, bool
+         FREE_XENHEAP_PAGE(d->arch.altp2m_visible_eptp);
+ 
+         for ( i = 0; i < MAX_ALTP2M; i++ )
+-            p2m_teardown(d->arch.altp2m_p2m[i], false);
++        {
++            p2m_teardown(d->arch.altp2m_p2m[i], false, preempted);
++            if ( preempted && *preempted )
++                return;
++        }
+     }
+ 
+     /* Destroy nestedp2m's after altp2m. */
+     for ( i = 0; i < MAX_NESTEDP2M; i++ )
+-        p2m_teardown(d->arch.nested_p2m[i], false);
++    {
++        p2m_teardown(d->arch.nested_p2m[i], false, preempted);
++        if ( preempted && *preempted )
++            return;
++    }
+ 
+-    p2m_teardown(p2m_get_hostp2m(d), false);
++    p2m_teardown(p2m_get_hostp2m(d), false, preempted);
++    if ( preempted && *preempted )
++        return;
+ 
+     paging_lock(d); /* Keep various asserts happy */
+ 
+--- a/xen/arch/x86/mm/p2m.c
++++ b/xen/arch/x86/mm/p2m.c
+@@ -749,12 +749,13 @@ int p2m_alloc_table(struct p2m_domain *p
+  * hvm fixme: when adding support for pvh non-hardware domains, this path must
+  * cleanup any foreign p2m types (release refcnts on them).
+  */
+-void p2m_teardown(struct p2m_domain *p2m, bool remove_root)
++void p2m_teardown(struct p2m_domain *p2m, bool remove_root, bool *preempted)
+ /* Return all the p2m pages to Xen.
+  * We know we don't have any extra mappings to these pages */
+ {
+     struct page_info *pg, *root_pg = NULL;
+     struct domain *d;
++    unsigned int i = 0;
+ 
+     if (p2m == NULL)
+         return;
+@@ -773,8 +774,19 @@ void p2m_teardown(struct p2m_domain *p2m
+     }
+ 
+     while ( (pg = page_list_remove_head(&p2m->pages)) )
+-        if ( pg != root_pg )
+-            d->arch.paging.free_page(d, pg);
++    {
++        if ( pg == root_pg )
++            continue;
++
++        d->arch.paging.free_page(d, pg);
++
++        /* Arbitrarily check preemption every 1024 iterations */
++        if ( preempted && !(++i % 1024) && general_preempt_check() )
++        {
++            *preempted = true;
++            break;
++        }
++    }
+ 
+     if ( root_pg )
+         page_list_add(root_pg, &p2m->pages);
+--- a/xen/arch/x86/mm/shadow/common.c
++++ b/xen/arch/x86/mm/shadow/common.c
+@@ -2770,8 +2770,12 @@ int shadow_enable(struct domain *d, u32
+  out_locked:
+     paging_unlock(d);
+  out_unlocked:
++    /*
++     * This is fine to ignore the preemption here because only the root
++     * will be allocated by p2m_alloc_table().
++     */
+     if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) )
+-        p2m_teardown(p2m, true);
++        p2m_teardown(p2m, true, NULL);
+     if ( rv != 0 && pg != NULL )
+     {
+         pg->count_info &= ~PGC_count_mask;
+@@ -2824,7 +2828,9 @@ void shadow_teardown(struct domain *d, b
+     for_each_vcpu ( d, v )
+         shadow_vcpu_teardown(v);
+ 
+-    p2m_teardown(p2m_get_hostp2m(d), false);
++    p2m_teardown(p2m_get_hostp2m(d), false, preempted);
++    if ( preempted && *preempted )
++        return;
+ 
+     paging_lock(d);
+ 
+@@ -2945,7 +2951,7 @@ void shadow_final_teardown(struct domain
+         shadow_teardown(d, NULL);
+ 
+     /* It is now safe to pull down the p2m map. */
+-    p2m_teardown(p2m_get_hostp2m(d), true);
++    p2m_teardown(p2m_get_hostp2m(d), true, NULL);
+     /* Free any shadow memory that the p2m teardown released */
+     paging_lock(d);
+     shadow_set_allocation(d, 0, NULL);
diff --git a/system/xen/xsa/xsa411.patch b/system/xen/xsa/xsa411.patch
new file mode 100644
index 0000000000..50dcae4a81
--- /dev/null
+++ b/system/xen/xsa/xsa411.patch
@@ -0,0 +1,55 @@
+From: Jan Beulich <jbeulich@suse.com>
+Subject: gnttab: correct locking on transitive grant copy error path
+
+While the comment next to the lock dropping in preparation of
+recursively calling acquire_grant_for_copy() mistakenly talks about the
+rd == td case (excluded a few lines further up), the same concerns apply
+to the calling of release_grant_for_copy() on a subsequent error path.
+
+This is CVE-2022-33748 / XSA-411.
+
+Fixes: ad48fb963dbf ("gnttab: fix transitive grant handling")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+---
+v2: Extend code comment.
+
+--- a/xen/common/grant_table.c
++++ b/xen/common/grant_table.c
+@@ -2622,9 +2622,8 @@ acquire_grant_for_copy(
+                      trans_domid);
+ 
+         /*
+-         * acquire_grant_for_copy() could take the lock on the
+-         * remote table (if rd == td), so we have to drop the lock
+-         * here and reacquire.
++         * acquire_grant_for_copy() will take the lock on the remote table,
++         * so we have to drop the lock here and reacquire.
+          */
+         active_entry_release(act);
+         grant_read_unlock(rgt);
+@@ -2661,11 +2660,25 @@ acquire_grant_for_copy(
+                           act->trans_gref != trans_gref ||
+                           !act->is_sub_page)) )
+         {
++            /*
++             * Like above for acquire_grant_for_copy() we need to drop and then
++             * re-acquire the locks here to prevent lock order inversion issues.
++             * Unlike for acquire_grant_for_copy() we don't need to re-check
++             * anything, as release_grant_for_copy() doesn't depend on the grant
++             * table entry: It only updates internal state and the status flags.
++             */
++            active_entry_release(act);
++            grant_read_unlock(rgt);
++
+             release_grant_for_copy(td, trans_gref, readonly);
+             rcu_unlock_domain(td);
++
++            grant_read_lock(rgt);
++            act = active_entry_acquire(rgt, gref);
+             reduce_status_for_pin(rd, act, status, readonly);
+             active_entry_release(act);
+             grant_read_unlock(rgt);
++
+             put_page(*page);
+             *page = NULL;
+             return ERESTART;
author	Mario Preksavec <mario@slackware.hr>	2022-10-13 11:36:16 +0200
committer	Willy Sudiarto Raharjo <willysr@slackbuilds.org>	2022-10-15 10:47:42 +0700
commit	b75c769415e3718b1b934f26737ad17f9ed019ce (patch)
tree	7a6e3fed3a7a8f14befad896144c810f2e66a74b /system/xen
parent	aae988ef834b611712b5b7b4df0256aae61ff3ff (diff)
download	slackbuilds-b75c769415e3718b1b934f26737ad17f9ed019ce.tar.gz