summaryrefslogtreecommitdiff
path: root/system/xen/xsa/xsa401-4.16-1.patch
blob: 5c8c50617a21eb4be542fbe44ce6e349c2940569 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: x86/pv: Clean up _get_page_type()

Various fixes for clarity, ahead of making complicated changes.

 * Split the overflow check out of the if/else chain for type handling, as
   it's somewhat unrelated.
 * Comment the main if/else chain to explain what is going on.  Adjust one
   ASSERT() and state the bit layout for validate-locked and partial states.
 * Correct the comment about TLB flushing, as it's backwards.  The problem
   case is when writeable mappings are retained to a page becoming read-only,
   as it allows the guest to bypass Xen's safety checks for updates.
 * Reduce the scope of 'y'.  It is an artefact of the cmpxchg loop and not
   valid for use by subsequent logic.  Switch to using ACCESS_ONCE() to treat
   all reads as explicitly volatile.  The only thing preventing the validated
   wait-loop being infinite is the compiler barrier hidden in cpu_relax().
 * Replace one page_get_owner(page) with the already-calculated 'd' already in
   scope.

No functional change.

This is part of XSA-401 / CVE-2022-26362.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 796faca64103..ddd32f88c798 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -2935,16 +2935,17 @@ static int _put_page_type(struct page_info *page, unsigned int flags,
 static int _get_page_type(struct page_info *page, unsigned long type,
                           bool preemptible)
 {
-    unsigned long nx, x, y = page->u.inuse.type_info;
+    unsigned long nx, x;
     int rc = 0;
 
     ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
     ASSERT(!in_irq());
 
-    for ( ; ; )
+    for ( unsigned long y = ACCESS_ONCE(page->u.inuse.type_info); ; )
     {
         x  = y;
         nx = x + 1;
+
         if ( unlikely((nx & PGT_count_mask) == 0) )
         {
             gdprintk(XENLOG_WARNING,
@@ -2952,8 +2953,15 @@ static int _get_page_type(struct page_info *page, unsigned long type,
                      mfn_x(page_to_mfn(page)));
             return -EINVAL;
         }
-        else if ( unlikely((x & PGT_count_mask) == 0) )
+
+        if ( unlikely((x & PGT_count_mask) == 0) )
         {
+            /*
+             * Typeref 0 -> 1.
+             *
+             * Type changes are permitted when the typeref is 0.  If the type
+             * actually changes, the page needs re-validating.
+             */
             struct domain *d = page_get_owner(page);
 
             if ( d && shadow_mode_enabled(d) )
@@ -2964,8 +2972,8 @@ static int _get_page_type(struct page_info *page, unsigned long type,
             {
                 /*
                  * On type change we check to flush stale TLB entries. It is
-                 * vital that no other CPUs are left with mappings of a frame
-                 * which is about to become writeable to the guest.
+                 * vital that no other CPUs are left with writeable mappings
+                 * to a frame which is intending to become pgtable/segdesc.
                  */
                 cpumask_t *mask = this_cpu(scratch_cpumask);
 
@@ -2977,7 +2985,7 @@ static int _get_page_type(struct page_info *page, unsigned long type,
 
                 if ( unlikely(!cpumask_empty(mask)) &&
                      /* Shadow mode: track only writable pages. */
-                     (!shadow_mode_enabled(page_get_owner(page)) ||
+                     (!shadow_mode_enabled(d) ||
                       ((nx & PGT_type_mask) == PGT_writable_page)) )
                 {
                     perfc_incr(need_flush_tlb_flush);
@@ -3008,7 +3016,14 @@ static int _get_page_type(struct page_info *page, unsigned long type,
         }
         else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
         {
-            /* Don't log failure if it could be a recursive-mapping attempt. */
+            /*
+             * else, we're trying to take a new reference, of the wrong type.
+             *
+             * This (being able to prohibit use of the wrong type) is what the
+             * typeref system exists for, but skip printing the failure if it
+             * looks like a recursive mapping, as subsequent logic might
+             * ultimately permit the attempt.
+             */
             if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
                  (type == PGT_l1_page_table) )
                 return -EINVAL;
@@ -3027,18 +3042,46 @@ static int _get_page_type(struct page_info *page, unsigned long type,
         }
         else if ( unlikely(!(x & PGT_validated)) )
         {
+            /*
+             * else, the count is non-zero, and we're grabbing the right type;
+             * but the page hasn't been validated yet.
+             *
+             * The page is in one of two states (depending on PGT_partial),
+             * and should have exactly one reference.
+             */
+            ASSERT((x & (PGT_type_mask | PGT_count_mask)) == (type | 1));
+
             if ( !(x & PGT_partial) )
             {
-                /* Someone else is updating validation of this page. Wait... */
+                /*
+                 * The page has been left in the "validate locked" state
+                 * (i.e. PGT_[type] | 1) which means that a concurrent caller
+                 * of _get_page_type() is in the middle of validation.
+                 *
+                 * Spin waiting for the concurrent user to complete (partial
+                 * or fully validated), then restart our attempt to acquire a
+                 * type reference.
+                 */
                 do {
                     if ( preemptible && hypercall_preempt_check() )
                         return -EINTR;
                     cpu_relax();
-                } while ( (y = page->u.inuse.type_info) == x );
+                } while ( (y = ACCESS_ONCE(page->u.inuse.type_info)) == x );
                 continue;
             }
-            /* Type ref count was left at 1 when PGT_partial got set. */
-            ASSERT((x & PGT_count_mask) == 1);
+
+            /*
+             * The page has been left in the "partial" state
+             * (i.e., PGT_[type] | PGT_partial | 1).
+             *
+             * Rather than bumping the type count, we need to try to grab the
+             * validation lock; if we succeed, we need to validate the page,
+             * then drop the general ref associated with the PGT_partial bit.
+             *
+             * We grab the validation lock by setting nx to (PGT_[type] | 1)
+             * (i.e., non-zero type count, neither PGT_validated nor
+             * PGT_partial set).
+             */
             nx = x & ~PGT_partial;
         }
 
@@ -3087,6 +3130,13 @@ static int _get_page_type(struct page_info *page, unsigned long type,
     }
 
  out:
+    /*
+     * Did we drop the PGT_partial bit when acquiring the typeref?  If so,
+     * drop the general reference that went along with it.
+     *
+     * N.B. validate_page() may have have re-set PGT_partial, not reflected in
+     * nx, but will have taken an extra ref when doing so.
+     */
     if ( (x & PGT_partial) && !(nx & PGT_partial) )
         put_page(page);