summaryrefslogtreecommitdiff
path: root/system/xen/xsa/xsa240-4.9-0001-x86-limit-linear-page-table-use-to-a-single-level.patch
blob: 515ad22b66bb3ac88ca584b28c0b6a0a2cec40bf (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
From 867988237d3e472fe2c99e81ae733e103422566c Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@suse.com>
Date: Thu, 28 Sep 2017 15:17:25 +0100
Subject: [PATCH 1/2] x86: limit linear page table use to a single level

That's the only way that they're meant to be used. Without such a
restriction arbitrarily long chains of same-level page tables can be
built, tearing down of which may then cause arbitrarily deep recursion,
causing a stack overflow. To facilitate this restriction, a counter is
being introduced to track both the number of same-level entries in a
page table as well as the number of uses of a page table in another
same-level one (counting into positive and negative direction
respectively, utilizing the fact that both counts can't be non-zero at
the same time).

Note that the added accounting introduces a restriction on the number
of times a page can be used in other same-level page tables - more than
32k of such uses are no longer possible.

Note also that some put_page_and_type[_preemptible]() calls are
replaced with open-coded equivalents.  This seemed preferrable to
adding "parent_table" to the matrix of functions.

Note further that cross-domain same-level page table references are no
longer permitted (they probably never should have been).

This is XSA-240.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
---
 xen/arch/x86/domain.c        |   1 +
 xen/arch/x86/mm.c            | 171 ++++++++++++++++++++++++++++++++++++++-----
 xen/include/asm-x86/domain.h |   2 +
 xen/include/asm-x86/mm.h     |  25 +++++--
 4 files changed, 175 insertions(+), 24 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index d7e699228c..d7ed72c246 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1226,6 +1226,7 @@ int arch_set_info_guest(
                     rc = -ERESTART;
                     /* Fallthrough */
                 case -ERESTART:
+                    v->arch.old_guest_ptpg = NULL;
                     v->arch.old_guest_table =
                         pagetable_get_page(v->arch.guest_table);
                     v->arch.guest_table = pagetable_null();
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 86f5eda52d..1e469bd354 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -747,6 +747,61 @@ static void put_data_page(
         put_page(page);
 }
 
+static bool inc_linear_entries(struct page_info *pg)
+{
+    typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
+
+    do {
+        /*
+         * The check below checks for the "linear use" count being non-zero
+         * as well as overflow.  Signed integer overflow is undefined behavior
+         * according to the C spec.  However, as long as linear_pt_count is
+         * smaller in size than 'int', the arithmetic operation of the
+         * increment below won't overflow; rather the result will be truncated
+         * when stored.  Ensure that this is always true.
+         */
+        BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
+        oc = nc++;
+        if ( nc <= 0 )
+            return false;
+        nc = cmpxchg(&pg->linear_pt_count, oc, nc);
+    } while ( oc != nc );
+
+    return true;
+}
+
+static void dec_linear_entries(struct page_info *pg)
+{
+    typeof(pg->linear_pt_count) oc;
+
+    oc = arch_fetch_and_add(&pg->linear_pt_count, -1);
+    ASSERT(oc > 0);
+}
+
+static bool inc_linear_uses(struct page_info *pg)
+{
+    typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
+
+    do {
+        /* See the respective comment in inc_linear_entries(). */
+        BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
+        oc = nc--;
+        if ( nc >= 0 )
+            return false;
+        nc = cmpxchg(&pg->linear_pt_count, oc, nc);
+    } while ( oc != nc );
+
+    return true;
+}
+
+static void dec_linear_uses(struct page_info *pg)
+{
+    typeof(pg->linear_pt_count) oc;
+
+    oc = arch_fetch_and_add(&pg->linear_pt_count, 1);
+    ASSERT(oc < 0);
+}
+
 /*
  * We allow root tables to map each other (a.k.a. linear page tables). It
  * needs some special care with reference counts and access permissions:
@@ -777,15 +832,35 @@ get_##level##_linear_pagetable(                                             \
                                                                             \
     if ( (pfn = level##e_get_pfn(pde)) != pde_pfn )                         \
     {                                                                       \
+        struct page_info *ptpg = mfn_to_page(pde_pfn);                      \
+                                                                            \
+        /* Make sure the page table belongs to the correct domain. */       \
+        if ( unlikely(page_get_owner(ptpg) != d) )                          \
+            return 0;                                                       \
+                                                                            \
         /* Make sure the mapped frame belongs to the correct domain. */     \
         if ( unlikely(!get_page_from_pagenr(pfn, d)) )                      \
             return 0;                                                       \
                                                                             \
         /*                                                                  \
-         * Ensure that the mapped frame is an already-validated page table. \
+         * Ensure that the mapped frame is an already-validated page table  \
+         * and is not itself having linear entries, as well as that the     \
+         * containing page table is not iself in use as a linear page table \
+         * elsewhere.                                                       \
          * If so, atomically increment the count (checking for overflow).   \
          */                                                                 \
         page = mfn_to_page(pfn);                                            \
+        if ( !inc_linear_entries(ptpg) )                                    \
+        {                                                                   \
+            put_page(page);                                                 \
+            return 0;                                                       \
+        }                                                                   \
+        if ( !inc_linear_uses(page) )                                       \
+        {                                                                   \
+            dec_linear_entries(ptpg);                                       \
+            put_page(page);                                                 \
+            return 0;                                                       \
+        }                                                                   \
         y = page->u.inuse.type_info;                                        \
         do {                                                                \
             x = y;                                                          \
@@ -793,6 +868,8 @@ get_##level##_linear_pagetable(                                             \
                  unlikely((x & (PGT_type_mask|PGT_validated)) !=            \
                           (PGT_##level##_page_table|PGT_validated)) )       \
             {                                                               \
+                dec_linear_uses(page);                                      \
+                dec_linear_entries(ptpg);                                   \
                 put_page(page);                                             \
                 return 0;                                                   \
             }                                                               \
@@ -1226,6 +1303,9 @@ get_page_from_l4e(
             l3e_remove_flags((pl3e), _PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);   \
     } while ( 0 )
 
+static int _put_page_type(struct page_info *page, bool preemptible,
+                          struct page_info *ptpg);
+
 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
 {
     unsigned long     pfn = l1e_get_pfn(l1e);
@@ -1296,17 +1376,22 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
     if ( l2e_get_flags(l2e) & _PAGE_PSE )
         put_superpage(l2e_get_pfn(l2e));
     else
-        put_page_and_type(l2e_get_page(l2e));
+    {
+        struct page_info *pg = l2e_get_page(l2e);
+        int rc = _put_page_type(pg, false, mfn_to_page(pfn));
+
+        ASSERT(!rc);
+        put_page(pg);
+    }
 
     return 0;
 }
 
-static int __put_page_type(struct page_info *, int preemptible);
-
 static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
                              int partial, bool_t defer)
 {
     struct page_info *pg;
+    int rc;
 
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
         return 1;
@@ -1329,21 +1414,28 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
     if ( unlikely(partial > 0) )
     {
         ASSERT(!defer);
-        return __put_page_type(pg, 1);
+        return _put_page_type(pg, true, mfn_to_page(pfn));
     }
 
     if ( defer )
     {
+        current->arch.old_guest_ptpg = mfn_to_page(pfn);
         current->arch.old_guest_table = pg;
         return 0;
     }
 
-    return put_page_and_type_preemptible(pg);
+    rc = _put_page_type(pg, true, mfn_to_page(pfn));
+    if ( likely(!rc) )
+        put_page(pg);
+
+    return rc;
 }
 
 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
                              int partial, bool_t defer)
 {
+    int rc = 1;
+
     if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && 
          (l4e_get_pfn(l4e) != pfn) )
     {
@@ -1352,18 +1444,22 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
         if ( unlikely(partial > 0) )
         {
             ASSERT(!defer);
-            return __put_page_type(pg, 1);
+            return _put_page_type(pg, true, mfn_to_page(pfn));
         }
 
         if ( defer )
         {
+            current->arch.old_guest_ptpg = mfn_to_page(pfn);
             current->arch.old_guest_table = pg;
             return 0;
         }
 
-        return put_page_and_type_preemptible(pg);
+        rc = _put_page_type(pg, true, mfn_to_page(pfn));
+        if ( likely(!rc) )
+            put_page(pg);
     }
-    return 1;
+
+    return rc;
 }
 
 static int alloc_l1_table(struct page_info *page)
@@ -1561,6 +1657,7 @@ static int alloc_l3_table(struct page_info *page)
         {
             page->nr_validated_ptes = i;
             page->partial_pte = 0;
+            current->arch.old_guest_ptpg = NULL;
             current->arch.old_guest_table = page;
         }
         while ( i-- > 0 )
@@ -1654,6 +1751,7 @@ static int alloc_l4_table(struct page_info *page)
                 {
                     if ( current->arch.old_guest_table )
                         page->nr_validated_ptes++;
+                    current->arch.old_guest_ptpg = NULL;
                     current->arch.old_guest_table = page;
                 }
             }
@@ -2403,14 +2501,20 @@ int free_page_type(struct page_info *pag
 }
 
 
-static int __put_final_page_type(
-    struct page_info *page, unsigned long type, int preemptible)
+static int _put_final_page_type(struct page_info *page, unsigned long type,
+                                bool preemptible, struct page_info *ptpg)
 {
     int rc = free_page_type(page, type, preemptible);
 
     /* No need for atomic update of type_info here: noone else updates it. */
     if ( rc == 0 )
     {
+        if ( ptpg && PGT_type_equal(type, ptpg->u.inuse.type_info) )
+        {
+            dec_linear_uses(page);
+            dec_linear_entries(ptpg);
+        }
+        ASSERT(!page->linear_pt_count || page_get_owner(page)->is_dying);
         /*
          * Record TLB information for flush later. We do not stamp page tables
          * when running in shadow mode:
@@ -2446,8 +2550,8 @@ static int __put_final_page_type(
 }
 
 
-static int __put_page_type(struct page_info *page,
-                           int preemptible)
+static int _put_page_type(struct page_info *page, bool preemptible,
+                          struct page_info *ptpg)
 {
     unsigned long nx, x, y = page->u.inuse.type_info;
     int rc = 0;
@@ -2474,12 +2578,28 @@ static int __put_page_type(struct page_info *page,
                                            x, nx)) != x) )
                     continue;
                 /* We cleared the 'valid bit' so we do the clean up. */
-                rc = __put_final_page_type(page, x, preemptible);
+                rc = _put_final_page_type(page, x, preemptible, ptpg);
+                ptpg = NULL;
                 if ( x & PGT_partial )
                     put_page(page);
                 break;
             }
 
+            if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) )
+            {
+                /*
+                 * page_set_tlbflush_timestamp() accesses the same union
+                 * linear_pt_count lives in. Unvalidated page table pages,
+                 * however, should occur during domain destruction only
+                 * anyway.  Updating of linear_pt_count luckily is not
+                 * necessary anymore for a dying domain.
+                 */
+                ASSERT(page_get_owner(page)->is_dying);
+                ASSERT(page->linear_pt_count < 0);
+                ASSERT(ptpg->linear_pt_count > 0);
+                ptpg = NULL;
+            }
+
             /*
              * Record TLB information for flush later. We do not stamp page
              * tables when running in shadow mode:
@@ -2499,6 +2619,13 @@ static int __put_page_type(struct page_info *page,
             return -EINTR;
     }
 
+    if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) )
+    {
+        ASSERT(!rc);
+        dec_linear_uses(page);
+        dec_linear_entries(ptpg);
+    }
+
     return rc;
 }
 
@@ -2638,6 +2765,7 @@ static int __get_page_type(struct page_info *page, unsigned long type,
             page->nr_validated_ptes = 0;
             page->partial_pte = 0;
         }
+        page->linear_pt_count = 0;
         rc = alloc_page_type(page, type, preemptible);
     }
 
@@ -2652,7 +2780,7 @@ static int __get_page_type(struct page_info *page, unsigned long type,
 
 void put_page_type(struct page_info *page)
 {
-    int rc = __put_page_type(page, 0);
+    int rc = _put_page_type(page, false, NULL);
     ASSERT(rc == 0);
     (void)rc;
 }
@@ -2668,7 +2796,7 @@ int get_page_type(struct page_info *page, unsigned long type)
 
 int put_page_type_preemptible(struct page_info *page)
 {
-    return __put_page_type(page, 1);
+    return _put_page_type(page, true, NULL);
 }
 
 int get_page_type_preemptible(struct page_info *page, unsigned long type)
@@ -2878,11 +3006,14 @@ int put_old_guest_table(struct vcpu *v)
     if ( !v->arch.old_guest_table )
         return 0;
 
-    switch ( rc = put_page_and_type_preemptible(v->arch.old_guest_table) )
+    switch ( rc = _put_page_type(v->arch.old_guest_table, true,
+                                 v->arch.old_guest_ptpg) )
     {
     case -EINTR:
     case -ERESTART:
         return -ERESTART;
+    case 0:
+        put_page(v->arch.old_guest_table);
     }
 
     v->arch.old_guest_table = NULL;
@@ -3042,6 +3173,7 @@ int new_guest_cr3(unsigned long mfn)
                 rc = -ERESTART;
                 /* fallthrough */
             case -ERESTART:
+                curr->arch.old_guest_ptpg = NULL;
                 curr->arch.old_guest_table = page;
                 break;
             default:
@@ -3310,7 +3442,10 @@ long do_mmuext_op(
                     if ( type == PGT_l1_page_table )
                         put_page_and_type(page);
                     else
+                    {
+                        curr->arch.old_guest_ptpg = NULL;
                         curr->arch.old_guest_table = page;
+                    }
                 }
             }
 
@@ -3346,6 +3481,7 @@ long do_mmuext_op(
             {
             case -EINTR:
             case -ERESTART:
+                curr->arch.old_guest_ptpg = NULL;
                 curr->arch.old_guest_table = page;
                 rc = 0;
                 break;
@@ -3425,6 +3561,7 @@ long do_mmuext_op(
                         rc = -ERESTART;
                         /* fallthrough */
                     case -ERESTART:
+                        curr->arch.old_guest_ptpg = NULL;
                         curr->arch.old_guest_table = page;
                         break;
                     default:
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index 924caac834..5a512918cc 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -527,6 +527,8 @@ struct arch_vcpu
     pagetable_t guest_table_user;       /* (MFN) x86/64 user-space pagetable */
     pagetable_t guest_table;            /* (MFN) guest notion of cr3 */
     struct page_info *old_guest_table;  /* partially destructed pagetable */
+    struct page_info *old_guest_ptpg;   /* containing page table of the */
+                                        /* former, if any */
     /* guest_table holds a ref to the page, and also a type-count unless
      * shadow refcounts are in use */
     pagetable_t shadow_table[4];        /* (MFN) shadow(s) of guest */
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index 119d7dec6b..445da50d47 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -124,11 +124,11 @@ struct page_info
         u32 tlbflush_timestamp;
 
         /*
-         * When PGT_partial is true then this field is valid and indicates
-         * that PTEs in the range [0, @nr_validated_ptes) have been validated.
-         * An extra page reference must be acquired (or not dropped) whenever
-         * PGT_partial gets set, and it must be dropped when the flag gets
-         * cleared. This is so that a get() leaving a page in partially
+         * When PGT_partial is true then the first two fields are valid and
+         * indicate that PTEs in the range [0, @nr_validated_ptes) have been
+         * validated. An extra page reference must be acquired (or not dropped)
+         * whenever PGT_partial gets set, and it must be dropped when the flag
+         * gets cleared. This is so that a get() leaving a page in partially
          * validated state (where the caller would drop the reference acquired
          * due to the getting of the type [apparently] failing [-ERESTART])
          * would not accidentally result in a page left with zero general
@@ -152,10 +152,18 @@ struct page_info
          * put_page_from_lNe() (due to the apparent failure), and hence it
          * must be dropped when the put operation is resumed (and completes),
          * but it must not be acquired if picking up the page for validation.
+         *
+         * The 3rd field, @linear_pt_count, indicates
+         * - by a positive value, how many same-level page table entries a page
+         *   table has,
+         * - by a negative value, in how many same-level page tables a page is
+         *   in use.
          */
         struct {
-            u16 nr_validated_ptes;
-            s8 partial_pte;
+            u16 nr_validated_ptes:PAGETABLE_ORDER + 1;
+            u16 :16 - PAGETABLE_ORDER - 1 - 2;
+            s16 partial_pte:2;
+            s16 linear_pt_count;
         };
 
         /*
@@ -206,6 +214,9 @@ struct page_info
 #define PGT_count_width   PG_shift(9)
 #define PGT_count_mask    ((1UL<<PGT_count_width)-1)
 
+/* Are the 'type mask' bits identical? */
+#define PGT_type_equal(x, y) (!(((x) ^ (y)) & PGT_type_mask))
+
  /* Cleared when the owning guest 'frees' this page. */
 #define _PGC_allocated    PG_shift(1)
 #define PGC_allocated     PG_mask(1, 1)
-- 
2.14.1