diff options
Diffstat (limited to 'system/xen/xsa')
-rw-r--r-- | system/xen/xsa/xsa252.patch | 27 | ||||
-rw-r--r-- | system/xen/xsa/xsa253.patch | 26 | ||||
-rw-r--r-- | system/xen/xsa/xsa255-1.patch | 133 | ||||
-rw-r--r-- | system/xen/xsa/xsa255-2.patch | 167 | ||||
-rw-r--r-- | system/xen/xsa/xsa256.patch | 40 | ||||
-rw-r--r-- | system/xen/xsa/xsa260-1.patch | 72 | ||||
-rw-r--r-- | system/xen/xsa/xsa260-2.patch | 110 | ||||
-rw-r--r-- | system/xen/xsa/xsa260-3.patch | 138 | ||||
-rw-r--r-- | system/xen/xsa/xsa260-4.patch | 72 | ||||
-rw-r--r-- | system/xen/xsa/xsa261.patch | 279 | ||||
-rw-r--r-- | system/xen/xsa/xsa262-4.10.patch | 76 |
11 files changed, 747 insertions, 393 deletions
diff --git a/system/xen/xsa/xsa252.patch b/system/xen/xsa/xsa252.patch deleted file mode 100644 index 8615928142..0000000000 --- a/system/xen/xsa/xsa252.patch +++ /dev/null @@ -1,27 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: memory: don't implicitly unpin for decrease-reservation - -It very likely was a mistake (copy-and-paste from domain cleanup code) -to implicitly unpin here: The caller should really unpin itself before -(or after, if they so wish) requesting the page to be removed. - -This is XSA-252. - -Reported-by: Jann Horn <jannh@google.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> - ---- a/xen/common/memory.c -+++ b/xen/common/memory.c -@@ -357,11 +357,6 @@ int guest_remove_page(struct domain *d, - - rc = guest_physmap_remove_page(d, _gfn(gmfn), mfn, 0); - --#ifdef _PGT_pinned -- if ( !rc && test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) ) -- put_page_and_type(page); --#endif -- - /* - * With the lack of an IOMMU on some platforms, domains with DMA-capable - * device must retrieve the same pfn when the hypercall populate_physmap diff --git a/system/xen/xsa/xsa253.patch b/system/xen/xsa/xsa253.patch deleted file mode 100644 index 19e4269358..0000000000 --- a/system/xen/xsa/xsa253.patch +++ /dev/null @@ -1,26 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86/msr: Free msr_vcpu_policy during vcpu destruction - -c/s 4187f79dc7 "x86/msr: introduce struct msr_vcpu_policy" introduced a -per-vcpu memory allocation, but failed to free it in the clean vcpu -destruction case. - -This is XSA-253 - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> - -diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c -index b17468c..0ae715d 100644 ---- a/xen/arch/x86/domain.c -+++ b/xen/arch/x86/domain.c -@@ -382,6 +382,9 @@ void vcpu_destroy(struct vcpu *v) - - vcpu_destroy_fpu(v); - -+ xfree(v->arch.msr); -+ v->arch.msr = NULL; -+ - if ( !is_idle_domain(v->domain) ) - vpmu_destroy(v); - diff --git a/system/xen/xsa/xsa255-1.patch b/system/xen/xsa/xsa255-1.patch deleted file mode 100644 index f8bba9e516..0000000000 --- a/system/xen/xsa/xsa255-1.patch +++ /dev/null @@ -1,133 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: gnttab/ARM: don't corrupt shared GFN array - -... by writing status GFNs to it. Introduce a second array instead. -Also implement gnttab_status_gmfn() properly now that the information is -suitably being tracked. - -While touching it anyway, remove a misguided (but luckily benign) upper -bound check from gnttab_shared_gmfn(): We should never access beyond the -bounds of that array. - -This is part of XSA-255. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> ---- -v3: Don't init the ARM GFN arrays to zero anymore, use INVALID_GFN. -v2: New. - ---- a/xen/common/grant_table.c -+++ b/xen/common/grant_table.c -@@ -3775,6 +3775,7 @@ int gnttab_map_frame(struct domain *d, u - { - int rc = 0; - struct grant_table *gt = d->grant_table; -+ bool status = false; - - grant_write_lock(gt); - -@@ -3785,6 +3786,7 @@ int gnttab_map_frame(struct domain *d, u - (idx & XENMAPIDX_grant_table_status) ) - { - idx &= ~XENMAPIDX_grant_table_status; -+ status = true; - if ( idx < nr_status_frames(gt) ) - *mfn = _mfn(virt_to_mfn(gt->status[idx])); - else -@@ -3802,7 +3804,7 @@ int gnttab_map_frame(struct domain *d, u - } - - if ( !rc ) -- gnttab_set_frame_gfn(gt, idx, gfn); -+ gnttab_set_frame_gfn(gt, status, idx, gfn); - - grant_write_unlock(gt); - ---- a/xen/include/asm-arm/grant_table.h -+++ b/xen/include/asm-arm/grant_table.h -@@ -9,7 +9,8 @@ - #define INITIAL_NR_GRANT_FRAMES 1U - - struct grant_table_arch { -- gfn_t *gfn; -+ gfn_t *shared_gfn; -+ gfn_t *status_gfn; - }; - - void gnttab_clear_flag(unsigned long nr, uint16_t *addr); -@@ -21,7 +22,6 @@ int replace_grant_host_mapping(unsigned - unsigned long new_gpaddr, unsigned int flags); - void gnttab_mark_dirty(struct domain *d, unsigned long l); - #define gnttab_create_status_page(d, t, i) do {} while (0) --#define gnttab_status_gmfn(d, t, i) (0) - #define gnttab_release_host_mappings(domain) 1 - static inline int replace_grant_supported(void) - { -@@ -42,19 +42,35 @@ static inline unsigned int gnttab_dom0_m - - #define gnttab_init_arch(gt) \ - ({ \ -- (gt)->arch.gfn = xzalloc_array(gfn_t, (gt)->max_grant_frames); \ -- ( (gt)->arch.gfn ? 0 : -ENOMEM ); \ -+ unsigned int ngf_ = (gt)->max_grant_frames; \ -+ unsigned int nsf_ = grant_to_status_frames(ngf_); \ -+ \ -+ (gt)->arch.shared_gfn = xmalloc_array(gfn_t, ngf_); \ -+ (gt)->arch.status_gfn = xmalloc_array(gfn_t, nsf_); \ -+ if ( (gt)->arch.shared_gfn && (gt)->arch.status_gfn ) \ -+ { \ -+ while ( ngf_-- ) \ -+ (gt)->arch.shared_gfn[ngf_] = INVALID_GFN; \ -+ while ( nsf_-- ) \ -+ (gt)->arch.status_gfn[nsf_] = INVALID_GFN; \ -+ } \ -+ else \ -+ gnttab_destroy_arch(gt); \ -+ (gt)->arch.shared_gfn ? 0 : -ENOMEM; \ - }) - - #define gnttab_destroy_arch(gt) \ - do { \ -- xfree((gt)->arch.gfn); \ -- (gt)->arch.gfn = NULL; \ -+ xfree((gt)->arch.shared_gfn); \ -+ (gt)->arch.shared_gfn = NULL; \ -+ xfree((gt)->arch.status_gfn); \ -+ (gt)->arch.status_gfn = NULL; \ - } while ( 0 ) - --#define gnttab_set_frame_gfn(gt, idx, gfn) \ -+#define gnttab_set_frame_gfn(gt, st, idx, gfn) \ - do { \ -- (gt)->arch.gfn[idx] = gfn; \ -+ ((st) ? (gt)->arch.status_gfn : (gt)->arch.shared_gfn)[idx] = \ -+ (gfn); \ - } while ( 0 ) - - #define gnttab_create_shared_page(d, t, i) \ -@@ -65,8 +81,10 @@ static inline unsigned int gnttab_dom0_m - } while ( 0 ) - - #define gnttab_shared_gmfn(d, t, i) \ -- ( ((i >= nr_grant_frames(t)) && \ -- (i < (t)->max_grant_frames))? 0 : gfn_x((t)->arch.gfn[i])) -+ gfn_x(((i) >= nr_grant_frames(t)) ? INVALID_GFN : (t)->arch.shared_gfn[i]) -+ -+#define gnttab_status_gmfn(d, t, i) \ -+ gfn_x(((i) >= nr_status_frames(t)) ? INVALID_GFN : (t)->arch.status_gfn[i]) - - #define gnttab_need_iommu_mapping(d) \ - (is_domain_direct_mapped(d) && need_iommu(d)) ---- a/xen/include/asm-x86/grant_table.h -+++ b/xen/include/asm-x86/grant_table.h -@@ -46,7 +46,7 @@ static inline unsigned int gnttab_dom0_m - - #define gnttab_init_arch(gt) 0 - #define gnttab_destroy_arch(gt) do {} while ( 0 ) --#define gnttab_set_frame_gfn(gt, idx, gfn) do {} while ( 0 ) -+#define gnttab_set_frame_gfn(gt, st, idx, gfn) do {} while ( 0 ) - - #define gnttab_create_shared_page(d, t, i) \ - do { \ diff --git a/system/xen/xsa/xsa255-2.patch b/system/xen/xsa/xsa255-2.patch deleted file mode 100644 index 402b6efe98..0000000000 --- a/system/xen/xsa/xsa255-2.patch +++ /dev/null @@ -1,167 +0,0 @@ -From: Jan Beulich <jbeulich@suse.com> -Subject: gnttab: don't blindly free status pages upon version change - -There may still be active mappings, which would trigger the respective -BUG_ON(). Split the loop into one dealing with the page attributes and -the second (when the first fully passed) freeing the pages. Return an -error if any pages still have pending references. - -This is part of XSA-255. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> ---- -v4: Add gprintk(XENLOG_ERR, ...) to domain_crash() invocations. -v3: Call guest_physmap_remove_page() from gnttab_map_frame(), making the - code unconditional at the same time. Re-base over changes to first - patch. -v2: Also deal with translated guests. - ---- a/xen/common/grant_table.c -+++ b/xen/common/grant_table.c -@@ -1636,23 +1636,74 @@ status_alloc_failed: - return -ENOMEM; - } - --static void -+static int - gnttab_unpopulate_status_frames(struct domain *d, struct grant_table *gt) - { -- int i; -+ unsigned int i; - - for ( i = 0; i < nr_status_frames(gt); i++ ) - { - struct page_info *pg = virt_to_page(gt->status[i]); -+ gfn_t gfn = gnttab_get_frame_gfn(gt, true, i); -+ -+ /* -+ * For translated domains, recovering from failure after partial -+ * changes were made is more complicated than it seems worth -+ * implementing at this time. Hence respective error paths below -+ * crash the domain in such a case. -+ */ -+ if ( paging_mode_translate(d) ) -+ { -+ int rc = gfn_eq(gfn, INVALID_GFN) -+ ? 0 -+ : guest_physmap_remove_page(d, gfn, -+ _mfn(page_to_mfn(pg)), 0); -+ -+ if ( rc ) -+ { -+ gprintk(XENLOG_ERR, -+ "Could not remove status frame %u (GFN %#lx) from P2M\n", -+ i, gfn_x(gfn)); -+ domain_crash(d); -+ return rc; -+ } -+ gnttab_set_frame_gfn(gt, true, i, INVALID_GFN); -+ } - - BUG_ON(page_get_owner(pg) != d); - if ( test_and_clear_bit(_PGC_allocated, &pg->count_info) ) - put_page(pg); -- BUG_ON(pg->count_info & ~PGC_xen_heap); -+ -+ if ( pg->count_info & ~PGC_xen_heap ) -+ { -+ if ( paging_mode_translate(d) ) -+ { -+ gprintk(XENLOG_ERR, -+ "Wrong page state %#lx of status frame %u (GFN %#lx)\n", -+ pg->count_info, i, gfn_x(gfn)); -+ domain_crash(d); -+ } -+ else -+ { -+ if ( get_page(pg, d) ) -+ set_bit(_PGC_allocated, &pg->count_info); -+ while ( i-- ) -+ gnttab_create_status_page(d, gt, i); -+ } -+ return -EBUSY; -+ } -+ -+ page_set_owner(pg, NULL); -+ } -+ -+ for ( i = 0; i < nr_status_frames(gt); i++ ) -+ { - free_xenheap_page(gt->status[i]); - gt->status[i] = NULL; - } - gt->nr_status_frames = 0; -+ -+ return 0; - } - - /* -@@ -2962,8 +3013,9 @@ gnttab_set_version(XEN_GUEST_HANDLE_PARA - break; - } - -- if ( op.version < 2 && gt->gt_version == 2 ) -- gnttab_unpopulate_status_frames(currd, gt); -+ if ( op.version < 2 && gt->gt_version == 2 && -+ (res = gnttab_unpopulate_status_frames(currd, gt)) != 0 ) -+ goto out_unlock; - - /* Make sure there's no crud left over from the old version. */ - for ( i = 0; i < nr_grant_frames(gt); i++ ) -@@ -3803,6 +3855,11 @@ int gnttab_map_frame(struct domain *d, u - rc = -EINVAL; - } - -+ if ( !rc && paging_mode_translate(d) && -+ !gfn_eq(gnttab_get_frame_gfn(gt, status, idx), INVALID_GFN) ) -+ rc = guest_physmap_remove_page(d, gnttab_get_frame_gfn(gt, status, idx), -+ *mfn, 0); -+ - if ( !rc ) - gnttab_set_frame_gfn(gt, status, idx, gfn); - ---- a/xen/include/asm-arm/grant_table.h -+++ b/xen/include/asm-arm/grant_table.h -@@ -73,6 +73,11 @@ static inline unsigned int gnttab_dom0_m - (gfn); \ - } while ( 0 ) - -+#define gnttab_get_frame_gfn(gt, st, idx) ({ \ -+ _gfn((st) ? gnttab_status_gmfn(NULL, gt, idx) \ -+ : gnttab_shared_gmfn(NULL, gt, idx)); \ -+}) -+ - #define gnttab_create_shared_page(d, t, i) \ - do { \ - share_xen_page_with_guest( \ ---- a/xen/include/asm-x86/grant_table.h -+++ b/xen/include/asm-x86/grant_table.h -@@ -47,6 +47,12 @@ static inline unsigned int gnttab_dom0_m - #define gnttab_init_arch(gt) 0 - #define gnttab_destroy_arch(gt) do {} while ( 0 ) - #define gnttab_set_frame_gfn(gt, st, idx, gfn) do {} while ( 0 ) -+#define gnttab_get_frame_gfn(gt, st, idx) ({ \ -+ unsigned long mfn_ = (st) ? gnttab_status_mfn(gt, idx) \ -+ : gnttab_shared_mfn(gt, idx); \ -+ unsigned long gpfn_ = get_gpfn_from_mfn(mfn_); \ -+ VALID_M2P(gpfn_) ? _gfn(gpfn_) : INVALID_GFN; \ -+}) - - #define gnttab_create_shared_page(d, t, i) \ - do { \ -@@ -63,11 +69,11 @@ static inline unsigned int gnttab_dom0_m - } while ( 0 ) - - --#define gnttab_shared_mfn(d, t, i) \ -+#define gnttab_shared_mfn(t, i) \ - ((virt_to_maddr((t)->shared_raw[i]) >> PAGE_SHIFT)) - - #define gnttab_shared_gmfn(d, t, i) \ -- (mfn_to_gmfn(d, gnttab_shared_mfn(d, t, i))) -+ (mfn_to_gmfn(d, gnttab_shared_mfn(t, i))) - - - #define gnttab_status_mfn(t, i) \ diff --git a/system/xen/xsa/xsa256.patch b/system/xen/xsa/xsa256.patch deleted file mode 100644 index 50ff24e17b..0000000000 --- a/system/xen/xsa/xsa256.patch +++ /dev/null @@ -1,40 +0,0 @@ -From: Andrew Cooper <andrew.cooper3@citrix.com> -Subject: x86/hvm: Disallow the creation of HVM domains without Local APIC emulation - -There are multiple problems, not necesserily limited to: - - * Guests which configure event channels via hvmop_set_evtchn_upcall_vector(), - or which hit %cr8 emulation will cause Xen to fall over a NULL vlapic->regs - pointer. - - * On Intel hardware, disabling the TPR_SHADOW execution control without - reenabling CR8_{LOAD,STORE} interception means that the guests %cr8 - accesses interact with the real TPR. Amongst other things, setting the - real TPR to 0xf blocks even IPIs from interrupting this CPU. - - * On hardware which sets up the use of Interrupt Posting, including - IOMMU-Posting, guests run without the appropriate non-root configuration, - which at a minimum will result in dropped interrupts. - -Whether no-LAPIC mode is of any use at all remains to be seen. - -This is XSA-256. - -Reported-by: Ian Jackson <ian.jackson@eu.citrix.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> - -diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c -index f93327b..f65fc12 100644 ---- a/xen/arch/x86/domain.c -+++ b/xen/arch/x86/domain.c -@@ -413,7 +413,7 @@ static bool emulation_flags_ok(const struct domain *d, uint32_t emflags) - if ( is_hardware_domain(d) && - emflags != (XEN_X86_EMU_LAPIC|XEN_X86_EMU_IOAPIC) ) - return false; -- if ( !is_hardware_domain(d) && emflags && -+ if ( !is_hardware_domain(d) && - emflags != XEN_X86_EMU_ALL && emflags != XEN_X86_EMU_LAPIC ) - return false; - } diff --git a/system/xen/xsa/xsa260-1.patch b/system/xen/xsa/xsa260-1.patch new file mode 100644 index 0000000000..21da59cddd --- /dev/null +++ b/system/xen/xsa/xsa260-1.patch @@ -0,0 +1,72 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/traps: Fix %dr6 handing in #DB handler + +Most bits in %dr6 accumulate, rather than being set directly based on the +current source of #DB. Have the handler follow the manuals guidance, which +avoids leaking hypervisor debugging activities into guest context. + +This is part of XSA-260 / CVE-2018-8897. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> + +--- a/xen/arch/x86/traps.c 2018-04-13 15:29:36.006747135 +0200 ++++ b/xen/arch/x86/traps.c 2018-04-13 15:44:57.015516185 +0200 +@@ -1761,11 +1761,36 @@ static void ler_enable(void) + + void do_debug(struct cpu_user_regs *regs) + { ++ unsigned long dr6; + struct vcpu *v = current; + ++ /* Stash dr6 as early as possible. */ ++ dr6 = read_debugreg(6); ++ + if ( debugger_trap_entry(TRAP_debug, regs) ) + return; + ++ /* ++ * At the time of writing (March 2018), on the subject of %dr6: ++ * ++ * The Intel manual says: ++ * Certain debug exceptions may clear bits 0-3. The remaining contents ++ * of the DR6 register are never cleared by the processor. To avoid ++ * confusion in identifying debug exceptions, debug handlers should ++ * clear the register (except bit 16, which they should set) before ++ * returning to the interrupted task. ++ * ++ * The AMD manual says: ++ * Bits 15:13 of the DR6 register are not cleared by the processor and ++ * must be cleared by software after the contents have been read. ++ * ++ * Some bits are reserved set, some are reserved clear, and some bits ++ * which were previously reserved set are reused and cleared by hardware. ++ * For future compatibility, reset to the default value, which will allow ++ * us to spot any bit being changed by hardware to its non-default value. ++ */ ++ write_debugreg(6, X86_DR6_DEFAULT); ++ + if ( !guest_mode(regs) ) + { + if ( regs->eflags & X86_EFLAGS_TF ) +@@ -1798,7 +1823,8 @@ void do_debug(struct cpu_user_regs *regs + } + + /* Save debug status register where guest OS can peek at it */ +- v->arch.debugreg[6] = read_debugreg(6); ++ v->arch.debugreg[6] |= (dr6 & ~X86_DR6_DEFAULT); ++ v->arch.debugreg[6] &= (dr6 | ~X86_DR6_DEFAULT); + + ler_enable(); + pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC); +--- a/xen/include/asm-x86/debugreg.h 2015-02-11 09:36:29.000000000 +0100 ++++ b/xen/include/asm-x86/debugreg.h 2018-04-13 15:44:57.015516185 +0200 +@@ -24,6 +24,8 @@ + #define DR_STATUS_RESERVED_ZERO (~0xffffeffful) /* Reserved, read as zero */ + #define DR_STATUS_RESERVED_ONE 0xffff0ff0ul /* Reserved, read as one */ + ++#define X86_DR6_DEFAULT 0xffff0ff0ul /* Default %dr6 value. */ ++ + /* Now define a bunch of things for manipulating the control register. + The top two bytes of the control register consist of 4 fields of 4 + bits - each field corresponds to one of the four debug registers, diff --git a/system/xen/xsa/xsa260-2.patch b/system/xen/xsa/xsa260-2.patch new file mode 100644 index 0000000000..be71b2438f --- /dev/null +++ b/system/xen/xsa/xsa260-2.patch @@ -0,0 +1,110 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/pv: Move exception injection into {,compat_}test_all_events() + +This allows paths to jump straight to {,compat_}test_all_events() and have +injection of pending exceptions happen automatically, rather than requiring +all calling paths to handle exceptions themselves. + +The normal exception path is simplified as a result, and +compat_post_handle_exception() is removed entirely. + +This is part of XSA-260 / CVE-2018-8897. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> + +--- a/xen/arch/x86/x86_64/compat/entry.S ++++ b/xen/arch/x86/x86_64/compat/entry.S +@@ -39,6 +39,12 @@ ENTRY(compat_test_all_events) + leaq irq_stat+IRQSTAT_softirq_pending(%rip),%rcx + cmpl $0,(%rcx,%rax,1) + jne compat_process_softirqs ++ ++ /* Inject exception if pending. */ ++ lea VCPU_trap_bounce(%rbx), %rdx ++ testb $TBF_EXCEPTION, TRAPBOUNCE_flags(%rdx) ++ jnz .Lcompat_process_trapbounce ++ + testb $1,VCPU_mce_pending(%rbx) + jnz compat_process_mce + .Lcompat_test_guest_nmi: +@@ -68,6 +74,15 @@ compat_process_softirqs: + call do_softirq + jmp compat_test_all_events + ++ ALIGN ++/* %rbx: struct vcpu, %rdx: struct trap_bounce */ ++.Lcompat_process_trapbounce: ++ sti ++.Lcompat_bounce_exception: ++ call compat_create_bounce_frame ++ movb $0, TRAPBOUNCE_flags(%rdx) ++ jmp compat_test_all_events ++ + ALIGN + /* %rbx: struct vcpu */ + compat_process_mce: +@@ -189,15 +204,6 @@ ENTRY(cr4_pv32_restore) + xor %eax, %eax + ret + +-/* %rdx: trap_bounce, %rbx: struct vcpu */ +-ENTRY(compat_post_handle_exception) +- testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx) +- jz compat_test_all_events +-.Lcompat_bounce_exception: +- call compat_create_bounce_frame +- movb $0,TRAPBOUNCE_flags(%rdx) +- jmp compat_test_all_events +- + .section .text.entry, "ax", @progbits + + /* See lstar_enter for entry register state. */ +--- a/xen/arch/x86/x86_64/entry.S ++++ b/xen/arch/x86/x86_64/entry.S +@@ -42,6 +42,12 @@ test_all_events: + leaq irq_stat+IRQSTAT_softirq_pending(%rip), %rcx + cmpl $0, (%rcx, %rax, 1) + jne process_softirqs ++ ++ /* Inject exception if pending. */ ++ lea VCPU_trap_bounce(%rbx), %rdx ++ testb $TBF_EXCEPTION, TRAPBOUNCE_flags(%rdx) ++ jnz .Lprocess_trapbounce ++ + cmpb $0, VCPU_mce_pending(%rbx) + jne process_mce + .Ltest_guest_nmi: +@@ -70,6 +76,15 @@ process_softirqs: + jmp test_all_events + + ALIGN ++/* %rbx: struct vcpu, %rdx struct trap_bounce */ ++.Lprocess_trapbounce: ++ sti ++.Lbounce_exception: ++ call create_bounce_frame ++ movb $0, TRAPBOUNCE_flags(%rdx) ++ jmp test_all_events ++ ++ ALIGN + /* %rbx: struct vcpu */ + process_mce: + testb $1 << VCPU_TRAP_MCE, VCPU_async_exception_mask(%rbx) +@@ -667,15 +682,9 @@ handle_exception_saved: + mov %r15, STACK_CPUINFO_FIELD(xen_cr3)(%r14) + testb $3,UREGS_cs(%rsp) + jz restore_all_xen +- leaq VCPU_trap_bounce(%rbx),%rdx + movq VCPU_domain(%rbx),%rax + testb $1,DOMAIN_is_32bit_pv(%rax) +- jnz compat_post_handle_exception +- testb $TBF_EXCEPTION,TRAPBOUNCE_flags(%rdx) +- jz test_all_events +-.Lbounce_exception: +- call create_bounce_frame +- movb $0,TRAPBOUNCE_flags(%rdx) ++ jnz compat_test_all_events + jmp test_all_events + + /* No special register assumptions. */ diff --git a/system/xen/xsa/xsa260-3.patch b/system/xen/xsa/xsa260-3.patch new file mode 100644 index 0000000000..f0a0a5687d --- /dev/null +++ b/system/xen/xsa/xsa260-3.patch @@ -0,0 +1,138 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/traps: Use an Interrupt Stack Table for #DB + +PV guests can use architectural corner cases to cause #DB to be raised after +transitioning into supervisor mode. + +Use an interrupt stack table for #DB to prevent the exception being taken with +a guest controlled stack pointer. + +This is part of XSA-260 / CVE-2018-8897. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> + +--- a/xen/arch/x86/cpu/common.c ++++ b/xen/arch/x86/cpu/common.c +@@ -679,6 +679,7 @@ void load_system_tables(void) + [IST_MCE - 1] = stack_top + IST_MCE * PAGE_SIZE, + [IST_DF - 1] = stack_top + IST_DF * PAGE_SIZE, + [IST_NMI - 1] = stack_top + IST_NMI * PAGE_SIZE, ++ [IST_DB - 1] = stack_top + IST_DB * PAGE_SIZE, + + [IST_MAX ... ARRAY_SIZE(tss->ist) - 1] = + 0x8600111111111111ul, +@@ -706,6 +707,7 @@ void load_system_tables(void) + set_ist(&idt_tables[cpu][TRAP_double_fault], IST_DF); + set_ist(&idt_tables[cpu][TRAP_nmi], IST_NMI); + set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE); ++ set_ist(&idt_tables[cpu][TRAP_debug], IST_DB); + + /* + * Bottom-of-stack must be 16-byte aligned! +--- a/xen/arch/x86/hvm/svm/svm.c ++++ b/xen/arch/x86/hvm/svm/svm.c +@@ -1046,6 +1046,7 @@ static void svm_ctxt_switch_from(struct + set_ist(&idt_tables[cpu][TRAP_double_fault], IST_DF); + set_ist(&idt_tables[cpu][TRAP_nmi], IST_NMI); + set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE); ++ set_ist(&idt_tables[cpu][TRAP_debug], IST_DB); + } + + static void svm_ctxt_switch_to(struct vcpu *v) +@@ -1067,6 +1068,7 @@ static void svm_ctxt_switch_to(struct vc + set_ist(&idt_tables[cpu][TRAP_double_fault], IST_NONE); + set_ist(&idt_tables[cpu][TRAP_nmi], IST_NONE); + set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE); ++ set_ist(&idt_tables[cpu][TRAP_debug], IST_NONE); + + svm_restore_dr(v); + +--- a/xen/arch/x86/smpboot.c ++++ b/xen/arch/x86/smpboot.c +@@ -964,6 +964,7 @@ static int cpu_smpboot_alloc(unsigned in + set_ist(&idt_tables[cpu][TRAP_double_fault], IST_NONE); + set_ist(&idt_tables[cpu][TRAP_nmi], IST_NONE); + set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE); ++ set_ist(&idt_tables[cpu][TRAP_debug], IST_NONE); + + for ( stub_page = 0, i = cpu & ~(STUBS_PER_PAGE - 1); + i < nr_cpu_ids && i <= (cpu | (STUBS_PER_PAGE - 1)); ++i ) +--- a/xen/arch/x86/traps.c ++++ b/xen/arch/x86/traps.c +@@ -325,13 +325,13 @@ static void show_guest_stack(struct vcpu + /* + * Notes for get_stack_trace_bottom() and get_stack_dump_bottom() + * +- * Stack pages 0, 1 and 2: ++ * Stack pages 0 - 3: + * These are all 1-page IST stacks. Each of these stacks have an exception + * frame and saved register state at the top. The interesting bound for a + * trace is the word adjacent to this, while the bound for a dump is the + * very top, including the exception frame. + * +- * Stack pages 3, 4 and 5: ++ * Stack pages 4 and 5: + * None of these are particularly interesting. With MEMORY_GUARD, page 5 is + * explicitly not present, so attempting to dump or trace it is + * counterproductive. Without MEMORY_GUARD, it is possible for a call chain +@@ -352,12 +352,12 @@ unsigned long get_stack_trace_bottom(uns + { + switch ( get_stack_page(sp) ) + { +- case 0 ... 2: ++ case 0 ... 3: + return ROUNDUP(sp, PAGE_SIZE) - + offsetof(struct cpu_user_regs, es) - sizeof(unsigned long); + + #ifndef MEMORY_GUARD +- case 3 ... 5: ++ case 4 ... 5: + #endif + case 6 ... 7: + return ROUNDUP(sp, STACK_SIZE) - +@@ -372,11 +372,11 @@ unsigned long get_stack_dump_bottom(unsi + { + switch ( get_stack_page(sp) ) + { +- case 0 ... 2: ++ case 0 ... 3: + return ROUNDUP(sp, PAGE_SIZE) - sizeof(unsigned long); + + #ifndef MEMORY_GUARD +- case 3 ... 5: ++ case 4 ... 5: + #endif + case 6 ... 7: + return ROUNDUP(sp, STACK_SIZE) - sizeof(unsigned long); +@@ -1943,6 +1943,7 @@ void __init init_idt_traps(void) + set_ist(&idt_table[TRAP_double_fault], IST_DF); + set_ist(&idt_table[TRAP_nmi], IST_NMI); + set_ist(&idt_table[TRAP_machine_check], IST_MCE); ++ set_ist(&idt_table[TRAP_debug], IST_DB); + + /* CPU0 uses the master IDT. */ + idt_tables[0] = idt_table; +--- a/xen/arch/x86/x86_64/entry.S ++++ b/xen/arch/x86/x86_64/entry.S +@@ -739,7 +739,7 @@ ENTRY(device_not_available) + ENTRY(debug) + pushq $0 + movl $TRAP_debug,4(%rsp) +- jmp handle_exception ++ jmp handle_ist_exception + + ENTRY(int3) + pushq $0 +--- a/xen/include/asm-x86/processor.h ++++ b/xen/include/asm-x86/processor.h +@@ -443,7 +443,8 @@ struct __packed __cacheline_aligned tss_ + #define IST_DF 1UL + #define IST_NMI 2UL + #define IST_MCE 3UL +-#define IST_MAX 3UL ++#define IST_DB 4UL ++#define IST_MAX 4UL + + /* Set the interrupt stack table used by a particular interrupt + * descriptor table entry. */ diff --git a/system/xen/xsa/xsa260-4.patch b/system/xen/xsa/xsa260-4.patch new file mode 100644 index 0000000000..c2fa02d6e1 --- /dev/null +++ b/system/xen/xsa/xsa260-4.patch @@ -0,0 +1,72 @@ +From: Andrew Cooper <andrew.cooper3@citrix.com> +Subject: x86/traps: Fix handling of #DB exceptions in hypervisor context + +The WARN_ON() can be triggered by guest activities, and emits a full stack +trace without rate limiting. Swap it out for a ratelimited printk with just +enough information to work out what is going on. + +Not all #DB exceptions are traps, so blindly continuing is not a safe action +to take. We don't let PV guests select these settings in the real %dr7 to +begin with, but for added safety against unexpected situations, detect the +fault cases and crash in an obvious manner. + +This is part of XSA-260 / CVE-2018-8897. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> + +--- a/xen/arch/x86/traps.c ++++ b/xen/arch/x86/traps.c +@@ -1809,16 +1809,44 @@ void do_debug(struct cpu_user_regs *regs + regs->eflags &= ~X86_EFLAGS_TF; + } + } +- else ++ ++ /* ++ * Check for fault conditions. General Detect, and instruction ++ * breakpoints are faults rather than traps, at which point attempting ++ * to ignore and continue will result in a livelock. ++ */ ++ if ( dr6 & DR_GENERAL_DETECT ) ++ { ++ printk(XENLOG_ERR "Hit General Detect in Xen context\n"); ++ fatal_trap(regs, 0); ++ } ++ ++ if ( dr6 & (DR_TRAP3 | DR_TRAP2 | DR_TRAP1 | DR_TRAP0) ) + { +- /* +- * We ignore watchpoints when they trigger within Xen. This may +- * happen when a buffer is passed to us which previously had a +- * watchpoint set on it. No need to bump EIP; the only faulting +- * trap is an instruction breakpoint, which can't happen to us. +- */ +- WARN_ON(!search_exception_table(regs)); ++ unsigned int bp, dr7 = read_debugreg(7) >> DR_CONTROL_SHIFT; ++ ++ for ( bp = 0; bp < 4; ++bp ) ++ { ++ if ( (dr6 & (1u << bp)) && /* Breakpoint triggered? */ ++ ((dr7 & (3u << (bp * DR_CONTROL_SIZE))) == 0) /* Insn? */ ) ++ { ++ printk(XENLOG_ERR ++ "Hit instruction breakpoint in Xen context\n"); ++ fatal_trap(regs, 0); ++ } ++ } + } ++ ++ /* ++ * Whatever caused this #DB should be a trap. Note it and continue. ++ * Guests can trigger this in certain corner cases, so ensure the ++ * message is ratelimited. ++ */ ++ gprintk(XENLOG_WARNING, ++ "Hit #DB in Xen context: %04x:%p [%ps], stk %04x:%p, dr6 %lx\n", ++ regs->cs, _p(regs->rip), _p(regs->rip), ++ regs->ss, _p(regs->rsp), dr6); ++ + goto out; + } + diff --git a/system/xen/xsa/xsa261.patch b/system/xen/xsa/xsa261.patch new file mode 100644 index 0000000000..a51744b8d0 --- /dev/null +++ b/system/xen/xsa/xsa261.patch @@ -0,0 +1,279 @@ +From: Xen Project Security Team <security@xenproject.org> +Subject: x86/vpt: add support for IO-APIC routed interrupts + +And modify the HPET code to make use of it. Currently HPET interrupts +are always treated as ISA and thus injected through the vPIC. This is +wrong because HPET interrupts when not in legacy mode should be +injected from the IO-APIC. + +To make things worse, the supported interrupt routing values are set +to [20..23], which clearly falls outside of the ISA range, thus +leading to an ASSERT in debug builds or memory corruption in non-debug +builds because the interrupt injection code will write out of the +bounds of the arch.hvm_domain.vpic array. + +Since the HPET interrupt source can change between ISA and IO-APIC +always destroy the timer before changing the mode, or else Xen risks +changing it while the timer is active. + +Note that vpt interrupt injection is racy in the sense that the +vIO-APIC RTE entry can be written by the guest in between the call to +pt_irq_masked and hvm_ioapic_assert, or the call to pt_update_irq and +pt_intr_post. Those are not deemed to be security issues, but rather +quirks of the current implementation. In the worse case the guest +might lose interrupts or get multiple interrupt vectors injected for +the same timer source. + +This is part of XSA-261. + +Address actual and potential compiler warnings. Fix formatting. + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +--- +Changes since v2: + - Move fallthrough comment to be just above the case label. + - Fix now stale comment in pt_update_irq. + - Use NR_ISAIRQS instead of 16. + - Expand commit message to mention the quirkiness of vpt interrupt + injection. + +Changes since v1: + - Simply usage of gsi in pt_irq_masked. + - Introduce hvm_ioapic_assert. + - Fix pt->source == PTSRC_isa in create_periodic_time. + +--- a/xen/arch/x86/hvm/hpet.c ++++ b/xen/arch/x86/hvm/hpet.c +@@ -264,13 +264,20 @@ static void hpet_set_timer(HPETState *h, + diff = (timer_is_32bit(h, tn) && (-diff > HPET_TINY_TIME_SPAN)) + ? (uint32_t)diff : 0; + ++ destroy_periodic_time(&h->pt[tn]); + if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) ) ++ { + /* if LegacyReplacementRoute bit is set, HPET specification requires + timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC, + timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */ + irq = (tn == 0) ? 0 : 8; ++ h->pt[tn].source = PTSRC_isa; ++ } + else ++ { + irq = timer_int_route(h, tn); ++ h->pt[tn].source = PTSRC_ioapic; ++ } + + /* + * diff is the time from now when the timer should fire, for a periodic +--- a/xen/arch/x86/hvm/irq.c ++++ b/xen/arch/x86/hvm/irq.c +@@ -41,6 +41,26 @@ static void assert_gsi(struct domain *d, + vioapic_irq_positive_edge(d, ioapic_gsi); + } + ++int hvm_ioapic_assert(struct domain *d, unsigned int gsi, bool level) ++{ ++ struct hvm_irq *hvm_irq = hvm_domain_irq(d); ++ int vector; ++ ++ if ( gsi >= hvm_irq->nr_gsis ) ++ { ++ ASSERT_UNREACHABLE(); ++ return -1; ++ } ++ ++ spin_lock(&d->arch.hvm_domain.irq_lock); ++ if ( !level || hvm_irq->gsi_assert_count[gsi]++ == 0 ) ++ assert_gsi(d, gsi); ++ vector = vioapic_get_vector(d, gsi); ++ spin_unlock(&d->arch.hvm_domain.irq_lock); ++ ++ return vector; ++} ++ + static void assert_irq(struct domain *d, unsigned ioapic_gsi, unsigned pic_irq) + { + assert_gsi(d, ioapic_gsi); +--- a/xen/arch/x86/hvm/vpt.c ++++ b/xen/arch/x86/hvm/vpt.c +@@ -107,31 +107,49 @@ static int pt_irq_vector(struct periodic + static int pt_irq_masked(struct periodic_time *pt) + { + struct vcpu *v = pt->vcpu; +- unsigned int gsi, isa_irq; +- int mask; +- uint8_t pic_imr; ++ unsigned int gsi = pt->irq; + +- if ( pt->source == PTSRC_lapic ) ++ switch ( pt->source ) ++ { ++ case PTSRC_lapic: + { + struct vlapic *vlapic = vcpu_vlapic(v); ++ + return (!vlapic_enabled(vlapic) || + (vlapic_get_reg(vlapic, APIC_LVTT) & APIC_LVT_MASKED)); + } + +- isa_irq = pt->irq; +- gsi = hvm_isa_irq_to_gsi(isa_irq); +- pic_imr = v->domain->arch.hvm_domain.vpic[isa_irq >> 3].imr; +- mask = vioapic_get_mask(v->domain, gsi); +- if ( mask < 0 ) +- { +- dprintk(XENLOG_WARNING, "d%u: invalid GSI (%u) for platform timer\n", +- v->domain->domain_id, gsi); +- domain_crash(v->domain); +- return -1; ++ case PTSRC_isa: ++ { ++ uint8_t pic_imr = v->domain->arch.hvm_domain.vpic[pt->irq >> 3].imr; ++ ++ /* Check if the interrupt is unmasked in the PIC. */ ++ if ( !(pic_imr & (1 << (pt->irq & 7))) && vlapic_accept_pic_intr(v) ) ++ return 0; ++ ++ gsi = hvm_isa_irq_to_gsi(pt->irq); ++ } ++ ++ /* Fallthrough to check if the interrupt is masked on the IO APIC. */ ++ case PTSRC_ioapic: ++ { ++ int mask = vioapic_get_mask(v->domain, gsi); ++ ++ if ( mask < 0 ) ++ { ++ dprintk(XENLOG_WARNING, ++ "d%d: invalid GSI (%u) for platform timer\n", ++ v->domain->domain_id, gsi); ++ domain_crash(v->domain); ++ return -1; ++ } ++ ++ return mask; ++ } + } + +- return (((pic_imr & (1 << (isa_irq & 7))) || !vlapic_accept_pic_intr(v)) && +- mask); ++ ASSERT_UNREACHABLE(); ++ return 1; + } + + static void pt_lock(struct periodic_time *pt) +@@ -252,7 +270,7 @@ int pt_update_irq(struct vcpu *v) + struct list_head *head = &v->arch.hvm_vcpu.tm_list; + struct periodic_time *pt, *temp, *earliest_pt; + uint64_t max_lag; +- int irq, is_lapic, pt_vector; ++ int irq, pt_vector = -1; + + spin_lock(&v->arch.hvm_vcpu.tm_lock); + +@@ -288,29 +306,26 @@ int pt_update_irq(struct vcpu *v) + + earliest_pt->irq_issued = 1; + irq = earliest_pt->irq; +- is_lapic = (earliest_pt->source == PTSRC_lapic); + + spin_unlock(&v->arch.hvm_vcpu.tm_lock); + +- /* +- * If periodic timer interrut is handled by lapic, its vector in +- * IRR is returned and used to set eoi_exit_bitmap for virtual +- * interrupt delivery case. Otherwise return -1 to do nothing. +- */ +- if ( is_lapic ) ++ switch ( earliest_pt->source ) + { ++ case PTSRC_lapic: ++ /* ++ * If periodic timer interrupt is handled by lapic, its vector in ++ * IRR is returned and used to set eoi_exit_bitmap for virtual ++ * interrupt delivery case. Otherwise return -1 to do nothing. ++ */ + vlapic_set_irq(vcpu_vlapic(v), irq, 0); + pt_vector = irq; +- } +- else +- { ++ break; ++ ++ case PTSRC_isa: + hvm_isa_irq_deassert(v->domain, irq); + if ( platform_legacy_irq(irq) && vlapic_accept_pic_intr(v) && + v->domain->arch.hvm_domain.vpic[irq >> 3].int_output ) +- { + hvm_isa_irq_assert(v->domain, irq, NULL); +- pt_vector = -1; +- } + else + { + pt_vector = hvm_isa_irq_assert(v->domain, irq, vioapic_get_vector); +@@ -321,6 +336,17 @@ int pt_update_irq(struct vcpu *v) + if ( pt_vector < 0 || !vlapic_test_irq(vcpu_vlapic(v), pt_vector) ) + pt_vector = -1; + } ++ break; ++ ++ case PTSRC_ioapic: ++ /* ++ * NB: At the moment IO-APIC routed interrupts generated by vpt devices ++ * (HPET) are edge-triggered. ++ */ ++ pt_vector = hvm_ioapic_assert(v->domain, irq, false); ++ if ( pt_vector < 0 || !vlapic_test_irq(vcpu_vlapic(v), pt_vector) ) ++ pt_vector = -1; ++ break; + } + + return pt_vector; +@@ -418,7 +444,14 @@ void create_periodic_time( + struct vcpu *v, struct periodic_time *pt, uint64_t delta, + uint64_t period, uint8_t irq, time_cb *cb, void *data) + { +- ASSERT(pt->source != 0); ++ if ( !pt->source || ++ (pt->irq >= NR_ISAIRQS && pt->source == PTSRC_isa) || ++ (pt->irq >= hvm_domain_irq(v->domain)->nr_gsis && ++ pt->source == PTSRC_ioapic) ) ++ { ++ ASSERT_UNREACHABLE(); ++ return; ++ } + + destroy_periodic_time(pt); + +@@ -498,7 +531,7 @@ static void pt_adjust_vcpu(struct period + { + int on_list; + +- ASSERT(pt->source == PTSRC_isa); ++ ASSERT(pt->source == PTSRC_isa || pt->source == PTSRC_ioapic); + + if ( pt->vcpu == NULL ) + return; +--- a/xen/include/asm-x86/hvm/irq.h ++++ b/xen/include/asm-x86/hvm/irq.h +@@ -207,6 +207,9 @@ int hvm_set_pci_link_route(struct domain + + int hvm_inject_msi(struct domain *d, uint64_t addr, uint32_t data); + ++/* Assert an IO APIC pin. */ ++int hvm_ioapic_assert(struct domain *d, unsigned int gsi, bool level); ++ + void hvm_maybe_deassert_evtchn_irq(void); + void hvm_assert_evtchn_irq(struct vcpu *v); + void hvm_set_callback_via(struct domain *d, uint64_t via); +--- a/xen/include/asm-x86/hvm/vpt.h ++++ b/xen/include/asm-x86/hvm/vpt.h +@@ -44,6 +44,7 @@ struct periodic_time { + bool_t warned_timeout_too_short; + #define PTSRC_isa 1 /* ISA time source */ + #define PTSRC_lapic 2 /* LAPIC time source */ ++#define PTSRC_ioapic 3 /* IOAPIC time source */ + u8 source; /* PTSRC_ */ + u8 irq; + struct vcpu *vcpu; /* vcpu timer interrupt delivers to */ diff --git a/system/xen/xsa/xsa262-4.10.patch b/system/xen/xsa/xsa262-4.10.patch new file mode 100644 index 0000000000..ba9a8ffa22 --- /dev/null +++ b/system/xen/xsa/xsa262-4.10.patch @@ -0,0 +1,76 @@ +From: Jan Beulich <jbeulich@suse.com> +Subject: x86/HVM: guard against emulator driving ioreq state in weird ways + +In the case where hvm_wait_for_io() calls wait_on_xen_event_channel(), +p->state ends up being read twice in succession: once to determine that +state != p->state, and then again at the top of the loop. This gives a +compromised emulator a chance to change the state back between the two +reads, potentially keeping Xen in a loop indefinitely. + +Instead: +* Read p->state once in each of the wait_on_xen_event_channel() tests, +* re-use that value the next time around, +* and insist that the states continue to transition "forward" (with the + exception of the transition to STATE_IOREQ_NONE). + +This is XSA-262. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: George Dunlap <george.dunlap@citrix.com> + +--- a/xen/arch/x86/hvm/ioreq.c ++++ b/xen/arch/x86/hvm/ioreq.c +@@ -87,14 +87,17 @@ static void hvm_io_assist(struct hvm_ior + + static bool hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p) + { ++ unsigned int prev_state = STATE_IOREQ_NONE; ++ + while ( sv->pending ) + { + unsigned int state = p->state; + +- rmb(); +- switch ( state ) ++ smp_rmb(); ++ ++ recheck: ++ if ( unlikely(state == STATE_IOREQ_NONE) ) + { +- case STATE_IOREQ_NONE: + /* + * The only reason we should see this case is when an + * emulator is dying and it races with an I/O being +@@ -102,14 +105,30 @@ static bool hvm_wait_for_io(struct hvm_i + */ + hvm_io_assist(sv, ~0ul); + break; ++ } ++ ++ if ( unlikely(state < prev_state) ) ++ { ++ gdprintk(XENLOG_ERR, "Weird HVM ioreq state transition %u -> %u\n", ++ prev_state, state); ++ sv->pending = false; ++ domain_crash(sv->vcpu->domain); ++ return false; /* bail */ ++ } ++ ++ switch ( prev_state = state ) ++ { + case STATE_IORESP_READY: /* IORESP_READY -> NONE */ + p->state = STATE_IOREQ_NONE; + hvm_io_assist(sv, p->data); + break; + case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */ + case STATE_IOREQ_INPROCESS: +- wait_on_xen_event_channel(sv->ioreq_evtchn, p->state != state); +- break; ++ wait_on_xen_event_channel(sv->ioreq_evtchn, ++ ({ state = p->state; ++ smp_rmb(); ++ state != prev_state; })); ++ goto recheck; + default: + gdprintk(XENLOG_ERR, "Weird HVM iorequest state %u\n", state); + sv->pending = false; |