diff options
77 files changed, 3510 insertions, 5304 deletions
diff --git a/0001-update-Xen-version-to-4.16.2-pre.patch b/0001-update-Xen-version-to-4.16.3-pre.patch index 2e62c21..6ae690c 100644 --- a/0001-update-Xen-version-to-4.16.2-pre.patch +++ b/0001-update-Xen-version-to-4.16.3-pre.patch @@ -1,25 +1,25 @@ -From 5be9edb482ab20cf3e7acb05b511465294d1e19b Mon Sep 17 00:00:00 2001 +From 4aa32912ebeda8cb94d1c3941e7f1f0a2d4f921b Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 7 Jun 2022 13:55:17 +0200 -Subject: [PATCH 01/51] update Xen version to 4.16.2-pre +Date: Tue, 11 Oct 2022 14:49:41 +0200 +Subject: [PATCH 01/26] update Xen version to 4.16.3-pre --- xen/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xen/Makefile b/xen/Makefile -index 8abc71cf73aa..90a29782dbf4 100644 +index 76d0a3ff253f..8a403ee896cd 100644 --- a/xen/Makefile +++ b/xen/Makefile @@ -2,7 +2,7 @@ # All other places this is stored (eg. compile.h) should be autogenerated. export XEN_VERSION = 4 export XEN_SUBVERSION = 16 --export XEN_EXTRAVERSION ?= .1$(XEN_VENDORVERSION) -+export XEN_EXTRAVERSION ?= .2-pre$(XEN_VENDORVERSION) +-export XEN_EXTRAVERSION ?= .2$(XEN_VENDORVERSION) ++export XEN_EXTRAVERSION ?= .3-pre$(XEN_VENDORVERSION) export XEN_FULLVERSION = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION) -include xen-version -- -2.35.1 +2.37.3 diff --git a/0002-x86-irq-skip-unmap_domain_pirq-XSM-during-destructio.patch b/0002-x86-irq-skip-unmap_domain_pirq-XSM-during-destructio.patch deleted file mode 100644 index 0ba090e..0000000 --- a/0002-x86-irq-skip-unmap_domain_pirq-XSM-during-destructio.patch +++ /dev/null @@ -1,50 +0,0 @@ -From b58fb6e81bd55b6bd946abc3070770f7994c9ef9 Mon Sep 17 00:00:00 2001 -From: Jason Andryuk <jandryuk@gmail.com> -Date: Tue, 7 Jun 2022 13:55:39 +0200 -Subject: [PATCH 02/51] x86/irq: skip unmap_domain_pirq XSM during destruction -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -xsm_unmap_domain_irq was seen denying unmap_domain_pirq when called from -complete_domain_destroy as an RCU callback. The source context was an -unexpected, random domain. Since this is a xen-internal operation, -going through the XSM hook is inapproriate. - -Check d->is_dying and skip the XSM hook when set since this is a cleanup -operation for a domain being destroyed. - -Suggested-by: Roger Pau Monné <roger.pau@citrix.com> -Signed-off-by: Jason Andryuk <jandryuk@gmail.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -master commit: 2e6f95a942d1927a53f077c301db0b799c54c05a -master date: 2022-04-08 14:51:52 +0200 ---- - xen/arch/x86/irq.c | 10 ++++++++-- - 1 file changed, 8 insertions(+), 2 deletions(-) - -diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c -index 67cbf6b979dc..47b86af5dce9 100644 ---- a/xen/arch/x86/irq.c -+++ b/xen/arch/x86/irq.c -@@ -2342,8 +2342,14 @@ int unmap_domain_pirq(struct domain *d, int pirq) - nr = msi_desc->msi.nvec; - } - -- ret = xsm_unmap_domain_irq(XSM_HOOK, d, irq, -- msi_desc ? msi_desc->dev : NULL); -+ /* -+ * When called by complete_domain_destroy via RCU, current is a random -+ * domain. Skip the XSM check since this is a Xen-initiated action. -+ */ -+ if ( !d->is_dying ) -+ ret = xsm_unmap_domain_irq(XSM_HOOK, d, irq, -+ msi_desc ? msi_desc->dev : NULL); -+ - if ( ret ) - goto done; - --- -2.35.1 - diff --git a/0002-xen-arm-p2m-Prevent-adding-mapping-when-domain-is-dy.patch b/0002-xen-arm-p2m-Prevent-adding-mapping-when-domain-is-dy.patch new file mode 100644 index 0000000..fecc260 --- /dev/null +++ b/0002-xen-arm-p2m-Prevent-adding-mapping-when-domain-is-dy.patch @@ -0,0 +1,62 @@ +From 8d9531a3421dad2b0012e09e6f41d5274e162064 Mon Sep 17 00:00:00 2001 +From: Julien Grall <jgrall@amazon.com> +Date: Tue, 11 Oct 2022 14:52:13 +0200 +Subject: [PATCH 02/26] xen/arm: p2m: Prevent adding mapping when domain is + dying + +During the domain destroy process, the domain will still be accessible +until it is fully destroyed. So does the P2M because we don't bail +out early if is_dying is non-zero. If a domain has permission to +modify the other domain's P2M (i.e. dom0, or a stubdomain), then +foreign mapping can be added past relinquish_p2m_mapping(). + +Therefore, we need to prevent mapping to be added when the domain +is dying. This commit prevents such adding of mapping by adding the +d->is_dying check to p2m_set_entry(). Also this commit enhances the +check in relinquish_p2m_mapping() to make sure that no mappings can +be added in the P2M after the P2M lock is released. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Julien Grall <jgrall@amazon.com> +Signed-off-by: Henry Wang <Henry.Wang@arm.com> +Tested-by: Henry Wang <Henry.Wang@arm.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +master commit: 3ebe773293e3b945460a3d6f54f3b91915397bab +master date: 2022-10-11 14:20:18 +0200 +--- + xen/arch/arm/p2m.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c +index 3349b464a39e..1affdafadbeb 100644 +--- a/xen/arch/arm/p2m.c ++++ b/xen/arch/arm/p2m.c +@@ -1093,6 +1093,15 @@ int p2m_set_entry(struct p2m_domain *p2m, + { + int rc = 0; + ++ /* ++ * Any reference taken by the P2M mappings (e.g. foreign mapping) will ++ * be dropped in relinquish_p2m_mapping(). As the P2M will still ++ * be accessible after, we need to prevent mapping to be added when the ++ * domain is dying. ++ */ ++ if ( unlikely(p2m->domain->is_dying) ) ++ return -ENOMEM; ++ + while ( nr ) + { + unsigned long mask; +@@ -1610,6 +1619,8 @@ int relinquish_p2m_mapping(struct domain *d) + unsigned int order; + gfn_t start, end; + ++ BUG_ON(!d->is_dying); ++ /* No mappings can be added in the P2M after the P2M lock is released. */ + p2m_write_lock(p2m); + + start = p2m->lowest_mapped_gfn; +-- +2.37.3 + diff --git a/0003-xen-arm-p2m-Handle-preemption-when-freeing-intermedi.patch b/0003-xen-arm-p2m-Handle-preemption-when-freeing-intermedi.patch new file mode 100644 index 0000000..3190db8 --- /dev/null +++ b/0003-xen-arm-p2m-Handle-preemption-when-freeing-intermedi.patch @@ -0,0 +1,167 @@ +From 937fdbad5180440888f1fcee46299103327efa90 Mon Sep 17 00:00:00 2001 +From: Julien Grall <jgrall@amazon.com> +Date: Tue, 11 Oct 2022 14:52:27 +0200 +Subject: [PATCH 03/26] xen/arm: p2m: Handle preemption when freeing + intermediate page tables + +At the moment the P2M page tables will be freed when the domain structure +is freed without any preemption. As the P2M is quite large, iterating +through this may take more time than it is reasonable without intermediate +preemption (to run softirqs and perhaps scheduler). + +Split p2m_teardown() in two parts: one preemptible and called when +relinquishing the resources, the other one non-preemptible and called +when freeing the domain structure. + +As we are now freeing the P2M pages early, we also need to prevent +further allocation if someone call p2m_set_entry() past p2m_teardown() +(I wasn't able to prove this will never happen). This is done by +the checking domain->is_dying from previous patch in p2m_set_entry(). + +Similarly, we want to make sure that no-one can accessed the free +pages. Therefore the root is cleared before freeing pages. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Julien Grall <jgrall@amazon.com> +Signed-off-by: Henry Wang <Henry.Wang@arm.com> +Tested-by: Henry Wang <Henry.Wang@arm.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +master commit: 3202084566bba0ef0c45caf8c24302f83d92f9c8 +master date: 2022-10-11 14:20:56 +0200 +--- + xen/arch/arm/domain.c | 10 +++++++-- + xen/arch/arm/p2m.c | 47 ++++++++++++++++++++++++++++++++++++--- + xen/include/asm-arm/p2m.h | 13 +++++++++-- + 3 files changed, 63 insertions(+), 7 deletions(-) + +diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c +index 96e1b235501d..2694c39127c5 100644 +--- a/xen/arch/arm/domain.c ++++ b/xen/arch/arm/domain.c +@@ -789,10 +789,10 @@ fail: + void arch_domain_destroy(struct domain *d) + { + /* IOMMU page table is shared with P2M, always call +- * iommu_domain_destroy() before p2m_teardown(). ++ * iommu_domain_destroy() before p2m_final_teardown(). + */ + iommu_domain_destroy(d); +- p2m_teardown(d); ++ p2m_final_teardown(d); + domain_vgic_free(d); + domain_vuart_free(d); + free_xenheap_page(d->shared_info); +@@ -996,6 +996,7 @@ enum { + PROG_xen, + PROG_page, + PROG_mapping, ++ PROG_p2m, + PROG_done, + }; + +@@ -1056,6 +1057,11 @@ int domain_relinquish_resources(struct domain *d) + if ( ret ) + return ret; + ++ PROGRESS(p2m): ++ ret = p2m_teardown(d); ++ if ( ret ) ++ return ret; ++ + PROGRESS(done): + break; + +diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c +index 1affdafadbeb..27418ee5ee98 100644 +--- a/xen/arch/arm/p2m.c ++++ b/xen/arch/arm/p2m.c +@@ -1527,17 +1527,58 @@ static void p2m_free_vmid(struct domain *d) + spin_unlock(&vmid_alloc_lock); + } + +-void p2m_teardown(struct domain *d) ++int p2m_teardown(struct domain *d) + { + struct p2m_domain *p2m = p2m_get_hostp2m(d); ++ unsigned long count = 0; + struct page_info *pg; ++ unsigned int i; ++ int rc = 0; ++ ++ p2m_write_lock(p2m); ++ ++ /* ++ * We are about to free the intermediate page-tables, so clear the ++ * root to prevent any walk to use them. ++ */ ++ for ( i = 0; i < P2M_ROOT_PAGES; i++ ) ++ clear_and_clean_page(p2m->root + i); ++ ++ /* ++ * The domain will not be scheduled anymore, so in theory we should ++ * not need to flush the TLBs. Do it for safety purpose. ++ * ++ * Note that all the devices have already been de-assigned. So we don't ++ * need to flush the IOMMU TLB here. ++ */ ++ p2m_force_tlb_flush_sync(p2m); ++ ++ while ( (pg = page_list_remove_head(&p2m->pages)) ) ++ { ++ free_domheap_page(pg); ++ count++; ++ /* Arbitrarily preempt every 512 iterations */ ++ if ( !(count % 512) && hypercall_preempt_check() ) ++ { ++ rc = -ERESTART; ++ break; ++ } ++ } ++ ++ p2m_write_unlock(p2m); ++ ++ return rc; ++} ++ ++void p2m_final_teardown(struct domain *d) ++{ ++ struct p2m_domain *p2m = p2m_get_hostp2m(d); + + /* p2m not actually initialized */ + if ( !p2m->domain ) + return; + +- while ( (pg = page_list_remove_head(&p2m->pages)) ) +- free_domheap_page(pg); ++ ASSERT(page_list_empty(&p2m->pages)); + + if ( p2m->root ) + free_domheap_pages(p2m->root, P2M_ROOT_ORDER); +diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h +index 8f11d9c97b5d..b3ba83283e11 100644 +--- a/xen/include/asm-arm/p2m.h ++++ b/xen/include/asm-arm/p2m.h +@@ -192,8 +192,17 @@ void setup_virt_paging(void); + /* Init the datastructures for later use by the p2m code */ + int p2m_init(struct domain *d); + +-/* Return all the p2m resources to Xen. */ +-void p2m_teardown(struct domain *d); ++/* ++ * The P2M resources are freed in two parts: ++ * - p2m_teardown() will be called when relinquish the resources. It ++ * will free large resources (e.g. intermediate page-tables) that ++ * requires preemption. ++ * - p2m_final_teardown() will be called when domain struct is been ++ * freed. This *cannot* be preempted and therefore one small ++ * resources should be freed here. ++ */ ++int p2m_teardown(struct domain *d); ++void p2m_final_teardown(struct domain *d); + + /* + * Remove mapping refcount on each mapping page in the p2m +-- +2.37.3 + diff --git a/0003-xen-fix-XEN_DOMCTL_gdbsx_guestmemio-crash.patch b/0003-xen-fix-XEN_DOMCTL_gdbsx_guestmemio-crash.patch deleted file mode 100644 index fa1443c..0000000 --- a/0003-xen-fix-XEN_DOMCTL_gdbsx_guestmemio-crash.patch +++ /dev/null @@ -1,63 +0,0 @@ -From 6c6bbfdff9374ef41f84c4ebed7b8a7a40767ef6 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 7 Jun 2022 13:56:54 +0200 -Subject: [PATCH 03/51] xen: fix XEN_DOMCTL_gdbsx_guestmemio crash - -A hypervisor built without CONFIG_GDBSX will crash in case the -XEN_DOMCTL_gdbsx_guestmemio domctl is being called, as the call will -end up in iommu_do_domctl() with d == NULL: - - (XEN) CPU: 6 - (XEN) RIP: e008:[<ffff82d040269984>] iommu_do_domctl+0x4/0x30 - (XEN) RFLAGS: 0000000000010202 CONTEXT: hypervisor (d0v0) - (XEN) rax: 00000000000003e8 rbx: ffff830856277ef8 rcx: ffff830856277fff - ... - (XEN) Xen call trace: - (XEN) [<ffff82d040269984>] R iommu_do_domctl+0x4/0x30 - (XEN) [<ffff82d04035cd5f>] S arch_do_domctl+0x7f/0x2330 - (XEN) [<ffff82d040239e46>] S do_domctl+0xe56/0x1930 - (XEN) [<ffff82d040238ff0>] S do_domctl+0/0x1930 - (XEN) [<ffff82d0402f8c59>] S pv_hypercall+0x99/0x110 - (XEN) [<ffff82d0402f5161>] S arch/x86/pv/domain.c#_toggle_guest_pt+0x11/0x90 - (XEN) [<ffff82d040366288>] S lstar_enter+0x128/0x130 - (XEN) - (XEN) Pagetable walk from 0000000000000144: - (XEN) L4[0x000] = 0000000000000000 ffffffffffffffff - (XEN) - (XEN) **************************************** - (XEN) Panic on CPU 6: - (XEN) FATAL PAGE FAULT - (XEN) [error_code=0000] - (XEN) Faulting linear address: 0000000000000144 - (XEN) **************************************** - -It used to be permitted to pass DOMID_IDLE to dbg_rw_mem(), which is why the -special case skipping the domid checks exists. Now that it is only permitted -to pass proper domids, remove the special case, making 'd' always valid. - -Reported-by: Cheyenne Wills <cheyenne.wills@gmail.com> -Fixes: e726a82ca0dc ("xen: make gdbsx support configurable") -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: f00daf1fb3213a9b0335d9dcd90fe9cb5c02b7a9 -master date: 2022-04-19 17:07:08 +0100 ---- - xen/common/domctl.c | 1 - - 1 file changed, 1 deletion(-) - -diff --git a/xen/common/domctl.c b/xen/common/domctl.c -index 271862ae587f..419e4070f59d 100644 ---- a/xen/common/domctl.c -+++ b/xen/common/domctl.c -@@ -304,7 +304,6 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) - if ( op->domain == DOMID_INVALID ) - { - case XEN_DOMCTL_createdomain: -- case XEN_DOMCTL_gdbsx_guestmemio: - d = NULL; - break; - } --- -2.35.1 - diff --git a/0004-VT-d-refuse-to-use-IOMMU-with-reserved-CAP.ND-value.patch b/0004-VT-d-refuse-to-use-IOMMU-with-reserved-CAP.ND-value.patch deleted file mode 100644 index a4d229a..0000000 --- a/0004-VT-d-refuse-to-use-IOMMU-with-reserved-CAP.ND-value.patch +++ /dev/null @@ -1,49 +0,0 @@ -From b378ee56c7e0bb5eeb35dcc55b3d29e5f50eb566 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 7 Jun 2022 13:58:16 +0200 -Subject: [PATCH 04/51] VT-d: refuse to use IOMMU with reserved CAP.ND value -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The field taking the value 7 (resulting in 18-bit DIDs when using the -calculation in cap_ndoms(), when the DID fields are only 16 bits wide) -is reserved. Instead of misbehaving in case we would encounter such an -IOMMU, refuse to use it. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Kevin Tian <kevin.tian@intel.com> -master commit: a1545fbf45c689aff39ce76a6eaa609d32ef72a7 -master date: 2022-04-20 10:54:26 +0200 ---- - xen/drivers/passthrough/vtd/iommu.c | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c -index 93dd8aa643aa..8975c1de61bc 100644 ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -1279,8 +1279,11 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd) - - quirk_iommu_caps(iommu); - -+ nr_dom = cap_ndoms(iommu->cap); -+ - if ( cap_fault_reg_offset(iommu->cap) + - cap_num_fault_regs(iommu->cap) * PRIMARY_FAULT_REG_LEN >= PAGE_SIZE || -+ ((nr_dom - 1) >> 16) /* I.e. cap.nd > 6 */ || - ecap_iotlb_offset(iommu->ecap) >= PAGE_SIZE ) - { - printk(XENLOG_ERR VTDPREFIX "IOMMU: unsupported\n"); -@@ -1305,7 +1308,6 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd) - vtd_ops.sync_cache = sync_cache; - - /* allocate domain id bitmap */ -- nr_dom = cap_ndoms(iommu->cap); - iommu->domid_bitmap = xzalloc_array(unsigned long, BITS_TO_LONGS(nr_dom)); - if ( !iommu->domid_bitmap ) - return -ENOMEM; --- -2.35.1 - diff --git a/0004-x86-p2m-add-option-to-skip-root-pagetable-removal-in.patch b/0004-x86-p2m-add-option-to-skip-root-pagetable-removal-in.patch new file mode 100644 index 0000000..b3edbd9 --- /dev/null +++ b/0004-x86-p2m-add-option-to-skip-root-pagetable-removal-in.patch @@ -0,0 +1,138 @@ +From 8fc19c143b8aa563077f3d5c46fcc0a54dc04f35 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Tue, 11 Oct 2022 14:52:39 +0200 +Subject: [PATCH 04/26] x86/p2m: add option to skip root pagetable removal in + p2m_teardown() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Add a new parameter to p2m_teardown() in order to select whether the +root page table should also be freed. Note that all users are +adjusted to pass the parameter to remove the root page tables, so +behavior is not modified. + +No functional change intended. + +This is part of CVE-2022-33746 / XSA-410. + +Suggested-by: Julien Grall <julien@xen.org> +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Tim Deegan <tim@xen.org> +master commit: 1df52a270225527ae27bfa2fc40347bf93b78357 +master date: 2022-10-11 14:21:23 +0200 +--- + xen/arch/x86/mm/hap/hap.c | 6 +++--- + xen/arch/x86/mm/p2m.c | 20 ++++++++++++++++---- + xen/arch/x86/mm/shadow/common.c | 4 ++-- + xen/include/asm-x86/p2m.h | 2 +- + 4 files changed, 22 insertions(+), 10 deletions(-) + +diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c +index 47a7487fa7a3..a8f5a19da917 100644 +--- a/xen/arch/x86/mm/hap/hap.c ++++ b/xen/arch/x86/mm/hap/hap.c +@@ -541,18 +541,18 @@ void hap_final_teardown(struct domain *d) + } + + for ( i = 0; i < MAX_ALTP2M; i++ ) +- p2m_teardown(d->arch.altp2m_p2m[i]); ++ p2m_teardown(d->arch.altp2m_p2m[i], true); + } + + /* Destroy nestedp2m's first */ + for (i = 0; i < MAX_NESTEDP2M; i++) { +- p2m_teardown(d->arch.nested_p2m[i]); ++ p2m_teardown(d->arch.nested_p2m[i], true); + } + + if ( d->arch.paging.hap.total_pages != 0 ) + hap_teardown(d, NULL); + +- p2m_teardown(p2m_get_hostp2m(d)); ++ p2m_teardown(p2m_get_hostp2m(d), true); + /* Free any memory that the p2m teardown released */ + paging_lock(d); + hap_set_allocation(d, 0, NULL); +diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c +index def1695cf00b..aba4f17cbe12 100644 +--- a/xen/arch/x86/mm/p2m.c ++++ b/xen/arch/x86/mm/p2m.c +@@ -749,11 +749,11 @@ int p2m_alloc_table(struct p2m_domain *p2m) + * hvm fixme: when adding support for pvh non-hardware domains, this path must + * cleanup any foreign p2m types (release refcnts on them). + */ +-void p2m_teardown(struct p2m_domain *p2m) ++void p2m_teardown(struct p2m_domain *p2m, bool remove_root) + /* Return all the p2m pages to Xen. + * We know we don't have any extra mappings to these pages */ + { +- struct page_info *pg; ++ struct page_info *pg, *root_pg = NULL; + struct domain *d; + + if (p2m == NULL) +@@ -763,10 +763,22 @@ void p2m_teardown(struct p2m_domain *p2m) + + p2m_lock(p2m); + ASSERT(atomic_read(&d->shr_pages) == 0); +- p2m->phys_table = pagetable_null(); ++ ++ if ( remove_root ) ++ p2m->phys_table = pagetable_null(); ++ else if ( !pagetable_is_null(p2m->phys_table) ) ++ { ++ root_pg = pagetable_get_page(p2m->phys_table); ++ clear_domain_page(pagetable_get_mfn(p2m->phys_table)); ++ } + + while ( (pg = page_list_remove_head(&p2m->pages)) ) +- d->arch.paging.free_page(d, pg); ++ if ( pg != root_pg ) ++ d->arch.paging.free_page(d, pg); ++ ++ if ( root_pg ) ++ page_list_add(root_pg, &p2m->pages); ++ + p2m_unlock(p2m); + } + +diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c +index 8c1b041f7135..8c5baba9544d 100644 +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -2701,7 +2701,7 @@ int shadow_enable(struct domain *d, u32 mode) + paging_unlock(d); + out_unlocked: + if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) ) +- p2m_teardown(p2m); ++ p2m_teardown(p2m, true); + if ( rv != 0 && pg != NULL ) + { + pg->count_info &= ~PGC_count_mask; +@@ -2866,7 +2866,7 @@ void shadow_final_teardown(struct domain *d) + shadow_teardown(d, NULL); + + /* It is now safe to pull down the p2m map. */ +- p2m_teardown(p2m_get_hostp2m(d)); ++ p2m_teardown(p2m_get_hostp2m(d), true); + /* Free any shadow memory that the p2m teardown released */ + paging_lock(d); + shadow_set_allocation(d, 0, NULL); +diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h +index f2af7a746ced..c3c16748e7d5 100644 +--- a/xen/include/asm-x86/p2m.h ++++ b/xen/include/asm-x86/p2m.h +@@ -574,7 +574,7 @@ int p2m_init(struct domain *d); + int p2m_alloc_table(struct p2m_domain *p2m); + + /* Return all the p2m resources to Xen. */ +-void p2m_teardown(struct p2m_domain *p2m); ++void p2m_teardown(struct p2m_domain *p2m, bool remove_root); + void p2m_final_teardown(struct domain *d); + + /* Add a page to a domain's p2m table */ +-- +2.37.3 + diff --git a/0005-x86-HAP-adjust-monitor-table-related-error-handling.patch b/0005-x86-HAP-adjust-monitor-table-related-error-handling.patch new file mode 100644 index 0000000..33ab1ad --- /dev/null +++ b/0005-x86-HAP-adjust-monitor-table-related-error-handling.patch @@ -0,0 +1,77 @@ +From 3422c19d85a3d23a9d798eafb739ffb8865522d2 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 11 Oct 2022 14:52:59 +0200 +Subject: [PATCH 05/26] x86/HAP: adjust monitor table related error handling +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +hap_make_monitor_table() will return INVALID_MFN if it encounters an +error condition, but hap_update_paging_modes() wasn’t handling this +value, resulting in an inappropriate value being stored in +monitor_table. This would subsequently misguide at least +hap_vcpu_teardown(). Avoid this by bailing early. + +Further, when a domain has/was already crashed or (perhaps less +important as there's no such path known to lead here) is already dying, +avoid calling domain_crash() on it again - that's at best confusing. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: 5b44a61180f4f2e4f490a28400c884dd357ff45d +master date: 2022-10-11 14:21:56 +0200 +--- + xen/arch/x86/mm/hap/hap.c | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c +index a8f5a19da917..d75dc2b9ed3d 100644 +--- a/xen/arch/x86/mm/hap/hap.c ++++ b/xen/arch/x86/mm/hap/hap.c +@@ -39,6 +39,7 @@ + #include <asm/domain.h> + #include <xen/numa.h> + #include <asm/hvm/nestedhvm.h> ++#include <public/sched.h> + + #include "private.h" + +@@ -405,8 +406,13 @@ static mfn_t hap_make_monitor_table(struct vcpu *v) + return m4mfn; + + oom: +- printk(XENLOG_G_ERR "out of memory building monitor pagetable\n"); +- domain_crash(d); ++ if ( !d->is_dying && ++ (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) ++ { ++ printk(XENLOG_G_ERR "%pd: out of memory building monitor pagetable\n", ++ d); ++ domain_crash(d); ++ } + return INVALID_MFN; + } + +@@ -766,6 +772,9 @@ static void hap_update_paging_modes(struct vcpu *v) + if ( pagetable_is_null(v->arch.hvm.monitor_table) ) + { + mfn_t mmfn = hap_make_monitor_table(v); ++ ++ if ( mfn_eq(mmfn, INVALID_MFN) ) ++ goto unlock; + v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn); + make_cr3(v, mmfn); + hvm_update_host_cr3(v); +@@ -774,6 +783,7 @@ static void hap_update_paging_modes(struct vcpu *v) + /* CR3 is effectively updated by a mode change. Flush ASIDs, etc. */ + hap_update_cr3(v, 0, false); + ++ unlock: + paging_unlock(d); + put_gfn(d, cr3_gfn); + } +-- +2.37.3 + diff --git a/0005-x86-mm-avoid-inadvertently-degrading-a-TLB-flush-to-.patch b/0005-x86-mm-avoid-inadvertently-degrading-a-TLB-flush-to-.patch deleted file mode 100644 index 45a1825..0000000 --- a/0005-x86-mm-avoid-inadvertently-degrading-a-TLB-flush-to-.patch +++ /dev/null @@ -1,116 +0,0 @@ -From 7c003ab4a398ff4ddd54d15d4158cffb463134cc Mon Sep 17 00:00:00 2001 -From: David Vrabel <dvrabel@amazon.co.uk> -Date: Tue, 7 Jun 2022 13:59:31 +0200 -Subject: [PATCH 05/51] x86/mm: avoid inadvertently degrading a TLB flush to - local only -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -If the direct map is incorrectly modified with interrupts disabled, -the required TLB flushes are degraded to flushing the local CPU only. - -This could lead to very hard to diagnose problems as different CPUs will -end up with different views of memory. Although, no such issues have yet -been identified. - -Change the check in the flush_area() macro to look at system_state -instead. This defers the switch from local to all later in the boot -(see xen/arch/x86/setup.c:__start_xen()). This is fine because -additional PCPUs are not brought up until after the system state is -SYS_STATE_smp_boot. - -Signed-off-by: David Vrabel <dvrabel@amazon.co.uk> -Reviewed-by: Jan Beulich <jbeulich@suse.com> - -x86/flushtlb: remove flush_area check on system state - -Booting with Shadow Stacks leads to the following assert on a debug -hypervisor: - -Assertion 'local_irq_is_enabled()' failed at arch/x86/smp.c:265 -----[ Xen-4.17.0-10.24-d x86_64 debug=y Not tainted ]---- -CPU: 0 -RIP: e008:[<ffff82d040345300>] flush_area_mask+0x40/0x13e -[...] -Xen call trace: - [<ffff82d040345300>] R flush_area_mask+0x40/0x13e - [<ffff82d040338a40>] F modify_xen_mappings+0xc5/0x958 - [<ffff82d0404474f9>] F arch/x86/alternative.c#_alternative_instructions+0xb7/0xb9 - [<ffff82d0404476cc>] F alternative_branches+0xf/0x12 - [<ffff82d04044e37d>] F __start_xen+0x1ef4/0x2776 - [<ffff82d040203344>] F __high_start+0x94/0xa0 - -This is due to SYS_STATE_smp_boot being set before calling -alternative_branches(), and the flush in modify_xen_mappings() then -using flush_area_all() with interrupts disabled. Note that -alternative_branches() is called before APs are started, so the flush -must be a local one (and indeed the cpumask passed to -flush_area_mask() just contains one CPU). - -Take the opportunity to simplify a bit the logic and make flush_area() -an alias of flush_area_all() in mm.c, taking into account that -cpu_online_map just contains the BSP before APs are started. This -requires widening the assert in flush_area_mask() to allow being -called with interrupts disabled as long as it's strictly a local only -flush. - -The overall result is that a conditional can be removed from -flush_area(). - -While there also introduce an ASSERT to check that a vCPU state flush -is not issued for the local CPU only. - -Fixes: 78e072bc37 ('x86/mm: avoid inadvertently degrading a TLB flush to local only') -Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com> -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 78e072bc375043e81691a59454e09f0b38241ddd -master date: 2022-04-20 10:55:01 +0200 -master commit: 9f735ee4903f1b9f1966bb4ba5b5616b03ae08b5 -master date: 2022-05-25 11:09:46 +0200 ---- - xen/arch/x86/mm.c | 10 ++-------- - xen/arch/x86/smp.c | 5 ++++- - 2 files changed, 6 insertions(+), 9 deletions(-) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index 4d799032dc82..e222d9aa98ee 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -5051,14 +5051,8 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v) - #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) | _PAGE_PSE) : (f)) - #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f)) - --/* -- * map_pages_to_xen() can be called with interrupts disabled during -- * early bootstrap. In this case it is safe to use flush_area_local() -- * and avoid locking because only the local CPU is online. -- */ --#define flush_area(v,f) (!local_irq_is_enabled() ? \ -- flush_area_local((const void *)v, f) : \ -- flush_area_all((const void *)v, f)) -+/* flush_area_all() can be used prior to any other CPU being online. */ -+#define flush_area(v, f) flush_area_all((const void *)(v), f) - - #define L3T_INIT(page) (page) = ZERO_BLOCK_PTR - -diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c -index eef0f9c6cbf4..3556ec116608 100644 ---- a/xen/arch/x86/smp.c -+++ b/xen/arch/x86/smp.c -@@ -262,7 +262,10 @@ void flush_area_mask(const cpumask_t *mask, const void *va, unsigned int flags) - { - unsigned int cpu = smp_processor_id(); - -- ASSERT(local_irq_is_enabled()); -+ /* Local flushes can be performed with interrupts disabled. */ -+ ASSERT(local_irq_is_enabled() || cpumask_subset(mask, cpumask_of(cpu))); -+ /* Exclude use of FLUSH_VCPU_STATE for the local CPU. */ -+ ASSERT(!cpumask_test_cpu(cpu, mask) || !(flags & FLUSH_VCPU_STATE)); - - if ( (flags & ~(FLUSH_VCPU_STATE | FLUSH_ORDER_MASK)) && - cpumask_test_cpu(cpu, mask) ) --- -2.35.1 - diff --git a/0006-x86-shadow-tolerate-failure-of-sh_set_toplevel_shado.patch b/0006-x86-shadow-tolerate-failure-of-sh_set_toplevel_shado.patch new file mode 100644 index 0000000..bbae48b --- /dev/null +++ b/0006-x86-shadow-tolerate-failure-of-sh_set_toplevel_shado.patch @@ -0,0 +1,76 @@ +From 40e9daf6b56ae49bda3ba4e254ccf0e998e52a8c Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 11 Oct 2022 14:53:12 +0200 +Subject: [PATCH 06/26] x86/shadow: tolerate failure of + sh_set_toplevel_shadow() + +Subsequently sh_set_toplevel_shadow() will be adjusted to install a +blank entry in case prealloc fails. There are, in fact, pre-existing +error paths which would put in place a blank entry. The 4- and 2-level +code in sh_update_cr3(), however, assume the top level entry to be +valid. + +Hence bail from the function in the unlikely event that it's not. Note +that 3-level logic works differently: In particular a guest is free to +supply a PDPTR pointing at 4 non-present (or otherwise deemed invalid) +entries. The guest will crash, but we already cope with that. + +Really mfn_valid() is likely wrong to use in sh_set_toplevel_shadow(), +and it should instead be !mfn_eq(gmfn, INVALID_MFN). Avoid such a change +in security context, but add a respective assertion. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Tim Deegan <tim@xen.org> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: eac000978c1feb5a9ee3236ab0c0da9a477e5336 +master date: 2022-10-11 14:22:24 +0200 +--- + xen/arch/x86/mm/shadow/common.c | 1 + + xen/arch/x86/mm/shadow/multi.c | 10 ++++++++++ + 2 files changed, 11 insertions(+) + +diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c +index 8c5baba9544d..00e520cbd05b 100644 +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -2516,6 +2516,7 @@ void sh_set_toplevel_shadow(struct vcpu *v, + /* Now figure out the new contents: is this a valid guest MFN? */ + if ( !mfn_valid(gmfn) ) + { ++ ASSERT(mfn_eq(gmfn, INVALID_MFN)); + new_entry = pagetable_null(); + goto install_new_entry; + } +diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c +index 7b8f4dd13b03..2ff78fe3362c 100644 +--- a/xen/arch/x86/mm/shadow/multi.c ++++ b/xen/arch/x86/mm/shadow/multi.c +@@ -3312,6 +3312,11 @@ sh_update_cr3(struct vcpu *v, int do_locking, bool noflush) + if ( sh_remove_write_access(d, gmfn, 4, 0) != 0 ) + guest_flush_tlb_mask(d, d->dirty_cpumask); + sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow, sh_make_shadow); ++ if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) ) ++ { ++ ASSERT(d->is_dying || d->is_shutting_down); ++ return; ++ } + if ( !shadow_mode_external(d) && !is_pv_32bit_domain(d) ) + { + mfn_t smfn = pagetable_get_mfn(v->arch.paging.shadow.shadow_table[0]); +@@ -3370,6 +3375,11 @@ sh_update_cr3(struct vcpu *v, int do_locking, bool noflush) + if ( sh_remove_write_access(d, gmfn, 2, 0) != 0 ) + guest_flush_tlb_mask(d, d->dirty_cpumask); + sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow, sh_make_shadow); ++ if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) ) ++ { ++ ASSERT(d->is_dying || d->is_shutting_down); ++ return; ++ } + #else + #error This should never happen + #endif +-- +2.37.3 + diff --git a/0006-xen-build-Fix-dependency-for-the-MAP-rule.patch b/0006-xen-build-Fix-dependency-for-the-MAP-rule.patch deleted file mode 100644 index 7eb13cd..0000000 --- a/0006-xen-build-Fix-dependency-for-the-MAP-rule.patch +++ /dev/null @@ -1,29 +0,0 @@ -From 4bb8c34ba4241c2bf7845cd8b80c17530dbfb085 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 7 Jun 2022 14:00:09 +0200 -Subject: [PATCH 06/51] xen/build: Fix dependency for the MAP rule - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: Jan Beulich <jbeulich@suse.com> -master commit: e1e72198213b80b7a82bdc90f96ed05ae4f53e20 -master date: 2022-04-20 19:10:59 +0100 ---- - xen/Makefile | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/Makefile b/xen/Makefile -index 90a29782dbf4..ce4eca3ee4d7 100644 ---- a/xen/Makefile -+++ b/xen/Makefile -@@ -507,7 +507,7 @@ cscope: - cscope -k -b -q - - .PHONY: _MAP --_MAP: -+_MAP: $(TARGET) - $(NM) -n $(TARGET)-syms | grep -v '\(compiled\)\|\(\.o$$\)\|\( [aUw] \)\|\(\.\.ng$$\)\|\(LASH[RL]DI\)' > System.map - - %.o %.i %.s: %.c FORCE --- -2.35.1 - diff --git a/0007-tools-libs-evtchn-don-t-set-errno-to-negative-values.patch b/0007-tools-libs-evtchn-don-t-set-errno-to-negative-values.patch deleted file mode 100644 index ed98922..0000000 --- a/0007-tools-libs-evtchn-don-t-set-errno-to-negative-values.patch +++ /dev/null @@ -1,74 +0,0 @@ -From 13a29f3756bc4cab96c59f46c3875b483553fb8f Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 7 Jun 2022 14:00:31 +0200 -Subject: [PATCH 07/51] tools/libs/evtchn: don't set errno to negative values - -Setting errno to a negative value makes no sense. - -Fixes: 6b6500b3cbaa ("tools/libs/evtchn: Add support for restricting a handle") -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: 60245b71c1cd001686fa7b7a26869cbcb80d074c -master date: 2022-04-22 20:39:34 +0100 ---- - tools/libs/evtchn/freebsd.c | 2 +- - tools/libs/evtchn/minios.c | 2 +- - tools/libs/evtchn/netbsd.c | 2 +- - tools/libs/evtchn/solaris.c | 2 +- - 4 files changed, 4 insertions(+), 4 deletions(-) - -diff --git a/tools/libs/evtchn/freebsd.c b/tools/libs/evtchn/freebsd.c -index 7427ab240860..fa17a0f8dbb5 100644 ---- a/tools/libs/evtchn/freebsd.c -+++ b/tools/libs/evtchn/freebsd.c -@@ -58,7 +58,7 @@ int osdep_evtchn_close(xenevtchn_handle *xce) - - int osdep_evtchn_restrict(xenevtchn_handle *xce, domid_t domid) - { -- errno = -EOPNOTSUPP; -+ errno = EOPNOTSUPP; - - return -1; - } -diff --git a/tools/libs/evtchn/minios.c b/tools/libs/evtchn/minios.c -index e5dfdc5ef52e..c0bd5429eea2 100644 ---- a/tools/libs/evtchn/minios.c -+++ b/tools/libs/evtchn/minios.c -@@ -97,7 +97,7 @@ int osdep_evtchn_close(xenevtchn_handle *xce) - - int osdep_evtchn_restrict(xenevtchn_handle *xce, domid_t domid) - { -- errno = -EOPNOTSUPP; -+ errno = EOPNOTSUPP; - - return -1; - } -diff --git a/tools/libs/evtchn/netbsd.c b/tools/libs/evtchn/netbsd.c -index 1cebc21ffce0..56409513bc23 100644 ---- a/tools/libs/evtchn/netbsd.c -+++ b/tools/libs/evtchn/netbsd.c -@@ -53,7 +53,7 @@ int osdep_evtchn_close(xenevtchn_handle *xce) - - int osdep_evtchn_restrict(xenevtchn_handle *xce, domid_t domid) - { -- errno = -EOPNOTSUPP; -+ errno = EOPNOTSUPP; - - return -1; - } -diff --git a/tools/libs/evtchn/solaris.c b/tools/libs/evtchn/solaris.c -index df9579df1778..beaa7721425f 100644 ---- a/tools/libs/evtchn/solaris.c -+++ b/tools/libs/evtchn/solaris.c -@@ -53,7 +53,7 @@ int osdep_evtchn_close(xenevtchn_handle *xce) - - int osdep_evtchn_restrict(xenevtchn_handle *xce, domid_t domid) - { -- errno = -EOPNOTSUPP; -+ errno = EOPNOTSUPP; - return -1; - } - --- -2.35.1 - diff --git a/0007-x86-shadow-tolerate-failure-in-shadow_prealloc.patch b/0007-x86-shadow-tolerate-failure-in-shadow_prealloc.patch new file mode 100644 index 0000000..5e2f8ab --- /dev/null +++ b/0007-x86-shadow-tolerate-failure-in-shadow_prealloc.patch @@ -0,0 +1,279 @@ +From 28d3f677ec97c98154311f64871ac48762cf980a Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Tue, 11 Oct 2022 14:53:27 +0200 +Subject: [PATCH 07/26] x86/shadow: tolerate failure in shadow_prealloc() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Prevent _shadow_prealloc() from calling BUG() when unable to fulfill +the pre-allocation and instead return true/false. Modify +shadow_prealloc() to crash the domain on allocation failure (if the +domain is not already dying), as shadow cannot operate normally after +that. Modify callers to also gracefully handle {_,}shadow_prealloc() +failing to fulfill the request. + +Note this in turn requires adjusting the callers of +sh_make_monitor_table() also to handle it returning INVALID_MFN. +sh_update_paging_modes() is also modified to add additional error +paths in case of allocation failure, some of those will return with +null monitor page tables (and the domain likely crashed). This is no +different that current error paths, but the newly introduced ones are +more likely to trigger. + +The now added failure points in sh_update_paging_modes() also require +that on some error return paths the previous structures are cleared, +and thus monitor table is null. + +While there adjust the 'type' parameter type of shadow_prealloc() to +unsigned int rather than u32. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Tim Deegan <tim@xen.org> +master commit: b7f93c6afb12b6061e2d19de2f39ea09b569ac68 +master date: 2022-10-11 14:22:53 +0200 +--- + xen/arch/x86/mm/shadow/common.c | 69 ++++++++++++++++++++++++-------- + xen/arch/x86/mm/shadow/hvm.c | 4 +- + xen/arch/x86/mm/shadow/multi.c | 11 +++-- + xen/arch/x86/mm/shadow/private.h | 3 +- + 4 files changed, 66 insertions(+), 21 deletions(-) + +diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c +index 00e520cbd05b..2067c7d16bb4 100644 +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -36,6 +36,7 @@ + #include <asm/flushtlb.h> + #include <asm/shadow.h> + #include <xen/numa.h> ++#include <public/sched.h> + #include "private.h" + + DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags); +@@ -928,14 +929,15 @@ static inline void trace_shadow_prealloc_unpin(struct domain *d, mfn_t smfn) + + /* Make sure there are at least count order-sized pages + * available in the shadow page pool. */ +-static void _shadow_prealloc(struct domain *d, unsigned int pages) ++static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages) + { + struct vcpu *v; + struct page_info *sp, *t; + mfn_t smfn; + int i; + +- if ( d->arch.paging.shadow.free_pages >= pages ) return; ++ if ( d->arch.paging.shadow.free_pages >= pages ) ++ return true; + + /* Shouldn't have enabled shadows if we've no vcpus. */ + ASSERT(d->vcpu && d->vcpu[0]); +@@ -951,7 +953,8 @@ static void _shadow_prealloc(struct domain *d, unsigned int pages) + sh_unpin(d, smfn); + + /* See if that freed up enough space */ +- if ( d->arch.paging.shadow.free_pages >= pages ) return; ++ if ( d->arch.paging.shadow.free_pages >= pages ) ++ return true; + } + + /* Stage two: all shadow pages are in use in hierarchies that are +@@ -974,7 +977,7 @@ static void _shadow_prealloc(struct domain *d, unsigned int pages) + if ( d->arch.paging.shadow.free_pages >= pages ) + { + guest_flush_tlb_mask(d, d->dirty_cpumask); +- return; ++ return true; + } + } + } +@@ -987,7 +990,12 @@ static void _shadow_prealloc(struct domain *d, unsigned int pages) + d->arch.paging.shadow.total_pages, + d->arch.paging.shadow.free_pages, + d->arch.paging.shadow.p2m_pages); +- BUG(); ++ ++ ASSERT(d->is_dying); ++ ++ guest_flush_tlb_mask(d, d->dirty_cpumask); ++ ++ return false; + } + + /* Make sure there are at least count pages of the order according to +@@ -995,9 +1003,19 @@ static void _shadow_prealloc(struct domain *d, unsigned int pages) + * This must be called before any calls to shadow_alloc(). Since this + * will free existing shadows to make room, it must be called early enough + * to avoid freeing shadows that the caller is currently working on. */ +-void shadow_prealloc(struct domain *d, u32 type, unsigned int count) ++bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count) + { +- return _shadow_prealloc(d, shadow_size(type) * count); ++ bool ret = _shadow_prealloc(d, shadow_size(type) * count); ++ ++ if ( !ret && !d->is_dying && ++ (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) ++ /* ++ * Failing to allocate memory required for shadow usage can only result in ++ * a domain crash, do it here rather that relying on every caller to do it. ++ */ ++ domain_crash(d); ++ ++ return ret; + } + + /* Deliberately free all the memory we can: this will tear down all of +@@ -1218,7 +1236,7 @@ void shadow_free(struct domain *d, mfn_t smfn) + static struct page_info * + shadow_alloc_p2m_page(struct domain *d) + { +- struct page_info *pg; ++ struct page_info *pg = NULL; + + /* This is called both from the p2m code (which never holds the + * paging lock) and the log-dirty code (which always does). */ +@@ -1236,16 +1254,18 @@ shadow_alloc_p2m_page(struct domain *d) + d->arch.paging.shadow.p2m_pages, + shadow_min_acceptable_pages(d)); + } +- paging_unlock(d); +- return NULL; ++ goto out; + } + +- shadow_prealloc(d, SH_type_p2m_table, 1); ++ if ( !shadow_prealloc(d, SH_type_p2m_table, 1) ) ++ goto out; ++ + pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0)); + d->arch.paging.shadow.p2m_pages++; + d->arch.paging.shadow.total_pages--; + ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask)); + ++ out: + paging_unlock(d); + + return pg; +@@ -1336,7 +1356,9 @@ int shadow_set_allocation(struct domain *d, unsigned int pages, bool *preempted) + else if ( d->arch.paging.shadow.total_pages > pages ) + { + /* Need to return memory to domheap */ +- _shadow_prealloc(d, 1); ++ if ( !_shadow_prealloc(d, 1) ) ++ return -ENOMEM; ++ + sp = page_list_remove_head(&d->arch.paging.shadow.freelist); + ASSERT(sp); + /* +@@ -2334,12 +2356,13 @@ static void sh_update_paging_modes(struct vcpu *v) + if ( mfn_eq(v->arch.paging.shadow.oos_snapshot[0], INVALID_MFN) ) + { + int i; ++ ++ if ( !shadow_prealloc(d, SH_type_oos_snapshot, SHADOW_OOS_PAGES) ) ++ return; ++ + for(i = 0; i < SHADOW_OOS_PAGES; i++) +- { +- shadow_prealloc(d, SH_type_oos_snapshot, 1); + v->arch.paging.shadow.oos_snapshot[i] = + shadow_alloc(d, SH_type_oos_snapshot, 0); +- } + } + #endif /* OOS */ + +@@ -2403,6 +2426,9 @@ static void sh_update_paging_modes(struct vcpu *v) + mfn_t mmfn = sh_make_monitor_table( + v, v->arch.paging.mode->shadow.shadow_levels); + ++ if ( mfn_eq(mmfn, INVALID_MFN) ) ++ return; ++ + v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn); + make_cr3(v, mmfn); + hvm_update_host_cr3(v); +@@ -2441,6 +2467,12 @@ static void sh_update_paging_modes(struct vcpu *v) + v->arch.hvm.monitor_table = pagetable_null(); + new_mfn = sh_make_monitor_table( + v, v->arch.paging.mode->shadow.shadow_levels); ++ if ( mfn_eq(new_mfn, INVALID_MFN) ) ++ { ++ sh_destroy_monitor_table(v, old_mfn, ++ old_mode->shadow.shadow_levels); ++ return; ++ } + v->arch.hvm.monitor_table = pagetable_from_mfn(new_mfn); + SHADOW_PRINTK("new monitor table %"PRI_mfn "\n", + mfn_x(new_mfn)); +@@ -2526,7 +2558,12 @@ void sh_set_toplevel_shadow(struct vcpu *v, + if ( !mfn_valid(smfn) ) + { + /* Make sure there's enough free shadow memory. */ +- shadow_prealloc(d, root_type, 1); ++ if ( !shadow_prealloc(d, root_type, 1) ) ++ { ++ new_entry = pagetable_null(); ++ goto install_new_entry; ++ } ++ + /* Shadow the page. */ + smfn = make_shadow(v, gmfn, root_type); + } +diff --git a/xen/arch/x86/mm/shadow/hvm.c b/xen/arch/x86/mm/shadow/hvm.c +index d5f42102a0bd..a0878d9ad71a 100644 +--- a/xen/arch/x86/mm/shadow/hvm.c ++++ b/xen/arch/x86/mm/shadow/hvm.c +@@ -700,7 +700,9 @@ mfn_t sh_make_monitor_table(const struct vcpu *v, unsigned int shadow_levels) + ASSERT(!pagetable_get_pfn(v->arch.hvm.monitor_table)); + + /* Guarantee we can get the memory we need */ +- shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS); ++ if ( !shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS) ) ++ return INVALID_MFN; ++ + m4mfn = shadow_alloc(d, SH_type_monitor_table, 0); + mfn_to_page(m4mfn)->shadow_flags = 4; + +diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c +index 2ff78fe3362c..c07af0bd99da 100644 +--- a/xen/arch/x86/mm/shadow/multi.c ++++ b/xen/arch/x86/mm/shadow/multi.c +@@ -2440,9 +2440,14 @@ static int sh_page_fault(struct vcpu *v, + * Preallocate shadow pages *before* removing writable accesses + * otherwhise an OOS L1 might be demoted and promoted again with + * writable mappings. */ +- shadow_prealloc(d, +- SH_type_l1_shadow, +- GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1); ++ if ( !shadow_prealloc(d, SH_type_l1_shadow, ++ GUEST_PAGING_LEVELS < 4 ++ ? 1 : GUEST_PAGING_LEVELS - 1) ) ++ { ++ paging_unlock(d); ++ put_gfn(d, gfn_x(gfn)); ++ return 0; ++ } + + rc = gw_remove_write_accesses(v, va, &gw); + +diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h +index 35efb1b984fb..738214f75e8d 100644 +--- a/xen/arch/x86/mm/shadow/private.h ++++ b/xen/arch/x86/mm/shadow/private.h +@@ -383,7 +383,8 @@ void shadow_promote(struct domain *d, mfn_t gmfn, u32 type); + void shadow_demote(struct domain *d, mfn_t gmfn, u32 type); + + /* Shadow page allocation functions */ +-void shadow_prealloc(struct domain *d, u32 shadow_type, unsigned int count); ++bool __must_check shadow_prealloc(struct domain *d, unsigned int shadow_type, ++ unsigned int count); + mfn_t shadow_alloc(struct domain *d, + u32 shadow_type, + unsigned long backpointer); +-- +2.37.3 + diff --git a/0008-tools-libs-ctrl-don-t-set-errno-to-a-negative-value.patch b/0008-tools-libs-ctrl-don-t-set-errno-to-a-negative-value.patch deleted file mode 100644 index 166f0ff..0000000 --- a/0008-tools-libs-ctrl-don-t-set-errno-to-a-negative-value.patch +++ /dev/null @@ -1,36 +0,0 @@ -From ba62afdbc31a8cfe897191efd25ed4449d9acd94 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 7 Jun 2022 14:01:03 +0200 -Subject: [PATCH 08/51] tools/libs/ctrl: don't set errno to a negative value - -The claimed reason for setting errno to -1 is wrong. On x86 -xc_domain_pod_target() will set errno to a sane value in the error -case. - -Fixes: ff1745d5882b ("tools: libxl: do not set the PoD target on ARM") -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: a0fb7e0e73483ed042d5ca34861a891a51ad337b -master date: 2022-04-22 20:39:34 +0100 ---- - tools/libs/ctrl/xc_domain.c | 4 +--- - 1 file changed, 1 insertion(+), 3 deletions(-) - -diff --git a/tools/libs/ctrl/xc_domain.c b/tools/libs/ctrl/xc_domain.c -index b155d6afd2ef..9d675c8f21e1 100644 ---- a/tools/libs/ctrl/xc_domain.c -+++ b/tools/libs/ctrl/xc_domain.c -@@ -1297,9 +1297,7 @@ int xc_domain_get_pod_target(xc_interface *xch, - uint64_t *pod_cache_pages, - uint64_t *pod_entries) - { -- /* On x86 (above) xc_domain_pod_target will incorrectly return -1 -- * with errno==-1 on error. Do the same for least surprise. */ -- errno = -1; -+ errno = EOPNOTSUPP; - return -1; - } - #endif --- -2.35.1 - diff --git a/0008-x86-p2m-refuse-new-allocations-for-dying-domains.patch b/0008-x86-p2m-refuse-new-allocations-for-dying-domains.patch new file mode 100644 index 0000000..70b5cc9 --- /dev/null +++ b/0008-x86-p2m-refuse-new-allocations-for-dying-domains.patch @@ -0,0 +1,100 @@ +From 745e0b300dc3f5000e6d48c273b405d4bcc29ba7 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Tue, 11 Oct 2022 14:53:41 +0200 +Subject: [PATCH 08/26] x86/p2m: refuse new allocations for dying domains +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This will in particular prevent any attempts to add entries to the p2m, +once - in a subsequent change - non-root entries have been removed. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Tim Deegan <tim@xen.org> +master commit: ff600a8cf8e36f8ecbffecf96a035952e022ab87 +master date: 2022-10-11 14:23:22 +0200 +--- + xen/arch/x86/mm/hap/hap.c | 5 ++++- + xen/arch/x86/mm/shadow/common.c | 18 ++++++++++++++---- + 2 files changed, 18 insertions(+), 5 deletions(-) + +diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c +index d75dc2b9ed3d..787991233e53 100644 +--- a/xen/arch/x86/mm/hap/hap.c ++++ b/xen/arch/x86/mm/hap/hap.c +@@ -245,6 +245,9 @@ static struct page_info *hap_alloc(struct domain *d) + + ASSERT(paging_locked_by_me(d)); + ++ if ( unlikely(d->is_dying) ) ++ return NULL; ++ + pg = page_list_remove_head(&d->arch.paging.hap.freelist); + if ( unlikely(!pg) ) + return NULL; +@@ -281,7 +284,7 @@ static struct page_info *hap_alloc_p2m_page(struct domain *d) + d->arch.paging.hap.p2m_pages++; + ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask)); + } +- else if ( !d->arch.paging.p2m_alloc_failed ) ++ else if ( !d->arch.paging.p2m_alloc_failed && !d->is_dying ) + { + d->arch.paging.p2m_alloc_failed = 1; + dprintk(XENLOG_ERR, "d%i failed to allocate from HAP pool\n", +diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c +index 2067c7d16bb4..9807f6ec6c00 100644 +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -939,6 +939,10 @@ static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages) + if ( d->arch.paging.shadow.free_pages >= pages ) + return true; + ++ if ( unlikely(d->is_dying) ) ++ /* No reclaim when the domain is dying, teardown will take care of it. */ ++ return false; ++ + /* Shouldn't have enabled shadows if we've no vcpus. */ + ASSERT(d->vcpu && d->vcpu[0]); + +@@ -991,7 +995,7 @@ static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages) + d->arch.paging.shadow.free_pages, + d->arch.paging.shadow.p2m_pages); + +- ASSERT(d->is_dying); ++ ASSERT_UNREACHABLE(); + + guest_flush_tlb_mask(d, d->dirty_cpumask); + +@@ -1005,10 +1009,13 @@ static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages) + * to avoid freeing shadows that the caller is currently working on. */ + bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count) + { +- bool ret = _shadow_prealloc(d, shadow_size(type) * count); ++ bool ret; + +- if ( !ret && !d->is_dying && +- (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) ++ if ( unlikely(d->is_dying) ) ++ return false; ++ ++ ret = _shadow_prealloc(d, shadow_size(type) * count); ++ if ( !ret && (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) + /* + * Failing to allocate memory required for shadow usage can only result in + * a domain crash, do it here rather that relying on every caller to do it. +@@ -1238,6 +1245,9 @@ shadow_alloc_p2m_page(struct domain *d) + { + struct page_info *pg = NULL; + ++ if ( unlikely(d->is_dying) ) ++ return NULL; ++ + /* This is called both from the p2m code (which never holds the + * paging lock) and the log-dirty code (which always does). */ + paging_lock_recursive(d); +-- +2.37.3 + diff --git a/0009-tools-libs-guest-don-t-set-errno-to-a-negative-value.patch b/0009-tools-libs-guest-don-t-set-errno-to-a-negative-value.patch deleted file mode 100644 index 5d035f6..0000000 --- a/0009-tools-libs-guest-don-t-set-errno-to-a-negative-value.patch +++ /dev/null @@ -1,32 +0,0 @@ -From a2cf30eec08db5df974a9e8bb7366fee8fc7fcd9 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 7 Jun 2022 14:01:27 +0200 -Subject: [PATCH 09/51] tools/libs/guest: don't set errno to a negative value - -Setting errno to a negative error value makes no sense. - -Fixes: cb99a64029c9 ("libxc: arm: allow passing a device tree blob to the guest") -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: 438e96ab479495a932391a22e219ee62fa8c4f47 -master date: 2022-04-22 20:39:34 +0100 ---- - tools/libs/guest/xg_dom_core.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/tools/libs/guest/xg_dom_core.c b/tools/libs/guest/xg_dom_core.c -index 2e4c1330ea6b..65975a75da37 100644 ---- a/tools/libs/guest/xg_dom_core.c -+++ b/tools/libs/guest/xg_dom_core.c -@@ -856,7 +856,7 @@ int xc_dom_devicetree_file(struct xc_dom_image *dom, const char *filename) - return -1; - return 0; - #else -- errno = -EINVAL; -+ errno = EINVAL; - return -1; - #endif - } --- -2.35.1 - diff --git a/0009-x86-p2m-truly-free-paging-pool-memory-for-dying-doma.patch b/0009-x86-p2m-truly-free-paging-pool-memory-for-dying-doma.patch new file mode 100644 index 0000000..07e63ac --- /dev/null +++ b/0009-x86-p2m-truly-free-paging-pool-memory-for-dying-doma.patch @@ -0,0 +1,115 @@ +From 943635d8f8486209e4e48966507ad57963e96284 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Tue, 11 Oct 2022 14:54:00 +0200 +Subject: [PATCH 09/26] x86/p2m: truly free paging pool memory for dying + domains +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Modify {hap,shadow}_free to free the page immediately if the domain is +dying, so that pages don't accumulate in the pool when +{shadow,hap}_final_teardown() get called. This is to limit the amount of +work which needs to be done there (in a non-preemptable manner). + +Note the call to shadow_free() in shadow_free_p2m_page() is moved after +increasing total_pages, so that the decrease done in shadow_free() in +case the domain is dying doesn't underflow the counter, even if just for +a short interval. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Tim Deegan <tim@xen.org> +master commit: f50a2c0e1d057c00d6061f40ae24d068226052ad +master date: 2022-10-11 14:23:51 +0200 +--- + xen/arch/x86/mm/hap/hap.c | 12 ++++++++++++ + xen/arch/x86/mm/shadow/common.c | 28 +++++++++++++++++++++++++--- + 2 files changed, 37 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c +index 787991233e53..aef2297450e1 100644 +--- a/xen/arch/x86/mm/hap/hap.c ++++ b/xen/arch/x86/mm/hap/hap.c +@@ -265,6 +265,18 @@ static void hap_free(struct domain *d, mfn_t mfn) + + ASSERT(paging_locked_by_me(d)); + ++ /* ++ * For dying domains, actually free the memory here. This way less work is ++ * left to hap_final_teardown(), which cannot easily have preemption checks ++ * added. ++ */ ++ if ( unlikely(d->is_dying) ) ++ { ++ free_domheap_page(pg); ++ d->arch.paging.hap.total_pages--; ++ return; ++ } ++ + d->arch.paging.hap.free_pages++; + page_list_add_tail(pg, &d->arch.paging.hap.freelist); + } +diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c +index 9807f6ec6c00..9eb33eafc7f7 100644 +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -1187,6 +1187,7 @@ mfn_t shadow_alloc(struct domain *d, + void shadow_free(struct domain *d, mfn_t smfn) + { + struct page_info *next = NULL, *sp = mfn_to_page(smfn); ++ bool dying = ACCESS_ONCE(d->is_dying); + struct page_list_head *pin_list; + unsigned int pages; + u32 shadow_type; +@@ -1229,11 +1230,32 @@ void shadow_free(struct domain *d, mfn_t smfn) + * just before the allocator hands the page out again. */ + page_set_tlbflush_timestamp(sp); + perfc_decr(shadow_alloc_count); +- page_list_add_tail(sp, &d->arch.paging.shadow.freelist); ++ ++ /* ++ * For dying domains, actually free the memory here. This way less ++ * work is left to shadow_final_teardown(), which cannot easily have ++ * preemption checks added. ++ */ ++ if ( unlikely(dying) ) ++ { ++ /* ++ * The backpointer field (sh.back) used by shadow code aliases the ++ * domain owner field, unconditionally clear it here to avoid ++ * free_domheap_page() attempting to parse it. ++ */ ++ page_set_owner(sp, NULL); ++ free_domheap_page(sp); ++ } ++ else ++ page_list_add_tail(sp, &d->arch.paging.shadow.freelist); ++ + sp = next; + } + +- d->arch.paging.shadow.free_pages += pages; ++ if ( unlikely(dying) ) ++ d->arch.paging.shadow.total_pages -= pages; ++ else ++ d->arch.paging.shadow.free_pages += pages; + } + + /* Divert a page from the pool to be used by the p2m mapping. +@@ -1303,9 +1325,9 @@ shadow_free_p2m_page(struct domain *d, struct page_info *pg) + * paging lock) and the log-dirty code (which always does). */ + paging_lock_recursive(d); + +- shadow_free(d, page_to_mfn(pg)); + d->arch.paging.shadow.p2m_pages--; + d->arch.paging.shadow.total_pages++; ++ shadow_free(d, page_to_mfn(pg)); + + paging_unlock(d); + } +-- +2.37.3 + diff --git a/0010-tools-libs-light-don-t-set-errno-to-a-negative-value.patch b/0010-tools-libs-light-don-t-set-errno-to-a-negative-value.patch deleted file mode 100644 index ac900ae..0000000 --- a/0010-tools-libs-light-don-t-set-errno-to-a-negative-value.patch +++ /dev/null @@ -1,32 +0,0 @@ -From 15391de8e2bb6153eadd483154c53044ab53d98d Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 7 Jun 2022 14:01:44 +0200 -Subject: [PATCH 10/51] tools/libs/light: don't set errno to a negative value - -Setting errno to a negative value makes no sense. - -Fixes: e78e8b9bb649 ("libxl: Add interface for querying hypervisor about PCI topology") -Signed-off-by: Juergen Gross <jgross@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: 2419a159fb943c24a6f2439604b9fdb1478fcd08 -master date: 2022-04-22 20:39:34 +0100 ---- - tools/libs/light/libxl_linux.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/tools/libs/light/libxl_linux.c b/tools/libs/light/libxl_linux.c -index 8d62dfd255cb..27f2bce71837 100644 ---- a/tools/libs/light/libxl_linux.c -+++ b/tools/libs/light/libxl_linux.c -@@ -288,7 +288,7 @@ int libxl__pci_topology_init(libxl__gc *gc, - if (i == num_devs) { - LOG(ERROR, "Too many devices"); - err = ERROR_FAIL; -- errno = -ENOSPC; -+ errno = ENOSPC; - goto out; - } - --- -2.35.1 - diff --git a/0010-x86-p2m-free-the-paging-memory-pool-preemptively.patch b/0010-x86-p2m-free-the-paging-memory-pool-preemptively.patch new file mode 100644 index 0000000..59c6940 --- /dev/null +++ b/0010-x86-p2m-free-the-paging-memory-pool-preemptively.patch @@ -0,0 +1,181 @@ +From f5959ed715e19cf2844656477dbf74c2f576c9d4 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> +Date: Tue, 11 Oct 2022 14:54:21 +0200 +Subject: [PATCH 10/26] x86/p2m: free the paging memory pool preemptively +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The paging memory pool is currently freed in two different places: +from {shadow,hap}_teardown() via domain_relinquish_resources() and +from {shadow,hap}_final_teardown() via complete_domain_destroy(). +While the former does handle preemption, the later doesn't. + +Attempt to move as much p2m related freeing as possible to happen +before the call to {shadow,hap}_teardown(), so that most memory can be +freed in a preemptive way. In order to avoid causing issues to +existing callers leave the root p2m page tables set and free them in +{hap,shadow}_final_teardown(). Also modify {hap,shadow}_free to free +the page immediately if the domain is dying, so that pages don't +accumulate in the pool when {shadow,hap}_final_teardown() get called. + +Move altp2m_vcpu_disable_ve() to be done in hap_teardown(), as that's +the place where altp2m_active gets disabled now. + +This is part of CVE-2022-33746 / XSA-410. + +Reported-by: Julien Grall <jgrall@amazon.com> +Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Tim Deegan <tim@xen.org> +master commit: e7aa55c0aab36d994bf627c92bd5386ae167e16e +master date: 2022-10-11 14:24:21 +0200 +--- + xen/arch/x86/domain.c | 7 ------ + xen/arch/x86/mm/hap/hap.c | 42 ++++++++++++++++++++------------- + xen/arch/x86/mm/shadow/common.c | 12 ++++++++++ + 3 files changed, 38 insertions(+), 23 deletions(-) + +diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c +index 0d39981550ca..a4356893bdbc 100644 +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -38,7 +38,6 @@ + #include <xen/livepatch.h> + #include <public/sysctl.h> + #include <public/hvm/hvm_vcpu.h> +-#include <asm/altp2m.h> + #include <asm/regs.h> + #include <asm/mc146818rtc.h> + #include <asm/system.h> +@@ -2381,12 +2380,6 @@ int domain_relinquish_resources(struct domain *d) + vpmu_destroy(v); + } + +- if ( altp2m_active(d) ) +- { +- for_each_vcpu ( d, v ) +- altp2m_vcpu_disable_ve(v); +- } +- + if ( is_pv_domain(d) ) + { + for_each_vcpu ( d, v ) +diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c +index aef2297450e1..a44fcfd95e1e 100644 +--- a/xen/arch/x86/mm/hap/hap.c ++++ b/xen/arch/x86/mm/hap/hap.c +@@ -28,6 +28,7 @@ + #include <xen/domain_page.h> + #include <xen/guest_access.h> + #include <xen/keyhandler.h> ++#include <asm/altp2m.h> + #include <asm/event.h> + #include <asm/page.h> + #include <asm/current.h> +@@ -546,24 +547,8 @@ void hap_final_teardown(struct domain *d) + unsigned int i; + + if ( hvm_altp2m_supported() ) +- { +- d->arch.altp2m_active = 0; +- +- if ( d->arch.altp2m_eptp ) +- { +- free_xenheap_page(d->arch.altp2m_eptp); +- d->arch.altp2m_eptp = NULL; +- } +- +- if ( d->arch.altp2m_visible_eptp ) +- { +- free_xenheap_page(d->arch.altp2m_visible_eptp); +- d->arch.altp2m_visible_eptp = NULL; +- } +- + for ( i = 0; i < MAX_ALTP2M; i++ ) + p2m_teardown(d->arch.altp2m_p2m[i], true); +- } + + /* Destroy nestedp2m's first */ + for (i = 0; i < MAX_NESTEDP2M; i++) { +@@ -578,6 +563,8 @@ void hap_final_teardown(struct domain *d) + paging_lock(d); + hap_set_allocation(d, 0, NULL); + ASSERT(d->arch.paging.hap.p2m_pages == 0); ++ ASSERT(d->arch.paging.hap.free_pages == 0); ++ ASSERT(d->arch.paging.hap.total_pages == 0); + paging_unlock(d); + } + +@@ -603,6 +590,7 @@ void hap_vcpu_teardown(struct vcpu *v) + void hap_teardown(struct domain *d, bool *preempted) + { + struct vcpu *v; ++ unsigned int i; + + ASSERT(d->is_dying); + ASSERT(d != current->domain); +@@ -611,6 +599,28 @@ void hap_teardown(struct domain *d, bool *preempted) + for_each_vcpu ( d, v ) + hap_vcpu_teardown(v); + ++ /* Leave the root pt in case we get further attempts to modify the p2m. */ ++ if ( hvm_altp2m_supported() ) ++ { ++ if ( altp2m_active(d) ) ++ for_each_vcpu ( d, v ) ++ altp2m_vcpu_disable_ve(v); ++ ++ d->arch.altp2m_active = 0; ++ ++ FREE_XENHEAP_PAGE(d->arch.altp2m_eptp); ++ FREE_XENHEAP_PAGE(d->arch.altp2m_visible_eptp); ++ ++ for ( i = 0; i < MAX_ALTP2M; i++ ) ++ p2m_teardown(d->arch.altp2m_p2m[i], false); ++ } ++ ++ /* Destroy nestedp2m's after altp2m. */ ++ for ( i = 0; i < MAX_NESTEDP2M; i++ ) ++ p2m_teardown(d->arch.nested_p2m[i], false); ++ ++ p2m_teardown(p2m_get_hostp2m(d), false); ++ + paging_lock(d); /* Keep various asserts happy */ + + if ( d->arch.paging.hap.total_pages != 0 ) +diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c +index 9eb33eafc7f7..ac9a1ae07808 100644 +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -2824,8 +2824,17 @@ void shadow_teardown(struct domain *d, bool *preempted) + for_each_vcpu ( d, v ) + shadow_vcpu_teardown(v); + ++ p2m_teardown(p2m_get_hostp2m(d), false); ++ + paging_lock(d); + ++ /* ++ * Reclaim all shadow memory so that shadow_set_allocation() doesn't find ++ * in-use pages, as _shadow_prealloc() will no longer try to reclaim pages ++ * because the domain is dying. ++ */ ++ shadow_blow_tables(d); ++ + #if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) + /* Free the virtual-TLB array attached to each vcpu */ + for_each_vcpu(d, v) +@@ -2946,6 +2955,9 @@ void shadow_final_teardown(struct domain *d) + d->arch.paging.shadow.total_pages, + d->arch.paging.shadow.free_pages, + d->arch.paging.shadow.p2m_pages); ++ ASSERT(!d->arch.paging.shadow.total_pages); ++ ASSERT(!d->arch.paging.shadow.free_pages); ++ ASSERT(!d->arch.paging.shadow.p2m_pages); + paging_unlock(d); + } + +-- +2.37.3 + diff --git a/0011-xen-iommu-cleanup-iommu-related-domctl-handling.patch b/0011-xen-iommu-cleanup-iommu-related-domctl-handling.patch deleted file mode 100644 index 3c60de4..0000000 --- a/0011-xen-iommu-cleanup-iommu-related-domctl-handling.patch +++ /dev/null @@ -1,112 +0,0 @@ -From a6c32abd144ec6443c6a433b5a2ac00e2615aa86 Mon Sep 17 00:00:00 2001 -From: Juergen Gross <jgross@suse.com> -Date: Tue, 7 Jun 2022 14:02:08 +0200 -Subject: [PATCH 11/51] xen/iommu: cleanup iommu related domctl handling - -Today iommu_do_domctl() is being called from arch_do_domctl() in the -"default:" case of a switch statement. This has led already to crashes -due to unvalidated parameters. - -Fix that by moving the call of iommu_do_domctl() to the main switch -statement of do_domctl(). - -Signed-off-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> # Arm -master commit: 9cd7e31b3f584e97a138a770cfb031a91a867936 -master date: 2022-04-26 10:23:58 +0200 ---- - xen/arch/arm/domctl.c | 11 +---------- - xen/arch/x86/domctl.c | 2 +- - xen/common/domctl.c | 7 +++++++ - xen/include/xen/iommu.h | 12 +++++++++--- - 4 files changed, 18 insertions(+), 14 deletions(-) - -diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c -index 6245af6d0bab..1baf25c3d98b 100644 ---- a/xen/arch/arm/domctl.c -+++ b/xen/arch/arm/domctl.c -@@ -176,16 +176,7 @@ long arch_do_domctl(struct xen_domctl *domctl, struct domain *d, - return rc; - } - default: -- { -- int rc; -- -- rc = subarch_do_domctl(domctl, d, u_domctl); -- -- if ( rc == -ENOSYS ) -- rc = iommu_do_domctl(domctl, d, u_domctl); -- -- return rc; -- } -+ return subarch_do_domctl(domctl, d, u_domctl); - } - } - -diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c -index 7d102e0647ec..0fa51f2ebd10 100644 ---- a/xen/arch/x86/domctl.c -+++ b/xen/arch/x86/domctl.c -@@ -1380,7 +1380,7 @@ long arch_do_domctl( - break; - - default: -- ret = iommu_do_domctl(domctl, d, u_domctl); -+ ret = -ENOSYS; - break; - } - -diff --git a/xen/common/domctl.c b/xen/common/domctl.c -index 419e4070f59d..65d2a4588b71 100644 ---- a/xen/common/domctl.c -+++ b/xen/common/domctl.c -@@ -870,6 +870,13 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) - copyback = 1; - break; - -+ case XEN_DOMCTL_assign_device: -+ case XEN_DOMCTL_test_assign_device: -+ case XEN_DOMCTL_deassign_device: -+ case XEN_DOMCTL_get_device_group: -+ ret = iommu_do_domctl(op, d, u_domctl); -+ break; -+ - default: - ret = arch_do_domctl(op, d, u_domctl); - break; -diff --git a/xen/include/xen/iommu.h b/xen/include/xen/iommu.h -index 92b2d23f0ba2..861579562e8a 100644 ---- a/xen/include/xen/iommu.h -+++ b/xen/include/xen/iommu.h -@@ -342,8 +342,17 @@ struct domain_iommu { - /* Does the IOMMU pagetable need to be kept synchronized with the P2M */ - #ifdef CONFIG_HAS_PASSTHROUGH - #define need_iommu_pt_sync(d) (dom_iommu(d)->need_sync) -+ -+int iommu_do_domctl(struct xen_domctl *domctl, struct domain *d, -+ XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl); - #else - #define need_iommu_pt_sync(d) ({ (void)(d); false; }) -+ -+static inline int iommu_do_domctl(struct xen_domctl *domctl, struct domain *d, -+ XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) -+{ -+ return -ENOSYS; -+} - #endif - - int __must_check iommu_suspend(void); -@@ -357,9 +366,6 @@ int iommu_do_pci_domctl(struct xen_domctl *, struct domain *d, - XEN_GUEST_HANDLE_PARAM(xen_domctl_t)); - #endif - --int iommu_do_domctl(struct xen_domctl *, struct domain *d, -- XEN_GUEST_HANDLE_PARAM(xen_domctl_t)); -- - void iommu_dev_iotlb_flush_timeout(struct domain *d, struct pci_dev *pdev); - - /* --- -2.35.1 - diff --git a/0011-xen-x86-p2m-Add-preemption-in-p2m_teardown.patch b/0011-xen-x86-p2m-Add-preemption-in-p2m_teardown.patch new file mode 100644 index 0000000..5520627 --- /dev/null +++ b/0011-xen-x86-p2m-Add-preemption-in-p2m_teardown.patch @@ -0,0 +1,197 @@ +From a603386b422f5cb4c5e2639a7e20a1d99dba2175 Mon Sep 17 00:00:00 2001 +From: Julien Grall <jgrall@amazon.com> +Date: Tue, 11 Oct 2022 14:54:44 +0200 +Subject: [PATCH 11/26] xen/x86: p2m: Add preemption in p2m_teardown() + +The list p2m->pages contain all the pages used by the P2M. On large +instance this can be quite large and the time spent to call +d->arch.paging.free_page() will take more than 1ms for a 80GB guest +on a Xen running in nested environment on a c5.metal. + +By extrapolation, it would take > 100ms for a 8TB guest (what we +current security support). So add some preemption in p2m_teardown() +and propagate to the callers. Note there are 3 places where +the preemption is not enabled: + - hap_final_teardown()/shadow_final_teardown(): We are + preventing update the P2M once the domain is dying (so + no more pages could be allocated) and most of the P2M pages + will be freed in preemptive manneer when relinquishing the + resources. So this is fine to disable preemption. + - shadow_enable(): This is fine because it will undo the allocation + that may have been made by p2m_alloc_table() (so only the root + page table). + +The preemption is arbitrarily checked every 1024 iterations. + +We now need to include <xen/event.h> in p2m-basic in order to +import the definition for local_events_need_delivery() used by +general_preempt_check(). Ideally, the inclusion should happen in +xen/sched.h but it opened a can of worms. + +Note that with the current approach, Xen doesn't keep track on whether +the alt/nested P2Ms have been cleared. So there are some redundant work. +However, this is not expected to incurr too much overhead (the P2M lock +shouldn't be contended during teardown). So this is optimization is +left outside of the security event. + +This is part of CVE-2022-33746 / XSA-410. + +Signed-off-by: Julien Grall <jgrall@amazon.com> +Signed-off-by: Jan Beulich <jbeulich@suse.com> +master commit: 8a2111250b424edc49c65c4d41b276766d30635c +master date: 2022-10-11 14:24:48 +0200 +--- + xen/arch/x86/mm/hap/hap.c | 22 ++++++++++++++++------ + xen/arch/x86/mm/p2m.c | 18 +++++++++++++++--- + xen/arch/x86/mm/shadow/common.c | 12 +++++++++--- + xen/include/asm-x86/p2m.h | 2 +- + 4 files changed, 41 insertions(+), 13 deletions(-) + +diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c +index a44fcfd95e1e..1f9a157a0c34 100644 +--- a/xen/arch/x86/mm/hap/hap.c ++++ b/xen/arch/x86/mm/hap/hap.c +@@ -548,17 +548,17 @@ void hap_final_teardown(struct domain *d) + + if ( hvm_altp2m_supported() ) + for ( i = 0; i < MAX_ALTP2M; i++ ) +- p2m_teardown(d->arch.altp2m_p2m[i], true); ++ p2m_teardown(d->arch.altp2m_p2m[i], true, NULL); + + /* Destroy nestedp2m's first */ + for (i = 0; i < MAX_NESTEDP2M; i++) { +- p2m_teardown(d->arch.nested_p2m[i], true); ++ p2m_teardown(d->arch.nested_p2m[i], true, NULL); + } + + if ( d->arch.paging.hap.total_pages != 0 ) + hap_teardown(d, NULL); + +- p2m_teardown(p2m_get_hostp2m(d), true); ++ p2m_teardown(p2m_get_hostp2m(d), true, NULL); + /* Free any memory that the p2m teardown released */ + paging_lock(d); + hap_set_allocation(d, 0, NULL); +@@ -612,14 +612,24 @@ void hap_teardown(struct domain *d, bool *preempted) + FREE_XENHEAP_PAGE(d->arch.altp2m_visible_eptp); + + for ( i = 0; i < MAX_ALTP2M; i++ ) +- p2m_teardown(d->arch.altp2m_p2m[i], false); ++ { ++ p2m_teardown(d->arch.altp2m_p2m[i], false, preempted); ++ if ( preempted && *preempted ) ++ return; ++ } + } + + /* Destroy nestedp2m's after altp2m. */ + for ( i = 0; i < MAX_NESTEDP2M; i++ ) +- p2m_teardown(d->arch.nested_p2m[i], false); ++ { ++ p2m_teardown(d->arch.nested_p2m[i], false, preempted); ++ if ( preempted && *preempted ) ++ return; ++ } + +- p2m_teardown(p2m_get_hostp2m(d), false); ++ p2m_teardown(p2m_get_hostp2m(d), false, preempted); ++ if ( preempted && *preempted ) ++ return; + + paging_lock(d); /* Keep various asserts happy */ + +diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c +index aba4f17cbe12..8781df9dda8d 100644 +--- a/xen/arch/x86/mm/p2m.c ++++ b/xen/arch/x86/mm/p2m.c +@@ -749,12 +749,13 @@ int p2m_alloc_table(struct p2m_domain *p2m) + * hvm fixme: when adding support for pvh non-hardware domains, this path must + * cleanup any foreign p2m types (release refcnts on them). + */ +-void p2m_teardown(struct p2m_domain *p2m, bool remove_root) ++void p2m_teardown(struct p2m_domain *p2m, bool remove_root, bool *preempted) + /* Return all the p2m pages to Xen. + * We know we don't have any extra mappings to these pages */ + { + struct page_info *pg, *root_pg = NULL; + struct domain *d; ++ unsigned int i = 0; + + if (p2m == NULL) + return; +@@ -773,8 +774,19 @@ void p2m_teardown(struct p2m_domain *p2m, bool remove_root) + } + + while ( (pg = page_list_remove_head(&p2m->pages)) ) +- if ( pg != root_pg ) +- d->arch.paging.free_page(d, pg); ++ { ++ if ( pg == root_pg ) ++ continue; ++ ++ d->arch.paging.free_page(d, pg); ++ ++ /* Arbitrarily check preemption every 1024 iterations */ ++ if ( preempted && !(++i % 1024) && general_preempt_check() ) ++ { ++ *preempted = true; ++ break; ++ } ++ } + + if ( root_pg ) + page_list_add(root_pg, &p2m->pages); +diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c +index ac9a1ae07808..3b0d781991b5 100644 +--- a/xen/arch/x86/mm/shadow/common.c ++++ b/xen/arch/x86/mm/shadow/common.c +@@ -2770,8 +2770,12 @@ int shadow_enable(struct domain *d, u32 mode) + out_locked: + paging_unlock(d); + out_unlocked: ++ /* ++ * This is fine to ignore the preemption here because only the root ++ * will be allocated by p2m_alloc_table(). ++ */ + if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) ) +- p2m_teardown(p2m, true); ++ p2m_teardown(p2m, true, NULL); + if ( rv != 0 && pg != NULL ) + { + pg->count_info &= ~PGC_count_mask; +@@ -2824,7 +2828,9 @@ void shadow_teardown(struct domain *d, bool *preempted) + for_each_vcpu ( d, v ) + shadow_vcpu_teardown(v); + +- p2m_teardown(p2m_get_hostp2m(d), false); ++ p2m_teardown(p2m_get_hostp2m(d), false, preempted); ++ if ( preempted && *preempted ) ++ return; + + paging_lock(d); + +@@ -2945,7 +2951,7 @@ void shadow_final_teardown(struct domain *d) + shadow_teardown(d, NULL); + + /* It is now safe to pull down the p2m map. */ +- p2m_teardown(p2m_get_hostp2m(d), true); ++ p2m_teardown(p2m_get_hostp2m(d), true, NULL); + /* Free any shadow memory that the p2m teardown released */ + paging_lock(d); + shadow_set_allocation(d, 0, NULL); +diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h +index c3c16748e7d5..2db9ab0122f2 100644 +--- a/xen/include/asm-x86/p2m.h ++++ b/xen/include/asm-x86/p2m.h +@@ -574,7 +574,7 @@ int p2m_init(struct domain *d); + int p2m_alloc_table(struct p2m_domain *p2m); + + /* Return all the p2m resources to Xen. */ +-void p2m_teardown(struct p2m_domain *p2m, bool remove_root); ++void p2m_teardown(struct p2m_domain *p2m, bool remove_root, bool *preempted); + void p2m_final_teardown(struct domain *d); + + /* Add a page to a domain's p2m table */ +-- +2.37.3 + diff --git a/0012-IOMMU-make-domctl-handler-tolerate-NULL-domain.patch b/0012-IOMMU-make-domctl-handler-tolerate-NULL-domain.patch deleted file mode 100644 index 37b9005..0000000 --- a/0012-IOMMU-make-domctl-handler-tolerate-NULL-domain.patch +++ /dev/null @@ -1,36 +0,0 @@ -From 4cf9a7c7bdb9d544fbac81105bbc1059ba3dd932 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 7 Jun 2022 14:02:30 +0200 -Subject: [PATCH 12/51] IOMMU: make domctl handler tolerate NULL domain - -Besides the reporter's issue of hitting a NULL deref when !CONFIG_GDBSX, -XEN_DOMCTL_test_assign_device can legitimately end up having NULL passed -here, when the domctl was passed DOMID_INVALID. - -Fixes: 71e617a6b8f6 ("use is_iommu_enabled() where appropriate...") -Reported-by: Cheyenne Wills <cheyenne.wills@gmail.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Paul Durrant <paul@xen.org> -Reviewed-by: Juergen Gross <jgross@suse.com> -master commit: fa4d84e6dd3c3bfd23a525b75a5483d4ce15adbb -master date: 2022-04-26 10:25:54 +0200 ---- - xen/drivers/passthrough/iommu.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c -index caaba62c8865..287f63fc736f 100644 ---- a/xen/drivers/passthrough/iommu.c -+++ b/xen/drivers/passthrough/iommu.c -@@ -535,7 +535,7 @@ int iommu_do_domctl( - { - int ret = -ENODEV; - -- if ( !is_iommu_enabled(d) ) -+ if ( !(d ? is_iommu_enabled(d) : iommu_enabled) ) - return -EOPNOTSUPP; - - #ifdef CONFIG_HAS_PCI --- -2.35.1 - diff --git a/0012-libxl-docs-Use-arch-specific-default-paging-memory.patch b/0012-libxl-docs-Use-arch-specific-default-paging-memory.patch new file mode 100644 index 0000000..9390500 --- /dev/null +++ b/0012-libxl-docs-Use-arch-specific-default-paging-memory.patch @@ -0,0 +1,149 @@ +From 755a9b52844de3e1e47aa1fc9991a4240ccfbf35 Mon Sep 17 00:00:00 2001 +From: Henry Wang <Henry.Wang@arm.com> +Date: Tue, 11 Oct 2022 14:55:08 +0200 +Subject: [PATCH 12/26] libxl, docs: Use arch-specific default paging memory + +The default paging memory (descibed in `shadow_memory` entry in xl +config) in libxl is used to determine the memory pool size for xl +guests. Currently this size is only used for x86, and contains a part +of RAM to shadow the resident processes. Since on Arm there is no +shadow mode guests, so the part of RAM to shadow the resident processes +is not necessary. Therefore, this commit splits the function +`libxl_get_required_shadow_memory()` to arch specific helpers and +renamed the helper to `libxl__arch_get_required_paging_memory()`. + +On x86, this helper calls the original value from +`libxl_get_required_shadow_memory()` so no functional change intended. + +On Arm, this helper returns 1MB per vcpu plus 4KB per MiB of RAM +for the P2M map and additional 512KB. + +Also update the xl.cfg documentation to add Arm documentation +according to code changes and correct the comment style following Xen +coding style. + +This is part of CVE-2022-33747 / XSA-409. + +Suggested-by: Julien Grall <jgrall@amazon.com> +Signed-off-by: Henry Wang <Henry.Wang@arm.com> +Reviewed-by: Anthony PERARD <anthony.perard@citrix.com> +master commit: 156a239ea288972425f967ac807b3cb5b5e14874 +master date: 2022-10-11 14:28:37 +0200 +--- + docs/man/xl.cfg.5.pod.in | 5 +++++ + tools/libs/light/libxl_arch.h | 4 ++++ + tools/libs/light/libxl_arm.c | 14 ++++++++++++++ + tools/libs/light/libxl_utils.c | 9 ++------- + tools/libs/light/libxl_x86.c | 13 +++++++++++++ + 5 files changed, 38 insertions(+), 7 deletions(-) + +diff --git a/docs/man/xl.cfg.5.pod.in b/docs/man/xl.cfg.5.pod.in +index b98d1613987e..eda1e77ebd06 100644 +--- a/docs/man/xl.cfg.5.pod.in ++++ b/docs/man/xl.cfg.5.pod.in +@@ -1768,6 +1768,11 @@ are not using hardware assisted paging (i.e. you are using shadow + mode) and your guest workload consists of a very large number of + similar processes then increasing this value may improve performance. + ++On Arm, this field is used to determine the size of the guest P2M pages ++pool, and the default value is 1MB per vCPU plus 4KB per MB of RAM for ++the P2M map and additional 512KB for extended regions. Users should ++adjust this value if bigger P2M pool size is needed. ++ + =back + + =head3 Processor and Platform Features +diff --git a/tools/libs/light/libxl_arch.h b/tools/libs/light/libxl_arch.h +index 1522ecb97f72..5a060c2c3033 100644 +--- a/tools/libs/light/libxl_arch.h ++++ b/tools/libs/light/libxl_arch.h +@@ -90,6 +90,10 @@ void libxl__arch_update_domain_config(libxl__gc *gc, + libxl_domain_config *dst, + const libxl_domain_config *src); + ++_hidden ++unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb, ++ unsigned int smp_cpus); ++ + #if defined(__i386__) || defined(__x86_64__) + + #define LAPIC_BASE_ADDRESS 0xfee00000 +diff --git a/tools/libs/light/libxl_arm.c b/tools/libs/light/libxl_arm.c +index eef1de093914..73a95e83af24 100644 +--- a/tools/libs/light/libxl_arm.c ++++ b/tools/libs/light/libxl_arm.c +@@ -154,6 +154,20 @@ out: + return rc; + } + ++unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb, ++ unsigned int smp_cpus) ++{ ++ /* ++ * 256 pages (1MB) per vcpu, ++ * plus 1 page per MiB of RAM for the P2M map, ++ * plus 1 page per MiB of extended region. This default value is 128 MiB ++ * which should be enough for domains that are not running backend. ++ * This is higher than the minimum that Xen would allocate if no value ++ * were given (but the Xen minimum is for safety, not performance). ++ */ ++ return 4 * (256 * smp_cpus + maxmem_kb / 1024 + 128); ++} ++ + static struct arch_info { + const char *guest_type; + const char *timer_compat; +diff --git a/tools/libs/light/libxl_utils.c b/tools/libs/light/libxl_utils.c +index 4699c4a0a36f..e276c0ee9cc3 100644 +--- a/tools/libs/light/libxl_utils.c ++++ b/tools/libs/light/libxl_utils.c +@@ -18,6 +18,7 @@ + #include <ctype.h> + + #include "libxl_internal.h" ++#include "libxl_arch.h" + #include "_paths.h" + + #ifndef LIBXL_HAVE_NONCONST_LIBXL_BASENAME_RETURN_VALUE +@@ -39,13 +40,7 @@ char *libxl_basename(const char *name) + + unsigned long libxl_get_required_shadow_memory(unsigned long maxmem_kb, unsigned int smp_cpus) + { +- /* 256 pages (1MB) per vcpu, +- plus 1 page per MiB of RAM for the P2M map, +- plus 1 page per MiB of RAM to shadow the resident processes. +- This is higher than the minimum that Xen would allocate if no value +- were given (but the Xen minimum is for safety, not performance). +- */ +- return 4 * (256 * smp_cpus + 2 * (maxmem_kb / 1024)); ++ return libxl__arch_get_required_paging_memory(maxmem_kb, smp_cpus); + } + + char *libxl_domid_to_name(libxl_ctx *ctx, uint32_t domid) +diff --git a/tools/libs/light/libxl_x86.c b/tools/libs/light/libxl_x86.c +index 1feadebb1852..51362893cf98 100644 +--- a/tools/libs/light/libxl_x86.c ++++ b/tools/libs/light/libxl_x86.c +@@ -882,6 +882,19 @@ void libxl__arch_update_domain_config(libxl__gc *gc, + libxl_defbool_val(src->b_info.arch_x86.msr_relaxed)); + } + ++unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb, ++ unsigned int smp_cpus) ++{ ++ /* ++ * 256 pages (1MB) per vcpu, ++ * plus 1 page per MiB of RAM for the P2M map, ++ * plus 1 page per MiB of RAM to shadow the resident processes. ++ * This is higher than the minimum that Xen would allocate if no value ++ * were given (but the Xen minimum is for safety, not performance). ++ */ ++ return 4 * (256 * smp_cpus + 2 * (maxmem_kb / 1024)); ++} ++ + /* + * Local variables: + * mode: C +-- +2.37.3 + diff --git a/0013-IOMMU-x86-disallow-device-assignment-to-PoD-guests.patch b/0013-IOMMU-x86-disallow-device-assignment-to-PoD-guests.patch deleted file mode 100644 index 8416c96..0000000 --- a/0013-IOMMU-x86-disallow-device-assignment-to-PoD-guests.patch +++ /dev/null @@ -1,229 +0,0 @@ -From 838f6c211f7f05f107e1acdfb0977ab61ec0bf2e Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 7 Jun 2022 14:03:20 +0200 -Subject: [PATCH 13/51] IOMMU/x86: disallow device assignment to PoD guests -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -While it is okay for IOMMU page tables to be set up for guests starting -in PoD mode, actual device assignment may only occur once all PoD -entries have been removed from the P2M. So far this was enforced only -for boot-time assignment, and only in the tool stack. - -Also use the new function to replace p2m_pod_entry_count(): Its unlocked -access to p2m->pod.entry_count wasn't really okay (irrespective of the -result being stale by the time the caller gets to see it). Nor was the -use of that function in line with the immediately preceding comment: A -PoD guest isn't just one with a non-zero entry count, but also one with -a non-empty cache (e.g. prior to actually launching the guest). - -To allow the tool stack to see a consistent snapshot of PoD state, move -the tail of XENMEM_{get,set}_pod_target handling into a function, adding -proper locking there. - -In libxl take the liberty to use the new local variable r also for a -pre-existing call into libxc. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -master commit: ad4312d764e8b40a1e45b64aac6d840a60c59f13 -master date: 2022-05-02 08:48:02 +0200 ---- - xen/arch/x86/mm.c | 6 +--- - xen/arch/x86/mm/p2m-pod.c | 43 ++++++++++++++++++++++++++++- - xen/common/vm_event.c | 2 +- - xen/drivers/passthrough/x86/iommu.c | 3 +- - xen/include/asm-x86/p2m.h | 21 +++++++------- - 5 files changed, 57 insertions(+), 18 deletions(-) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index e222d9aa98ee..4ee2de11051d 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -4777,7 +4777,6 @@ long arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg) - { - xen_pod_target_t target; - struct domain *d; -- struct p2m_domain *p2m; - - if ( copy_from_guest(&target, arg, 1) ) - return -EFAULT; -@@ -4812,10 +4811,7 @@ long arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg) - } - else if ( rc >= 0 ) - { -- p2m = p2m_get_hostp2m(d); -- target.tot_pages = domain_tot_pages(d); -- target.pod_cache_pages = p2m->pod.count; -- target.pod_entries = p2m->pod.entry_count; -+ p2m_pod_get_mem_target(d, &target); - - if ( __copy_to_guest(arg, &target, 1) ) - { -diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c -index d8d1a0ce7ed7..a3c9d8a97423 100644 ---- a/xen/arch/x86/mm/p2m-pod.c -+++ b/xen/arch/x86/mm/p2m-pod.c -@@ -20,6 +20,7 @@ - */ - - #include <xen/event.h> -+#include <xen/iocap.h> - #include <xen/ioreq.h> - #include <xen/mm.h> - #include <xen/sched.h> -@@ -362,7 +363,10 @@ p2m_pod_set_mem_target(struct domain *d, unsigned long target) - - ASSERT( pod_target >= p2m->pod.count ); - -- ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/); -+ if ( has_arch_pdevs(d) || cache_flush_permitted(d) ) -+ ret = -ENOTEMPTY; -+ else -+ ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/); - - out: - pod_unlock(p2m); -@@ -370,6 +374,23 @@ out: - return ret; - } - -+void p2m_pod_get_mem_target(const struct domain *d, xen_pod_target_t *target) -+{ -+ struct p2m_domain *p2m = p2m_get_hostp2m(d); -+ -+ ASSERT(is_hvm_domain(d)); -+ -+ pod_lock(p2m); -+ lock_page_alloc(p2m); -+ -+ target->tot_pages = domain_tot_pages(d); -+ target->pod_cache_pages = p2m->pod.count; -+ target->pod_entries = p2m->pod.entry_count; -+ -+ unlock_page_alloc(p2m); -+ pod_unlock(p2m); -+} -+ - int p2m_pod_empty_cache(struct domain *d) - { - struct p2m_domain *p2m = p2m_get_hostp2m(d); -@@ -1387,6 +1408,9 @@ guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn, - if ( !paging_mode_translate(d) ) - return -EINVAL; - -+ if ( has_arch_pdevs(d) || cache_flush_permitted(d) ) -+ return -ENOTEMPTY; -+ - do { - rc = mark_populate_on_demand(d, gfn, chunk_order); - -@@ -1408,3 +1432,20 @@ void p2m_pod_init(struct p2m_domain *p2m) - for ( i = 0; i < ARRAY_SIZE(p2m->pod.mrp.list); ++i ) - p2m->pod.mrp.list[i] = gfn_x(INVALID_GFN); - } -+ -+bool p2m_pod_active(const struct domain *d) -+{ -+ struct p2m_domain *p2m; -+ bool res; -+ -+ if ( !is_hvm_domain(d) ) -+ return false; -+ -+ p2m = p2m_get_hostp2m(d); -+ -+ pod_lock(p2m); -+ res = p2m->pod.entry_count | p2m->pod.count; -+ pod_unlock(p2m); -+ -+ return res; -+} -diff --git a/xen/common/vm_event.c b/xen/common/vm_event.c -index 70ab3ba406ff..21d2f0edf727 100644 ---- a/xen/common/vm_event.c -+++ b/xen/common/vm_event.c -@@ -639,7 +639,7 @@ int vm_event_domctl(struct domain *d, struct xen_domctl_vm_event_op *vec) - - rc = -EXDEV; - /* Disallow paging in a PoD guest */ -- if ( p2m_pod_entry_count(p2m_get_hostp2m(d)) ) -+ if ( p2m_pod_active(d) ) - break; - - /* domain_pause() not required here, see XSA-99 */ -diff --git a/xen/drivers/passthrough/x86/iommu.c b/xen/drivers/passthrough/x86/iommu.c -index a36a6bd4b249..dc9936e16930 100644 ---- a/xen/drivers/passthrough/x86/iommu.c -+++ b/xen/drivers/passthrough/x86/iommu.c -@@ -502,11 +502,12 @@ bool arch_iommu_use_permitted(const struct domain *d) - { - /* - * Prevent device assign if mem paging, mem sharing or log-dirty -- * have been enabled for this domain. -+ * have been enabled for this domain, or if PoD is still in active use. - */ - return d == dom_io || - (likely(!mem_sharing_enabled(d)) && - likely(!mem_paging_enabled(d)) && -+ likely(!p2m_pod_active(d)) && - likely(!p2m_get_hostp2m(d)->global_logdirty)); - } - -diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h -index 357a8087481e..f2af7a746ced 100644 ---- a/xen/include/asm-x86/p2m.h -+++ b/xen/include/asm-x86/p2m.h -@@ -661,6 +661,12 @@ int p2m_pod_empty_cache(struct domain *d); - * domain matches target */ - int p2m_pod_set_mem_target(struct domain *d, unsigned long target); - -+/* Obtain a consistent snapshot of PoD related domain state. */ -+void p2m_pod_get_mem_target(const struct domain *d, xen_pod_target_t *target); -+ -+/* Check whether PoD is (still) active in a domain. */ -+bool p2m_pod_active(const struct domain *d); -+ - /* Scan pod cache when offline/broken page triggered */ - int - p2m_pod_offline_or_broken_hit(struct page_info *p); -@@ -669,11 +675,6 @@ p2m_pod_offline_or_broken_hit(struct page_info *p); - void - p2m_pod_offline_or_broken_replace(struct page_info *p); - --static inline long p2m_pod_entry_count(const struct p2m_domain *p2m) --{ -- return p2m->pod.entry_count; --} -- - void p2m_pod_init(struct p2m_domain *p2m); - - #else -@@ -689,6 +690,11 @@ static inline int p2m_pod_empty_cache(struct domain *d) - return 0; - } - -+static inline bool p2m_pod_active(const struct domain *d) -+{ -+ return false; -+} -+ - static inline int p2m_pod_offline_or_broken_hit(struct page_info *p) - { - return 0; -@@ -699,11 +705,6 @@ static inline void p2m_pod_offline_or_broken_replace(struct page_info *p) - ASSERT_UNREACHABLE(); - } - --static inline long p2m_pod_entry_count(const struct p2m_domain *p2m) --{ -- return 0; --} -- - static inline void p2m_pod_init(struct p2m_domain *p2m) {} - - #endif --- -2.35.1 - diff --git a/0013-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch b/0013-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch new file mode 100644 index 0000000..dee9d9c --- /dev/null +++ b/0013-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch @@ -0,0 +1,189 @@ +From 914fc8e8b4cc003e90d51bee0aef54687358530a Mon Sep 17 00:00:00 2001 +From: Henry Wang <Henry.Wang@arm.com> +Date: Tue, 11 Oct 2022 14:55:21 +0200 +Subject: [PATCH 13/26] xen/arm: Construct the P2M pages pool for guests + +This commit constructs the p2m pages pool for guests from the +data structure and helper perspective. + +This is implemented by: + +- Adding a `struct paging_domain` which contains a freelist, a +counter variable and a spinlock to `struct arch_domain` to +indicate the free p2m pages and the number of p2m total pages in +the p2m pages pool. + +- Adding a helper `p2m_get_allocation` to get the p2m pool size. + +- Adding a helper `p2m_set_allocation` to set the p2m pages pool +size. This helper should be called before allocating memory for +a guest. + +- Adding a helper `p2m_teardown_allocation` to free the p2m pages +pool. This helper should be called during the xl domain destory. + +This is part of CVE-2022-33747 / XSA-409. + +Signed-off-by: Henry Wang <Henry.Wang@arm.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +master commit: 55914f7fc91a468649b8a3ec3f53ae1c4aca6670 +master date: 2022-10-11 14:28:39 +0200 +--- + xen/arch/arm/p2m.c | 88 ++++++++++++++++++++++++++++++++++++ + xen/include/asm-arm/domain.h | 10 ++++ + xen/include/asm-arm/p2m.h | 4 ++ + 3 files changed, 102 insertions(+) + +diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c +index 27418ee5ee98..d8957dd8727c 100644 +--- a/xen/arch/arm/p2m.c ++++ b/xen/arch/arm/p2m.c +@@ -50,6 +50,92 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn) + return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48)); + } + ++/* Return the size of the pool, rounded up to the nearest MB */ ++unsigned int p2m_get_allocation(struct domain *d) ++{ ++ unsigned long nr_pages = ACCESS_ONCE(d->arch.paging.p2m_total_pages); ++ ++ return ROUNDUP(nr_pages, 1 << (20 - PAGE_SHIFT)) >> (20 - PAGE_SHIFT); ++} ++ ++/* ++ * Set the pool of pages to the required number of pages. ++ * Returns 0 for success, non-zero for failure. ++ * Call with d->arch.paging.lock held. ++ */ ++int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted) ++{ ++ struct page_info *pg; ++ ++ ASSERT(spin_is_locked(&d->arch.paging.lock)); ++ ++ for ( ; ; ) ++ { ++ if ( d->arch.paging.p2m_total_pages < pages ) ++ { ++ /* Need to allocate more memory from domheap */ ++ pg = alloc_domheap_page(NULL, 0); ++ if ( pg == NULL ) ++ { ++ printk(XENLOG_ERR "Failed to allocate P2M pages.\n"); ++ return -ENOMEM; ++ } ++ ACCESS_ONCE(d->arch.paging.p2m_total_pages) = ++ d->arch.paging.p2m_total_pages + 1; ++ page_list_add_tail(pg, &d->arch.paging.p2m_freelist); ++ } ++ else if ( d->arch.paging.p2m_total_pages > pages ) ++ { ++ /* Need to return memory to domheap */ ++ pg = page_list_remove_head(&d->arch.paging.p2m_freelist); ++ if( pg ) ++ { ++ ACCESS_ONCE(d->arch.paging.p2m_total_pages) = ++ d->arch.paging.p2m_total_pages - 1; ++ free_domheap_page(pg); ++ } ++ else ++ { ++ printk(XENLOG_ERR ++ "Failed to free P2M pages, P2M freelist is empty.\n"); ++ return -ENOMEM; ++ } ++ } ++ else ++ break; ++ ++ /* Check to see if we need to yield and try again */ ++ if ( preempted && general_preempt_check() ) ++ { ++ *preempted = true; ++ return -ERESTART; ++ } ++ } ++ ++ return 0; ++} ++ ++int p2m_teardown_allocation(struct domain *d) ++{ ++ int ret = 0; ++ bool preempted = false; ++ ++ spin_lock(&d->arch.paging.lock); ++ if ( d->arch.paging.p2m_total_pages != 0 ) ++ { ++ ret = p2m_set_allocation(d, 0, &preempted); ++ if ( preempted ) ++ { ++ spin_unlock(&d->arch.paging.lock); ++ return -ERESTART; ++ } ++ ASSERT(d->arch.paging.p2m_total_pages == 0); ++ } ++ spin_unlock(&d->arch.paging.lock); ++ ++ return ret; ++} ++ + /* Unlock the flush and do a P2M TLB flush if necessary */ + void p2m_write_unlock(struct p2m_domain *p2m) + { +@@ -1599,7 +1685,9 @@ int p2m_init(struct domain *d) + unsigned int cpu; + + rwlock_init(&p2m->lock); ++ spin_lock_init(&d->arch.paging.lock); + INIT_PAGE_LIST_HEAD(&p2m->pages); ++ INIT_PAGE_LIST_HEAD(&d->arch.paging.p2m_freelist); + + p2m->vmid = INVALID_VMID; + +diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h +index 7f8ddd3f5c3b..2f31795ab96d 100644 +--- a/xen/include/asm-arm/domain.h ++++ b/xen/include/asm-arm/domain.h +@@ -40,6 +40,14 @@ struct vtimer { + uint64_t cval; + }; + ++struct paging_domain { ++ spinlock_t lock; ++ /* Free P2M pages from the pre-allocated P2M pool */ ++ struct page_list_head p2m_freelist; ++ /* Number of pages from the pre-allocated P2M pool */ ++ unsigned long p2m_total_pages; ++}; ++ + struct arch_domain + { + #ifdef CONFIG_ARM_64 +@@ -51,6 +59,8 @@ struct arch_domain + + struct hvm_domain hvm; + ++ struct paging_domain paging; ++ + struct vmmio vmmio; + + /* Continuable domain_relinquish_resources(). */ +diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h +index b3ba83283e11..c9598740bd02 100644 +--- a/xen/include/asm-arm/p2m.h ++++ b/xen/include/asm-arm/p2m.h +@@ -218,6 +218,10 @@ void p2m_restore_state(struct vcpu *n); + /* Print debugging/statistial info about a domain's p2m */ + void p2m_dump_info(struct domain *d); + ++unsigned int p2m_get_allocation(struct domain *d); ++int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted); ++int p2m_teardown_allocation(struct domain *d); ++ + static inline void p2m_write_lock(struct p2m_domain *p2m) + { + write_lock(&p2m->lock); +-- +2.37.3 + diff --git a/0014-x86-msr-handle-reads-to-MSR_P5_MC_-ADDR-TYPE.patch b/0014-x86-msr-handle-reads-to-MSR_P5_MC_-ADDR-TYPE.patch deleted file mode 100644 index 69049f1..0000000 --- a/0014-x86-msr-handle-reads-to-MSR_P5_MC_-ADDR-TYPE.patch +++ /dev/null @@ -1,121 +0,0 @@ -From 9ebe2ba83644ec6cd33a93c68dab5f551adcbea0 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 7 Jun 2022 14:04:16 +0200 -Subject: [PATCH 14/51] x86/msr: handle reads to MSR_P5_MC_{ADDR,TYPE} -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Windows Server 2019 Essentials will unconditionally attempt to read -P5_MC_ADDR MSR at boot and throw a BSOD if injected a #GP. - -Fix this by mapping MSR_P5_MC_{ADDR,TYPE} to -MSR_IA32_MCi_{ADDR,STATUS}, as reported also done by hardware in Intel -SDM "Mapping of the Pentium Processor Machine-Check Errors to the -Machine-Check Architecture" section. - -Reported-by: Steffen Einsle <einsle@phptrix.de> -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: ce59e472b581e4923f6892172dde62b88c8aa8b7 -master date: 2022-05-02 08:49:12 +0200 ---- - xen/arch/x86/cpu/mcheck/mce.h | 6 ++++++ - xen/arch/x86/cpu/mcheck/mce_intel.c | 19 +++++++++++++++++++ - xen/arch/x86/cpu/mcheck/vmce.c | 2 ++ - xen/arch/x86/msr.c | 2 ++ - xen/include/asm-x86/msr-index.h | 3 +++ - 5 files changed, 32 insertions(+) - -diff --git a/xen/arch/x86/cpu/mcheck/mce.h b/xen/arch/x86/cpu/mcheck/mce.h -index 195362691904..192315ecfa3d 100644 ---- a/xen/arch/x86/cpu/mcheck/mce.h -+++ b/xen/arch/x86/cpu/mcheck/mce.h -@@ -169,6 +169,12 @@ static inline int mce_vendor_bank_msr(const struct vcpu *v, uint32_t msr) - if (msr >= MSR_IA32_MC0_CTL2 && - msr < MSR_IA32_MCx_CTL2(v->arch.vmce.mcg_cap & MCG_CAP_COUNT) ) - return 1; -+ fallthrough; -+ -+ case X86_VENDOR_CENTAUR: -+ case X86_VENDOR_SHANGHAI: -+ if (msr == MSR_P5_MC_ADDR || msr == MSR_P5_MC_TYPE) -+ return 1; - break; - - case X86_VENDOR_AMD: -diff --git a/xen/arch/x86/cpu/mcheck/mce_intel.c b/xen/arch/x86/cpu/mcheck/mce_intel.c -index bb9f3a3ff795..d364e9bf5ad1 100644 ---- a/xen/arch/x86/cpu/mcheck/mce_intel.c -+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c -@@ -1001,8 +1001,27 @@ int vmce_intel_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val) - - int vmce_intel_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) - { -+ const struct cpuid_policy *cp = v->domain->arch.cpuid; - unsigned int bank = msr - MSR_IA32_MC0_CTL2; - -+ switch ( msr ) -+ { -+ case MSR_P5_MC_ADDR: -+ /* -+ * Bank 0 is used for the 'bank 0 quirk' on older processors. -+ * See vcpu_fill_mc_msrs() for reference. -+ */ -+ *val = v->arch.vmce.bank[1].mci_addr; -+ return 1; -+ -+ case MSR_P5_MC_TYPE: -+ *val = v->arch.vmce.bank[1].mci_status; -+ return 1; -+ } -+ -+ if ( !(cp->x86_vendor & X86_VENDOR_INTEL) ) -+ return 0; -+ - if ( bank < GUEST_MC_BANK_NUM ) - { - *val = v->arch.vmce.bank[bank].mci_ctl2; -diff --git a/xen/arch/x86/cpu/mcheck/vmce.c b/xen/arch/x86/cpu/mcheck/vmce.c -index eb6434a3ba20..0899df58bcbf 100644 ---- a/xen/arch/x86/cpu/mcheck/vmce.c -+++ b/xen/arch/x86/cpu/mcheck/vmce.c -@@ -150,6 +150,8 @@ static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) - default: - switch ( boot_cpu_data.x86_vendor ) - { -+ case X86_VENDOR_CENTAUR: -+ case X86_VENDOR_SHANGHAI: - case X86_VENDOR_INTEL: - ret = vmce_intel_rdmsr(v, msr, val); - break; -diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c -index aaedb2c31287..da305c7aa4c9 100644 ---- a/xen/arch/x86/msr.c -+++ b/xen/arch/x86/msr.c -@@ -282,6 +282,8 @@ int guest_rdmsr(struct vcpu *v, uint32_t msr, uint64_t *val) - *val = msrs->misc_features_enables.raw; - break; - -+ case MSR_P5_MC_ADDR: -+ case MSR_P5_MC_TYPE: - case MSR_IA32_MCG_CAP ... MSR_IA32_MCG_CTL: /* 0x179 -> 0x17b */ - case MSR_IA32_MCx_CTL2(0) ... MSR_IA32_MCx_CTL2(31): /* 0x280 -> 0x29f */ - case MSR_IA32_MCx_CTL(0) ... MSR_IA32_MCx_MISC(31): /* 0x400 -> 0x47f */ -diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h -index 3e038db618ff..31964b88af7a 100644 ---- a/xen/include/asm-x86/msr-index.h -+++ b/xen/include/asm-x86/msr-index.h -@@ -15,6 +15,9 @@ - * abbreviated name. Exceptions will be considered on a case-by-case basis. - */ - -+#define MSR_P5_MC_ADDR 0 -+#define MSR_P5_MC_TYPE 0x00000001 -+ - #define MSR_APIC_BASE 0x0000001b - #define APIC_BASE_BSP (_AC(1, ULL) << 8) - #define APIC_BASE_EXTD (_AC(1, ULL) << 10) --- -2.35.1 - diff --git a/0014-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch b/0014-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch new file mode 100644 index 0000000..fe24269 --- /dev/null +++ b/0014-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch @@ -0,0 +1,108 @@ +From 3a16da801e14b8ff996b6f7408391ce488abd925 Mon Sep 17 00:00:00 2001 +From: Henry Wang <Henry.Wang@arm.com> +Date: Tue, 11 Oct 2022 14:55:40 +0200 +Subject: [PATCH 14/26] xen/arm, libxl: Implement XEN_DOMCTL_shadow_op for Arm + +This commit implements the `XEN_DOMCTL_shadow_op` support in Xen +for Arm. The p2m pages pool size for xl guests is supposed to be +determined by `XEN_DOMCTL_shadow_op`. Hence, this commit: + +- Introduces a function `p2m_domctl` and implements the subops +`XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION` and +`XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION` of `XEN_DOMCTL_shadow_op`. + +- Adds the `XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION` support in libxl. + +Therefore enabling the setting of shadow memory pool size +when creating a guest from xl and getting shadow memory pool size +from Xen. + +Note that the `XEN_DOMCTL_shadow_op` added in this commit is only +a dummy op, and the functionality of setting/getting p2m memory pool +size for xl guests will be added in following commits. + +This is part of CVE-2022-33747 / XSA-409. + +Signed-off-by: Henry Wang <Henry.Wang@arm.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +master commit: cf2a68d2ffbc3ce95e01449d46180bddb10d24a0 +master date: 2022-10-11 14:28:42 +0200 +--- + tools/libs/light/libxl_arm.c | 12 ++++++++++++ + xen/arch/arm/domctl.c | 32 ++++++++++++++++++++++++++++++++ + 2 files changed, 44 insertions(+) + +diff --git a/tools/libs/light/libxl_arm.c b/tools/libs/light/libxl_arm.c +index 73a95e83af24..22a0c561bbc6 100644 +--- a/tools/libs/light/libxl_arm.c ++++ b/tools/libs/light/libxl_arm.c +@@ -131,6 +131,18 @@ int libxl__arch_domain_create(libxl__gc *gc, + libxl__domain_build_state *state, + uint32_t domid) + { ++ libxl_ctx *ctx = libxl__gc_owner(gc); ++ unsigned int shadow_mb = DIV_ROUNDUP(d_config->b_info.shadow_memkb, 1024); ++ ++ int r = xc_shadow_control(ctx->xch, domid, ++ XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION, ++ &shadow_mb, 0); ++ if (r) { ++ LOGED(ERROR, domid, ++ "Failed to set %u MiB shadow allocation", shadow_mb); ++ return ERROR_FAIL; ++ } ++ + return 0; + } + +diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c +index 1baf25c3d98b..9bf72e693019 100644 +--- a/xen/arch/arm/domctl.c ++++ b/xen/arch/arm/domctl.c +@@ -47,11 +47,43 @@ static int handle_vuart_init(struct domain *d, + return rc; + } + ++static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc, ++ XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) ++{ ++ if ( unlikely(d == current->domain) ) ++ { ++ printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n"); ++ return -EINVAL; ++ } ++ ++ if ( unlikely(d->is_dying) ) ++ { ++ printk(XENLOG_ERR "Tried to do a p2m domctl op on dying domain %u\n", ++ d->domain_id); ++ return -EINVAL; ++ } ++ ++ switch ( sc->op ) ++ { ++ case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION: ++ return 0; ++ case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION: ++ return 0; ++ default: ++ { ++ printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op); ++ return -EINVAL; ++ } ++ } ++} ++ + long arch_do_domctl(struct xen_domctl *domctl, struct domain *d, + XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) + { + switch ( domctl->cmd ) + { ++ case XEN_DOMCTL_shadow_op: ++ return p2m_domctl(d, &domctl->u.shadow_op, u_domctl); + case XEN_DOMCTL_cacheflush: + { + gfn_t s = _gfn(domctl->u.cacheflush.start_pfn); +-- +2.37.3 + diff --git a/0015-kconfig-detect-LD-implementation.patch b/0015-kconfig-detect-LD-implementation.patch deleted file mode 100644 index 4507bc7..0000000 --- a/0015-kconfig-detect-LD-implementation.patch +++ /dev/null @@ -1,46 +0,0 @@ -From 3754bd128d1a6b3d5864d1a3ee5d27b67d35387a Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 7 Jun 2022 14:05:06 +0200 -Subject: [PATCH 15/51] kconfig: detect LD implementation -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Detect GNU and LLVM ld implementations. This is required for further -patches that will introduce diverging behaviour depending on the -linker implementation in use. - -Note that LLVM ld returns "compatible with GNU linkers" as part of the -version string, so be on the safe side and use '^' to only match at -the start of the line in case LLVM ever decides to change the text to -use "compatible with GNU ld" instead. - -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Michal Orzel <michal.orzel@arm.com> -Acked-by: Julien Grall <jgrall@amazon.com> -master commit: c70c4b624f85f7d4e28c70a804a0a3f20d73092b -master date: 2022-05-02 08:50:39 +0200 ---- - xen/Kconfig | 6 ++++++ - 1 file changed, 6 insertions(+) - -diff --git a/xen/Kconfig b/xen/Kconfig -index bcbd2758e5d3..0c89afd50fcf 100644 ---- a/xen/Kconfig -+++ b/xen/Kconfig -@@ -23,6 +23,12 @@ config CLANG_VERSION - int - default $(shell,$(BASEDIR)/scripts/clang-version.sh $(CC)) - -+config LD_IS_GNU -+ def_bool $(success,$(LD) --version | head -n 1 | grep -q "^GNU ld") -+ -+config LD_IS_LLVM -+ def_bool $(success,$(LD) --version | head -n 1 | grep -q "^LLD") -+ - # -fvisibility=hidden reduces -fpic cost, if it's available - config CC_HAS_VISIBILITY_ATTRIBUTE - def_bool $(cc-option,-fvisibility=hidden) --- -2.35.1 - diff --git a/0015-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch b/0015-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch new file mode 100644 index 0000000..704543a --- /dev/null +++ b/0015-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch @@ -0,0 +1,289 @@ +From 44e9dcc48b81bca202a5b31926125a6a59a4c72e Mon Sep 17 00:00:00 2001 +From: Henry Wang <Henry.Wang@arm.com> +Date: Tue, 11 Oct 2022 14:55:53 +0200 +Subject: [PATCH 15/26] xen/arm: Allocate and free P2M pages from the P2M pool + +This commit sets/tearsdown of p2m pages pool for non-privileged Arm +guests by calling `p2m_set_allocation` and `p2m_teardown_allocation`. + +- For dom0, P2M pages should come from heap directly instead of p2m +pool, so that the kernel may take advantage of the extended regions. + +- For xl guests, the setting of the p2m pool is called in +`XEN_DOMCTL_shadow_op` and the p2m pool is destroyed in +`domain_relinquish_resources`. Note that domctl->u.shadow_op.mb is +updated with the new size when setting the p2m pool. + +- For dom0less domUs, the setting of the p2m pool is called before +allocating memory during domain creation. Users can specify the p2m +pool size by `xen,domain-p2m-mem-mb` dts property. + +To actually allocate/free pages from the p2m pool, this commit adds +two helper functions namely `p2m_alloc_page` and `p2m_free_page` to +`struct p2m_domain`. By replacing the `alloc_domheap_page` and +`free_domheap_page` with these two helper functions, p2m pages can +be added/removed from the list of p2m pool rather than from the heap. + +Since page from `p2m_alloc_page` is cleaned, take the opportunity +to remove the redundant `clean_page` in `p2m_create_table`. + +This is part of CVE-2022-33747 / XSA-409. + +Signed-off-by: Henry Wang <Henry.Wang@arm.com> +Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> +master commit: cbea5a1149ca7fd4b7cdbfa3ec2e4f109b601ff7 +master date: 2022-10-11 14:28:44 +0200 +--- + docs/misc/arm/device-tree/booting.txt | 8 ++++ + xen/arch/arm/domain.c | 6 +++ + xen/arch/arm/domain_build.c | 29 ++++++++++++++ + xen/arch/arm/domctl.c | 23 ++++++++++- + xen/arch/arm/p2m.c | 57 +++++++++++++++++++++++++-- + 5 files changed, 118 insertions(+), 5 deletions(-) + +diff --git a/docs/misc/arm/device-tree/booting.txt b/docs/misc/arm/device-tree/booting.txt +index 71895663a4de..d92ccc56ffe0 100644 +--- a/docs/misc/arm/device-tree/booting.txt ++++ b/docs/misc/arm/device-tree/booting.txt +@@ -182,6 +182,14 @@ with the following properties: + Both #address-cells and #size-cells need to be specified because + both sub-nodes (described shortly) have reg properties. + ++- xen,domain-p2m-mem-mb ++ ++ Optional. A 32-bit integer specifying the amount of megabytes of RAM ++ used for the domain P2M pool. This is in-sync with the shadow_memory ++ option in xl.cfg. Leaving this field empty in device tree will lead to ++ the default size of domain P2M pool, i.e. 1MB per guest vCPU plus 4KB ++ per MB of guest RAM plus 512KB for guest extended regions. ++ + Under the "xen,domain" compatible node, one or more sub-nodes are present + for the DomU kernel and ramdisk. + +diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c +index 2694c39127c5..a818f33a1afa 100644 +--- a/xen/arch/arm/domain.c ++++ b/xen/arch/arm/domain.c +@@ -997,6 +997,7 @@ enum { + PROG_page, + PROG_mapping, + PROG_p2m, ++ PROG_p2m_pool, + PROG_done, + }; + +@@ -1062,6 +1063,11 @@ int domain_relinquish_resources(struct domain *d) + if ( ret ) + return ret; + ++ PROGRESS(p2m_pool): ++ ret = p2m_teardown_allocation(d); ++ if( ret ) ++ return ret; ++ + PROGRESS(done): + break; + +diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c +index d02bacbcd1ed..8aec3755ca5d 100644 +--- a/xen/arch/arm/domain_build.c ++++ b/xen/arch/arm/domain_build.c +@@ -2833,6 +2833,21 @@ static void __init find_gnttab_region(struct domain *d, + kinfo->gnttab_start, kinfo->gnttab_start + kinfo->gnttab_size); + } + ++static unsigned long __init domain_p2m_pages(unsigned long maxmem_kb, ++ unsigned int smp_cpus) ++{ ++ /* ++ * Keep in sync with libxl__get_required_paging_memory(). ++ * 256 pages (1MB) per vcpu, plus 1 page per MiB of RAM for the P2M map, ++ * plus 128 pages to cover extended regions. ++ */ ++ unsigned long memkb = 4 * (256 * smp_cpus + (maxmem_kb / 1024) + 128); ++ ++ BUILD_BUG_ON(PAGE_SIZE != SZ_4K); ++ ++ return DIV_ROUND_UP(memkb, 1024) << (20 - PAGE_SHIFT); ++} ++ + static int __init construct_domain(struct domain *d, struct kernel_info *kinfo) + { + unsigned int i; +@@ -2924,6 +2939,8 @@ static int __init construct_domU(struct domain *d, + struct kernel_info kinfo = {}; + int rc; + u64 mem; ++ u32 p2m_mem_mb; ++ unsigned long p2m_pages; + + rc = dt_property_read_u64(node, "memory", &mem); + if ( !rc ) +@@ -2933,6 +2950,18 @@ static int __init construct_domU(struct domain *d, + } + kinfo.unassigned_mem = (paddr_t)mem * SZ_1K; + ++ rc = dt_property_read_u32(node, "xen,domain-p2m-mem-mb", &p2m_mem_mb); ++ /* If xen,domain-p2m-mem-mb is not specified, use the default value. */ ++ p2m_pages = rc ? ++ p2m_mem_mb << (20 - PAGE_SHIFT) : ++ domain_p2m_pages(mem, d->max_vcpus); ++ ++ spin_lock(&d->arch.paging.lock); ++ rc = p2m_set_allocation(d, p2m_pages, NULL); ++ spin_unlock(&d->arch.paging.lock); ++ if ( rc != 0 ) ++ return rc; ++ + printk("*** LOADING DOMU cpus=%u memory=%"PRIx64"KB ***\n", d->max_vcpus, mem); + + kinfo.vpl011 = dt_property_read_bool(node, "vpl011"); +diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c +index 9bf72e693019..c8fdeb124084 100644 +--- a/xen/arch/arm/domctl.c ++++ b/xen/arch/arm/domctl.c +@@ -50,6 +50,9 @@ static int handle_vuart_init(struct domain *d, + static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc, + XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) + { ++ long rc; ++ bool preempted = false; ++ + if ( unlikely(d == current->domain) ) + { + printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n"); +@@ -66,9 +69,27 @@ static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc, + switch ( sc->op ) + { + case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION: +- return 0; ++ { ++ /* Allow and handle preemption */ ++ spin_lock(&d->arch.paging.lock); ++ rc = p2m_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted); ++ spin_unlock(&d->arch.paging.lock); ++ ++ if ( preempted ) ++ /* Not finished. Set up to re-run the call. */ ++ rc = hypercall_create_continuation(__HYPERVISOR_domctl, "h", ++ u_domctl); ++ else ++ /* Finished. Return the new allocation. */ ++ sc->mb = p2m_get_allocation(d); ++ ++ return rc; ++ } + case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION: ++ { ++ sc->mb = p2m_get_allocation(d); + return 0; ++ } + default: + { + printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op); +diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c +index d8957dd8727c..b2d856a801af 100644 +--- a/xen/arch/arm/p2m.c ++++ b/xen/arch/arm/p2m.c +@@ -50,6 +50,54 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn) + return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48)); + } + ++static struct page_info *p2m_alloc_page(struct domain *d) ++{ ++ struct page_info *pg; ++ ++ spin_lock(&d->arch.paging.lock); ++ /* ++ * For hardware domain, there should be no limit in the number of pages that ++ * can be allocated, so that the kernel may take advantage of the extended ++ * regions. Hence, allocate p2m pages for hardware domains from heap. ++ */ ++ if ( is_hardware_domain(d) ) ++ { ++ pg = alloc_domheap_page(NULL, 0); ++ if ( pg == NULL ) ++ { ++ printk(XENLOG_G_ERR "Failed to allocate P2M pages for hwdom.\n"); ++ spin_unlock(&d->arch.paging.lock); ++ return NULL; ++ } ++ } ++ else ++ { ++ pg = page_list_remove_head(&d->arch.paging.p2m_freelist); ++ if ( unlikely(!pg) ) ++ { ++ spin_unlock(&d->arch.paging.lock); ++ return NULL; ++ } ++ d->arch.paging.p2m_total_pages--; ++ } ++ spin_unlock(&d->arch.paging.lock); ++ ++ return pg; ++} ++ ++static void p2m_free_page(struct domain *d, struct page_info *pg) ++{ ++ spin_lock(&d->arch.paging.lock); ++ if ( is_hardware_domain(d) ) ++ free_domheap_page(pg); ++ else ++ { ++ d->arch.paging.p2m_total_pages++; ++ page_list_add_tail(pg, &d->arch.paging.p2m_freelist); ++ } ++ spin_unlock(&d->arch.paging.lock); ++} ++ + /* Return the size of the pool, rounded up to the nearest MB */ + unsigned int p2m_get_allocation(struct domain *d) + { +@@ -751,7 +799,7 @@ static int p2m_create_table(struct p2m_domain *p2m, lpae_t *entry) + + ASSERT(!p2m_is_valid(*entry)); + +- page = alloc_domheap_page(NULL, 0); ++ page = p2m_alloc_page(p2m->domain); + if ( page == NULL ) + return -ENOMEM; + +@@ -878,7 +926,7 @@ static void p2m_free_entry(struct p2m_domain *p2m, + pg = mfn_to_page(mfn); + + page_list_del(pg, &p2m->pages); +- free_domheap_page(pg); ++ p2m_free_page(p2m->domain, pg); + } + + static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry, +@@ -902,7 +950,7 @@ static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry, + ASSERT(level < target); + ASSERT(p2m_is_superpage(*entry, level)); + +- page = alloc_domheap_page(NULL, 0); ++ page = p2m_alloc_page(p2m->domain); + if ( !page ) + return false; + +@@ -1641,7 +1689,7 @@ int p2m_teardown(struct domain *d) + + while ( (pg = page_list_remove_head(&p2m->pages)) ) + { +- free_domheap_page(pg); ++ p2m_free_page(p2m->domain, pg); + count++; + /* Arbitrarily preempt every 512 iterations */ + if ( !(count % 512) && hypercall_preempt_check() ) +@@ -1665,6 +1713,7 @@ void p2m_final_teardown(struct domain *d) + return; + + ASSERT(page_list_empty(&p2m->pages)); ++ ASSERT(page_list_empty(&d->arch.paging.p2m_freelist)); + + if ( p2m->root ) + free_domheap_pages(p2m->root, P2M_ROOT_ORDER); +-- +2.37.3 + diff --git a/0016-gnttab-correct-locking-on-transitive-grant-copy-erro.patch b/0016-gnttab-correct-locking-on-transitive-grant-copy-erro.patch new file mode 100644 index 0000000..6283d47 --- /dev/null +++ b/0016-gnttab-correct-locking-on-transitive-grant-copy-erro.patch @@ -0,0 +1,66 @@ +From 32cb81501c8b858fe9a451650804ec3024a8b364 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 11 Oct 2022 14:56:29 +0200 +Subject: [PATCH 16/26] gnttab: correct locking on transitive grant copy error + path + +While the comment next to the lock dropping in preparation of +recursively calling acquire_grant_for_copy() mistakenly talks about the +rd == td case (excluded a few lines further up), the same concerns apply +to the calling of release_grant_for_copy() on a subsequent error path. + +This is CVE-2022-33748 / XSA-411. + +Fixes: ad48fb963dbf ("gnttab: fix transitive grant handling") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +master commit: 6e3aab858eef614a21a782a3b73acc88e74690ea +master date: 2022-10-11 14:29:30 +0200 +--- + xen/common/grant_table.c | 19 ++++++++++++++++--- + 1 file changed, 16 insertions(+), 3 deletions(-) + +diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c +index 4c742cd8fe81..d8ca645b96ff 100644 +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -2613,9 +2613,8 @@ acquire_grant_for_copy( + trans_domid); + + /* +- * acquire_grant_for_copy() could take the lock on the +- * remote table (if rd == td), so we have to drop the lock +- * here and reacquire. ++ * acquire_grant_for_copy() will take the lock on the remote table, ++ * so we have to drop the lock here and reacquire. + */ + active_entry_release(act); + grant_read_unlock(rgt); +@@ -2652,11 +2651,25 @@ acquire_grant_for_copy( + act->trans_gref != trans_gref || + !act->is_sub_page)) ) + { ++ /* ++ * Like above for acquire_grant_for_copy() we need to drop and then ++ * re-acquire the locks here to prevent lock order inversion issues. ++ * Unlike for acquire_grant_for_copy() we don't need to re-check ++ * anything, as release_grant_for_copy() doesn't depend on the grant ++ * table entry: It only updates internal state and the status flags. ++ */ ++ active_entry_release(act); ++ grant_read_unlock(rgt); ++ + release_grant_for_copy(td, trans_gref, readonly); + rcu_unlock_domain(td); ++ ++ grant_read_lock(rgt); ++ act = active_entry_acquire(rgt, gref); + reduce_status_for_pin(rd, act, status, readonly); + active_entry_release(act); + grant_read_unlock(rgt); ++ + put_page(*page); + *page = NULL; + return ERESTART; +-- +2.37.3 + diff --git a/0016-linker-lld-do-not-generate-quoted-section-names.patch b/0016-linker-lld-do-not-generate-quoted-section-names.patch deleted file mode 100644 index 5b3a8cd..0000000 --- a/0016-linker-lld-do-not-generate-quoted-section-names.patch +++ /dev/null @@ -1,54 +0,0 @@ -From 88b653f73928117461dc250acd1e830a47a14c2b Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com> -Date: Tue, 7 Jun 2022 14:05:24 +0200 -Subject: [PATCH 16/51] linker/lld: do not generate quoted section names -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -LLVM LD doesn't strip the quotes from the section names, and so the -resulting binary ends up with section names like: - - [ 1] ".text" PROGBITS ffff82d040200000 00008000 - 000000000018cbc1 0000000000000000 AX 0 0 4096 - -This confuses some tools (like gdb) and prevents proper parsing of the -binary. - -The issue has already been reported and is being fixed in LLD. In -order to workaround this issue and keep the GNU ld support define -different DECL_SECTION macros depending on the used ld -implementation. - -Drop the quotes from the definitions of the debug sections in -DECL_DEBUG{2}, as those quotes are not required for GNU ld either. - -Fixes: 6254920587c3 ('x86: quote section names when defining them in linker script') -Signed-off-by: Roger Pau Monné <roger.pau@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 702c9a800eb3ecd4b8595998d37a769d470c5bb0 -master date: 2022-05-02 08:51:45 +0200 ---- - xen/arch/x86/xen.lds.S | 6 +++++- - 1 file changed, 5 insertions(+), 1 deletion(-) - -diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S -index 4c58f3209c3d..bc9b9651b192 100644 ---- a/xen/arch/x86/xen.lds.S -+++ b/xen/arch/x86/xen.lds.S -@@ -18,7 +18,11 @@ ENTRY(efi_start) - #else /* !EFI */ - - #define FORMAT "elf64-x86-64" --#define DECL_SECTION(x) #x : AT(ADDR(#x) - __XEN_VIRT_START) -+#ifdef CONFIG_LD_IS_GNU -+# define DECL_SECTION(x) x : AT(ADDR(#x) - __XEN_VIRT_START) -+#else -+# define DECL_SECTION(x) x : AT(ADDR(x) - __XEN_VIRT_START) -+#endif - - ENTRY(start_pa) - --- -2.35.1 - diff --git a/0017-tools-libxl-Replace-deprecated-soundhw-on-QEMU-comma.patch b/0017-tools-libxl-Replace-deprecated-soundhw-on-QEMU-comma.patch new file mode 100644 index 0000000..ffbc311 --- /dev/null +++ b/0017-tools-libxl-Replace-deprecated-soundhw-on-QEMU-comma.patch @@ -0,0 +1,112 @@ +From e85e2a3c17b6cd38de041cdaf14d9efdcdabad1a Mon Sep 17 00:00:00 2001 +From: Anthony PERARD <anthony.perard@citrix.com> +Date: Tue, 11 Oct 2022 14:59:10 +0200 +Subject: [PATCH 17/26] tools/libxl: Replace deprecated -soundhw on QEMU + command line + +-soundhw is deprecated since 825ff02911c9 ("audio: add soundhw +deprecation notice"), QEMU v5.1, and is been remove for upcoming v7.1 +by 039a68373c45 ("introduce -audio as a replacement for -soundhw"). + +Instead we can just add the sound card with "-device", for most option +that "-soundhw" could handle. "-device" is an option that existed +before QEMU 1.0, and could already be used to add audio hardware. + +The list of possible option for libxl's "soundhw" is taken the list +from QEMU 7.0. + +The list of options for "soundhw" are listed in order of preference in +the manual. The first three (hda, ac97, es1370) are PCI devices and +easy to test on Linux, and the last four are ISA devices which doesn't +seems to work out of the box on linux. + +The sound card 'pcspk' isn't listed even if it used to be accepted by +'-soundhw' because QEMU crash when trying to add it to a Xen domain. +Also, it wouldn't work with "-device" might need to be "-machine +pcspk-audiodev=default" instead. + +Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> +Reviewed-by: Jason Andryuk <jandryuk@gmail.com> +master commit: 62ca138c2c052187783aca3957d3f47c4dcfd683 +master date: 2022-08-18 09:25:50 +0200 +--- + docs/man/xl.cfg.5.pod.in | 6 +++--- + tools/libs/light/libxl_dm.c | 19 ++++++++++++++++++- + tools/libs/light/libxl_types_internal.idl | 10 ++++++++++ + 3 files changed, 31 insertions(+), 4 deletions(-) + +diff --git a/docs/man/xl.cfg.5.pod.in b/docs/man/xl.cfg.5.pod.in +index eda1e77ebd06..ab7541f22c3e 100644 +--- a/docs/man/xl.cfg.5.pod.in ++++ b/docs/man/xl.cfg.5.pod.in +@@ -2545,9 +2545,9 @@ The form serial=DEVICE is also accepted for backwards compatibility. + + =item B<soundhw="DEVICE"> + +-Select the virtual sound card to expose to the guest. The valid +-devices are defined by the device model configuration, please see the +-B<qemu(1)> manpage for details. The default is not to export any sound ++Select the virtual sound card to expose to the guest. The valid devices are ++B<hda>, B<ac97>, B<es1370>, B<adlib>, B<cs4231a>, B<gus>, B<sb16> if there are ++available with the device model QEMU. The default is not to export any sound + device. + + =item B<vkb_device=BOOLEAN> +diff --git a/tools/libs/light/libxl_dm.c b/tools/libs/light/libxl_dm.c +index 04bf5d85632e..fc264a3a13a6 100644 +--- a/tools/libs/light/libxl_dm.c ++++ b/tools/libs/light/libxl_dm.c +@@ -1204,6 +1204,7 @@ static int libxl__build_device_model_args_new(libxl__gc *gc, + uint64_t ram_size; + const char *path, *chardev; + bool is_stubdom = libxl_defbool_val(b_info->device_model_stubdomain); ++ int rc; + + dm_args = flexarray_make(gc, 16, 1); + dm_envs = flexarray_make(gc, 16, 1); +@@ -1531,7 +1532,23 @@ static int libxl__build_device_model_args_new(libxl__gc *gc, + } + } + if (b_info->u.hvm.soundhw) { +- flexarray_vappend(dm_args, "-soundhw", b_info->u.hvm.soundhw, NULL); ++ libxl__qemu_soundhw soundhw; ++ ++ rc = libxl__qemu_soundhw_from_string(b_info->u.hvm.soundhw, &soundhw); ++ if (rc) { ++ LOGD(ERROR, guest_domid, "Unknown soundhw option '%s'", b_info->u.hvm.soundhw); ++ return ERROR_INVAL; ++ } ++ ++ switch (soundhw) { ++ case LIBXL__QEMU_SOUNDHW_HDA: ++ flexarray_vappend(dm_args, "-device", "intel-hda", ++ "-device", "hda-duplex", NULL); ++ break; ++ default: ++ flexarray_append_pair(dm_args, "-device", ++ (char*)libxl__qemu_soundhw_to_string(soundhw)); ++ } + } + if (!libxl__acpi_defbool_val(b_info)) { + flexarray_append(dm_args, "-no-acpi"); +diff --git a/tools/libs/light/libxl_types_internal.idl b/tools/libs/light/libxl_types_internal.idl +index 3593e21dbb64..caa08d3229cd 100644 +--- a/tools/libs/light/libxl_types_internal.idl ++++ b/tools/libs/light/libxl_types_internal.idl +@@ -55,3 +55,13 @@ libxl__device_action = Enumeration("device_action", [ + (1, "ADD"), + (2, "REMOVE"), + ]) ++ ++libxl__qemu_soundhw = Enumeration("qemu_soundhw", [ ++ (1, "ac97"), ++ (2, "adlib"), ++ (3, "cs4231a"), ++ (4, "es1370"), ++ (5, "gus"), ++ (6, "hda"), ++ (7, "sb16"), ++ ]) +-- +2.37.3 + diff --git a/0017-xen-io-Fix-race-between-sending-an-I-O-and-domain-sh.patch b/0017-xen-io-Fix-race-between-sending-an-I-O-and-domain-sh.patch deleted file mode 100644 index bc48a84..0000000 --- a/0017-xen-io-Fix-race-between-sending-an-I-O-and-domain-sh.patch +++ /dev/null @@ -1,142 +0,0 @@ -From 982a314bd3000a16c3128afadb36a8ff41029adc Mon Sep 17 00:00:00 2001 -From: Julien Grall <jgrall@amazon.com> -Date: Tue, 7 Jun 2022 14:06:11 +0200 -Subject: [PATCH 17/51] xen: io: Fix race between sending an I/O and domain - shutdown - -Xen provides hypercalls to shutdown (SCHEDOP_shutdown{,_code}) and -resume a domain (XEN_DOMCTL_resumedomain). They can be used for checkpoint -where the expectation is the domain should continue as nothing happened -afterwards. - -hvmemul_do_io() and handle_pio() will act differently if the return -code of hvm_send_ioreq() (resp. hvmemul_do_pio_buffer()) is X86EMUL_RETRY. - -In this case, the I/O state will be reset to STATE_IOREQ_NONE (i.e -no I/O is pending) and/or the PC will not be advanced. - -If the shutdown request happens right after the I/O was sent to the -IOREQ, then emulation code will end up to re-execute the instruction -and therefore forward again the same I/O (at least when reading IO port). - -This would be problem if the access has a side-effect. A dumb example, -is a device implementing a counter which is incremented by one for every -access. When running shutdown/resume in a loop, the value read by the -OS may not be the old value + 1. - -Add an extra boolean in the structure hvm_vcpu_io to indicate whether -the I/O was suspended. This is then used in place of checking the domain -is shutting down in hvmemul_do_io() and handle_pio() as they should -act on suspend (i.e. vcpu_start_shutdown_deferral() returns false) rather -than shutdown. - -Signed-off-by: Julien Grall <jgrall@amazon.com> -Reviewed-by: Paul Durrant <paul@xen.org> -master commit: b7e0d8978810b534725e94a321736496928f00a5 -master date: 2022-05-06 17:16:22 +0100 ---- - xen/arch/arm/ioreq.c | 3 ++- - xen/arch/x86/hvm/emulate.c | 3 ++- - xen/arch/x86/hvm/io.c | 7 ++++--- - xen/common/ioreq.c | 4 ++++ - xen/include/xen/sched.h | 5 +++++ - 5 files changed, 17 insertions(+), 5 deletions(-) - -diff --git a/xen/arch/arm/ioreq.c b/xen/arch/arm/ioreq.c -index 308650b40051..fbccef212bf1 100644 ---- a/xen/arch/arm/ioreq.c -+++ b/xen/arch/arm/ioreq.c -@@ -80,9 +80,10 @@ enum io_state try_fwd_ioserv(struct cpu_user_regs *regs, - return IO_ABORT; - - vio->req = p; -+ vio->suspended = false; - - rc = ioreq_send(s, &p, 0); -- if ( rc != IO_RETRY || v->domain->is_shutting_down ) -+ if ( rc != IO_RETRY || vio->suspended ) - vio->req.state = STATE_IOREQ_NONE; - else if ( !ioreq_needs_completion(&vio->req) ) - rc = IO_HANDLED; -diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c -index 76a2ccfafe23..7da348b5d486 100644 ---- a/xen/arch/x86/hvm/emulate.c -+++ b/xen/arch/x86/hvm/emulate.c -@@ -239,6 +239,7 @@ static int hvmemul_do_io( - ASSERT(p.count); - - vio->req = p; -+ vio->suspended = false; - - rc = hvm_io_intercept(&p); - -@@ -334,7 +335,7 @@ static int hvmemul_do_io( - else - { - rc = ioreq_send(s, &p, 0); -- if ( rc != X86EMUL_RETRY || currd->is_shutting_down ) -+ if ( rc != X86EMUL_RETRY || vio->suspended ) - vio->req.state = STATE_IOREQ_NONE; - else if ( !ioreq_needs_completion(&vio->req) ) - rc = X86EMUL_OKAY; -diff --git a/xen/arch/x86/hvm/io.c b/xen/arch/x86/hvm/io.c -index 93f1d1503fa6..80915f27e488 100644 ---- a/xen/arch/x86/hvm/io.c -+++ b/xen/arch/x86/hvm/io.c -@@ -138,10 +138,11 @@ bool handle_pio(uint16_t port, unsigned int size, int dir) - - case X86EMUL_RETRY: - /* -- * We should not advance RIP/EIP if the domain is shutting down or -- * if X86EMUL_RETRY has been returned by an internal handler. -+ * We should not advance RIP/EIP if the vio was suspended (e.g. -+ * because the domain is shutting down) or if X86EMUL_RETRY has -+ * been returned by an internal handler. - */ -- if ( curr->domain->is_shutting_down || !vcpu_ioreq_pending(curr) ) -+ if ( vio->suspended || !vcpu_ioreq_pending(curr) ) - return false; - break; - -diff --git a/xen/common/ioreq.c b/xen/common/ioreq.c -index d732dc045df9..42414b750bef 100644 ---- a/xen/common/ioreq.c -+++ b/xen/common/ioreq.c -@@ -1256,6 +1256,7 @@ int ioreq_send(struct ioreq_server *s, ioreq_t *proto_p, - struct vcpu *curr = current; - struct domain *d = curr->domain; - struct ioreq_vcpu *sv; -+ struct vcpu_io *vio = &curr->io; - - ASSERT(s); - -@@ -1263,7 +1264,10 @@ int ioreq_send(struct ioreq_server *s, ioreq_t *proto_p, - return ioreq_send_buffered(s, proto_p); - - if ( unlikely(!vcpu_start_shutdown_deferral(curr)) ) -+ { -+ vio->suspended = true; - return IOREQ_STATUS_RETRY; -+ } - - list_for_each_entry ( sv, - &s->ioreq_vcpu_list, -diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h -index 28146ee404e6..9671062360ac 100644 ---- a/xen/include/xen/sched.h -+++ b/xen/include/xen/sched.h -@@ -159,6 +159,11 @@ enum vio_completion { - struct vcpu_io { - /* I/O request in flight to device model. */ - enum vio_completion completion; -+ /* -+ * Indicate whether the I/O was not handled because the domain -+ * is about to be paused. -+ */ -+ bool suspended; - ioreq_t req; - }; - --- -2.35.1 - diff --git a/0018-build-suppress-GNU-ld-warning-about-RWX-load-segment.patch b/0018-build-suppress-GNU-ld-warning-about-RWX-load-segment.patch deleted file mode 100644 index b20a99a..0000000 --- a/0018-build-suppress-GNU-ld-warning-about-RWX-load-segment.patch +++ /dev/null @@ -1,35 +0,0 @@ -From 4890031d224262a6cf43d3bef1af4a16c13db306 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 7 Jun 2022 14:06:51 +0200 -Subject: [PATCH 18/51] build: suppress GNU ld warning about RWX load segments - -We cannot really avoid such and we're also not really at risk because of -them, as we control page table permissions ourselves rather than relying -on a loader of some sort. Present GNU ld master started warning about -such, and hence 2.39 is anticipated to have this warning. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: Julien Grall <jgrall@amazon.com> -master commit: 68f5aac012b9ae36ce9b65d9ca9cc9f232191ad3 -master date: 2022-05-18 11:17:19 +0200 ---- - xen/Makefile | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/xen/Makefile b/xen/Makefile -index ce4eca3ee4d7..4d9abe704628 100644 ---- a/xen/Makefile -+++ b/xen/Makefile -@@ -260,6 +260,8 @@ endif - - AFLAGS += -D__ASSEMBLY__ - -+LDFLAGS-$(call ld-option,--warn-rwx-segments) += --no-warn-rwx-segments -+ - CFLAGS += $(CFLAGS-y) - # allow extra CFLAGS externally via EXTRA_CFLAGS_XEN_CORE - CFLAGS += $(EXTRA_CFLAGS_XEN_CORE) --- -2.35.1 - diff --git a/0018-x86-CPUID-surface-suitable-value-in-EBX-of-XSTATE-su.patch b/0018-x86-CPUID-surface-suitable-value-in-EBX-of-XSTATE-su.patch new file mode 100644 index 0000000..d6ade98 --- /dev/null +++ b/0018-x86-CPUID-surface-suitable-value-in-EBX-of-XSTATE-su.patch @@ -0,0 +1,44 @@ +From e8882bcfe35520e950ba60acd6e67e65f1ce90a8 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 11 Oct 2022 14:59:26 +0200 +Subject: [PATCH 18/26] x86/CPUID: surface suitable value in EBX of XSTATE + subleaf 1 + +While the SDM isn't very clear about this, our present behavior make +Linux 5.19 unhappy. As of commit 8ad7e8f69695 ("x86/fpu/xsave: Support +XSAVEC in the kernel") they're using this CPUID output also to size +the compacted area used by XSAVEC. Getting back zero there isn't really +liked, yet for PV that's the default on capable hardware: XSAVES isn't +exposed to PV domains. + +Considering that the size reported is that of the compacted save area, +I view Linux'es assumption as appropriate (short of the SDM properly +considering the case). Therefore we need to populate the field also when +only XSAVEC is supported for a guest. + +Fixes: 460b9a4b3630 ("x86/xsaves: enable xsaves/xrstors for hvm guest") +Fixes: 8d050ed1097c ("x86: don't expose XSAVES capability to PV guests") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: c3bd0b83ea5b7c0da6542687436042eeea1e7909 +master date: 2022-08-24 14:23:59 +0200 +--- + xen/arch/x86/cpuid.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c +index ff335f16390d..a647331f4793 100644 +--- a/xen/arch/x86/cpuid.c ++++ b/xen/arch/x86/cpuid.c +@@ -1060,7 +1060,7 @@ void guest_cpuid(const struct vcpu *v, uint32_t leaf, + switch ( subleaf ) + { + case 1: +- if ( p->xstate.xsaves ) ++ if ( p->xstate.xsavec || p->xstate.xsaves ) + { + /* + * TODO: Figure out what to do for XSS state. VT-x manages +-- +2.37.3 + diff --git a/0019-build-silence-GNU-ld-warning-about-executable-stacks.patch b/0019-build-silence-GNU-ld-warning-about-executable-stacks.patch deleted file mode 100644 index e4d739b..0000000 --- a/0019-build-silence-GNU-ld-warning-about-executable-stacks.patch +++ /dev/null @@ -1,35 +0,0 @@ -From 1bc669a568a9f4bdab9e9ddb95823ba370dc0baf Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 7 Jun 2022 14:07:11 +0200 -Subject: [PATCH 19/51] build: silence GNU ld warning about executable stacks - -While for C files the compiler is supposed to arrange for emitting -respective information, for assembly sources we're responsible ourselves. -Present GNU ld master started warning about such, and hence 2.39 is -anticipated to have this warning. - -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> -Acked-by: Julien Grall <jgrall@amazon.com> -master commit: 62d22296a95d259c934ca2f39ac511d729cfbb68 -master date: 2022-05-18 11:18:45 +0200 ---- - xen/Makefile | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/xen/Makefile b/xen/Makefile -index 4d9abe704628..971028eda240 100644 ---- a/xen/Makefile -+++ b/xen/Makefile -@@ -260,6 +260,8 @@ endif - - AFLAGS += -D__ASSEMBLY__ - -+$(call cc-option-add,AFLAGS,CC,-Wa$(comma)--noexecstack) -+ - LDFLAGS-$(call ld-option,--warn-rwx-segments) += --no-warn-rwx-segments - - CFLAGS += $(CFLAGS-y) --- -2.35.1 - diff --git a/0019-xen-sched-introduce-cpupool_update_node_affinity.patch b/0019-xen-sched-introduce-cpupool_update_node_affinity.patch new file mode 100644 index 0000000..957d0fe --- /dev/null +++ b/0019-xen-sched-introduce-cpupool_update_node_affinity.patch @@ -0,0 +1,257 @@ +From d4e971ad12dd27913dffcf96b5de378ea7b476e1 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 11 Oct 2022 14:59:40 +0200 +Subject: [PATCH 19/26] xen/sched: introduce cpupool_update_node_affinity() + +For updating the node affinities of all domains in a cpupool add a new +function cpupool_update_node_affinity(). + +In order to avoid multiple allocations of cpumasks carve out memory +allocation and freeing from domain_update_node_affinity() into new +helpers, which can be used by cpupool_update_node_affinity(). + +Modify domain_update_node_affinity() to take an additional parameter +for passing the allocated memory in and to allocate and free the memory +via the new helpers in case NULL was passed. + +This will help later to pre-allocate the cpumasks in order to avoid +allocations in stop-machine context. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +Tested-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: a83fa1e2b96ace65b45dde6954d67012633a082b +master date: 2022-09-05 11:42:30 +0100 +--- + xen/common/sched/core.c | 54 ++++++++++++++++++++++++++------------ + xen/common/sched/cpupool.c | 39 +++++++++++++++------------ + xen/common/sched/private.h | 7 +++++ + xen/include/xen/sched.h | 9 ++++++- + 4 files changed, 74 insertions(+), 35 deletions(-) + +diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c +index f07bd2681fcb..065a83eca912 100644 +--- a/xen/common/sched/core.c ++++ b/xen/common/sched/core.c +@@ -1824,9 +1824,28 @@ int vcpu_affinity_domctl(struct domain *d, uint32_t cmd, + return ret; + } + +-void domain_update_node_affinity(struct domain *d) ++bool alloc_affinity_masks(struct affinity_masks *affinity) + { +- cpumask_var_t dom_cpumask, dom_cpumask_soft; ++ if ( !alloc_cpumask_var(&affinity->hard) ) ++ return false; ++ if ( !alloc_cpumask_var(&affinity->soft) ) ++ { ++ free_cpumask_var(affinity->hard); ++ return false; ++ } ++ ++ return true; ++} ++ ++void free_affinity_masks(struct affinity_masks *affinity) ++{ ++ free_cpumask_var(affinity->soft); ++ free_cpumask_var(affinity->hard); ++} ++ ++void domain_update_node_aff(struct domain *d, struct affinity_masks *affinity) ++{ ++ struct affinity_masks masks; + cpumask_t *dom_affinity; + const cpumask_t *online; + struct sched_unit *unit; +@@ -1836,14 +1855,16 @@ void domain_update_node_affinity(struct domain *d) + if ( !d->vcpu || !d->vcpu[0] ) + return; + +- if ( !zalloc_cpumask_var(&dom_cpumask) ) +- return; +- if ( !zalloc_cpumask_var(&dom_cpumask_soft) ) ++ if ( !affinity ) + { +- free_cpumask_var(dom_cpumask); +- return; ++ affinity = &masks; ++ if ( !alloc_affinity_masks(affinity) ) ++ return; + } + ++ cpumask_clear(affinity->hard); ++ cpumask_clear(affinity->soft); ++ + online = cpupool_domain_master_cpumask(d); + + spin_lock(&d->node_affinity_lock); +@@ -1864,22 +1885,21 @@ void domain_update_node_affinity(struct domain *d) + */ + for_each_sched_unit ( d, unit ) + { +- cpumask_or(dom_cpumask, dom_cpumask, unit->cpu_hard_affinity); +- cpumask_or(dom_cpumask_soft, dom_cpumask_soft, +- unit->cpu_soft_affinity); ++ cpumask_or(affinity->hard, affinity->hard, unit->cpu_hard_affinity); ++ cpumask_or(affinity->soft, affinity->soft, unit->cpu_soft_affinity); + } + /* Filter out non-online cpus */ +- cpumask_and(dom_cpumask, dom_cpumask, online); +- ASSERT(!cpumask_empty(dom_cpumask)); ++ cpumask_and(affinity->hard, affinity->hard, online); ++ ASSERT(!cpumask_empty(affinity->hard)); + /* And compute the intersection between hard, online and soft */ +- cpumask_and(dom_cpumask_soft, dom_cpumask_soft, dom_cpumask); ++ cpumask_and(affinity->soft, affinity->soft, affinity->hard); + + /* + * If not empty, the intersection of hard, soft and online is the + * narrowest set we want. If empty, we fall back to hard&online. + */ +- dom_affinity = cpumask_empty(dom_cpumask_soft) ? +- dom_cpumask : dom_cpumask_soft; ++ dom_affinity = cpumask_empty(affinity->soft) ? affinity->hard ++ : affinity->soft; + + nodes_clear(d->node_affinity); + for_each_cpu ( cpu, dom_affinity ) +@@ -1888,8 +1908,8 @@ void domain_update_node_affinity(struct domain *d) + + spin_unlock(&d->node_affinity_lock); + +- free_cpumask_var(dom_cpumask_soft); +- free_cpumask_var(dom_cpumask); ++ if ( affinity == &masks ) ++ free_affinity_masks(affinity); + } + + typedef long ret_t; +diff --git a/xen/common/sched/cpupool.c b/xen/common/sched/cpupool.c +index 8c6e6eb9ccd5..45b6ff99561a 100644 +--- a/xen/common/sched/cpupool.c ++++ b/xen/common/sched/cpupool.c +@@ -401,6 +401,25 @@ int cpupool_move_domain(struct domain *d, struct cpupool *c) + return ret; + } + ++/* Update affinities of all domains in a cpupool. */ ++static void cpupool_update_node_affinity(const struct cpupool *c) ++{ ++ struct affinity_masks masks; ++ struct domain *d; ++ ++ if ( !alloc_affinity_masks(&masks) ) ++ return; ++ ++ rcu_read_lock(&domlist_read_lock); ++ ++ for_each_domain_in_cpupool(d, c) ++ domain_update_node_aff(d, &masks); ++ ++ rcu_read_unlock(&domlist_read_lock); ++ ++ free_affinity_masks(&masks); ++} ++ + /* + * assign a specific cpu to a cpupool + * cpupool_lock must be held +@@ -408,7 +427,6 @@ int cpupool_move_domain(struct domain *d, struct cpupool *c) + static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu) + { + int ret; +- struct domain *d; + const cpumask_t *cpus; + + cpus = sched_get_opt_cpumask(c->gran, cpu); +@@ -433,12 +451,7 @@ static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu) + + rcu_read_unlock(&sched_res_rculock); + +- rcu_read_lock(&domlist_read_lock); +- for_each_domain_in_cpupool(d, c) +- { +- domain_update_node_affinity(d); +- } +- rcu_read_unlock(&domlist_read_lock); ++ cpupool_update_node_affinity(c); + + return 0; + } +@@ -447,18 +460,14 @@ static int cpupool_unassign_cpu_finish(struct cpupool *c) + { + int cpu = cpupool_moving_cpu; + const cpumask_t *cpus; +- struct domain *d; + int ret; + + if ( c != cpupool_cpu_moving ) + return -EADDRNOTAVAIL; + +- /* +- * We need this for scanning the domain list, both in +- * cpu_disable_scheduler(), and at the bottom of this function. +- */ + rcu_read_lock(&domlist_read_lock); + ret = cpu_disable_scheduler(cpu); ++ rcu_read_unlock(&domlist_read_lock); + + rcu_read_lock(&sched_res_rculock); + cpus = get_sched_res(cpu)->cpus; +@@ -485,11 +494,7 @@ static int cpupool_unassign_cpu_finish(struct cpupool *c) + } + rcu_read_unlock(&sched_res_rculock); + +- for_each_domain_in_cpupool(d, c) +- { +- domain_update_node_affinity(d); +- } +- rcu_read_unlock(&domlist_read_lock); ++ cpupool_update_node_affinity(c); + + return ret; + } +diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h +index a870320146ef..2b04b01a0c0a 100644 +--- a/xen/common/sched/private.h ++++ b/xen/common/sched/private.h +@@ -593,6 +593,13 @@ affinity_balance_cpumask(const struct sched_unit *unit, int step, + cpumask_copy(mask, unit->cpu_hard_affinity); + } + ++struct affinity_masks { ++ cpumask_var_t hard; ++ cpumask_var_t soft; ++}; ++ ++bool alloc_affinity_masks(struct affinity_masks *affinity); ++void free_affinity_masks(struct affinity_masks *affinity); + void sched_rm_cpu(unsigned int cpu); + const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt, unsigned int cpu); + void schedule_dump(struct cpupool *c); +diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h +index 9671062360ac..3f4225738a40 100644 +--- a/xen/include/xen/sched.h ++++ b/xen/include/xen/sched.h +@@ -655,8 +655,15 @@ static inline void get_knownalive_domain(struct domain *d) + ASSERT(!(atomic_read(&d->refcnt) & DOMAIN_DESTROYED)); + } + ++struct affinity_masks; ++ + int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity); +-void domain_update_node_affinity(struct domain *d); ++void domain_update_node_aff(struct domain *d, struct affinity_masks *affinity); ++ ++static inline void domain_update_node_affinity(struct domain *d) ++{ ++ domain_update_node_aff(d, NULL); ++} + + /* + * To be implemented by each architecture, sanity checking the configuration +-- +2.37.3 + diff --git a/0020-ns16550-use-poll-mode-if-INTERRUPT_LINE-is-0xff.patch b/0020-ns16550-use-poll-mode-if-INTERRUPT_LINE-is-0xff.patch deleted file mode 100644 index baa1e15..0000000 --- a/0020-ns16550-use-poll-mode-if-INTERRUPT_LINE-is-0xff.patch +++ /dev/null @@ -1,50 +0,0 @@ -From f1be0b62a03b90a40a03e21f965e4cbb89809bb1 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= - <marmarek@invisiblethingslab.com> -Date: Tue, 7 Jun 2022 14:07:34 +0200 -Subject: [PATCH 20/51] ns16550: use poll mode if INTERRUPT_LINE is 0xff -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Intel LPSS has INTERRUPT_LINE set to 0xff by default, that is declared -by the PCI Local Bus Specification Revision 3.0 (from 2004) as -"unknown"/"no connection". Fallback to poll mode in this case. -The 0xff handling is x86-specific, the surrounding code is guarded with -CONFIG_X86 anyway. - -Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -master commit: 6a2ea1a2370a0c8a0210accac0ae62e68c185134 -master date: 2022-05-20 12:19:45 +0200 ---- - xen/drivers/char/ns16550.c | 13 +++++++++++++ - 1 file changed, 13 insertions(+) - -diff --git a/xen/drivers/char/ns16550.c b/xen/drivers/char/ns16550.c -index 30596d60d4ed..2d2bd2a02469 100644 ---- a/xen/drivers/char/ns16550.c -+++ b/xen/drivers/char/ns16550.c -@@ -1221,6 +1221,19 @@ pci_uart_config(struct ns16550 *uart, bool_t skip_amt, unsigned int idx) - pci_conf_read8(PCI_SBDF(0, b, d, f), - PCI_INTERRUPT_LINE) : 0; - -+#ifdef CONFIG_X86 -+ /* -+ * PCI Local Bus Specification Revision 3.0 defines 0xff value -+ * as special only for X86. -+ */ -+ if ( uart->irq == 0xff ) -+ uart->irq = 0; -+#endif -+ if ( !uart->irq ) -+ printk(XENLOG_INFO -+ "ns16550: %pp: no legacy IRQ, using poll mode\n", -+ &PCI_SBDF(0, b, d, f)); -+ - return 0; - } - } --- -2.35.1 - diff --git a/0020-xen-sched-carve-out-memory-allocation-and-freeing-fr.patch b/0020-xen-sched-carve-out-memory-allocation-and-freeing-fr.patch new file mode 100644 index 0000000..30784c3 --- /dev/null +++ b/0020-xen-sched-carve-out-memory-allocation-and-freeing-fr.patch @@ -0,0 +1,263 @@ +From c377ceab0a007690a1e71c81a5232613c99e944d Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 11 Oct 2022 15:00:05 +0200 +Subject: [PATCH 20/26] xen/sched: carve out memory allocation and freeing from + schedule_cpu_rm() + +In order to prepare not allocating or freeing memory from +schedule_cpu_rm(), move this functionality to dedicated functions. + +For now call those functions from schedule_cpu_rm(). + +No change of behavior expected. + +Signed-off-by: Juergen Gross <jgross@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: d42be6f83480b3ada286dc18444331a816be88a3 +master date: 2022-09-05 11:42:30 +0100 +--- + xen/common/sched/core.c | 143 ++++++++++++++++++++++--------------- + xen/common/sched/private.h | 11 +++ + 2 files changed, 98 insertions(+), 56 deletions(-) + +diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c +index 065a83eca912..2decb1161a63 100644 +--- a/xen/common/sched/core.c ++++ b/xen/common/sched/core.c +@@ -3221,6 +3221,75 @@ out: + return ret; + } + ++/* ++ * Allocate all memory needed for free_cpu_rm_data(), as allocations cannot ++ * be made in stop_machine() context. ++ * ++ * Between alloc_cpu_rm_data() and the real cpu removal action the relevant ++ * contents of struct sched_resource can't change, as the cpu in question is ++ * locked against any other movement to or from cpupools, and the data copied ++ * by alloc_cpu_rm_data() is modified only in case the cpu in question is ++ * being moved from or to a cpupool. ++ */ ++struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu) ++{ ++ struct cpu_rm_data *data; ++ const struct sched_resource *sr; ++ unsigned int idx; ++ ++ rcu_read_lock(&sched_res_rculock); ++ ++ sr = get_sched_res(cpu); ++ data = xmalloc_flex_struct(struct cpu_rm_data, sr, sr->granularity - 1); ++ if ( !data ) ++ goto out; ++ ++ data->old_ops = sr->scheduler; ++ data->vpriv_old = idle_vcpu[cpu]->sched_unit->priv; ++ data->ppriv_old = sr->sched_priv; ++ ++ for ( idx = 0; idx < sr->granularity - 1; idx++ ) ++ { ++ data->sr[idx] = sched_alloc_res(); ++ if ( data->sr[idx] ) ++ { ++ data->sr[idx]->sched_unit_idle = sched_alloc_unit_mem(); ++ if ( !data->sr[idx]->sched_unit_idle ) ++ { ++ sched_res_free(&data->sr[idx]->rcu); ++ data->sr[idx] = NULL; ++ } ++ } ++ if ( !data->sr[idx] ) ++ { ++ while ( idx > 0 ) ++ sched_res_free(&data->sr[--idx]->rcu); ++ XFREE(data); ++ goto out; ++ } ++ ++ data->sr[idx]->curr = data->sr[idx]->sched_unit_idle; ++ data->sr[idx]->scheduler = &sched_idle_ops; ++ data->sr[idx]->granularity = 1; ++ ++ /* We want the lock not to change when replacing the resource. */ ++ data->sr[idx]->schedule_lock = sr->schedule_lock; ++ } ++ ++ out: ++ rcu_read_unlock(&sched_res_rculock); ++ ++ return data; ++} ++ ++void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu) ++{ ++ sched_free_udata(mem->old_ops, mem->vpriv_old); ++ sched_free_pdata(mem->old_ops, mem->ppriv_old, cpu); ++ ++ xfree(mem); ++} ++ + /* + * Remove a pCPU from its cpupool. Its scheduler becomes &sched_idle_ops + * (the idle scheduler). +@@ -3229,53 +3298,23 @@ out: + */ + int schedule_cpu_rm(unsigned int cpu) + { +- void *ppriv_old, *vpriv_old; +- struct sched_resource *sr, **sr_new = NULL; ++ struct sched_resource *sr; ++ struct cpu_rm_data *data; + struct sched_unit *unit; +- struct scheduler *old_ops; + spinlock_t *old_lock; + unsigned long flags; +- int idx, ret = -ENOMEM; ++ int idx = 0; + unsigned int cpu_iter; + ++ data = alloc_cpu_rm_data(cpu); ++ if ( !data ) ++ return -ENOMEM; ++ + rcu_read_lock(&sched_res_rculock); + + sr = get_sched_res(cpu); +- old_ops = sr->scheduler; + +- if ( sr->granularity > 1 ) +- { +- sr_new = xmalloc_array(struct sched_resource *, sr->granularity - 1); +- if ( !sr_new ) +- goto out; +- for ( idx = 0; idx < sr->granularity - 1; idx++ ) +- { +- sr_new[idx] = sched_alloc_res(); +- if ( sr_new[idx] ) +- { +- sr_new[idx]->sched_unit_idle = sched_alloc_unit_mem(); +- if ( !sr_new[idx]->sched_unit_idle ) +- { +- sched_res_free(&sr_new[idx]->rcu); +- sr_new[idx] = NULL; +- } +- } +- if ( !sr_new[idx] ) +- { +- for ( idx--; idx >= 0; idx-- ) +- sched_res_free(&sr_new[idx]->rcu); +- goto out; +- } +- sr_new[idx]->curr = sr_new[idx]->sched_unit_idle; +- sr_new[idx]->scheduler = &sched_idle_ops; +- sr_new[idx]->granularity = 1; +- +- /* We want the lock not to change when replacing the resource. */ +- sr_new[idx]->schedule_lock = sr->schedule_lock; +- } +- } +- +- ret = 0; ++ ASSERT(sr->granularity); + ASSERT(sr->cpupool != NULL); + ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus)); + ASSERT(!cpumask_test_cpu(cpu, sr->cpupool->cpu_valid)); +@@ -3283,10 +3322,6 @@ int schedule_cpu_rm(unsigned int cpu) + /* See comment in schedule_cpu_add() regarding lock switching. */ + old_lock = pcpu_schedule_lock_irqsave(cpu, &flags); + +- vpriv_old = idle_vcpu[cpu]->sched_unit->priv; +- ppriv_old = sr->sched_priv; +- +- idx = 0; + for_each_cpu ( cpu_iter, sr->cpus ) + { + per_cpu(sched_res_idx, cpu_iter) = 0; +@@ -3300,27 +3335,27 @@ int schedule_cpu_rm(unsigned int cpu) + else + { + /* Initialize unit. */ +- unit = sr_new[idx]->sched_unit_idle; +- unit->res = sr_new[idx]; ++ unit = data->sr[idx]->sched_unit_idle; ++ unit->res = data->sr[idx]; + unit->is_running = true; + sched_unit_add_vcpu(unit, idle_vcpu[cpu_iter]); + sched_domain_insert_unit(unit, idle_vcpu[cpu_iter]->domain); + + /* Adjust cpu masks of resources (old and new). */ + cpumask_clear_cpu(cpu_iter, sr->cpus); +- cpumask_set_cpu(cpu_iter, sr_new[idx]->cpus); ++ cpumask_set_cpu(cpu_iter, data->sr[idx]->cpus); + cpumask_set_cpu(cpu_iter, &sched_res_mask); + + /* Init timer. */ +- init_timer(&sr_new[idx]->s_timer, s_timer_fn, NULL, cpu_iter); ++ init_timer(&data->sr[idx]->s_timer, s_timer_fn, NULL, cpu_iter); + + /* Last resource initializations and insert resource pointer. */ +- sr_new[idx]->master_cpu = cpu_iter; +- set_sched_res(cpu_iter, sr_new[idx]); ++ data->sr[idx]->master_cpu = cpu_iter; ++ set_sched_res(cpu_iter, data->sr[idx]); + + /* Last action: set the new lock pointer. */ + smp_mb(); +- sr_new[idx]->schedule_lock = &sched_free_cpu_lock; ++ data->sr[idx]->schedule_lock = &sched_free_cpu_lock; + + idx++; + } +@@ -3336,16 +3371,12 @@ int schedule_cpu_rm(unsigned int cpu) + /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */ + spin_unlock_irqrestore(old_lock, flags); + +- sched_deinit_pdata(old_ops, ppriv_old, cpu); ++ sched_deinit_pdata(data->old_ops, data->ppriv_old, cpu); + +- sched_free_udata(old_ops, vpriv_old); +- sched_free_pdata(old_ops, ppriv_old, cpu); +- +-out: + rcu_read_unlock(&sched_res_rculock); +- xfree(sr_new); ++ free_cpu_rm_data(data, cpu); + +- return ret; ++ return 0; + } + + struct scheduler *scheduler_get_default(void) +diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h +index 2b04b01a0c0a..e286849a1312 100644 +--- a/xen/common/sched/private.h ++++ b/xen/common/sched/private.h +@@ -600,6 +600,15 @@ struct affinity_masks { + + bool alloc_affinity_masks(struct affinity_masks *affinity); + void free_affinity_masks(struct affinity_masks *affinity); ++ ++/* Memory allocation related data for schedule_cpu_rm(). */ ++struct cpu_rm_data { ++ const struct scheduler *old_ops; ++ void *ppriv_old; ++ void *vpriv_old; ++ struct sched_resource *sr[]; ++}; ++ + void sched_rm_cpu(unsigned int cpu); + const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt, unsigned int cpu); + void schedule_dump(struct cpupool *c); +@@ -608,6 +617,8 @@ struct scheduler *scheduler_alloc(unsigned int sched_id); + void scheduler_free(struct scheduler *sched); + int cpu_disable_scheduler(unsigned int cpu); + int schedule_cpu_add(unsigned int cpu, struct cpupool *c); ++struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu); ++void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu); + int schedule_cpu_rm(unsigned int cpu); + int sched_move_domain(struct domain *d, struct cpupool *c); + struct cpupool *cpupool_get_by_id(unsigned int poolid); +-- +2.37.3 + diff --git a/0021-PCI-don-t-allow-pci-phantom-to-mark-real-devices-as-.patch b/0021-PCI-don-t-allow-pci-phantom-to-mark-real-devices-as-.patch deleted file mode 100644 index 1312bda..0000000 --- a/0021-PCI-don-t-allow-pci-phantom-to-mark-real-devices-as-.patch +++ /dev/null @@ -1,56 +0,0 @@ -From 8e11ec8fbf6f933f8854f4bc54226653316903f2 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 7 Jun 2022 14:08:06 +0200 -Subject: [PATCH 21/51] PCI: don't allow "pci-phantom=" to mark real devices as - phantom functions -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -IOMMU code mapping / unmapping devices and interrupts will misbehave if -a wrong command line option declared a function "phantom" when there's a -real device at that position. Warn about this and adjust the specified -stride (in the worst case ignoring the option altogether). - -Requested-by: Andrew Cooper <andrew.cooper3@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -master commit: 444b555dc9e09fa3ce90f066e0c88dec9b47f422 -master date: 2022-05-20 12:20:35 +0200 ---- - xen/drivers/passthrough/pci.c | 19 ++++++++++++++++++- - 1 file changed, 18 insertions(+), 1 deletion(-) - -diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c -index 395958698e6a..e0491c908f10 100644 ---- a/xen/drivers/passthrough/pci.c -+++ b/xen/drivers/passthrough/pci.c -@@ -382,7 +382,24 @@ static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn) - phantom_devs[i].slot == PCI_SLOT(devfn) && - phantom_devs[i].stride > PCI_FUNC(devfn) ) - { -- pdev->phantom_stride = phantom_devs[i].stride; -+ pci_sbdf_t sbdf = pdev->sbdf; -+ unsigned int stride = phantom_devs[i].stride; -+ -+ while ( (sbdf.fn += stride) > PCI_FUNC(devfn) ) -+ { -+ if ( pci_conf_read16(sbdf, PCI_VENDOR_ID) == 0xffff && -+ pci_conf_read16(sbdf, PCI_DEVICE_ID) == 0xffff ) -+ continue; -+ stride <<= 1; -+ printk(XENLOG_WARNING -+ "%pp looks to be a real device; bumping %04x:%02x:%02x stride to %u\n", -+ &sbdf, phantom_devs[i].seg, -+ phantom_devs[i].bus, phantom_devs[i].slot, -+ stride); -+ sbdf = pdev->sbdf; -+ } -+ if ( PCI_FUNC(stride) ) -+ pdev->phantom_stride = stride; - break; - } - } --- -2.35.1 - diff --git a/0021-xen-sched-fix-cpu-hotplug.patch b/0021-xen-sched-fix-cpu-hotplug.patch new file mode 100644 index 0000000..ea0b732 --- /dev/null +++ b/0021-xen-sched-fix-cpu-hotplug.patch @@ -0,0 +1,307 @@ +From 4f3204c2bc66db18c61600dd3e08bf1fd9584a1b Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 11 Oct 2022 15:00:19 +0200 +Subject: [PATCH 21/26] xen/sched: fix cpu hotplug + +Cpu unplugging is calling schedule_cpu_rm() via stop_machine_run() with +interrupts disabled, thus any memory allocation or freeing must be +avoided. + +Since commit 5047cd1d5dea ("xen/common: Use enhanced +ASSERT_ALLOC_CONTEXT in xmalloc()") this restriction is being enforced +via an assertion, which will now fail. + +Fix this by allocating needed memory before entering stop_machine_run() +and freeing any memory only after having finished stop_machine_run(). + +Fixes: 1ec410112cdd ("xen/sched: support differing granularity in schedule_cpu_[add/rm]()") +Reported-by: Gao Ruifeng <ruifeng.gao@intel.com> +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> +Tested-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: d84473689611eed32fd90b27e614f28af767fa3f +master date: 2022-09-05 11:42:30 +0100 +--- + xen/common/sched/core.c | 25 +++++++++++--- + xen/common/sched/cpupool.c | 69 +++++++++++++++++++++++++++++--------- + xen/common/sched/private.h | 5 +-- + 3 files changed, 77 insertions(+), 22 deletions(-) + +diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c +index 2decb1161a63..900aab8f66a7 100644 +--- a/xen/common/sched/core.c ++++ b/xen/common/sched/core.c +@@ -3231,7 +3231,7 @@ out: + * by alloc_cpu_rm_data() is modified only in case the cpu in question is + * being moved from or to a cpupool. + */ +-struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu) ++struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu, bool aff_alloc) + { + struct cpu_rm_data *data; + const struct sched_resource *sr; +@@ -3244,6 +3244,17 @@ struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu) + if ( !data ) + goto out; + ++ if ( aff_alloc ) ++ { ++ if ( !alloc_affinity_masks(&data->affinity) ) ++ { ++ XFREE(data); ++ goto out; ++ } ++ } ++ else ++ memset(&data->affinity, 0, sizeof(data->affinity)); ++ + data->old_ops = sr->scheduler; + data->vpriv_old = idle_vcpu[cpu]->sched_unit->priv; + data->ppriv_old = sr->sched_priv; +@@ -3264,6 +3275,7 @@ struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu) + { + while ( idx > 0 ) + sched_res_free(&data->sr[--idx]->rcu); ++ free_affinity_masks(&data->affinity); + XFREE(data); + goto out; + } +@@ -3286,6 +3298,7 @@ void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu) + { + sched_free_udata(mem->old_ops, mem->vpriv_old); + sched_free_pdata(mem->old_ops, mem->ppriv_old, cpu); ++ free_affinity_masks(&mem->affinity); + + xfree(mem); + } +@@ -3296,17 +3309,18 @@ void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu) + * The cpu is already marked as "free" and not valid any longer for its + * cpupool. + */ +-int schedule_cpu_rm(unsigned int cpu) ++int schedule_cpu_rm(unsigned int cpu, struct cpu_rm_data *data) + { + struct sched_resource *sr; +- struct cpu_rm_data *data; + struct sched_unit *unit; + spinlock_t *old_lock; + unsigned long flags; + int idx = 0; + unsigned int cpu_iter; ++ bool free_data = !data; + +- data = alloc_cpu_rm_data(cpu); ++ if ( !data ) ++ data = alloc_cpu_rm_data(cpu, false); + if ( !data ) + return -ENOMEM; + +@@ -3374,7 +3388,8 @@ int schedule_cpu_rm(unsigned int cpu) + sched_deinit_pdata(data->old_ops, data->ppriv_old, cpu); + + rcu_read_unlock(&sched_res_rculock); +- free_cpu_rm_data(data, cpu); ++ if ( free_data ) ++ free_cpu_rm_data(data, cpu); + + return 0; + } +diff --git a/xen/common/sched/cpupool.c b/xen/common/sched/cpupool.c +index 45b6ff99561a..b5a948639aad 100644 +--- a/xen/common/sched/cpupool.c ++++ b/xen/common/sched/cpupool.c +@@ -402,22 +402,28 @@ int cpupool_move_domain(struct domain *d, struct cpupool *c) + } + + /* Update affinities of all domains in a cpupool. */ +-static void cpupool_update_node_affinity(const struct cpupool *c) ++static void cpupool_update_node_affinity(const struct cpupool *c, ++ struct affinity_masks *masks) + { +- struct affinity_masks masks; ++ struct affinity_masks local_masks; + struct domain *d; + +- if ( !alloc_affinity_masks(&masks) ) +- return; ++ if ( !masks ) ++ { ++ if ( !alloc_affinity_masks(&local_masks) ) ++ return; ++ masks = &local_masks; ++ } + + rcu_read_lock(&domlist_read_lock); + + for_each_domain_in_cpupool(d, c) +- domain_update_node_aff(d, &masks); ++ domain_update_node_aff(d, masks); + + rcu_read_unlock(&domlist_read_lock); + +- free_affinity_masks(&masks); ++ if ( masks == &local_masks ) ++ free_affinity_masks(masks); + } + + /* +@@ -451,15 +457,17 @@ static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu) + + rcu_read_unlock(&sched_res_rculock); + +- cpupool_update_node_affinity(c); ++ cpupool_update_node_affinity(c, NULL); + + return 0; + } + +-static int cpupool_unassign_cpu_finish(struct cpupool *c) ++static int cpupool_unassign_cpu_finish(struct cpupool *c, ++ struct cpu_rm_data *mem) + { + int cpu = cpupool_moving_cpu; + const cpumask_t *cpus; ++ struct affinity_masks *masks = mem ? &mem->affinity : NULL; + int ret; + + if ( c != cpupool_cpu_moving ) +@@ -482,7 +490,7 @@ static int cpupool_unassign_cpu_finish(struct cpupool *c) + */ + if ( !ret ) + { +- ret = schedule_cpu_rm(cpu); ++ ret = schedule_cpu_rm(cpu, mem); + if ( ret ) + cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus); + else +@@ -494,7 +502,7 @@ static int cpupool_unassign_cpu_finish(struct cpupool *c) + } + rcu_read_unlock(&sched_res_rculock); + +- cpupool_update_node_affinity(c); ++ cpupool_update_node_affinity(c, masks); + + return ret; + } +@@ -558,7 +566,7 @@ static long cpupool_unassign_cpu_helper(void *info) + cpupool_cpu_moving->cpupool_id, cpupool_moving_cpu); + spin_lock(&cpupool_lock); + +- ret = cpupool_unassign_cpu_finish(c); ++ ret = cpupool_unassign_cpu_finish(c, NULL); + + spin_unlock(&cpupool_lock); + debugtrace_printk("cpupool_unassign_cpu ret=%ld\n", ret); +@@ -701,7 +709,7 @@ static int cpupool_cpu_add(unsigned int cpu) + * This function is called in stop_machine context, so we can be sure no + * non-idle vcpu is active on the system. + */ +-static void cpupool_cpu_remove(unsigned int cpu) ++static void cpupool_cpu_remove(unsigned int cpu, struct cpu_rm_data *mem) + { + int ret; + +@@ -709,7 +717,7 @@ static void cpupool_cpu_remove(unsigned int cpu) + + if ( !cpumask_test_cpu(cpu, &cpupool_free_cpus) ) + { +- ret = cpupool_unassign_cpu_finish(cpupool0); ++ ret = cpupool_unassign_cpu_finish(cpupool0, mem); + BUG_ON(ret); + } + cpumask_clear_cpu(cpu, &cpupool_free_cpus); +@@ -775,7 +783,7 @@ static void cpupool_cpu_remove_forced(unsigned int cpu) + { + ret = cpupool_unassign_cpu_start(c, master_cpu); + BUG_ON(ret); +- ret = cpupool_unassign_cpu_finish(c); ++ ret = cpupool_unassign_cpu_finish(c, NULL); + BUG_ON(ret); + } + } +@@ -993,12 +1001,24 @@ void dump_runq(unsigned char key) + static int cpu_callback( + struct notifier_block *nfb, unsigned long action, void *hcpu) + { ++ static struct cpu_rm_data *mem; ++ + unsigned int cpu = (unsigned long)hcpu; + int rc = 0; + + switch ( action ) + { + case CPU_DOWN_FAILED: ++ if ( system_state <= SYS_STATE_active ) ++ { ++ if ( mem ) ++ { ++ free_cpu_rm_data(mem, cpu); ++ mem = NULL; ++ } ++ rc = cpupool_cpu_add(cpu); ++ } ++ break; + case CPU_ONLINE: + if ( system_state <= SYS_STATE_active ) + rc = cpupool_cpu_add(cpu); +@@ -1006,12 +1026,31 @@ static int cpu_callback( + case CPU_DOWN_PREPARE: + /* Suspend/Resume don't change assignments of cpus to cpupools. */ + if ( system_state <= SYS_STATE_active ) ++ { + rc = cpupool_cpu_remove_prologue(cpu); ++ if ( !rc ) ++ { ++ ASSERT(!mem); ++ mem = alloc_cpu_rm_data(cpu, true); ++ rc = mem ? 0 : -ENOMEM; ++ } ++ } + break; + case CPU_DYING: + /* Suspend/Resume don't change assignments of cpus to cpupools. */ + if ( system_state <= SYS_STATE_active ) +- cpupool_cpu_remove(cpu); ++ { ++ ASSERT(mem); ++ cpupool_cpu_remove(cpu, mem); ++ } ++ break; ++ case CPU_DEAD: ++ if ( system_state <= SYS_STATE_active ) ++ { ++ ASSERT(mem); ++ free_cpu_rm_data(mem, cpu); ++ mem = NULL; ++ } + break; + case CPU_RESUME_FAILED: + cpupool_cpu_remove_forced(cpu); +diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h +index e286849a1312..0126a4bb9ed3 100644 +--- a/xen/common/sched/private.h ++++ b/xen/common/sched/private.h +@@ -603,6 +603,7 @@ void free_affinity_masks(struct affinity_masks *affinity); + + /* Memory allocation related data for schedule_cpu_rm(). */ + struct cpu_rm_data { ++ struct affinity_masks affinity; + const struct scheduler *old_ops; + void *ppriv_old; + void *vpriv_old; +@@ -617,9 +618,9 @@ struct scheduler *scheduler_alloc(unsigned int sched_id); + void scheduler_free(struct scheduler *sched); + int cpu_disable_scheduler(unsigned int cpu); + int schedule_cpu_add(unsigned int cpu, struct cpupool *c); +-struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu); ++struct cpu_rm_data *alloc_cpu_rm_data(unsigned int cpu, bool aff_alloc); + void free_cpu_rm_data(struct cpu_rm_data *mem, unsigned int cpu); +-int schedule_cpu_rm(unsigned int cpu); ++int schedule_cpu_rm(unsigned int cpu, struct cpu_rm_data *mem); + int sched_move_domain(struct domain *d, struct cpupool *c); + struct cpupool *cpupool_get_by_id(unsigned int poolid); + void cpupool_put(struct cpupool *pool); +-- +2.37.3 + diff --git a/0022-Config.mk-correct-PIE-related-option-s-in-EMBEDDED_E.patch b/0022-Config.mk-correct-PIE-related-option-s-in-EMBEDDED_E.patch new file mode 100644 index 0000000..03f485a --- /dev/null +++ b/0022-Config.mk-correct-PIE-related-option-s-in-EMBEDDED_E.patch @@ -0,0 +1,58 @@ +From 2b694dd2932be78431b14257f23b738f2fc8f6a1 Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 11 Oct 2022 15:00:33 +0200 +Subject: [PATCH 22/26] Config.mk: correct PIE-related option(s) in + EMBEDDED_EXTRA_CFLAGS + +I haven't been able to find evidence of "-nopie" ever having been a +supported compiler option. The correct spelling is "-no-pie". +Furthermore like "-pie" this is an option which is solely passed to the +linker. The compiler only recognizes "-fpie" / "-fPIE" / "-fno-pie", and +it doesn't infer these options from "-pie" / "-no-pie". + +Add the compiler recognized form, but for the possible case of the +variable also being used somewhere for linking keep the linker option as +well (with corrected spelling). + +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Julien Grall <jgrall@amazon.com> + +Build: Drop -no-pie from EMBEDDED_EXTRA_CFLAGS + +This breaks all Clang builds, as demostrated by Gitlab CI. + +Contrary to the description in ecd6b9759919, -no-pie is not even an option +passed to the linker. GCC's actual behaviour is to inhibit the passing of +-pie to the linker, as well as selecting different cr0 artefacts to be linked. + +EMBEDDED_EXTRA_CFLAGS is not used for $(CC)-doing-linking, and not liable to +gain such a usecase. + +Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> +Acked-by: Jan Beulich <jbeulich@suse.com> +Tested-by: Stefano Stabellini <sstabellini@kernel.org> +Fixes: ecd6b9759919 ("Config.mk: correct PIE-related option(s) in EMBEDDED_EXTRA_CFLAGS") +master commit: ecd6b9759919fa6335b0be1b5fc5cce29a30c4f1 +master date: 2022-09-08 09:25:26 +0200 +master commit: 13a7c0074ac8fb31f6c0485429b7a20a1946cb22 +master date: 2022-09-27 15:40:42 -0700 +--- + Config.mk | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/Config.mk b/Config.mk +index 46de3cd1e0e1..6f95067b8de6 100644 +--- a/Config.mk ++++ b/Config.mk +@@ -197,7 +197,7 @@ endif + APPEND_LDFLAGS += $(foreach i, $(APPEND_LIB), -L$(i)) + APPEND_CFLAGS += $(foreach i, $(APPEND_INCLUDES), -I$(i)) + +-EMBEDDED_EXTRA_CFLAGS := -nopie -fno-stack-protector -fno-stack-protector-all ++EMBEDDED_EXTRA_CFLAGS := -fno-pie -fno-stack-protector -fno-stack-protector-all + EMBEDDED_EXTRA_CFLAGS += -fno-exceptions -fno-asynchronous-unwind-tables + + XEN_EXTFILES_URL ?= http://xenbits.xen.org/xen-extfiles +-- +2.37.3 + diff --git a/0022-x86-pv-Clean-up-_get_page_type.patch b/0022-x86-pv-Clean-up-_get_page_type.patch deleted file mode 100644 index 0270beb..0000000 --- a/0022-x86-pv-Clean-up-_get_page_type.patch +++ /dev/null @@ -1,180 +0,0 @@ -From b152dfbc3ad71a788996440b18174d995c3bffc9 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Thu, 9 Jun 2022 15:27:19 +0200 -Subject: [PATCH 22/51] x86/pv: Clean up _get_page_type() - -Various fixes for clarity, ahead of making complicated changes. - - * Split the overflow check out of the if/else chain for type handling, as - it's somewhat unrelated. - * Comment the main if/else chain to explain what is going on. Adjust one - ASSERT() and state the bit layout for validate-locked and partial states. - * Correct the comment about TLB flushing, as it's backwards. The problem - case is when writeable mappings are retained to a page becoming read-only, - as it allows the guest to bypass Xen's safety checks for updates. - * Reduce the scope of 'y'. It is an artefact of the cmpxchg loop and not - valid for use by subsequent logic. Switch to using ACCESS_ONCE() to treat - all reads as explicitly volatile. The only thing preventing the validated - wait-loop being infinite is the compiler barrier hidden in cpu_relax(). - * Replace one page_get_owner(page) with the already-calculated 'd' already in - scope. - -No functional change. - -This is part of XSA-401 / CVE-2022-26362. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: George Dunlap <george.dunlap@citrix.com> -master commit: 9186e96b199e4f7e52e033b238f9fe869afb69c7 -master date: 2022-06-09 14:20:36 +0200 ---- - xen/arch/x86/mm.c | 72 +++++++++++++++++++++++++++++++++++++++-------- - 1 file changed, 61 insertions(+), 11 deletions(-) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index 4ee2de11051d..79ad7fdd2b82 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -2906,16 +2906,17 @@ static int _put_page_type(struct page_info *page, unsigned int flags, - static int _get_page_type(struct page_info *page, unsigned long type, - bool preemptible) - { -- unsigned long nx, x, y = page->u.inuse.type_info; -+ unsigned long nx, x; - int rc = 0; - - ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2))); - ASSERT(!in_irq()); - -- for ( ; ; ) -+ for ( unsigned long y = ACCESS_ONCE(page->u.inuse.type_info); ; ) - { - x = y; - nx = x + 1; -+ - if ( unlikely((nx & PGT_count_mask) == 0) ) - { - gdprintk(XENLOG_WARNING, -@@ -2923,8 +2924,15 @@ static int _get_page_type(struct page_info *page, unsigned long type, - mfn_x(page_to_mfn(page))); - return -EINVAL; - } -- else if ( unlikely((x & PGT_count_mask) == 0) ) -+ -+ if ( unlikely((x & PGT_count_mask) == 0) ) - { -+ /* -+ * Typeref 0 -> 1. -+ * -+ * Type changes are permitted when the typeref is 0. If the type -+ * actually changes, the page needs re-validating. -+ */ - struct domain *d = page_get_owner(page); - - if ( d && shadow_mode_enabled(d) ) -@@ -2935,8 +2943,8 @@ static int _get_page_type(struct page_info *page, unsigned long type, - { - /* - * On type change we check to flush stale TLB entries. It is -- * vital that no other CPUs are left with mappings of a frame -- * which is about to become writeable to the guest. -+ * vital that no other CPUs are left with writeable mappings -+ * to a frame which is intending to become pgtable/segdesc. - */ - cpumask_t *mask = this_cpu(scratch_cpumask); - -@@ -2948,7 +2956,7 @@ static int _get_page_type(struct page_info *page, unsigned long type, - - if ( unlikely(!cpumask_empty(mask)) && - /* Shadow mode: track only writable pages. */ -- (!shadow_mode_enabled(page_get_owner(page)) || -+ (!shadow_mode_enabled(d) || - ((nx & PGT_type_mask) == PGT_writable_page)) ) - { - perfc_incr(need_flush_tlb_flush); -@@ -2979,7 +2987,14 @@ static int _get_page_type(struct page_info *page, unsigned long type, - } - else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) ) - { -- /* Don't log failure if it could be a recursive-mapping attempt. */ -+ /* -+ * else, we're trying to take a new reference, of the wrong type. -+ * -+ * This (being able to prohibit use of the wrong type) is what the -+ * typeref system exists for, but skip printing the failure if it -+ * looks like a recursive mapping, as subsequent logic might -+ * ultimately permit the attempt. -+ */ - if ( ((x & PGT_type_mask) == PGT_l2_page_table) && - (type == PGT_l1_page_table) ) - return -EINVAL; -@@ -2998,18 +3013,46 @@ static int _get_page_type(struct page_info *page, unsigned long type, - } - else if ( unlikely(!(x & PGT_validated)) ) - { -+ /* -+ * else, the count is non-zero, and we're grabbing the right type; -+ * but the page hasn't been validated yet. -+ * -+ * The page is in one of two states (depending on PGT_partial), -+ * and should have exactly one reference. -+ */ -+ ASSERT((x & (PGT_type_mask | PGT_count_mask)) == (type | 1)); -+ - if ( !(x & PGT_partial) ) - { -- /* Someone else is updating validation of this page. Wait... */ -+ /* -+ * The page has been left in the "validate locked" state -+ * (i.e. PGT_[type] | 1) which means that a concurrent caller -+ * of _get_page_type() is in the middle of validation. -+ * -+ * Spin waiting for the concurrent user to complete (partial -+ * or fully validated), then restart our attempt to acquire a -+ * type reference. -+ */ - do { - if ( preemptible && hypercall_preempt_check() ) - return -EINTR; - cpu_relax(); -- } while ( (y = page->u.inuse.type_info) == x ); -+ } while ( (y = ACCESS_ONCE(page->u.inuse.type_info)) == x ); - continue; - } -- /* Type ref count was left at 1 when PGT_partial got set. */ -- ASSERT((x & PGT_count_mask) == 1); -+ -+ /* -+ * The page has been left in the "partial" state -+ * (i.e., PGT_[type] | PGT_partial | 1). -+ * -+ * Rather than bumping the type count, we need to try to grab the -+ * validation lock; if we succeed, we need to validate the page, -+ * then drop the general ref associated with the PGT_partial bit. -+ * -+ * We grab the validation lock by setting nx to (PGT_[type] | 1) -+ * (i.e., non-zero type count, neither PGT_validated nor -+ * PGT_partial set). -+ */ - nx = x & ~PGT_partial; - } - -@@ -3058,6 +3101,13 @@ static int _get_page_type(struct page_info *page, unsigned long type, - } - - out: -+ /* -+ * Did we drop the PGT_partial bit when acquiring the typeref? If so, -+ * drop the general reference that went along with it. -+ * -+ * N.B. validate_page() may have have re-set PGT_partial, not reflected in -+ * nx, but will have taken an extra ref when doing so. -+ */ - if ( (x & PGT_partial) && !(nx & PGT_partial) ) - put_page(page); - --- -2.35.1 - diff --git a/0023-tools-xenstore-minor-fix-of-the-migration-stream-doc.patch b/0023-tools-xenstore-minor-fix-of-the-migration-stream-doc.patch new file mode 100644 index 0000000..45f7509 --- /dev/null +++ b/0023-tools-xenstore-minor-fix-of-the-migration-stream-doc.patch @@ -0,0 +1,41 @@ +From 49510071ee93905378e54664778760ed3908d447 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 11 Oct 2022 15:00:59 +0200 +Subject: [PATCH 23/26] tools/xenstore: minor fix of the migration stream doc + +Drop mentioning the non-existent read-only socket in the migration +stream description document. + +The related record field was removed in commit 8868a0e3f674 ("docs: +update the xenstore migration stream documentation). + +Signed-off-by: Juergen Gross <jgross@suse.com> +Acked-by: Julien Grall <jgrall@amazon.com> +master commit: ace1d2eff80d3d66c37ae765dae3e3cb5697e5a4 +master date: 2022-09-08 09:25:58 +0200 +--- + docs/designs/xenstore-migration.md | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +diff --git a/docs/designs/xenstore-migration.md b/docs/designs/xenstore-migration.md +index 5f1155273ec3..78530bbb0ef4 100644 +--- a/docs/designs/xenstore-migration.md ++++ b/docs/designs/xenstore-migration.md +@@ -129,11 +129,9 @@ xenstored state that needs to be restored. + | `evtchn-fd` | The file descriptor used to communicate with | + | | the event channel driver | + +-xenstored will resume in the original process context. Hence `rw-socket-fd` and +-`ro-socket-fd` simply specify the file descriptors of the sockets. Sockets +-are not always used, however, and so -1 will be used to denote an unused +-socket. +- ++xenstored will resume in the original process context. Hence `rw-socket-fd` ++simply specifies the file descriptor of the socket. Sockets are not always ++used, however, and so -1 will be used to denote an unused socket. + + \pagebreak + +-- +2.37.3 + diff --git a/0023-x86-pv-Fix-ABAC-cmpxchg-race-in-_get_page_type.patch b/0023-x86-pv-Fix-ABAC-cmpxchg-race-in-_get_page_type.patch deleted file mode 100644 index 1e3febd..0000000 --- a/0023-x86-pv-Fix-ABAC-cmpxchg-race-in-_get_page_type.patch +++ /dev/null @@ -1,201 +0,0 @@ -From 8dab3f79b122e69cbcdebca72cdc14f004ee2193 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Thu, 9 Jun 2022 15:27:37 +0200 -Subject: [PATCH 23/51] x86/pv: Fix ABAC cmpxchg() race in _get_page_type() - -_get_page_type() suffers from a race condition where it incorrectly assumes -that because 'x' was read and a subsequent a cmpxchg() succeeds, the type -cannot have changed in-between. Consider: - -CPU A: - 1. Creates an L2e referencing pg - `-> _get_page_type(pg, PGT_l1_page_table), sees count 0, type PGT_writable_page - 2. Issues flush_tlb_mask() -CPU B: - 3. Creates a writeable mapping of pg - `-> _get_page_type(pg, PGT_writable_page), count increases to 1 - 4. Writes into new mapping, creating a TLB entry for pg - 5. Removes the writeable mapping of pg - `-> _put_page_type(pg), count goes back down to 0 -CPU A: - 7. Issues cmpxchg(), setting count 1, type PGT_l1_page_table - -CPU B now has a writeable mapping to pg, which Xen believes is a pagetable and -suitably protected (i.e. read-only). The TLB flush in step 2 must be deferred -until after the guest is prohibited from creating new writeable mappings, -which is after step 7. - -Defer all safety actions until after the cmpxchg() has successfully taken the -intended typeref, because that is what prevents concurrent users from using -the old type. - -Also remove the early validation for writeable and shared pages. This removes -race conditions where one half of a parallel mapping attempt can return -successfully before: - * The IOMMU pagetables are in sync with the new page type - * Writeable mappings to shared pages have been torn down - -This is part of XSA-401 / CVE-2022-26362. - -Reported-by: Jann Horn <jannh@google.com> -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: George Dunlap <george.dunlap@citrix.com> -master commit: 8cc5036bc385112a82f1faff27a0970e6440dfed -master date: 2022-06-09 14:21:04 +0200 ---- - xen/arch/x86/mm.c | 116 ++++++++++++++++++++++++++-------------------- - 1 file changed, 67 insertions(+), 49 deletions(-) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index 79ad7fdd2b82..c6429b0f749a 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -2933,56 +2933,12 @@ static int _get_page_type(struct page_info *page, unsigned long type, - * Type changes are permitted when the typeref is 0. If the type - * actually changes, the page needs re-validating. - */ -- struct domain *d = page_get_owner(page); -- -- if ( d && shadow_mode_enabled(d) ) -- shadow_prepare_page_type_change(d, page, type); - - ASSERT(!(x & PGT_pae_xen_l2)); - if ( (x & PGT_type_mask) != type ) - { -- /* -- * On type change we check to flush stale TLB entries. It is -- * vital that no other CPUs are left with writeable mappings -- * to a frame which is intending to become pgtable/segdesc. -- */ -- cpumask_t *mask = this_cpu(scratch_cpumask); -- -- BUG_ON(in_irq()); -- cpumask_copy(mask, d->dirty_cpumask); -- -- /* Don't flush if the timestamp is old enough */ -- tlbflush_filter(mask, page->tlbflush_timestamp); -- -- if ( unlikely(!cpumask_empty(mask)) && -- /* Shadow mode: track only writable pages. */ -- (!shadow_mode_enabled(d) || -- ((nx & PGT_type_mask) == PGT_writable_page)) ) -- { -- perfc_incr(need_flush_tlb_flush); -- /* -- * If page was a page table make sure the flush is -- * performed using an IPI in order to avoid changing the -- * type of a page table page under the feet of -- * spurious_page_fault(). -- */ -- flush_mask(mask, -- (x & PGT_type_mask) && -- (x & PGT_type_mask) <= PGT_root_page_table -- ? FLUSH_TLB | FLUSH_FORCE_IPI -- : FLUSH_TLB); -- } -- -- /* We lose existing type and validity. */ - nx &= ~(PGT_type_mask | PGT_validated); - nx |= type; -- -- /* -- * No special validation needed for writable pages. -- * Page tables and GDT/LDT need to be scanned for validity. -- */ -- if ( type == PGT_writable_page || type == PGT_shared_page ) -- nx |= PGT_validated; - } - } - else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) ) -@@ -3063,6 +3019,56 @@ static int _get_page_type(struct page_info *page, unsigned long type, - return -EINTR; - } - -+ /* -+ * One typeref has been taken and is now globally visible. -+ * -+ * The page is either in the "validate locked" state (PGT_[type] | 1) or -+ * fully validated (PGT_[type] | PGT_validated | >0). -+ */ -+ -+ if ( unlikely((x & PGT_count_mask) == 0) ) -+ { -+ struct domain *d = page_get_owner(page); -+ -+ if ( d && shadow_mode_enabled(d) ) -+ shadow_prepare_page_type_change(d, page, type); -+ -+ if ( (x & PGT_type_mask) != type ) -+ { -+ /* -+ * On type change we check to flush stale TLB entries. It is -+ * vital that no other CPUs are left with writeable mappings -+ * to a frame which is intending to become pgtable/segdesc. -+ */ -+ cpumask_t *mask = this_cpu(scratch_cpumask); -+ -+ BUG_ON(in_irq()); -+ cpumask_copy(mask, d->dirty_cpumask); -+ -+ /* Don't flush if the timestamp is old enough */ -+ tlbflush_filter(mask, page->tlbflush_timestamp); -+ -+ if ( unlikely(!cpumask_empty(mask)) && -+ /* Shadow mode: track only writable pages. */ -+ (!shadow_mode_enabled(d) || -+ ((nx & PGT_type_mask) == PGT_writable_page)) ) -+ { -+ perfc_incr(need_flush_tlb_flush); -+ /* -+ * If page was a page table make sure the flush is -+ * performed using an IPI in order to avoid changing the -+ * type of a page table page under the feet of -+ * spurious_page_fault(). -+ */ -+ flush_mask(mask, -+ (x & PGT_type_mask) && -+ (x & PGT_type_mask) <= PGT_root_page_table -+ ? FLUSH_TLB | FLUSH_FORCE_IPI -+ : FLUSH_TLB); -+ } -+ } -+ } -+ - if ( unlikely(((x & PGT_type_mask) == PGT_writable_page) != - (type == PGT_writable_page)) ) - { -@@ -3091,13 +3097,25 @@ static int _get_page_type(struct page_info *page, unsigned long type, - - if ( unlikely(!(nx & PGT_validated)) ) - { -- if ( !(x & PGT_partial) ) -+ /* -+ * No special validation needed for writable or shared pages. Page -+ * tables and GDT/LDT need to have their contents audited. -+ * -+ * per validate_page(), non-atomic updates are fine here. -+ */ -+ if ( type == PGT_writable_page || type == PGT_shared_page ) -+ page->u.inuse.type_info |= PGT_validated; -+ else - { -- page->nr_validated_ptes = 0; -- page->partial_flags = 0; -- page->linear_pt_count = 0; -+ if ( !(x & PGT_partial) ) -+ { -+ page->nr_validated_ptes = 0; -+ page->partial_flags = 0; -+ page->linear_pt_count = 0; -+ } -+ -+ rc = validate_page(page, type, preemptible); - } -- rc = validate_page(page, type, preemptible); - } - - out: --- -2.35.1 - diff --git a/0024-x86-page-Introduce-_PAGE_-constants-for-memory-types.patch b/0024-x86-page-Introduce-_PAGE_-constants-for-memory-types.patch deleted file mode 100644 index 409b72f..0000000 --- a/0024-x86-page-Introduce-_PAGE_-constants-for-memory-types.patch +++ /dev/null @@ -1,53 +0,0 @@ -From 9cfd796ae05421ded8e4f70b2c55352491cfa841 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Thu, 9 Jun 2022 15:27:53 +0200 -Subject: [PATCH 24/51] x86/page: Introduce _PAGE_* constants for memory types - -... rather than opencoding the PAT/PCD/PWT attributes in __PAGE_HYPERVISOR_* -constants. These are going to be needed by forthcoming logic. - -No functional change. - -This is part of XSA-402. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 1be8707c75bf4ba68447c74e1618b521dd432499 -master date: 2022-06-09 14:21:38 +0200 ---- - xen/include/asm-x86/page.h | 12 ++++++++++-- - 1 file changed, 10 insertions(+), 2 deletions(-) - -diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h -index 1d080cffbe84..2e542050f65a 100644 ---- a/xen/include/asm-x86/page.h -+++ b/xen/include/asm-x86/page.h -@@ -331,6 +331,14 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t); - - #define PAGE_CACHE_ATTRS (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT) - -+/* Memory types, encoded under Xen's choice of MSR_PAT. */ -+#define _PAGE_WB ( 0) -+#define _PAGE_WT ( _PAGE_PWT) -+#define _PAGE_UCM ( _PAGE_PCD ) -+#define _PAGE_UC ( _PAGE_PCD | _PAGE_PWT) -+#define _PAGE_WC (_PAGE_PAT ) -+#define _PAGE_WP (_PAGE_PAT | _PAGE_PWT) -+ - /* - * Debug option: Ensure that granted mappings are not implicitly unmapped. - * WARNING: This will need to be disabled to run OSes that use the spare PTE -@@ -349,8 +357,8 @@ void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t); - #define __PAGE_HYPERVISOR_RX (_PAGE_PRESENT | _PAGE_ACCESSED) - #define __PAGE_HYPERVISOR (__PAGE_HYPERVISOR_RX | \ - _PAGE_DIRTY | _PAGE_RW) --#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_PCD) --#define __PAGE_HYPERVISOR_UC (__PAGE_HYPERVISOR | _PAGE_PCD | _PAGE_PWT) -+#define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_UCM) -+#define __PAGE_HYPERVISOR_UC (__PAGE_HYPERVISOR | _PAGE_UC) - #define __PAGE_HYPERVISOR_SHSTK (__PAGE_HYPERVISOR_RO | _PAGE_DIRTY) - - #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */ --- -2.35.1 - diff --git a/0024-xen-gnttab-fix-gnttab_acquire_resource.patch b/0024-xen-gnttab-fix-gnttab_acquire_resource.patch new file mode 100644 index 0000000..898503f --- /dev/null +++ b/0024-xen-gnttab-fix-gnttab_acquire_resource.patch @@ -0,0 +1,69 @@ +From b9560762392c01b3ee84148c07be8017cb42dbc9 Mon Sep 17 00:00:00 2001 +From: Juergen Gross <jgross@suse.com> +Date: Tue, 11 Oct 2022 15:01:22 +0200 +Subject: [PATCH 24/26] xen/gnttab: fix gnttab_acquire_resource() + +Commit 9dc46386d89d ("gnttab: work around "may be used uninitialized" +warning") was wrong, as vaddrs can legitimately be NULL in case +XENMEM_resource_grant_table_id_status was specified for a grant table +v1. This would result in crashes in debug builds due to +ASSERT_UNREACHABLE() triggering. + +Check vaddrs only to be NULL in the rc == 0 case. + +Expand the tests in tools/tests/resource to tickle this path, and verify that +using XENMEM_resource_grant_table_id_status on a v1 grant table fails. + +Fixes: 9dc46386d89d ("gnttab: work around "may be used uninitialized" warning") +Signed-off-by: Juergen Gross <jgross@suse.com> +Reviewed-by: Jan Beulich <jbeulich@suse.com> # xen +Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> +master commit: 52daa6a8483e4fbd6757c9d1b791e23931791608 +master date: 2022-09-09 16:28:38 +0100 +--- + tools/tests/resource/test-resource.c | 15 +++++++++++++++ + xen/common/grant_table.c | 2 +- + 2 files changed, 16 insertions(+), 1 deletion(-) + +diff --git a/tools/tests/resource/test-resource.c b/tools/tests/resource/test-resource.c +index 0557f8a1b585..37dfff4dcd20 100644 +--- a/tools/tests/resource/test-resource.c ++++ b/tools/tests/resource/test-resource.c +@@ -106,6 +106,21 @@ static void test_gnttab(uint32_t domid, unsigned int nr_frames, + if ( rc ) + return fail(" Fail: Unmap grant table %d - %s\n", + errno, strerror(errno)); ++ ++ /* ++ * Verify that an attempt to map the status frames fails, as the domain is ++ * in gnttab v1 mode. ++ */ ++ res = xenforeignmemory_map_resource( ++ fh, domid, XENMEM_resource_grant_table, ++ XENMEM_resource_grant_table_id_status, 0, 1, ++ (void **)&gnttab, PROT_READ | PROT_WRITE, 0); ++ ++ if ( res ) ++ { ++ fail(" Fail: Managed to map gnttab v2 status frames in v1 mode\n"); ++ xenforeignmemory_unmap_resource(fh, res); ++ } + } + + static void test_domain_configurations(void) +diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c +index d8ca645b96ff..76272b3c8add 100644 +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -4142,7 +4142,7 @@ int gnttab_acquire_resource( + * on non-error paths, and hence it needs setting to NULL at the top of the + * function. Leave some runtime safety. + */ +- if ( !vaddrs ) ++ if ( !rc && !vaddrs ) + { + ASSERT_UNREACHABLE(); + rc = -ENODATA; +-- +2.37.3 + diff --git a/0025-x86-Don-t-change-the-cacheability-of-the-directmap.patch b/0025-x86-Don-t-change-the-cacheability-of-the-directmap.patch deleted file mode 100644 index 0a24a0a..0000000 --- a/0025-x86-Don-t-change-the-cacheability-of-the-directmap.patch +++ /dev/null @@ -1,223 +0,0 @@ -From 74193f4292d9cfc2874866e941d9939d8f33fcef Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Thu, 9 Jun 2022 15:28:23 +0200 -Subject: [PATCH 25/51] x86: Don't change the cacheability of the directmap - -Changeset 55f97f49b7ce ("x86: Change cache attributes of Xen 1:1 page mappings -in response to guest mapping requests") attempted to keep the cacheability -consistent between different mappings of the same page. - -The reason wasn't described in the changelog, but it is understood to be in -regards to a concern over machine check exceptions, owing to errata when using -mixed cacheabilities. It did this primarily by updating Xen's mapping of the -page in the direct map when the guest mapped a page with reduced cacheability. - -Unfortunately, the logic didn't actually prevent mixed cacheability from -occurring: - * A guest could map a page normally, and then map the same page with - different cacheability; nothing prevented this. - * The cacheability of the directmap was always latest-takes-precedence in - terms of guest requests. - * Grant-mapped frames with lesser cacheability didn't adjust the page's - cacheattr settings. - * The map_domain_page() function still unconditionally created WB mappings, - irrespective of the page's cacheattr settings. - -Additionally, update_xen_mappings() had a bug where the alias calculation was -wrong for mfn's which were .init content, which should have been treated as -fully guest pages, not Xen pages. - -Worse yet, the logic introduced a vulnerability whereby necessary -pagetable/segdesc adjustments made by Xen in the validation logic could become -non-coherent between the cache and main memory. The CPU could subsequently -operate on the stale value in the cache, rather than the safe value in main -memory. - -The directmap contains primarily mappings of RAM. PAT/MTRR conflict -resolution is asymmetric, and generally for MTRR=WB ranges, PAT of lesser -cacheability resolves to being coherent. The special case is WC mappings, -which are non-coherent against MTRR=WB regions (except for fully-coherent -CPUs). - -Xen must not have any WC cacheability in the directmap, to prevent Xen's -actions from creating non-coherency. (Guest actions creating non-coherency is -dealt with in subsequent patches.) As all memory types for MTRR=WB ranges -inter-operate coherently, so leave Xen's directmap mappings as WB. - -Only PV guests with access to devices can use reduced-cacheability mappings to -begin with, and they're trusted not to mount DoSs against the system anyway. - -Drop PGC_cacheattr_{base,mask} entirely, and the logic to manipulate them. -Shift the later PGC_* constants up, to gain 3 extra bits in the main reference -count. Retain the check in get_page_from_l1e() for special_pages() because a -guest has no business using reduced cacheability on these. - -This reverts changeset 55f97f49b7ce6c3520c555d19caac6cf3f9a5df0 - -This is CVE-2022-26363, part of XSA-402. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: George Dunlap <george.dunlap@citrix.com> -master commit: ae09597da34aee6bc5b76475c5eea6994457e854 -master date: 2022-06-09 14:22:08 +0200 ---- - xen/arch/x86/mm.c | 84 ++++------------------------------------ - xen/include/asm-x86/mm.h | 23 +++++------ - 2 files changed, 17 insertions(+), 90 deletions(-) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index c6429b0f749a..ab32d13a1a0d 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -783,28 +783,6 @@ bool is_iomem_page(mfn_t mfn) - return (page_get_owner(page) == dom_io); - } - --static int update_xen_mappings(unsigned long mfn, unsigned int cacheattr) --{ -- int err = 0; -- bool alias = mfn >= PFN_DOWN(xen_phys_start) && -- mfn < PFN_UP(xen_phys_start + xen_virt_end - XEN_VIRT_START); -- unsigned long xen_va = -- XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT); -- -- if ( boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) ) -- return 0; -- -- if ( unlikely(alias) && cacheattr ) -- err = map_pages_to_xen(xen_va, _mfn(mfn), 1, 0); -- if ( !err ) -- err = map_pages_to_xen((unsigned long)mfn_to_virt(mfn), _mfn(mfn), 1, -- PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr)); -- if ( unlikely(alias) && !cacheattr && !err ) -- err = map_pages_to_xen(xen_va, _mfn(mfn), 1, PAGE_HYPERVISOR); -- -- return err; --} -- - #ifndef NDEBUG - struct mmio_emul_range_ctxt { - const struct domain *d; -@@ -1009,47 +987,14 @@ get_page_from_l1e( - goto could_not_pin; - } - -- if ( pte_flags_to_cacheattr(l1f) != -- ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) ) -+ if ( (l1f & PAGE_CACHE_ATTRS) != _PAGE_WB && is_special_page(page) ) - { -- unsigned long x, nx, y = page->count_info; -- unsigned long cacheattr = pte_flags_to_cacheattr(l1f); -- int err; -- -- if ( is_special_page(page) ) -- { -- if ( write ) -- put_page_type(page); -- put_page(page); -- gdprintk(XENLOG_WARNING, -- "Attempt to change cache attributes of Xen heap page\n"); -- return -EACCES; -- } -- -- do { -- x = y; -- nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base); -- } while ( (y = cmpxchg(&page->count_info, x, nx)) != x ); -- -- err = update_xen_mappings(mfn, cacheattr); -- if ( unlikely(err) ) -- { -- cacheattr = y & PGC_cacheattr_mask; -- do { -- x = y; -- nx = (x & ~PGC_cacheattr_mask) | cacheattr; -- } while ( (y = cmpxchg(&page->count_info, x, nx)) != x ); -- -- if ( write ) -- put_page_type(page); -- put_page(page); -- -- gdprintk(XENLOG_WARNING, "Error updating mappings for mfn %" PRI_mfn -- " (pfn %" PRI_pfn ", from L1 entry %" PRIpte ") for d%d\n", -- mfn, get_gpfn_from_mfn(mfn), -- l1e_get_intpte(l1e), l1e_owner->domain_id); -- return err; -- } -+ if ( write ) -+ put_page_type(page); -+ put_page(page); -+ gdprintk(XENLOG_WARNING, -+ "Attempt to change cache attributes of Xen heap page\n"); -+ return -EACCES; - } - - return 0; -@@ -2467,24 +2412,9 @@ static int mod_l4_entry(l4_pgentry_t *pl4e, - */ - static int cleanup_page_mappings(struct page_info *page) - { -- unsigned int cacheattr = -- (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base; - int rc = 0; - unsigned long mfn = mfn_x(page_to_mfn(page)); - -- /* -- * If we've modified xen mappings as a result of guest cache -- * attributes, restore them to the "normal" state. -- */ -- if ( unlikely(cacheattr) ) -- { -- page->count_info &= ~PGC_cacheattr_mask; -- -- BUG_ON(is_special_page(page)); -- -- rc = update_xen_mappings(mfn, 0); -- } -- - /* - * If this may be in a PV domain's IOMMU, remove it. - * -diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h -index cb9052749963..8a9a43bb0a9d 100644 ---- a/xen/include/asm-x86/mm.h -+++ b/xen/include/asm-x86/mm.h -@@ -69,25 +69,22 @@ - /* Set when is using a page as a page table */ - #define _PGC_page_table PG_shift(3) - #define PGC_page_table PG_mask(1, 3) -- /* 3-bit PAT/PCD/PWT cache-attribute hint. */ --#define PGC_cacheattr_base PG_shift(6) --#define PGC_cacheattr_mask PG_mask(7, 6) - /* Page is broken? */ --#define _PGC_broken PG_shift(7) --#define PGC_broken PG_mask(1, 7) -+#define _PGC_broken PG_shift(4) -+#define PGC_broken PG_mask(1, 4) - /* Mutually-exclusive page states: { inuse, offlining, offlined, free }. */ --#define PGC_state PG_mask(3, 9) --#define PGC_state_inuse PG_mask(0, 9) --#define PGC_state_offlining PG_mask(1, 9) --#define PGC_state_offlined PG_mask(2, 9) --#define PGC_state_free PG_mask(3, 9) -+#define PGC_state PG_mask(3, 6) -+#define PGC_state_inuse PG_mask(0, 6) -+#define PGC_state_offlining PG_mask(1, 6) -+#define PGC_state_offlined PG_mask(2, 6) -+#define PGC_state_free PG_mask(3, 6) - #define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st) - /* Page is not reference counted (see below for caveats) */ --#define _PGC_extra PG_shift(10) --#define PGC_extra PG_mask(1, 10) -+#define _PGC_extra PG_shift(7) -+#define PGC_extra PG_mask(1, 7) - - /* Count of references to this frame. */ --#define PGC_count_width PG_shift(10) -+#define PGC_count_width PG_shift(7) - #define PGC_count_mask ((1UL<<PGC_count_width)-1) - - /* --- -2.35.1 - diff --git a/0025-x86-wire-up-VCPUOP_register_vcpu_time_memory_area-fo.patch b/0025-x86-wire-up-VCPUOP_register_vcpu_time_memory_area-fo.patch new file mode 100644 index 0000000..849ef60 --- /dev/null +++ b/0025-x86-wire-up-VCPUOP_register_vcpu_time_memory_area-fo.patch @@ -0,0 +1,59 @@ +From 3f4da85ca8816f6617529c80850eaddd80ea0f1f Mon Sep 17 00:00:00 2001 +From: Jan Beulich <jbeulich@suse.com> +Date: Tue, 11 Oct 2022 15:01:36 +0200 +Subject: [PATCH 25/26] x86: wire up VCPUOP_register_vcpu_time_memory_area for + 32-bit guests +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Forever sinced its introduction VCPUOP_register_vcpu_time_memory_area +was available only to native domains. Linux, for example, would attempt +to use it irrespective of guest bitness (including in its so called +PVHVM mode) as long as it finds XEN_PVCLOCK_TSC_STABLE_BIT set (which we +set only for clocksource=tsc, which in turn needs engaging via command +line option). + +Fixes: a5d39947cb89 ("Allow guests to register secondary vcpu_time_info") +Signed-off-by: Jan Beulich <jbeulich@suse.com> +Acked-by: Roger Pau Monné <roger.pau@citrix.com> +master commit: b726541d94bd0a80b5864d17a2cd2e6d73a3fe0a +master date: 2022-09-29 14:47:45 +0200 +--- + xen/arch/x86/x86_64/domain.c | 20 ++++++++++++++++++++ + 1 file changed, 20 insertions(+) + +diff --git a/xen/arch/x86/x86_64/domain.c b/xen/arch/x86/x86_64/domain.c +index c46dccc25a54..d51d99344796 100644 +--- a/xen/arch/x86/x86_64/domain.c ++++ b/xen/arch/x86/x86_64/domain.c +@@ -54,6 +54,26 @@ arch_compat_vcpu_op( + break; + } + ++ case VCPUOP_register_vcpu_time_memory_area: ++ { ++ struct compat_vcpu_register_time_memory_area area = { .addr.p = 0 }; ++ ++ rc = -EFAULT; ++ if ( copy_from_guest(&area.addr.h, arg, 1) ) ++ break; ++ ++ if ( area.addr.h.c != area.addr.p || ++ !compat_handle_okay(area.addr.h, 1) ) ++ break; ++ ++ rc = 0; ++ guest_from_compat_handle(v->arch.time_info_guest, area.addr.h); ++ ++ force_update_vcpu_system_time(v); ++ ++ break; ++ } ++ + case VCPUOP_get_physid: + rc = arch_do_vcpu_op(cmd, v, arg); + break; +-- +2.37.3 + diff --git a/0026-x86-Split-cache_flush-out-of-cache_writeback.patch b/0026-x86-Split-cache_flush-out-of-cache_writeback.patch deleted file mode 100644 index 50f70f4..0000000 --- a/0026-x86-Split-cache_flush-out-of-cache_writeback.patch +++ /dev/null @@ -1,294 +0,0 @@ -From 8eafa2d871ae51d461256e4a14175e24df330c70 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Thu, 9 Jun 2022 15:28:48 +0200 -Subject: [PATCH 26/51] x86: Split cache_flush() out of cache_writeback() - -Subsequent changes will want a fully flushing version. - -Use the new helper rather than opencoding it in flush_area_local(). This -resolves an outstanding issue where the conditional sfence is on the wrong -side of the clflushopt loop. clflushopt is ordered with respect to older -stores, not to younger stores. - -Rename gnttab_cache_flush()'s helper to avoid colliding in name. -grant_table.c can see the prototype from cache.h so the build fails -otherwise. - -This is part of XSA-402. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 9a67ffee3371506e1cbfdfff5b90658d4828f6a2 -master date: 2022-06-09 14:22:38 +0200 ---- - xen/arch/x86/flushtlb.c | 84 ++++++++++++++++++++++++--- - xen/common/grant_table.c | 4 +- - xen/drivers/passthrough/vtd/extern.h | 1 - - xen/drivers/passthrough/vtd/iommu.c | 53 +---------------- - xen/drivers/passthrough/vtd/x86/vtd.c | 5 -- - xen/include/asm-x86/cache.h | 7 +++ - 6 files changed, 88 insertions(+), 66 deletions(-) - -diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c -index 25798df50f54..0c912b8669f8 100644 ---- a/xen/arch/x86/flushtlb.c -+++ b/xen/arch/x86/flushtlb.c -@@ -234,7 +234,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags) - if ( flags & FLUSH_CACHE ) - { - const struct cpuinfo_x86 *c = ¤t_cpu_data; -- unsigned long i, sz = 0; -+ unsigned long sz = 0; - - if ( order < (BITS_PER_LONG - PAGE_SHIFT) ) - sz = 1UL << (order + PAGE_SHIFT); -@@ -244,13 +244,7 @@ unsigned int flush_area_local(const void *va, unsigned int flags) - c->x86_clflush_size && c->x86_cache_size && sz && - ((sz >> 10) < c->x86_cache_size) ) - { -- alternative("", "sfence", X86_FEATURE_CLFLUSHOPT); -- for ( i = 0; i < sz; i += c->x86_clflush_size ) -- alternative_input(".byte " __stringify(NOP_DS_PREFIX) ";" -- " clflush %0", -- "data16 clflush %0", /* clflushopt */ -- X86_FEATURE_CLFLUSHOPT, -- "m" (((const char *)va)[i])); -+ cache_flush(va, sz); - flags &= ~FLUSH_CACHE; - } - else -@@ -265,6 +259,80 @@ unsigned int flush_area_local(const void *va, unsigned int flags) - return flags; - } - -+void cache_flush(const void *addr, unsigned int size) -+{ -+ /* -+ * This function may be called before current_cpu_data is established. -+ * Hence a fallback is needed to prevent the loop below becoming infinite. -+ */ -+ unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16; -+ const void *end = addr + size; -+ -+ addr -= (unsigned long)addr & (clflush_size - 1); -+ for ( ; addr < end; addr += clflush_size ) -+ { -+ /* -+ * Note regarding the "ds" prefix use: it's faster to do a clflush -+ * + prefix than a clflush + nop, and hence the prefix is added instead -+ * of letting the alternative framework fill the gap by appending nops. -+ */ -+ alternative_io("ds; clflush %[p]", -+ "data16 clflush %[p]", /* clflushopt */ -+ X86_FEATURE_CLFLUSHOPT, -+ /* no outputs */, -+ [p] "m" (*(const char *)(addr))); -+ } -+ -+ alternative("", "sfence", X86_FEATURE_CLFLUSHOPT); -+} -+ -+void cache_writeback(const void *addr, unsigned int size) -+{ -+ unsigned int clflush_size; -+ const void *end = addr + size; -+ -+ /* Fall back to CLFLUSH{,OPT} when CLWB isn't available. */ -+ if ( !boot_cpu_has(X86_FEATURE_CLWB) ) -+ return cache_flush(addr, size); -+ -+ /* -+ * This function may be called before current_cpu_data is established. -+ * Hence a fallback is needed to prevent the loop below becoming infinite. -+ */ -+ clflush_size = current_cpu_data.x86_clflush_size ?: 16; -+ addr -= (unsigned long)addr & (clflush_size - 1); -+ for ( ; addr < end; addr += clflush_size ) -+ { -+/* -+ * The arguments to a macro must not include preprocessor directives. Doing so -+ * results in undefined behavior, so we have to create some defines here in -+ * order to avoid it. -+ */ -+#if defined(HAVE_AS_CLWB) -+# define CLWB_ENCODING "clwb %[p]" -+#elif defined(HAVE_AS_XSAVEOPT) -+# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */ -+#else -+# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */ -+#endif -+ -+#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr)) -+#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT) -+# define INPUT BASE_INPUT -+#else -+# define INPUT(addr) "a" (addr), BASE_INPUT(addr) -+#endif -+ -+ asm volatile (CLWB_ENCODING :: INPUT(addr)); -+ -+#undef INPUT -+#undef BASE_INPUT -+#undef CLWB_ENCODING -+ } -+ -+ asm volatile ("sfence" ::: "memory"); -+} -+ - unsigned int guest_flush_tlb_flags(const struct domain *d) - { - bool shadow = paging_mode_shadow(d); -diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c -index 66f8ce71741c..4c742cd8fe81 100644 ---- a/xen/common/grant_table.c -+++ b/xen/common/grant_table.c -@@ -3431,7 +3431,7 @@ gnttab_swap_grant_ref(XEN_GUEST_HANDLE_PARAM(gnttab_swap_grant_ref_t) uop, - return 0; - } - --static int cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref) -+static int _cache_flush(const gnttab_cache_flush_t *cflush, grant_ref_t *cur_ref) - { - struct domain *d, *owner; - struct page_info *page; -@@ -3525,7 +3525,7 @@ gnttab_cache_flush(XEN_GUEST_HANDLE_PARAM(gnttab_cache_flush_t) uop, - return -EFAULT; - for ( ; ; ) - { -- int ret = cache_flush(&op, cur_ref); -+ int ret = _cache_flush(&op, cur_ref); - - if ( ret < 0 ) - return ret; -diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h -index 01e010a10d61..401079299725 100644 ---- a/xen/drivers/passthrough/vtd/extern.h -+++ b/xen/drivers/passthrough/vtd/extern.h -@@ -76,7 +76,6 @@ int __must_check qinval_device_iotlb_sync(struct vtd_iommu *iommu, - struct pci_dev *pdev, - u16 did, u16 size, u64 addr); - --unsigned int get_cache_line_size(void); - void flush_all_cache(void); - - uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node); -diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c -index 8975c1de61bc..bc377c9bcfa4 100644 ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -31,6 +31,7 @@ - #include <xen/pci.h> - #include <xen/pci_regs.h> - #include <xen/keyhandler.h> -+#include <asm/cache.h> - #include <asm/msi.h> - #include <asm/nops.h> - #include <asm/irq.h> -@@ -206,54 +207,6 @@ static void check_cleanup_domid_map(const struct domain *d, - } - } - --static void sync_cache(const void *addr, unsigned int size) --{ -- static unsigned long clflush_size = 0; -- const void *end = addr + size; -- -- if ( clflush_size == 0 ) -- clflush_size = get_cache_line_size(); -- -- addr -= (unsigned long)addr & (clflush_size - 1); -- for ( ; addr < end; addr += clflush_size ) --/* -- * The arguments to a macro must not include preprocessor directives. Doing so -- * results in undefined behavior, so we have to create some defines here in -- * order to avoid it. -- */ --#if defined(HAVE_AS_CLWB) --# define CLWB_ENCODING "clwb %[p]" --#elif defined(HAVE_AS_XSAVEOPT) --# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */ --#else --# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */ --#endif -- --#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr)) --#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT) --# define INPUT BASE_INPUT --#else --# define INPUT(addr) "a" (addr), BASE_INPUT(addr) --#endif -- /* -- * Note regarding the use of NOP_DS_PREFIX: it's faster to do a clflush -- * + prefix than a clflush + nop, and hence the prefix is added instead -- * of letting the alternative framework fill the gap by appending nops. -- */ -- alternative_io_2(".byte " __stringify(NOP_DS_PREFIX) "; clflush %[p]", -- "data16 clflush %[p]", /* clflushopt */ -- X86_FEATURE_CLFLUSHOPT, -- CLWB_ENCODING, -- X86_FEATURE_CLWB, /* no outputs */, -- INPUT(addr)); --#undef INPUT --#undef BASE_INPUT --#undef CLWB_ENCODING -- -- alternative_2("", "sfence", X86_FEATURE_CLFLUSHOPT, -- "sfence", X86_FEATURE_CLWB); --} -- - /* Allocate page table, return its machine address */ - uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node) - { -@@ -273,7 +226,7 @@ uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node) - clear_page(vaddr); - - if ( (iommu_ops.init ? &iommu_ops : &vtd_ops)->sync_cache ) -- sync_cache(vaddr, PAGE_SIZE); -+ cache_writeback(vaddr, PAGE_SIZE); - unmap_domain_page(vaddr); - cur_pg++; - } -@@ -1305,7 +1258,7 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd) - iommu->nr_pt_levels = agaw_to_level(agaw); - - if ( !ecap_coherent(iommu->ecap) ) -- vtd_ops.sync_cache = sync_cache; -+ vtd_ops.sync_cache = cache_writeback; - - /* allocate domain id bitmap */ - iommu->domid_bitmap = xzalloc_array(unsigned long, BITS_TO_LONGS(nr_dom)); -diff --git a/xen/drivers/passthrough/vtd/x86/vtd.c b/xen/drivers/passthrough/vtd/x86/vtd.c -index 6681dccd6970..55f0faa521cb 100644 ---- a/xen/drivers/passthrough/vtd/x86/vtd.c -+++ b/xen/drivers/passthrough/vtd/x86/vtd.c -@@ -47,11 +47,6 @@ void unmap_vtd_domain_page(const void *va) - unmap_domain_page(va); - } - --unsigned int get_cache_line_size(void) --{ -- return ((cpuid_ebx(1) >> 8) & 0xff) * 8; --} -- - void flush_all_cache() - { - wbinvd(); -diff --git a/xen/include/asm-x86/cache.h b/xen/include/asm-x86/cache.h -index 1f7173d8c72c..e4770efb22b9 100644 ---- a/xen/include/asm-x86/cache.h -+++ b/xen/include/asm-x86/cache.h -@@ -11,4 +11,11 @@ - - #define __read_mostly __section(".data.read_mostly") - -+#ifndef __ASSEMBLY__ -+ -+void cache_flush(const void *addr, unsigned int size); -+void cache_writeback(const void *addr, unsigned int size); -+ -+#endif -+ - #endif --- -2.35.1 - diff --git a/0026-x86-vpmu-Fix-race-condition-in-vpmu_load.patch b/0026-x86-vpmu-Fix-race-condition-in-vpmu_load.patch new file mode 100644 index 0000000..0f33747 --- /dev/null +++ b/0026-x86-vpmu-Fix-race-condition-in-vpmu_load.patch @@ -0,0 +1,97 @@ +From 1bce7fb1f702da4f7a749c6f1457ecb20bf74fca Mon Sep 17 00:00:00 2001 +From: Tamas K Lengyel <tamas.lengyel@intel.com> +Date: Tue, 11 Oct 2022 15:01:48 +0200 +Subject: [PATCH 26/26] x86/vpmu: Fix race-condition in vpmu_load + +The vPMU code-bases attempts to perform an optimization on saving/reloading the +PMU context by keeping track of what vCPU ran on each pCPU. When a pCPU is +getting scheduled, checks if the previous vCPU isn't the current one. If so, +attempts a call to vpmu_save_force. Unfortunately if the previous vCPU is +already getting scheduled to run on another pCPU its state will be already +runnable, which results in an ASSERT failure. + +Fix this by always performing a pmu context save in vpmu_save when called from +vpmu_switch_from, and do a vpmu_load when called from vpmu_switch_to. + +While this presents a minimal overhead in case the same vCPU is getting +rescheduled on the same pCPU, the ASSERT failure is avoided and the code is a +lot easier to reason about. + +Signed-off-by: Tamas K Lengyel <tamas.lengyel@intel.com> +Acked-by: Jan Beulich <jbeulich@suse.com> +master commit: defa4e51d20a143bdd4395a075bf0933bb38a9a4 +master date: 2022-09-30 09:53:49 +0200 +--- + xen/arch/x86/cpu/vpmu.c | 42 ++++------------------------------------- + 1 file changed, 4 insertions(+), 38 deletions(-) + +diff --git a/xen/arch/x86/cpu/vpmu.c b/xen/arch/x86/cpu/vpmu.c +index 16e91a3694fe..b6c2ec3cd047 100644 +--- a/xen/arch/x86/cpu/vpmu.c ++++ b/xen/arch/x86/cpu/vpmu.c +@@ -368,58 +368,24 @@ void vpmu_save(struct vcpu *v) + vpmu->last_pcpu = pcpu; + per_cpu(last_vcpu, pcpu) = v; + ++ vpmu_set(vpmu, VPMU_CONTEXT_SAVE); ++ + if ( vpmu->arch_vpmu_ops ) + if ( vpmu->arch_vpmu_ops->arch_vpmu_save(v, 0) ) + vpmu_reset(vpmu, VPMU_CONTEXT_LOADED); + ++ vpmu_reset(vpmu, VPMU_CONTEXT_SAVE); ++ + apic_write(APIC_LVTPC, PMU_APIC_VECTOR | APIC_LVT_MASKED); + } + + int vpmu_load(struct vcpu *v, bool_t from_guest) + { + struct vpmu_struct *vpmu = vcpu_vpmu(v); +- int pcpu = smp_processor_id(); +- struct vcpu *prev = NULL; + + if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_ALLOCATED) ) + return 0; + +- /* First time this VCPU is running here */ +- if ( vpmu->last_pcpu != pcpu ) +- { +- /* +- * Get the context from last pcpu that we ran on. Note that if another +- * VCPU is running there it must have saved this VPCU's context before +- * startig to run (see below). +- * There should be no race since remote pcpu will disable interrupts +- * before saving the context. +- */ +- if ( vpmu_is_set(vpmu, VPMU_CONTEXT_LOADED) ) +- { +- on_selected_cpus(cpumask_of(vpmu->last_pcpu), +- vpmu_save_force, (void *)v, 1); +- vpmu_reset(vpmu, VPMU_CONTEXT_LOADED); +- } +- } +- +- /* Prevent forced context save from remote CPU */ +- local_irq_disable(); +- +- prev = per_cpu(last_vcpu, pcpu); +- +- if ( prev != v && prev ) +- { +- vpmu = vcpu_vpmu(prev); +- +- /* Someone ran here before us */ +- vpmu_save_force(prev); +- vpmu_reset(vpmu, VPMU_CONTEXT_LOADED); +- +- vpmu = vcpu_vpmu(v); +- } +- +- local_irq_enable(); +- + /* Only when PMU is counting, we load PMU context immediately. */ + if ( !vpmu_is_set(vpmu, VPMU_RUNNING) || + (!has_vlapic(vpmu_vcpu(vpmu)->domain) && +-- +2.37.3 + diff --git a/0027-x86-amd-Work-around-CLFLUSH-ordering-on-older-parts.patch b/0027-x86-amd-Work-around-CLFLUSH-ordering-on-older-parts.patch deleted file mode 100644 index 060bc99..0000000 --- a/0027-x86-amd-Work-around-CLFLUSH-ordering-on-older-parts.patch +++ /dev/null @@ -1,95 +0,0 @@ -From c4815be949aae6583a9a22897beb96b095b4f1a2 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Thu, 9 Jun 2022 15:29:13 +0200 -Subject: [PATCH 27/51] x86/amd: Work around CLFLUSH ordering on older parts - -On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakely ordered with everything, -including reads and writes to the address, and LFENCE/SFENCE instructions. - -This creates a multitude of problematic corner cases, laid out in the manual. -Arrange to use MFENCE on both sides of the CLFLUSH to force proper ordering. - -This is part of XSA-402. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 062868a5a8b428b85db589fa9a6d6e43969ffeb9 -master date: 2022-06-09 14:23:07 +0200 ---- - xen/arch/x86/cpu/amd.c | 8 ++++++++ - xen/arch/x86/flushtlb.c | 13 ++++++++++++- - xen/include/asm-x86/cpufeatures.h | 1 + - 3 files changed, 21 insertions(+), 1 deletion(-) - -diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c -index a8e37dbb1f5c..b3b9a0df5fed 100644 ---- a/xen/arch/x86/cpu/amd.c -+++ b/xen/arch/x86/cpu/amd.c -@@ -812,6 +812,14 @@ static void init_amd(struct cpuinfo_x86 *c) - if (!cpu_has_lfence_dispatch) - __set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability); - -+ /* -+ * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with -+ * everything, including reads and writes to address, and -+ * LFENCE/SFENCE instructions. -+ */ -+ if (!cpu_has_clflushopt) -+ setup_force_cpu_cap(X86_BUG_CLFLUSH_MFENCE); -+ - switch(c->x86) - { - case 0xf ... 0x11: -diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c -index 0c912b8669f8..dcbb4064012e 100644 ---- a/xen/arch/x86/flushtlb.c -+++ b/xen/arch/x86/flushtlb.c -@@ -259,6 +259,13 @@ unsigned int flush_area_local(const void *va, unsigned int flags) - return flags; - } - -+/* -+ * On pre-CLFLUSHOPT AMD CPUs, CLFLUSH is weakly ordered with everything, -+ * including reads and writes to address, and LFENCE/SFENCE instructions. -+ * -+ * This function only works safely after alternatives have run. Luckily, at -+ * the time of writing, we don't flush the caches that early. -+ */ - void cache_flush(const void *addr, unsigned int size) - { - /* -@@ -268,6 +275,8 @@ void cache_flush(const void *addr, unsigned int size) - unsigned int clflush_size = current_cpu_data.x86_clflush_size ?: 16; - const void *end = addr + size; - -+ alternative("", "mfence", X86_BUG_CLFLUSH_MFENCE); -+ - addr -= (unsigned long)addr & (clflush_size - 1); - for ( ; addr < end; addr += clflush_size ) - { -@@ -283,7 +292,9 @@ void cache_flush(const void *addr, unsigned int size) - [p] "m" (*(const char *)(addr))); - } - -- alternative("", "sfence", X86_FEATURE_CLFLUSHOPT); -+ alternative_2("", -+ "sfence", X86_FEATURE_CLFLUSHOPT, -+ "mfence", X86_BUG_CLFLUSH_MFENCE); - } - - void cache_writeback(const void *addr, unsigned int size) -diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h -index 7413febd7ad8..ff3157d52d13 100644 ---- a/xen/include/asm-x86/cpufeatures.h -+++ b/xen/include/asm-x86/cpufeatures.h -@@ -47,6 +47,7 @@ XEN_CPUFEATURE(XEN_IBT, X86_SYNTH(27)) /* Xen uses CET Indirect Branch - - #define X86_BUG_FPU_PTRS X86_BUG( 0) /* (F)X{SAVE,RSTOR} doesn't save/restore FOP/FIP/FDP. */ - #define X86_BUG_NULL_SEG X86_BUG( 1) /* NULL-ing a selector preserves the base and limit. */ -+#define X86_BUG_CLFLUSH_MFENCE X86_BUG( 2) /* MFENCE needed to serialise CLFLUSH */ - - /* Total number of capability words, inc synth and bug words. */ - #define NCAPINTS (FSCAPINTS + X86_NR_SYNTH + X86_NR_BUG) /* N 32-bit words worth of info */ --- -2.35.1 - diff --git a/0028-x86-pv-Track-and-flush-non-coherent-mappings-of-RAM.patch b/0028-x86-pv-Track-and-flush-non-coherent-mappings-of-RAM.patch deleted file mode 100644 index af60348..0000000 --- a/0028-x86-pv-Track-and-flush-non-coherent-mappings-of-RAM.patch +++ /dev/null @@ -1,160 +0,0 @@ -From dc020d8d1ba420e2dd0e7a40f5045db897f3c4f4 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Thu, 9 Jun 2022 15:29:38 +0200 -Subject: [PATCH 28/51] x86/pv: Track and flush non-coherent mappings of RAM - -There are legitimate uses of WC mappings of RAM, e.g. for DMA buffers with -devices that make non-coherent writes. The Linux sound subsystem makes -extensive use of this technique. - -For such usecases, the guest's DMA buffer is mapped and consistently used as -WC, and Xen doesn't interact with the buffer. - -However, a mischevious guest can use WC mappings to deliberately create -non-coherency between the cache and RAM, and use this to trick Xen into -validating a pagetable which isn't actually safe. - -Allocate a new PGT_non_coherent to track the non-coherency of mappings. Set -it whenever a non-coherent writeable mapping is created. If the page is used -as anything other than PGT_writable_page, force a cache flush before -validation. Also force a cache flush before the page is returned to the heap. - -This is CVE-2022-26364, part of XSA-402. - -Reported-by: Jann Horn <jannh@google.com> -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: George Dunlap <george.dunlap@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: c1c9cae3a9633054b177c5de21ad7268162b2f2c -master date: 2022-06-09 14:23:37 +0200 ---- - xen/arch/x86/mm.c | 38 +++++++++++++++++++++++++++++++++++ - xen/arch/x86/pv/grant_table.c | 21 +++++++++++++++++++ - xen/include/asm-x86/mm.h | 6 +++++- - 3 files changed, 64 insertions(+), 1 deletion(-) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index ab32d13a1a0d..bab9624fabb7 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -997,6 +997,15 @@ get_page_from_l1e( - return -EACCES; - } - -+ /* -+ * Track writeable non-coherent mappings to RAM pages, to trigger a cache -+ * flush later if the target is used as anything but a PGT_writeable page. -+ * We care about all writeable mappings, including foreign mappings. -+ */ -+ if ( !boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) && -+ (l1f & (PAGE_CACHE_ATTRS | _PAGE_RW)) == (_PAGE_WC | _PAGE_RW) ) -+ set_bit(_PGT_non_coherent, &page->u.inuse.type_info); -+ - return 0; - - could_not_pin: -@@ -2454,6 +2463,19 @@ static int cleanup_page_mappings(struct page_info *page) - } - } - -+ /* -+ * Flush the cache if there were previously non-coherent writeable -+ * mappings of this page. This forces the page to be coherent before it -+ * is freed back to the heap. -+ */ -+ if ( __test_and_clear_bit(_PGT_non_coherent, &page->u.inuse.type_info) ) -+ { -+ void *addr = __map_domain_page(page); -+ -+ cache_flush(addr, PAGE_SIZE); -+ unmap_domain_page(addr); -+ } -+ - return rc; - } - -@@ -3027,6 +3049,22 @@ static int _get_page_type(struct page_info *page, unsigned long type, - - if ( unlikely(!(nx & PGT_validated)) ) - { -+ /* -+ * Flush the cache if there were previously non-coherent mappings of -+ * this page, and we're trying to use it as anything other than a -+ * writeable page. This forces the page to be coherent before we -+ * validate its contents for safety. -+ */ -+ if ( (nx & PGT_non_coherent) && type != PGT_writable_page ) -+ { -+ void *addr = __map_domain_page(page); -+ -+ cache_flush(addr, PAGE_SIZE); -+ unmap_domain_page(addr); -+ -+ page->u.inuse.type_info &= ~PGT_non_coherent; -+ } -+ - /* - * No special validation needed for writable or shared pages. Page - * tables and GDT/LDT need to have their contents audited. -diff --git a/xen/arch/x86/pv/grant_table.c b/xen/arch/x86/pv/grant_table.c -index 0325618c9883..81c72e61ed55 100644 ---- a/xen/arch/x86/pv/grant_table.c -+++ b/xen/arch/x86/pv/grant_table.c -@@ -109,7 +109,17 @@ int create_grant_pv_mapping(uint64_t addr, mfn_t frame, - - ol1e = *pl1e; - if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) ) -+ { -+ /* -+ * We always create mappings in this path. However, our caller, -+ * map_grant_ref(), only passes potentially non-zero cache_flags for -+ * MMIO frames, so this path doesn't create non-coherent mappings of -+ * RAM frames and there's no need to calculate PGT_non_coherent. -+ */ -+ ASSERT(!cache_flags || is_iomem_page(frame)); -+ - rc = GNTST_okay; -+ } - - out_unlock: - page_unlock(page); -@@ -294,7 +304,18 @@ int replace_grant_pv_mapping(uint64_t addr, mfn_t frame, - l1e_get_flags(ol1e), addr, grant_pte_flags); - - if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) ) -+ { -+ /* -+ * Generally, replace_grant_pv_mapping() is used to destroy mappings -+ * (n1le = l1e_empty()), but it can be a present mapping on the -+ * GNTABOP_unmap_and_replace path. -+ * -+ * In such cases, the PTE is fully transplanted from its old location -+ * via steal_linear_addr(), so we need not perform PGT_non_coherent -+ * checking here. -+ */ - rc = GNTST_okay; -+ } - - out_unlock: - page_unlock(page); -diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h -index 8a9a43bb0a9d..7464167ae192 100644 ---- a/xen/include/asm-x86/mm.h -+++ b/xen/include/asm-x86/mm.h -@@ -53,8 +53,12 @@ - #define _PGT_partial PG_shift(8) - #define PGT_partial PG_mask(1, 8) - -+/* Has this page been mapped writeable with a non-coherent memory type? */ -+#define _PGT_non_coherent PG_shift(9) -+#define PGT_non_coherent PG_mask(1, 9) -+ - /* Count of uses of this frame as its current type. */ --#define PGT_count_width PG_shift(8) -+#define PGT_count_width PG_shift(9) - #define PGT_count_mask ((1UL<<PGT_count_width)-1) - - /* Are the 'type mask' bits identical? */ --- -2.35.1 - diff --git a/0029-x86-mm-account-for-PGT_pae_xen_l2-in-recently-added-.patch b/0029-x86-mm-account-for-PGT_pae_xen_l2-in-recently-added-.patch deleted file mode 100644 index 90ce4cf..0000000 --- a/0029-x86-mm-account-for-PGT_pae_xen_l2-in-recently-added-.patch +++ /dev/null @@ -1,37 +0,0 @@ -From 0b4e62847c5af1a59eea8d17093feccd550d1c26 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Fri, 10 Jun 2022 10:28:28 +0200 -Subject: [PATCH 29/51] x86/mm: account for PGT_pae_xen_l2 in recently added - assertion - -While PGT_pae_xen_l2 will be zapped once the type refcount of an L2 page -reaches zero, it'll be retained as long as the type refcount is non- -zero. Hence any checking against the requested type needs to either zap -the bit from the type or include it in the used mask. - -Fixes: 9186e96b199e ("x86/pv: Clean up _get_page_type()") -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> -master commit: c2095ac76be0f4a1940346c9ffb49fb967345060 -master date: 2022-06-10 10:21:06 +0200 ---- - xen/arch/x86/mm.c | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index bab9624fabb7..c1b9a3bb102a 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -2928,7 +2928,8 @@ static int _get_page_type(struct page_info *page, unsigned long type, - * The page is in one of two states (depending on PGT_partial), - * and should have exactly one reference. - */ -- ASSERT((x & (PGT_type_mask | PGT_count_mask)) == (type | 1)); -+ ASSERT((x & (PGT_type_mask | PGT_pae_xen_l2 | PGT_count_mask)) == -+ (type | 1)); - - if ( !(x & PGT_partial) ) - { --- -2.35.1 - diff --git a/0030-x86-spec-ctrl-Make-VERW-flushing-runtime-conditional.patch b/0030-x86-spec-ctrl-Make-VERW-flushing-runtime-conditional.patch deleted file mode 100644 index af25b5c..0000000 --- a/0030-x86-spec-ctrl-Make-VERW-flushing-runtime-conditional.patch +++ /dev/null @@ -1,258 +0,0 @@ -From 0e80f9f61168d4e4f008da75762cee0118f802ed Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Mon, 13 Jun 2022 16:19:01 +0100 -Subject: [PATCH 30/51] x86/spec-ctrl: Make VERW flushing runtime conditional -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Currently, VERW flushing to mitigate MDS is boot time conditional per domain -type. However, to provide mitigations for DRPW (CVE-2022-21166), we need to -conditionally use VERW based on the trustworthiness of the guest, and the -devices passed through. - -Remove the PV/HVM alternatives and instead issue a VERW on the return-to-guest -path depending on the SCF_verw bit in cpuinfo spec_ctrl_flags. - -Introduce spec_ctrl_init_domain() and d->arch.verw to calculate the VERW -disposition at domain creation time, and context switch the SCF_verw bit. - -For now, VERW flushing is used and controlled exactly as before, but later -patches will add per-domain cases too. - -No change in behaviour. - -This is part of XSA-404. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -(cherry picked from commit e06b95c1d44ab80da255219fc9f1e2fc423edcb6) ---- - docs/misc/xen-command-line.pandoc | 5 ++--- - xen/arch/x86/domain.c | 12 ++++++++++-- - xen/arch/x86/hvm/vmx/entry.S | 2 +- - xen/arch/x86/spec_ctrl.c | 30 +++++++++++++++++------------ - xen/include/asm-x86/cpufeatures.h | 3 +-- - xen/include/asm-x86/domain.h | 3 +++ - xen/include/asm-x86/spec_ctrl.h | 2 ++ - xen/include/asm-x86/spec_ctrl_asm.h | 16 +++++++++++++-- - 8 files changed, 51 insertions(+), 22 deletions(-) - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index 1d08fb7e9aa6..d5cb09f86541 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -2258,9 +2258,8 @@ in place for guests to use. - Use of a positive boolean value for either of these options is invalid. - - The booleans `pv=`, `hvm=`, `msr-sc=`, `rsb=` and `md-clear=` offer fine --grained control over the alternative blocks used by Xen. These impact Xen's --ability to protect itself, and Xen's ability to virtualise support for guests --to use. -+grained control over the primitives by Xen. These impact Xen's ability to -+protect itself, and Xen's ability to virtualise support for guests to use. - - * `pv=` and `hvm=` offer control over all suboptions for PV and HVM guests - respectively. -diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c -index ef1812dc1402..1fe6644a71ae 100644 ---- a/xen/arch/x86/domain.c -+++ b/xen/arch/x86/domain.c -@@ -863,6 +863,8 @@ int arch_domain_create(struct domain *d, - - d->arch.msr_relaxed = config->arch.misc_flags & XEN_X86_MSR_RELAXED; - -+ spec_ctrl_init_domain(d); -+ - return 0; - - fail: -@@ -2017,14 +2019,15 @@ static void __context_switch(void) - void context_switch(struct vcpu *prev, struct vcpu *next) - { - unsigned int cpu = smp_processor_id(); -+ struct cpu_info *info = get_cpu_info(); - const struct domain *prevd = prev->domain, *nextd = next->domain; - unsigned int dirty_cpu = read_atomic(&next->dirty_cpu); - - ASSERT(prev != next); - ASSERT(local_irq_is_enabled()); - -- get_cpu_info()->use_pv_cr3 = false; -- get_cpu_info()->xen_cr3 = 0; -+ info->use_pv_cr3 = false; -+ info->xen_cr3 = 0; - - if ( unlikely(dirty_cpu != cpu) && dirty_cpu != VCPU_CPU_CLEAN ) - { -@@ -2088,6 +2091,11 @@ void context_switch(struct vcpu *prev, struct vcpu *next) - *last_id = next_id; - } - } -+ -+ /* Update the top-of-stack block with the VERW disposition. */ -+ info->spec_ctrl_flags &= ~SCF_verw; -+ if ( nextd->arch.verw ) -+ info->spec_ctrl_flags |= SCF_verw; - } - - sched_context_switched(prev, next); -diff --git a/xen/arch/x86/hvm/vmx/entry.S b/xen/arch/x86/hvm/vmx/entry.S -index 49651f3c435a..5f5de45a1309 100644 ---- a/xen/arch/x86/hvm/vmx/entry.S -+++ b/xen/arch/x86/hvm/vmx/entry.S -@@ -87,7 +87,7 @@ UNLIKELY_END(realmode) - - /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ - /* SPEC_CTRL_EXIT_TO_VMX Req: %rsp=regs/cpuinfo Clob: */ -- ALTERNATIVE "", __stringify(verw CPUINFO_verw_sel(%rsp)), X86_FEATURE_SC_VERW_HVM -+ DO_SPEC_CTRL_COND_VERW - - mov VCPU_hvm_guest_cr2(%rbx),%rax - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index c19464da70ce..21730aa03071 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -36,8 +36,8 @@ static bool __initdata opt_msr_sc_pv = true; - static bool __initdata opt_msr_sc_hvm = true; - static int8_t __initdata opt_rsb_pv = -1; - static bool __initdata opt_rsb_hvm = true; --static int8_t __initdata opt_md_clear_pv = -1; --static int8_t __initdata opt_md_clear_hvm = -1; -+static int8_t __read_mostly opt_md_clear_pv = -1; -+static int8_t __read_mostly opt_md_clear_hvm = -1; - - /* Cmdline controls for Xen's speculative settings. */ - static enum ind_thunk { -@@ -932,6 +932,13 @@ static __init void mds_calculations(uint64_t caps) - } - } - -+void spec_ctrl_init_domain(struct domain *d) -+{ -+ bool pv = is_pv_domain(d); -+ -+ d->arch.verw = pv ? opt_md_clear_pv : opt_md_clear_hvm; -+} -+ - void __init init_speculation_mitigations(void) - { - enum ind_thunk thunk = THUNK_DEFAULT; -@@ -1196,21 +1203,20 @@ void __init init_speculation_mitigations(void) - boot_cpu_has(X86_FEATURE_MD_CLEAR)); - - /* -- * Enable MDS defences as applicable. The PV blocks need using all the -- * time, and the Idle blocks need using if either PV or HVM defences are -- * used. -+ * Enable MDS defences as applicable. The Idle blocks need using if -+ * either PV or HVM defences are used. - * - * HVM is more complicated. The MD_CLEAR microcode extends L1D_FLUSH with -- * equivelent semantics to avoid needing to perform both flushes on the -- * HVM path. The HVM blocks don't need activating if our hypervisor told -- * us it was handling L1D_FLUSH, or we are using L1D_FLUSH ourselves. -+ * equivalent semantics to avoid needing to perform both flushes on the -+ * HVM path. Therefore, we don't need VERW in addition to L1D_FLUSH. -+ * -+ * After calculating the appropriate idle setting, simplify -+ * opt_md_clear_hvm to mean just "should we VERW on the way into HVM -+ * guests", so spec_ctrl_init_domain() can calculate suitable settings. - */ -- if ( opt_md_clear_pv ) -- setup_force_cpu_cap(X86_FEATURE_SC_VERW_PV); - if ( opt_md_clear_pv || opt_md_clear_hvm ) - setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE); -- if ( opt_md_clear_hvm && !(caps & ARCH_CAPS_SKIP_L1DFL) && !opt_l1d_flush ) -- setup_force_cpu_cap(X86_FEATURE_SC_VERW_HVM); -+ opt_md_clear_hvm &= !(caps & ARCH_CAPS_SKIP_L1DFL) && !opt_l1d_flush; - - /* - * Warn the user if they are on MLPDS/MFBDS-vulnerable hardware with HT -diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h -index ff3157d52d13..bd45a144ee78 100644 ---- a/xen/include/asm-x86/cpufeatures.h -+++ b/xen/include/asm-x86/cpufeatures.h -@@ -35,8 +35,7 @@ XEN_CPUFEATURE(SC_RSB_HVM, X86_SYNTH(19)) /* RSB overwrite needed for HVM - XEN_CPUFEATURE(XEN_SELFSNOOP, X86_SYNTH(20)) /* SELFSNOOP gets used by Xen itself */ - XEN_CPUFEATURE(SC_MSR_IDLE, X86_SYNTH(21)) /* (SC_MSR_PV || SC_MSR_HVM) && default_xen_spec_ctrl */ - XEN_CPUFEATURE(XEN_LBR, X86_SYNTH(22)) /* Xen uses MSR_DEBUGCTL.LBR */ --XEN_CPUFEATURE(SC_VERW_PV, X86_SYNTH(23)) /* VERW used by Xen for PV */ --XEN_CPUFEATURE(SC_VERW_HVM, X86_SYNTH(24)) /* VERW used by Xen for HVM */ -+/* Bits 23,24 unused. */ - XEN_CPUFEATURE(SC_VERW_IDLE, X86_SYNTH(25)) /* VERW used by Xen for idle */ - XEN_CPUFEATURE(XEN_SHSTK, X86_SYNTH(26)) /* Xen uses CET Shadow Stacks */ - XEN_CPUFEATURE(XEN_IBT, X86_SYNTH(27)) /* Xen uses CET Indirect Branch Tracking */ -diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h -index 92d54de0b9a1..2398a1d99da9 100644 ---- a/xen/include/asm-x86/domain.h -+++ b/xen/include/asm-x86/domain.h -@@ -319,6 +319,9 @@ struct arch_domain - uint32_t pci_cf8; - uint8_t cmos_idx; - -+ /* Use VERW on return-to-guest for its flushing side effect. */ -+ bool verw; -+ - union { - struct pv_domain pv; - struct hvm_domain hvm; -diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h -index f76029523610..751355f471f4 100644 ---- a/xen/include/asm-x86/spec_ctrl.h -+++ b/xen/include/asm-x86/spec_ctrl.h -@@ -24,6 +24,7 @@ - #define SCF_use_shadow (1 << 0) - #define SCF_ist_wrmsr (1 << 1) - #define SCF_ist_rsb (1 << 2) -+#define SCF_verw (1 << 3) - - #ifndef __ASSEMBLY__ - -@@ -32,6 +33,7 @@ - #include <asm/msr-index.h> - - void init_speculation_mitigations(void); -+void spec_ctrl_init_domain(struct domain *d); - - extern bool opt_ibpb; - extern bool opt_ssbd; -diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h -index 02b3b18ce69f..5a590bac44aa 100644 ---- a/xen/include/asm-x86/spec_ctrl_asm.h -+++ b/xen/include/asm-x86/spec_ctrl_asm.h -@@ -136,6 +136,19 @@ - #endif - .endm - -+.macro DO_SPEC_CTRL_COND_VERW -+/* -+ * Requires %rsp=cpuinfo -+ * -+ * Issue a VERW for its flushing side effect, if indicated. This is a Spectre -+ * v1 gadget, but the IRET/VMEntry is serialising. -+ */ -+ testb $SCF_verw, CPUINFO_spec_ctrl_flags(%rsp) -+ jz .L\@_verw_skip -+ verw CPUINFO_verw_sel(%rsp) -+.L\@_verw_skip: -+.endm -+ - .macro DO_SPEC_CTRL_ENTRY maybexen:req - /* - * Requires %rsp=regs (also cpuinfo if !maybexen) -@@ -231,8 +244,7 @@ - #define SPEC_CTRL_EXIT_TO_PV \ - ALTERNATIVE "", \ - DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV; \ -- ALTERNATIVE "", __stringify(verw CPUINFO_verw_sel(%rsp)), \ -- X86_FEATURE_SC_VERW_PV -+ DO_SPEC_CTRL_COND_VERW - - /* - * Use in IST interrupt/exception context. May interrupt Xen or PV context. --- -2.35.1 - diff --git a/0031-x86-spec-ctrl-Enumeration-for-MMIO-Stale-Data-contro.patch b/0031-x86-spec-ctrl-Enumeration-for-MMIO-Stale-Data-contro.patch deleted file mode 100644 index 3b91fb5..0000000 --- a/0031-x86-spec-ctrl-Enumeration-for-MMIO-Stale-Data-contro.patch +++ /dev/null @@ -1,98 +0,0 @@ -From a83108736db0ddaa5855f5abda6dcc8ae4fe25e9 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Mon, 20 Sep 2021 18:47:49 +0100 -Subject: [PATCH 31/51] x86/spec-ctrl: Enumeration for MMIO Stale Data controls -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The three *_NO bits indicate non-susceptibility to the SSDP, FBSDP and PSDP -data movement primitives. - -FB_CLEAR indicates that the VERW instruction has re-gained it's Fill Buffer -flushing side effect. This is only enumerated on parts where VERW had -previously lost it's flushing side effect due to the MDS/TAA vulnerabilities -being fixed in hardware. - -FB_CLEAR_CTRL is available on a subset of FB_CLEAR parts where the Fill Buffer -clearing side effect of VERW can be turned off for performance reasons. - -This is part of XSA-404. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -(cherry picked from commit 2ebe8fe9b7e0d36e9ec3cfe4552b2b197ef0dcec) ---- - xen/arch/x86/spec_ctrl.c | 11 ++++++++--- - xen/include/asm-x86/msr-index.h | 6 ++++++ - 2 files changed, 14 insertions(+), 3 deletions(-) - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 21730aa03071..d285538bde9f 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -323,7 +323,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - * Hardware read-only information, stating immunity to certain issues, or - * suggestions of which mitigation to use. - */ -- printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s\n", -+ printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", - (caps & ARCH_CAPS_RDCL_NO) ? " RDCL_NO" : "", - (caps & ARCH_CAPS_IBRS_ALL) ? " IBRS_ALL" : "", - (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", -@@ -332,13 +332,16 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - (caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : "", - (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : "", - (caps & ARCH_CAPS_TAA_NO) ? " TAA_NO" : "", -+ (caps & ARCH_CAPS_SBDR_SSDP_NO) ? " SBDR_SSDP_NO" : "", -+ (caps & ARCH_CAPS_FBSDP_NO) ? " FBSDP_NO" : "", -+ (caps & ARCH_CAPS_PSDP_NO) ? " PSDP_NO" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBRS_ALWAYS)) ? " IBRS_ALWAYS" : "", - (e8b & cpufeat_mask(X86_FEATURE_STIBP_ALWAYS)) ? " STIBP_ALWAYS" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBRS_FAST)) ? " IBRS_FAST" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBRS_SAME_MODE)) ? " IBRS_SAME_MODE" : ""); - - /* Hardware features which need driving to mitigate issues. */ -- printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s\n", -+ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n", - (e8b & cpufeat_mask(X86_FEATURE_IBPB)) || - (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBPB" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBRS)) || -@@ -353,7 +356,9 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - (_7d0 & cpufeat_mask(X86_FEATURE_MD_CLEAR)) ? " MD_CLEAR" : "", - (_7d0 & cpufeat_mask(X86_FEATURE_SRBDS_CTRL)) ? " SRBDS_CTRL" : "", - (e8b & cpufeat_mask(X86_FEATURE_VIRT_SSBD)) ? " VIRT_SSBD" : "", -- (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : ""); -+ (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "", -+ (caps & ARCH_CAPS_FB_CLEAR) ? " FB_CLEAR" : "", -+ (caps & ARCH_CAPS_FB_CLEAR_CTRL) ? " FB_CLEAR_CTRL" : ""); - - /* Compiled-in support which pertains to mitigations. */ - if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) ) -diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h -index 31964b88af7a..72bc32ba04ff 100644 ---- a/xen/include/asm-x86/msr-index.h -+++ b/xen/include/asm-x86/msr-index.h -@@ -66,6 +66,11 @@ - #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6) - #define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7) - #define ARCH_CAPS_TAA_NO (_AC(1, ULL) << 8) -+#define ARCH_CAPS_SBDR_SSDP_NO (_AC(1, ULL) << 13) -+#define ARCH_CAPS_FBSDP_NO (_AC(1, ULL) << 14) -+#define ARCH_CAPS_PSDP_NO (_AC(1, ULL) << 15) -+#define ARCH_CAPS_FB_CLEAR (_AC(1, ULL) << 17) -+#define ARCH_CAPS_FB_CLEAR_CTRL (_AC(1, ULL) << 18) - - #define MSR_FLUSH_CMD 0x0000010b - #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) -@@ -83,6 +88,7 @@ - #define MCU_OPT_CTRL_RNGDS_MITG_DIS (_AC(1, ULL) << 0) - #define MCU_OPT_CTRL_RTM_ALLOW (_AC(1, ULL) << 1) - #define MCU_OPT_CTRL_RTM_LOCKED (_AC(1, ULL) << 2) -+#define MCU_OPT_CTRL_FB_CLEAR_DIS (_AC(1, ULL) << 3) - - #define MSR_RTIT_OUTPUT_BASE 0x00000560 - #define MSR_RTIT_OUTPUT_MASK 0x00000561 --- -2.35.1 - diff --git a/0032-x86-spec-ctrl-Add-spec-ctrl-unpriv-mmio.patch b/0032-x86-spec-ctrl-Add-spec-ctrl-unpriv-mmio.patch deleted file mode 100644 index c63891a..0000000 --- a/0032-x86-spec-ctrl-Add-spec-ctrl-unpriv-mmio.patch +++ /dev/null @@ -1,187 +0,0 @@ -From 2e82446cb252f6c8ac697e81f4155872c69afde4 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Mon, 13 Jun 2022 19:18:32 +0100 -Subject: [PATCH 32/51] x86/spec-ctrl: Add spec-ctrl=unpriv-mmio -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Per Xen's support statement, PCI passthrough should be to trusted domains -because the overall system security depends on factors outside of Xen's -control. - -As such, Xen, in a supported configuration, is not vulnerable to DRPW/SBDR. - -However, users who have risk assessed their configuration may be happy with -the risk of DoS, but unhappy with the risk of cross-domain data leakage. Such -users should enable this option. - -On CPUs vulnerable to MDS, the existing mitigations are the best we can do to -mitigate MMIO cross-domain data leakage. - -On CPUs fixed to MDS but vulnerable MMIO stale data leakage, this option: - - * On CPUs susceptible to FBSDP, mitigates cross-domain fill buffer leakage - using FB_CLEAR. - * On CPUs susceptible to SBDR, mitigates RNG data recovery by engaging the - srb-lock, previously used to mitigate SRBDS. - -Both mitigations require microcode from IPU 2022.1, May 2022. - -This is part of XSA-404. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -(cherry picked from commit 8c24b70fedcb52633b2370f834d8a2be3f7fa38e) ---- - docs/misc/xen-command-line.pandoc | 14 +++++++-- - xen/arch/x86/spec_ctrl.c | 48 ++++++++++++++++++++++++------- - 2 files changed, 48 insertions(+), 14 deletions(-) - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index d5cb09f86541..a642e43476a2 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -2235,7 +2235,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). - ### spec-ctrl (x86) - > `= List of [ <bool>, xen=<bool>, {pv,hvm,msr-sc,rsb,md-clear}=<bool>, - > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu, --> l1d-flush,branch-harden,srb-lock}=<bool> ]` -+> l1d-flush,branch-harden,srb-lock,unpriv-mmio}=<bool> ]` - - Controls for speculative execution sidechannel mitigations. By default, Xen - will pick the most appropriate mitigations based on compiled in support, -@@ -2314,8 +2314,16 @@ Xen will enable this mitigation. - On hardware supporting SRBDS_CTRL, the `srb-lock=` option can be used to force - or prevent Xen from protect the Special Register Buffer from leaking stale - data. By default, Xen will enable this mitigation, except on parts where MDS --is fixed and TAA is fixed/mitigated (in which case, there is believed to be no --way for an attacker to obtain the stale data). -+is fixed and TAA is fixed/mitigated and there are no unprivileged MMIO -+mappings (in which case, there is believed to be no way for an attacker to -+obtain stale data). -+ -+The `unpriv-mmio=` boolean indicates whether the system has (or will have) -+less than fully privileged domains granted access to MMIO devices. By -+default, this option is disabled. If enabled, Xen will use the `FB_CLEAR` -+and/or `SRBDS_CTRL` functionality available in the Intel May 2022 microcode -+release to mitigate cross-domain leakage of data via the MMIO Stale Data -+vulnerabilities. - - ### sync_console - > `= <boolean>` -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index d285538bde9f..099113ba41e6 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -67,6 +67,8 @@ static bool __initdata cpu_has_bug_msbds_only; /* => minimal HT impact. */ - static bool __initdata cpu_has_bug_mds; /* Any other M{LP,SB,FB}DS combination. */ - - static int8_t __initdata opt_srb_lock = -1; -+static bool __initdata opt_unpriv_mmio; -+static bool __read_mostly opt_fb_clear_mmio; - - static int __init parse_spec_ctrl(const char *s) - { -@@ -184,6 +186,8 @@ static int __init parse_spec_ctrl(const char *s) - opt_branch_harden = val; - else if ( (val = parse_boolean("srb-lock", s, ss)) >= 0 ) - opt_srb_lock = val; -+ else if ( (val = parse_boolean("unpriv-mmio", s, ss)) >= 0 ) -+ opt_unpriv_mmio = val; - else - rc = -EINVAL; - -@@ -392,7 +396,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - opt_srb_lock ? " SRB_LOCK+" : " SRB_LOCK-", - opt_ibpb ? " IBPB" : "", - opt_l1d_flush ? " L1D_FLUSH" : "", -- opt_md_clear_pv || opt_md_clear_hvm ? " VERW" : "", -+ opt_md_clear_pv || opt_md_clear_hvm || -+ opt_fb_clear_mmio ? " VERW" : "", - opt_branch_harden ? " BRANCH_HARDEN" : ""); - - /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */ -@@ -941,7 +946,9 @@ void spec_ctrl_init_domain(struct domain *d) - { - bool pv = is_pv_domain(d); - -- d->arch.verw = pv ? opt_md_clear_pv : opt_md_clear_hvm; -+ d->arch.verw = -+ (pv ? opt_md_clear_pv : opt_md_clear_hvm) || -+ (opt_fb_clear_mmio && is_iommu_enabled(d)); - } - - void __init init_speculation_mitigations(void) -@@ -1195,6 +1202,18 @@ void __init init_speculation_mitigations(void) - - mds_calculations(caps); - -+ /* -+ * Parts which enumerate FB_CLEAR are those which are post-MDS_NO and have -+ * reintroduced the VERW fill buffer flushing side effect because of a -+ * susceptibility to FBSDP. -+ * -+ * If unprivileged guests have (or will have) MMIO mappings, we can -+ * mitigate cross-domain leakage of fill buffer data by issuing VERW on -+ * the return-to-guest path. -+ */ -+ if ( opt_unpriv_mmio ) -+ opt_fb_clear_mmio = caps & ARCH_CAPS_FB_CLEAR; -+ - /* - * By default, enable PV and HVM mitigations on MDS-vulnerable hardware. - * This will only be a token effort for MLPDS/MFBDS when HT is enabled, -@@ -1208,18 +1227,20 @@ void __init init_speculation_mitigations(void) - boot_cpu_has(X86_FEATURE_MD_CLEAR)); - - /* -- * Enable MDS defences as applicable. The Idle blocks need using if -- * either PV or HVM defences are used. -+ * Enable MDS/MMIO defences as applicable. The Idle blocks need using if -+ * either the PV or HVM MDS defences are used, or if we may give MMIO -+ * access to untrusted guests. - * - * HVM is more complicated. The MD_CLEAR microcode extends L1D_FLUSH with - * equivalent semantics to avoid needing to perform both flushes on the -- * HVM path. Therefore, we don't need VERW in addition to L1D_FLUSH. -+ * HVM path. Therefore, we don't need VERW in addition to L1D_FLUSH (for -+ * MDS mitigations. L1D_FLUSH is not safe for MMIO mitigations.) - * - * After calculating the appropriate idle setting, simplify - * opt_md_clear_hvm to mean just "should we VERW on the way into HVM - * guests", so spec_ctrl_init_domain() can calculate suitable settings. - */ -- if ( opt_md_clear_pv || opt_md_clear_hvm ) -+ if ( opt_md_clear_pv || opt_md_clear_hvm || opt_fb_clear_mmio ) - setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE); - opt_md_clear_hvm &= !(caps & ARCH_CAPS_SKIP_L1DFL) && !opt_l1d_flush; - -@@ -1284,14 +1305,19 @@ void __init init_speculation_mitigations(void) - * On some SRBDS-affected hardware, it may be safe to relax srb-lock by - * default. - * -- * On parts which enumerate MDS_NO and not TAA_NO, TSX is the only known -- * way to access the Fill Buffer. If TSX isn't available (inc. SKU -- * reasons on some models), or TSX is explicitly disabled, then there is -- * no need for the extra overhead to protect RDRAND/RDSEED. -+ * All parts with SRBDS_CTRL suffer SSDP, the mechanism by which stale RNG -+ * data becomes available to other contexts. To recover the data, an -+ * attacker needs to use: -+ * - SBDS (MDS or TAA to sample the cores fill buffer) -+ * - SBDR (Architecturally retrieve stale transaction buffer contents) -+ * - DRPW (Architecturally latch stale fill buffer data) -+ * -+ * On MDS_NO parts, and with TAA_NO or TSX unavailable/disabled, and there -+ * is no unprivileged MMIO access, the RNG data doesn't need protecting. - */ - if ( cpu_has_srbds_ctrl ) - { -- if ( opt_srb_lock == -1 && -+ if ( opt_srb_lock == -1 && !opt_unpriv_mmio && - (caps & (ARCH_CAPS_MDS_NO|ARCH_CAPS_TAA_NO)) == ARCH_CAPS_MDS_NO && - (!cpu_has_hle || ((caps & ARCH_CAPS_TSX_CTRL) && rtm_disabled)) ) - opt_srb_lock = 0; --- -2.35.1 - diff --git a/0033-IOMMU-x86-work-around-bogus-gcc12-warning-in-hvm_gsi.patch b/0033-IOMMU-x86-work-around-bogus-gcc12-warning-in-hvm_gsi.patch deleted file mode 100644 index 07f488d..0000000 --- a/0033-IOMMU-x86-work-around-bogus-gcc12-warning-in-hvm_gsi.patch +++ /dev/null @@ -1,52 +0,0 @@ -From 460b08d6c6c16b3f32aa138e772b759ae02a4479 Mon Sep 17 00:00:00 2001 -From: Jan Beulich <jbeulich@suse.com> -Date: Tue, 12 Jul 2022 11:10:34 +0200 -Subject: [PATCH 33/51] IOMMU/x86: work around bogus gcc12 warning in - hvm_gsi_eoi() -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -As per [1] the expansion of the pirq_dpci() macro causes a -Waddress -controlled warning (enabled implicitly in our builds, if not by default) -tying the middle part of the involved conditional expression to the -surrounding boolean context. Work around this by introducing a local -inline function in the affected source file. - -Reported-by: Andrew Cooper <andrew.cooper3@citrix.com> -Signed-off-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Roger Pau Monné <roger.pau@citrix.com> - -[1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102967 -master commit: 80ad8db8a4d9bb24952f0aea788ce6f47566fa76 -master date: 2022-06-15 10:19:32 +0200 ---- - xen/drivers/passthrough/x86/hvm.c | 12 ++++++++++++ - 1 file changed, 12 insertions(+) - -diff --git a/xen/drivers/passthrough/x86/hvm.c b/xen/drivers/passthrough/x86/hvm.c -index 0b37cd145b60..ba0f6c53d742 100644 ---- a/xen/drivers/passthrough/x86/hvm.c -+++ b/xen/drivers/passthrough/x86/hvm.c -@@ -25,6 +25,18 @@ - #include <asm/hvm/support.h> - #include <asm/io_apic.h> - -+/* -+ * Gcc12 takes issue with pirq_dpci() being used in boolean context (see gcc -+ * bug 102967). While we can't replace the macro definition in the header by an -+ * inline function, we can do so here. -+ */ -+static inline struct hvm_pirq_dpci *_pirq_dpci(struct pirq *pirq) -+{ -+ return pirq_dpci(pirq); -+} -+#undef pirq_dpci -+#define pirq_dpci(pirq) _pirq_dpci(pirq) -+ - static DEFINE_PER_CPU(struct list_head, dpci_list); - - /* --- -2.35.1 - diff --git a/0034-ehci-dbgp-fix-selecting-n-th-ehci-controller.patch b/0034-ehci-dbgp-fix-selecting-n-th-ehci-controller.patch deleted file mode 100644 index ac71ab8..0000000 --- a/0034-ehci-dbgp-fix-selecting-n-th-ehci-controller.patch +++ /dev/null @@ -1,36 +0,0 @@ -From 5cb8142076ce1ce53eafd7e00acb4d0eac4e7784 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= - <marmarek@invisiblethingslab.com> -Date: Tue, 12 Jul 2022 11:11:35 +0200 -Subject: [PATCH 34/51] ehci-dbgp: fix selecting n-th ehci controller -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The ehci<n> number was parsed but ignored. - -Fixes: 322ecbe4ac85 ("console: add EHCI debug port based serial console") -Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: d6d0cb659fda64430d4649f8680c5cead32da8fd -master date: 2022-06-16 14:23:37 +0100 ---- - xen/drivers/char/ehci-dbgp.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xen/drivers/char/ehci-dbgp.c b/xen/drivers/char/ehci-dbgp.c -index c893d246defa..66b4811af24a 100644 ---- a/xen/drivers/char/ehci-dbgp.c -+++ b/xen/drivers/char/ehci-dbgp.c -@@ -1478,7 +1478,7 @@ void __init ehci_dbgp_init(void) - unsigned int num = 0; - - if ( opt_dbgp[4] ) -- simple_strtoul(opt_dbgp + 4, &e, 10); -+ num = simple_strtoul(opt_dbgp + 4, &e, 10); - - dbgp->cap = find_dbgp(dbgp, num); - if ( !dbgp->cap ) --- -2.35.1 - diff --git a/0035-tools-xenstored-Harden-corrupt.patch b/0035-tools-xenstored-Harden-corrupt.patch deleted file mode 100644 index bb0f7f1..0000000 --- a/0035-tools-xenstored-Harden-corrupt.patch +++ /dev/null @@ -1,44 +0,0 @@ -From 81ee3d08351be1ef2a14d371993604098d6a4673 Mon Sep 17 00:00:00 2001 -From: Julien Grall <jgrall@amazon.com> -Date: Tue, 12 Jul 2022 11:12:13 +0200 -Subject: [PATCH 35/51] tools/xenstored: Harden corrupt() - -At the moment, corrupt() is neither checking for allocation failure -nor freeing the allocated memory. - -Harden the code by printing ENOMEM if the allocation failed and -free 'str' after the last use. - -This is not considered to be a security issue because corrupt() should -only be called when Xenstored thinks the database is corrupted. Note -that the trigger (i.e. a guest reliably provoking the call) would be -a security issue. - -Fixes: 06d17943f0cd ("Added a basic integrity checker, and some basic ability to recover from store") -Signed-off-by: Julien Grall <jgrall@amazon.com> -Reviewed-by: Juergen Gross <jgross@suse.com> -master commit: db3382dd4f468c763512d6bf91c96773395058fb -master date: 2022-06-23 13:44:10 +0100 ---- - tools/xenstore/xenstored_core.c | 5 ++++- - 1 file changed, 4 insertions(+), 1 deletion(-) - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 91d093a12ea6..0c8ee276f837 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -2087,7 +2087,10 @@ void corrupt(struct connection *conn, const char *fmt, ...) - va_end(arglist); - - log("corruption detected by connection %i: err %s: %s", -- conn ? (int)conn->id : -1, strerror(saved_errno), str); -+ conn ? (int)conn->id : -1, strerror(saved_errno), -+ str ?: "ENOMEM"); -+ -+ talloc_free(str); - - check_store(); - } --- -2.35.1 - diff --git a/0036-x86-spec-ctrl-Only-adjust-MSR_SPEC_CTRL-for-idle-wit.patch b/0036-x86-spec-ctrl-Only-adjust-MSR_SPEC_CTRL-for-idle-wit.patch deleted file mode 100644 index 8bc0768..0000000 --- a/0036-x86-spec-ctrl-Only-adjust-MSR_SPEC_CTRL-for-idle-wit.patch +++ /dev/null @@ -1,93 +0,0 @@ -From 09d533f4c80b7eaf9fb4e36ebba8259580857a9d Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 12 Jul 2022 11:12:46 +0200 -Subject: [PATCH 36/51] x86/spec-ctrl: Only adjust MSR_SPEC_CTRL for idle with - legacy IBRS -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Back at the time of the original Spectre-v2 fixes, it was recommended to clear -MSR_SPEC_CTRL when going idle. This is because of the side effects on the -sibling thread caused by the microcode IBRS and STIBP implementations which -were retrofitted to existing CPUs. - -However, there are no relevant cross-thread impacts for the hardware -IBRS/STIBP implementations, so this logic should not be used on Intel CPUs -supporting eIBRS, or any AMD CPUs; doing so only adds unnecessary latency to -the idle path. - -Furthermore, there's no point playing with MSR_SPEC_CTRL in the idle paths if -SMT is disabled for other reasons. - -Fixes: 8d03080d2a33 ("x86/spec-ctrl: Cease using thunk=lfence on AMD") -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -master commit: ffc7694e0c99eea158c32aa164b7d1e1bb1dc46b -master date: 2022-06-30 18:07:13 +0100 ---- - xen/arch/x86/spec_ctrl.c | 10 ++++++++-- - xen/include/asm-x86/cpufeatures.h | 2 +- - xen/include/asm-x86/spec_ctrl.h | 5 +++-- - 3 files changed, 12 insertions(+), 5 deletions(-) - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 099113ba41e6..1ed5ceda8b46 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -1150,8 +1150,14 @@ void __init init_speculation_mitigations(void) - /* (Re)init BSP state now that default_spec_ctrl_flags has been calculated. */ - init_shadow_spec_ctrl_state(); - -- /* If Xen is using any MSR_SPEC_CTRL settings, adjust the idle path. */ -- if ( default_xen_spec_ctrl ) -+ /* -+ * For microcoded IBRS only (i.e. Intel, pre eIBRS), it is recommended to -+ * clear MSR_SPEC_CTRL before going idle, to avoid impacting sibling -+ * threads. Activate this if SMT is enabled, and Xen is using a non-zero -+ * MSR_SPEC_CTRL setting. -+ */ -+ if ( boot_cpu_has(X86_FEATURE_IBRSB) && !(caps & ARCH_CAPS_IBRS_ALL) && -+ hw_smt_enabled && default_xen_spec_ctrl ) - setup_force_cpu_cap(X86_FEATURE_SC_MSR_IDLE); - - xpti_init_default(caps); -diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h -index bd45a144ee78..493d338a085e 100644 ---- a/xen/include/asm-x86/cpufeatures.h -+++ b/xen/include/asm-x86/cpufeatures.h -@@ -33,7 +33,7 @@ XEN_CPUFEATURE(SC_MSR_HVM, X86_SYNTH(17)) /* MSR_SPEC_CTRL used by Xen fo - XEN_CPUFEATURE(SC_RSB_PV, X86_SYNTH(18)) /* RSB overwrite needed for PV */ - XEN_CPUFEATURE(SC_RSB_HVM, X86_SYNTH(19)) /* RSB overwrite needed for HVM */ - XEN_CPUFEATURE(XEN_SELFSNOOP, X86_SYNTH(20)) /* SELFSNOOP gets used by Xen itself */ --XEN_CPUFEATURE(SC_MSR_IDLE, X86_SYNTH(21)) /* (SC_MSR_PV || SC_MSR_HVM) && default_xen_spec_ctrl */ -+XEN_CPUFEATURE(SC_MSR_IDLE, X86_SYNTH(21)) /* Clear MSR_SPEC_CTRL on idle */ - XEN_CPUFEATURE(XEN_LBR, X86_SYNTH(22)) /* Xen uses MSR_DEBUGCTL.LBR */ - /* Bits 23,24 unused. */ - XEN_CPUFEATURE(SC_VERW_IDLE, X86_SYNTH(25)) /* VERW used by Xen for idle */ -diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h -index 751355f471f4..7e83e0179fb9 100644 ---- a/xen/include/asm-x86/spec_ctrl.h -+++ b/xen/include/asm-x86/spec_ctrl.h -@@ -78,7 +78,8 @@ static always_inline void spec_ctrl_enter_idle(struct cpu_info *info) - uint32_t val = 0; - - /* -- * Branch Target Injection: -+ * It is recommended in some cases to clear MSR_SPEC_CTRL when going idle, -+ * to avoid impacting sibling threads. - * - * Latch the new shadow value, then enable shadowing, then update the MSR. - * There are no SMP issues here; only local processor ordering concerns. -@@ -114,7 +115,7 @@ static always_inline void spec_ctrl_exit_idle(struct cpu_info *info) - uint32_t val = info->xen_spec_ctrl; - - /* -- * Branch Target Injection: -+ * Restore MSR_SPEC_CTRL on exit from idle. - * - * Disable shadowing before updating the MSR. There are no SMP issues - * here; only local processor ordering concerns. --- -2.35.1 - diff --git a/0037-x86-spec-ctrl-Knobs-for-STIBP-and-PSFD-and-follow-ha.patch b/0037-x86-spec-ctrl-Knobs-for-STIBP-and-PSFD-and-follow-ha.patch deleted file mode 100644 index 156aa58..0000000 --- a/0037-x86-spec-ctrl-Knobs-for-STIBP-and-PSFD-and-follow-ha.patch +++ /dev/null @@ -1,234 +0,0 @@ -From db6ca8176ccc4ff7dfe3c06969af9ebfab0d7b04 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 12 Jul 2022 11:13:33 +0200 -Subject: [PATCH 37/51] x86/spec-ctrl: Knobs for STIBP and PSFD, and follow - hardware STIBP hint -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -STIBP and PSFD are slightly weird bits, because they're both implied by other -bits in MSR_SPEC_CTRL. Add fine grain controls for them, and take the -implications into account when setting IBRS/SSBD. - -Rearrange the IBPB text/variables/logic to keep all the MSR_SPEC_CTRL bits -together, for consistency. - -However, AMD have a hardware hint CPUID bit recommending that STIBP be set -unilaterally. This is advertised on Zen3, so follow the recommendation. -Furthermore, in such cases, set STIBP behind the guest's back for now. This -has negligible overhead for the guest, but saves a WRMSR on vmentry. This is -the only default change. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> -master commit: fef244b179c06fcdfa581f7d57fa6e578c49ff50 -master date: 2022-06-30 18:07:13 +0100 ---- - docs/misc/xen-command-line.pandoc | 21 +++++++--- - xen/arch/x86/hvm/svm/vmcb.c | 9 +++++ - xen/arch/x86/spec_ctrl.c | 67 ++++++++++++++++++++++++++----- - 3 files changed, 82 insertions(+), 15 deletions(-) - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index a642e43476a2..46e9c58d35cd 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -2234,8 +2234,9 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). - - ### spec-ctrl (x86) - > `= List of [ <bool>, xen=<bool>, {pv,hvm,msr-sc,rsb,md-clear}=<bool>, --> bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,eager-fpu, --> l1d-flush,branch-harden,srb-lock,unpriv-mmio}=<bool> ]` -+> bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd, -+> eager-fpu,l1d-flush,branch-harden,srb-lock, -+> unpriv-mmio}=<bool> ]` - - Controls for speculative execution sidechannel mitigations. By default, Xen - will pick the most appropriate mitigations based on compiled in support, -@@ -2285,9 +2286,10 @@ On hardware supporting IBRS (Indirect Branch Restricted Speculation), the - If Xen is not using IBRS itself, functionality is still set up so IBRS can be - virtualised for guests. - --On hardware supporting IBPB (Indirect Branch Prediction Barrier), the `ibpb=` --option can be used to force (the default) or prevent Xen from issuing branch --prediction barriers on vcpu context switches. -+On hardware supporting STIBP (Single Thread Indirect Branch Predictors), the -+`stibp=` option can be used to force or prevent Xen using the feature itself. -+By default, Xen will use STIBP when IBRS is in use (IBRS implies STIBP), and -+when hardware hints recommend using it as a blanket setting. - - On hardware supporting SSBD (Speculative Store Bypass Disable), the `ssbd=` - option can be used to force or prevent Xen using the feature itself. On AMD -@@ -2295,6 +2297,15 @@ hardware, this is a global option applied at boot, and not virtualised for - guest use. On Intel hardware, the feature is virtualised for guests, - independently of Xen's choice of setting. - -+On hardware supporting PSFD (Predictive Store Forwarding Disable), the `psfd=` -+option can be used to force or prevent Xen using the feature itself. By -+default, Xen will not use PSFD. PSFD is implied by SSBD, and SSBD is off by -+default. -+ -+On hardware supporting IBPB (Indirect Branch Prediction Barrier), the `ibpb=` -+option can be used to force (the default) or prevent Xen from issuing branch -+prediction barriers on vcpu context switches. -+ - On all hardware, the `eager-fpu=` option can be used to force or prevent Xen - from using fully eager FPU context switches. This is currently implemented as - a global control. By default, Xen will choose to use fully eager context -diff --git a/xen/arch/x86/hvm/svm/vmcb.c b/xen/arch/x86/hvm/svm/vmcb.c -index 565e997155f2..ef7224eb5dd7 100644 ---- a/xen/arch/x86/hvm/svm/vmcb.c -+++ b/xen/arch/x86/hvm/svm/vmcb.c -@@ -29,6 +29,7 @@ - #include <asm/hvm/support.h> - #include <asm/hvm/svm/svm.h> - #include <asm/hvm/svm/svmdebug.h> -+#include <asm/spec_ctrl.h> - - struct vmcb_struct *alloc_vmcb(void) - { -@@ -176,6 +177,14 @@ static int construct_vmcb(struct vcpu *v) - vmcb->_pause_filter_thresh = SVM_PAUSETHRESH_INIT; - } - -+ /* -+ * When default_xen_spec_ctrl simply SPEC_CTRL_STIBP, default this behind -+ * the back of the VM too. Our SMT topology isn't accurate, the overhead -+ * is neglegable, and doing this saves a WRMSR on the vmentry path. -+ */ -+ if ( default_xen_spec_ctrl == SPEC_CTRL_STIBP ) -+ v->arch.msrs->spec_ctrl.raw = SPEC_CTRL_STIBP; -+ - return 0; - } - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 1ed5ceda8b46..dfdd45c358c4 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -48,9 +48,13 @@ static enum ind_thunk { - THUNK_LFENCE, - THUNK_JMP, - } opt_thunk __initdata = THUNK_DEFAULT; -+ - static int8_t __initdata opt_ibrs = -1; -+int8_t __initdata opt_stibp = -1; -+bool __read_mostly opt_ssbd; -+int8_t __initdata opt_psfd = -1; -+ - bool __read_mostly opt_ibpb = true; --bool __read_mostly opt_ssbd = false; - int8_t __read_mostly opt_eager_fpu = -1; - int8_t __read_mostly opt_l1d_flush = -1; - static bool __initdata opt_branch_harden = true; -@@ -172,12 +176,20 @@ static int __init parse_spec_ctrl(const char *s) - else - rc = -EINVAL; - } -+ -+ /* Bits in MSR_SPEC_CTRL. */ - else if ( (val = parse_boolean("ibrs", s, ss)) >= 0 ) - opt_ibrs = val; -- else if ( (val = parse_boolean("ibpb", s, ss)) >= 0 ) -- opt_ibpb = val; -+ else if ( (val = parse_boolean("stibp", s, ss)) >= 0 ) -+ opt_stibp = val; - else if ( (val = parse_boolean("ssbd", s, ss)) >= 0 ) - opt_ssbd = val; -+ else if ( (val = parse_boolean("psfd", s, ss)) >= 0 ) -+ opt_psfd = val; -+ -+ /* Misc settings. */ -+ else if ( (val = parse_boolean("ibpb", s, ss)) >= 0 ) -+ opt_ibpb = val; - else if ( (val = parse_boolean("eager-fpu", s, ss)) >= 0 ) - opt_eager_fpu = val; - else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 ) -@@ -376,7 +388,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - "\n"); - - /* Settings for Xen's protection, irrespective of guests. */ -- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s, Other:%s%s%s%s%s\n", -+ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s\n", - thunk == THUNK_NONE ? "N/A" : - thunk == THUNK_RETPOLINE ? "RETPOLINE" : - thunk == THUNK_LFENCE ? "LFENCE" : -@@ -390,6 +402,9 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - (!boot_cpu_has(X86_FEATURE_SSBD) && - !boot_cpu_has(X86_FEATURE_AMD_SSBD)) ? "" : - (default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-", -+ (!boot_cpu_has(X86_FEATURE_PSFD) && -+ !boot_cpu_has(X86_FEATURE_INTEL_PSFD)) ? "" : -+ (default_xen_spec_ctrl & SPEC_CTRL_PSFD) ? " PSFD+" : " PSFD-", - !(caps & ARCH_CAPS_TSX_CTRL) ? "" : - (opt_tsx & 1) ? " TSX+" : " TSX-", - !cpu_has_srbds_ctrl ? "" : -@@ -979,10 +994,7 @@ void __init init_speculation_mitigations(void) - if ( !has_spec_ctrl ) - printk(XENLOG_WARNING "?!? CET active, but no MSR_SPEC_CTRL?\n"); - else if ( opt_ibrs == -1 ) -- { - opt_ibrs = ibrs = true; -- default_xen_spec_ctrl |= SPEC_CTRL_IBRS | SPEC_CTRL_STIBP; -- } - - if ( opt_thunk == THUNK_DEFAULT || opt_thunk == THUNK_RETPOLINE ) - thunk = THUNK_JMP; -@@ -1086,14 +1098,49 @@ void __init init_speculation_mitigations(void) - setup_force_cpu_cap(X86_FEATURE_SC_MSR_HVM); - } - -- /* If we have IBRS available, see whether we should use it. */ -+ /* Figure out default_xen_spec_ctrl. */ - if ( has_spec_ctrl && ibrs ) -- default_xen_spec_ctrl |= SPEC_CTRL_IBRS; -+ { -+ /* IBRS implies STIBP. */ -+ if ( opt_stibp == -1 ) -+ opt_stibp = 1; -+ -+ default_xen_spec_ctrl |= SPEC_CTRL_IBRS; -+ } -+ -+ /* -+ * Use STIBP by default if the hardware hint is set. Otherwise, leave it -+ * off as it a severe performance pentalty on pre-eIBRS Intel hardware -+ * where it was retrofitted in microcode. -+ */ -+ if ( opt_stibp == -1 ) -+ opt_stibp = !!boot_cpu_has(X86_FEATURE_STIBP_ALWAYS); -+ -+ if ( opt_stibp && (boot_cpu_has(X86_FEATURE_STIBP) || -+ boot_cpu_has(X86_FEATURE_AMD_STIBP)) ) -+ default_xen_spec_ctrl |= SPEC_CTRL_STIBP; - -- /* If we have SSBD available, see whether we should use it. */ - if ( opt_ssbd && (boot_cpu_has(X86_FEATURE_SSBD) || - boot_cpu_has(X86_FEATURE_AMD_SSBD)) ) -+ { -+ /* SSBD implies PSFD */ -+ if ( opt_psfd == -1 ) -+ opt_psfd = 1; -+ - default_xen_spec_ctrl |= SPEC_CTRL_SSBD; -+ } -+ -+ /* -+ * Don't use PSFD by default. AMD designed the predictor to -+ * auto-clear on privilege change. PSFD is implied by SSBD, which is -+ * off by default. -+ */ -+ if ( opt_psfd == -1 ) -+ opt_psfd = 0; -+ -+ if ( opt_psfd && (boot_cpu_has(X86_FEATURE_PSFD) || -+ boot_cpu_has(X86_FEATURE_INTEL_PSFD)) ) -+ default_xen_spec_ctrl |= SPEC_CTRL_PSFD; - - /* - * PV guests can create RSB entries for any linear address they control, --- -2.35.1 - diff --git a/0038-libxc-fix-compilation-error-with-gcc13.patch b/0038-libxc-fix-compilation-error-with-gcc13.patch deleted file mode 100644 index 8056742..0000000 --- a/0038-libxc-fix-compilation-error-with-gcc13.patch +++ /dev/null @@ -1,33 +0,0 @@ -From cd3d6b4cd46cd05590805b4a6c0b6654af60106e Mon Sep 17 00:00:00 2001 -From: Charles Arnold <carnold@suse.com> -Date: Tue, 12 Jul 2022 11:14:07 +0200 -Subject: [PATCH 38/51] libxc: fix compilation error with gcc13 - -xc_psr.c:161:5: error: conflicting types for 'xc_psr_cmt_get_data' -due to enum/integer mismatch; - -Signed-off-by: Charles Arnold <carnold@suse.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -Acked-by: Anthony PERARD <anthony.perard@citrix.com> -master commit: 8eeae8c2b4efefda8e946461e86cf2ae9c18e5a9 -master date: 2022-07-06 13:06:40 +0200 ---- - tools/include/xenctrl.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/tools/include/xenctrl.h b/tools/include/xenctrl.h -index 07b96e6671a5..893ae39e4a95 100644 ---- a/tools/include/xenctrl.h -+++ b/tools/include/xenctrl.h -@@ -2516,7 +2516,7 @@ int xc_psr_cmt_get_l3_event_mask(xc_interface *xch, uint32_t *event_mask); - int xc_psr_cmt_get_l3_cache_size(xc_interface *xch, uint32_t cpu, - uint32_t *l3_cache_size); - int xc_psr_cmt_get_data(xc_interface *xch, uint32_t rmid, uint32_t cpu, -- uint32_t psr_cmt_type, uint64_t *monitor_data, -+ xc_psr_cmt_type type, uint64_t *monitor_data, - uint64_t *tsc); - int xc_psr_cmt_enabled(xc_interface *xch); - --- -2.35.1 - diff --git a/0039-x86-spec-ctrl-Honour-spec-ctrl-0-for-unpriv-mmio-sub.patch b/0039-x86-spec-ctrl-Honour-spec-ctrl-0-for-unpriv-mmio-sub.patch deleted file mode 100644 index 1797a8f..0000000 --- a/0039-x86-spec-ctrl-Honour-spec-ctrl-0-for-unpriv-mmio-sub.patch +++ /dev/null @@ -1,32 +0,0 @@ -From 61b9c2ceeb94b0cdaff01023cc5523b1f13e66e2 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 12 Jul 2022 11:14:34 +0200 -Subject: [PATCH 39/51] x86/spec-ctrl: Honour spec-ctrl=0 for unpriv-mmio - sub-option - -This was an oversight from when unpriv-mmio was introduced. - -Fixes: 8c24b70fedcb ("x86/spec-ctrl: Add spec-ctrl=unpriv-mmio") -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 4cdb519d797c19ebb8fadc5938cdb47479d5a21b -master date: 2022-07-11 15:21:35 +0100 ---- - xen/arch/x86/spec_ctrl.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index dfdd45c358c4..ae74943c1053 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -122,6 +122,7 @@ static int __init parse_spec_ctrl(const char *s) - opt_l1d_flush = 0; - opt_branch_harden = false; - opt_srb_lock = 0; -+ opt_unpriv_mmio = false; - } - else if ( val > 0 ) - rc = -EINVAL; --- -2.35.1 - diff --git a/0040-xen-cmdline-Extend-parse_boolean-to-signal-a-name-ma.patch b/0040-xen-cmdline-Extend-parse_boolean-to-signal-a-name-ma.patch deleted file mode 100644 index 3512590..0000000 --- a/0040-xen-cmdline-Extend-parse_boolean-to-signal-a-name-ma.patch +++ /dev/null @@ -1,87 +0,0 @@ -From eec5b02403a9df2523527caad24f17af5060fbe7 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 12 Jul 2022 11:15:03 +0200 -Subject: [PATCH 40/51] xen/cmdline: Extend parse_boolean() to signal a name - match - -This will help parsing a sub-option which has boolean and non-boolean options -available. - -First, rework 'int val' into 'bool has_neg_prefix'. This inverts it's value, -but the resulting logic is far easier to follow. - -Second, reject anything of the form 'no-$FOO=' which excludes ambiguous -constructs such as 'no-$foo=yes' which have never been valid. - -This just leaves the case where everything is otherwise fine, but parse_bool() -can't interpret the provided string. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Juergen Gross <jgross@suse.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 382326cac528dd1eb0d04efd5c05363c453e29f4 -master date: 2022-07-11 15:21:35 +0100 ---- - xen/common/kernel.c | 20 ++++++++++++++++---- - xen/include/xen/lib.h | 3 ++- - 2 files changed, 18 insertions(+), 5 deletions(-) - -diff --git a/xen/common/kernel.c b/xen/common/kernel.c -index e119e5401f9d..7ed96521f97a 100644 ---- a/xen/common/kernel.c -+++ b/xen/common/kernel.c -@@ -272,9 +272,9 @@ int parse_bool(const char *s, const char *e) - int parse_boolean(const char *name, const char *s, const char *e) - { - size_t slen, nlen; -- int val = !!strncmp(s, "no-", 3); -+ bool has_neg_prefix = !strncmp(s, "no-", 3); - -- if ( !val ) -+ if ( has_neg_prefix ) - s += 3; - - slen = e ? ({ ASSERT(e >= s); e - s; }) : strlen(s); -@@ -286,11 +286,23 @@ int parse_boolean(const char *name, const char *s, const char *e) - - /* Exact, unadorned name? Result depends on the 'no-' prefix. */ - if ( slen == nlen ) -- return val; -+ return !has_neg_prefix; -+ -+ /* Inexact match with a 'no-' prefix? Not valid. */ -+ if ( has_neg_prefix ) -+ return -1; - - /* =$SOMETHING? Defer to the regular boolean parsing. */ - if ( s[nlen] == '=' ) -- return parse_bool(&s[nlen + 1], e); -+ { -+ int b = parse_bool(&s[nlen + 1], e); -+ -+ if ( b >= 0 ) -+ return b; -+ -+ /* Not a boolean, but the name matched. Signal specially. */ -+ return -2; -+ } - - /* Unrecognised. Give up. */ - return -1; -diff --git a/xen/include/xen/lib.h b/xen/include/xen/lib.h -index c6987973bf88..2296044caf79 100644 ---- a/xen/include/xen/lib.h -+++ b/xen/include/xen/lib.h -@@ -80,7 +80,8 @@ int parse_bool(const char *s, const char *e); - /** - * Given a specific name, parses a string of the form: - * [no-]$NAME[=...] -- * returning 0 or 1 for a recognised boolean, or -1 for an error. -+ * returning 0 or 1 for a recognised boolean. Returns -1 for general errors, -+ * and -2 for "not a boolean, but $NAME= matches". - */ - int parse_boolean(const char *name, const char *s, const char *e); - --- -2.35.1 - diff --git a/0041-x86-spec-ctrl-Add-fine-grained-cmdline-suboptions-fo.patch b/0041-x86-spec-ctrl-Add-fine-grained-cmdline-suboptions-fo.patch deleted file mode 100644 index 9964bb9..0000000 --- a/0041-x86-spec-ctrl-Add-fine-grained-cmdline-suboptions-fo.patch +++ /dev/null @@ -1,137 +0,0 @@ -From f066c8bb3e5686141cef6fa1dc86ea9f37c5388a Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 12 Jul 2022 11:15:37 +0200 -Subject: [PATCH 41/51] x86/spec-ctrl: Add fine-grained cmdline suboptions for - primitives - -Support controling the PV/HVM suboption of msr-sc/rsb/md-clear, which -previously wasn't possible. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -master commit: 27357c394ba6e1571a89105b840ce1c6f026485c -master date: 2022-07-11 15:21:35 +0100 ---- - docs/misc/xen-command-line.pandoc | 12 ++++-- - xen/arch/x86/spec_ctrl.c | 66 ++++++++++++++++++++++++++----- - 2 files changed, 66 insertions(+), 12 deletions(-) - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index 46e9c58d35cd..1bbdb55129cc 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -2233,7 +2233,8 @@ not be able to control the state of the mitigation. - By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). - - ### spec-ctrl (x86) --> `= List of [ <bool>, xen=<bool>, {pv,hvm,msr-sc,rsb,md-clear}=<bool>, -+> `= List of [ <bool>, xen=<bool>, {pv,hvm}=<bool>, -+> {msr-sc,rsb,md-clear}=<bool>|{pv,hvm}=<bool>, - > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd, - > eager-fpu,l1d-flush,branch-harden,srb-lock, - > unpriv-mmio}=<bool> ]` -@@ -2258,12 +2259,17 @@ in place for guests to use. - - Use of a positive boolean value for either of these options is invalid. - --The booleans `pv=`, `hvm=`, `msr-sc=`, `rsb=` and `md-clear=` offer fine -+The `pv=`, `hvm=`, `msr-sc=`, `rsb=` and `md-clear=` options offer fine - grained control over the primitives by Xen. These impact Xen's ability to --protect itself, and Xen's ability to virtualise support for guests to use. -+protect itself, and/or Xen's ability to virtualise support for guests to use. - - * `pv=` and `hvm=` offer control over all suboptions for PV and HVM guests - respectively. -+* Each other option can be used either as a plain boolean -+ (e.g. `spec-ctrl=rsb` to control both the PV and HVM sub-options), or with -+ `pv=` or `hvm=` subsuboptions (e.g. `spec-ctrl=rsb=no-hvm` to disable HVM -+ RSB only). -+ - * `msr-sc=` offers control over Xen's support for manipulating `MSR_SPEC_CTRL` - on entry and exit. These blocks are necessary to virtualise support for - guests and if disabled, guests will be unable to use IBRS/STIBP/SSBD/etc. -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index ae74943c1053..9507e5da60a9 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -147,20 +147,68 @@ static int __init parse_spec_ctrl(const char *s) - opt_rsb_hvm = val; - opt_md_clear_hvm = val; - } -- else if ( (val = parse_boolean("msr-sc", s, ss)) >= 0 ) -+ else if ( (val = parse_boolean("msr-sc", s, ss)) != -1 ) - { -- opt_msr_sc_pv = val; -- opt_msr_sc_hvm = val; -+ switch ( val ) -+ { -+ case 0: -+ case 1: -+ opt_msr_sc_pv = opt_msr_sc_hvm = val; -+ break; -+ -+ case -2: -+ s += strlen("msr-sc="); -+ if ( (val = parse_boolean("pv", s, ss)) >= 0 ) -+ opt_msr_sc_pv = val; -+ else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) -+ opt_msr_sc_hvm = val; -+ else -+ default: -+ rc = -EINVAL; -+ break; -+ } - } -- else if ( (val = parse_boolean("rsb", s, ss)) >= 0 ) -+ else if ( (val = parse_boolean("rsb", s, ss)) != -1 ) - { -- opt_rsb_pv = val; -- opt_rsb_hvm = val; -+ switch ( val ) -+ { -+ case 0: -+ case 1: -+ opt_rsb_pv = opt_rsb_hvm = val; -+ break; -+ -+ case -2: -+ s += strlen("rsb="); -+ if ( (val = parse_boolean("pv", s, ss)) >= 0 ) -+ opt_rsb_pv = val; -+ else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) -+ opt_rsb_hvm = val; -+ else -+ default: -+ rc = -EINVAL; -+ break; -+ } - } -- else if ( (val = parse_boolean("md-clear", s, ss)) >= 0 ) -+ else if ( (val = parse_boolean("md-clear", s, ss)) != -1 ) - { -- opt_md_clear_pv = val; -- opt_md_clear_hvm = val; -+ switch ( val ) -+ { -+ case 0: -+ case 1: -+ opt_md_clear_pv = opt_md_clear_hvm = val; -+ break; -+ -+ case -2: -+ s += strlen("md-clear="); -+ if ( (val = parse_boolean("pv", s, ss)) >= 0 ) -+ opt_md_clear_pv = val; -+ else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) -+ opt_md_clear_hvm = val; -+ else -+ default: -+ rc = -EINVAL; -+ break; -+ } - } - - /* Xen's speculative sidechannel mitigation settings. */ --- -2.35.1 - diff --git a/0042-tools-helpers-fix-build-of-xen-init-dom0-with-Werror.patch b/0042-tools-helpers-fix-build-of-xen-init-dom0-with-Werror.patch deleted file mode 100644 index eea790a..0000000 --- a/0042-tools-helpers-fix-build-of-xen-init-dom0-with-Werror.patch +++ /dev/null @@ -1,28 +0,0 @@ -From 14fd97e3de939a63a6e467f240efb49fe226a5dc Mon Sep 17 00:00:00 2001 -From: Anthony PERARD <anthony.perard@citrix.com> -Date: Tue, 12 Jul 2022 11:16:10 +0200 -Subject: [PATCH 42/51] tools/helpers: fix build of xen-init-dom0 with -Werror - -Missing prototype of asprintf() without _GNU_SOURCE. - -Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> -Reviewed-by: Henry Wang <Henry.Wang@arm.com> -master commit: d693b22733044d68e9974766b5c9e6259c9b1708 -master date: 2022-07-12 08:38:35 +0200 ---- - tools/helpers/xen-init-dom0.c | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/tools/helpers/xen-init-dom0.c b/tools/helpers/xen-init-dom0.c -index c99224a4b607..b4861c9e8041 100644 ---- a/tools/helpers/xen-init-dom0.c -+++ b/tools/helpers/xen-init-dom0.c -@@ -1,3 +1,5 @@ -+#define _GNU_SOURCE -+ - #include <stdlib.h> - #include <stdint.h> - #include <string.h> --- -2.35.1 - diff --git a/0043-libxl-check-return-value-of-libxl__xs_directory-in-n.patch b/0043-libxl-check-return-value-of-libxl__xs_directory-in-n.patch deleted file mode 100644 index 0c2470a..0000000 --- a/0043-libxl-check-return-value-of-libxl__xs_directory-in-n.patch +++ /dev/null @@ -1,38 +0,0 @@ -From 744accad1b73223b3261e3e678e16e030d83b179 Mon Sep 17 00:00:00 2001 -From: Anthony PERARD <anthony.perard@citrix.com> -Date: Tue, 12 Jul 2022 11:16:30 +0200 -Subject: [PATCH 43/51] libxl: check return value of libxl__xs_directory in - name2bdf - -libxl__xs_directory() can potentially return NULL without setting `n`. -As `n` isn't initialised, we need to check libxl__xs_directory() -return value before checking `n`. Otherwise, `n` might be non-zero -with `bdfs` NULL which would lead to a segv. - -Fixes: 57bff091f4 ("libxl: add 'name' field to 'libxl_device_pci' in the IDL...") -Reported-by: "G.R." <firemeteor@users.sourceforge.net> -Signed-off-by: Anthony PERARD <anthony.perard@citrix.com> -Reviewed-by: Juergen Gross <jgross@suse.com> -Tested-by: "G.R." <firemeteor@users.sourceforge.net> -master commit: d778089ac70e5b8e3bdea0c85fc8c0b9ed0eaf2f -master date: 2022-07-12 08:38:51 +0200 ---- - tools/libs/light/libxl_pci.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/tools/libs/light/libxl_pci.c b/tools/libs/light/libxl_pci.c -index 4bbbfe9f168f..ce3bf7c0ae81 100644 ---- a/tools/libs/light/libxl_pci.c -+++ b/tools/libs/light/libxl_pci.c -@@ -859,7 +859,7 @@ static int name2bdf(libxl__gc *gc, libxl_device_pci *pci) - int rc = ERROR_NOTFOUND; - - bdfs = libxl__xs_directory(gc, XBT_NULL, PCI_INFO_PATH, &n); -- if (!n) -+ if (!bdfs || !n) - goto out; - - for (i = 0; i < n; i++) { --- -2.35.1 - diff --git a/0044-x86-spec-ctrl-Rework-spec_ctrl_flags-context-switchi.patch b/0044-x86-spec-ctrl-Rework-spec_ctrl_flags-context-switchi.patch deleted file mode 100644 index d8517f8..0000000 --- a/0044-x86-spec-ctrl-Rework-spec_ctrl_flags-context-switchi.patch +++ /dev/null @@ -1,167 +0,0 @@ -From 3a280cbae7022b83af91c27a8e2211ba3b1234f5 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Fri, 1 Jul 2022 15:59:40 +0100 -Subject: [PATCH 44/51] x86/spec-ctrl: Rework spec_ctrl_flags context switching - -We are shortly going to need to context switch new bits in both the vcpu and -S3 paths. Introduce SCF_IST_MASK and SCF_DOM_MASK, and rework d->arch.verw -into d->arch.spec_ctrl_flags to accommodate. - -No functional change. - -This is part of XSA-407. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 5796912f7279d9348a3166655588d30eae9f72cc) ---- - xen/arch/x86/acpi/power.c | 8 ++++---- - xen/arch/x86/domain.c | 8 ++++---- - xen/arch/x86/spec_ctrl.c | 9 ++++++--- - xen/include/asm-x86/domain.h | 3 +-- - xen/include/asm-x86/spec_ctrl.h | 30 ++++++++++++++++++++++++++++- - xen/include/asm-x86/spec_ctrl_asm.h | 3 --- - 6 files changed, 44 insertions(+), 17 deletions(-) - -diff --git a/xen/arch/x86/acpi/power.c b/xen/arch/x86/acpi/power.c -index 5eaa77f66a28..dd397f713067 100644 ---- a/xen/arch/x86/acpi/power.c -+++ b/xen/arch/x86/acpi/power.c -@@ -248,8 +248,8 @@ static int enter_state(u32 state) - error = 0; - - ci = get_cpu_info(); -- /* Avoid NMI/#MC using MSR_SPEC_CTRL until we've reloaded microcode. */ -- ci->spec_ctrl_flags &= ~SCF_ist_wrmsr; -+ /* Avoid NMI/#MC using unsafe MSRs until we've reloaded microcode. */ -+ ci->spec_ctrl_flags &= ~SCF_IST_MASK; - - ACPI_FLUSH_CPU_CACHE(); - -@@ -292,8 +292,8 @@ static int enter_state(u32 state) - if ( !recheck_cpu_features(0) ) - panic("Missing previously available feature(s)\n"); - -- /* Re-enabled default NMI/#MC use of MSR_SPEC_CTRL. */ -- ci->spec_ctrl_flags |= (default_spec_ctrl_flags & SCF_ist_wrmsr); -+ /* Re-enabled default NMI/#MC use of MSRs now microcode is loaded. */ -+ ci->spec_ctrl_flags |= (default_spec_ctrl_flags & SCF_IST_MASK); - - if ( boot_cpu_has(X86_FEATURE_IBRSB) || boot_cpu_has(X86_FEATURE_IBRS) ) - { -diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c -index 1fe6644a71ae..82a0b73cf6ef 100644 ---- a/xen/arch/x86/domain.c -+++ b/xen/arch/x86/domain.c -@@ -2092,10 +2092,10 @@ void context_switch(struct vcpu *prev, struct vcpu *next) - } - } - -- /* Update the top-of-stack block with the VERW disposition. */ -- info->spec_ctrl_flags &= ~SCF_verw; -- if ( nextd->arch.verw ) -- info->spec_ctrl_flags |= SCF_verw; -+ /* Update the top-of-stack block with the new spec_ctrl settings. */ -+ info->spec_ctrl_flags = -+ (info->spec_ctrl_flags & ~SCF_DOM_MASK) | -+ (nextd->arch.spec_ctrl_flags & SCF_DOM_MASK); - } - - sched_context_switched(prev, next); -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 9507e5da60a9..7e646680f1c7 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -1010,9 +1010,12 @@ void spec_ctrl_init_domain(struct domain *d) - { - bool pv = is_pv_domain(d); - -- d->arch.verw = -- (pv ? opt_md_clear_pv : opt_md_clear_hvm) || -- (opt_fb_clear_mmio && is_iommu_enabled(d)); -+ bool verw = ((pv ? opt_md_clear_pv : opt_md_clear_hvm) || -+ (opt_fb_clear_mmio && is_iommu_enabled(d))); -+ -+ d->arch.spec_ctrl_flags = -+ (verw ? SCF_verw : 0) | -+ 0; - } - - void __init init_speculation_mitigations(void) -diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h -index 2398a1d99da9..e4c099262cb7 100644 ---- a/xen/include/asm-x86/domain.h -+++ b/xen/include/asm-x86/domain.h -@@ -319,8 +319,7 @@ struct arch_domain - uint32_t pci_cf8; - uint8_t cmos_idx; - -- /* Use VERW on return-to-guest for its flushing side effect. */ -- bool verw; -+ uint8_t spec_ctrl_flags; /* See SCF_DOM_MASK */ - - union { - struct pv_domain pv; -diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h -index 7e83e0179fb9..3cd72e40305f 100644 ---- a/xen/include/asm-x86/spec_ctrl.h -+++ b/xen/include/asm-x86/spec_ctrl.h -@@ -20,12 +20,40 @@ - #ifndef __X86_SPEC_CTRL_H__ - #define __X86_SPEC_CTRL_H__ - --/* Encoding of cpuinfo.spec_ctrl_flags */ -+/* -+ * Encoding of: -+ * cpuinfo.spec_ctrl_flags -+ * default_spec_ctrl_flags -+ * domain.spec_ctrl_flags -+ * -+ * Live settings are in the top-of-stack block, because they need to be -+ * accessable when XPTI is active. Some settings are fixed from boot, some -+ * context switched per domain, and some inhibited in the S3 path. -+ */ - #define SCF_use_shadow (1 << 0) - #define SCF_ist_wrmsr (1 << 1) - #define SCF_ist_rsb (1 << 2) - #define SCF_verw (1 << 3) - -+/* -+ * The IST paths (NMI/#MC) can interrupt any arbitrary context. Some -+ * functionality requires updated microcode to work. -+ * -+ * On boot, this is easy; we load microcode before figuring out which -+ * speculative protections to apply. However, on the S3 resume path, we must -+ * be able to disable the configured mitigations until microcode is reloaded. -+ * -+ * These are the controls to inhibit on the S3 resume path until microcode has -+ * been reloaded. -+ */ -+#define SCF_IST_MASK (SCF_ist_wrmsr) -+ -+/* -+ * Some speculative protections are per-domain. These settings are merged -+ * into the top-of-stack block in the context switch path. -+ */ -+#define SCF_DOM_MASK (SCF_verw) -+ - #ifndef __ASSEMBLY__ - - #include <asm/alternative.h> -diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h -index 5a590bac44aa..66b00d511fc6 100644 ---- a/xen/include/asm-x86/spec_ctrl_asm.h -+++ b/xen/include/asm-x86/spec_ctrl_asm.h -@@ -248,9 +248,6 @@ - - /* - * Use in IST interrupt/exception context. May interrupt Xen or PV context. -- * Fine grain control of SCF_ist_wrmsr is needed for safety in the S3 resume -- * path to avoid using MSR_SPEC_CTRL before the microcode introducing it has -- * been reloaded. - */ - .macro SPEC_CTRL_ENTRY_FROM_INTR_IST - /* --- -2.35.1 - diff --git a/0045-x86-spec-ctrl-Rename-SCF_ist_wrmsr-to-SCF_ist_sc_msr.patch b/0045-x86-spec-ctrl-Rename-SCF_ist_wrmsr-to-SCF_ist_sc_msr.patch deleted file mode 100644 index 5b841a6..0000000 --- a/0045-x86-spec-ctrl-Rename-SCF_ist_wrmsr-to-SCF_ist_sc_msr.patch +++ /dev/null @@ -1,110 +0,0 @@ -From 31aa2a20bfefc3a8a200da54a56471bf99f9630e Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 28 Jun 2022 14:36:56 +0100 -Subject: [PATCH 45/51] x86/spec-ctrl: Rename SCF_ist_wrmsr to SCF_ist_sc_msr - -We are about to introduce SCF_ist_ibpb, at which point SCF_ist_wrmsr becomes -ambiguous. - -No functional change. - -This is part of XSA-407. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 76d6a36f645dfdbad8830559d4d52caf36efc75e) ---- - xen/arch/x86/spec_ctrl.c | 6 +++--- - xen/include/asm-x86/spec_ctrl.h | 4 ++-- - xen/include/asm-x86/spec_ctrl_asm.h | 8 ++++---- - 3 files changed, 9 insertions(+), 9 deletions(-) - -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 7e646680f1c7..89f95c083e1b 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -1115,7 +1115,7 @@ void __init init_speculation_mitigations(void) - { - if ( opt_msr_sc_pv ) - { -- default_spec_ctrl_flags |= SCF_ist_wrmsr; -+ default_spec_ctrl_flags |= SCF_ist_sc_msr; - setup_force_cpu_cap(X86_FEATURE_SC_MSR_PV); - } - -@@ -1126,7 +1126,7 @@ void __init init_speculation_mitigations(void) - * Xen's value is not restored atomically. An early NMI hitting - * the VMExit path needs to restore Xen's value for safety. - */ -- default_spec_ctrl_flags |= SCF_ist_wrmsr; -+ default_spec_ctrl_flags |= SCF_ist_sc_msr; - setup_force_cpu_cap(X86_FEATURE_SC_MSR_HVM); - } - } -@@ -1139,7 +1139,7 @@ void __init init_speculation_mitigations(void) - * on real hardware matches the availability of MSR_SPEC_CTRL in the - * first place. - * -- * No need for SCF_ist_wrmsr because Xen's value is restored -+ * No need for SCF_ist_sc_msr because Xen's value is restored - * atomically WRT NMIs in the VMExit path. - * - * TODO: Adjust cpu_has_svm_spec_ctrl to be usable earlier on boot. -diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h -index 3cd72e40305f..f8f0ac47e759 100644 ---- a/xen/include/asm-x86/spec_ctrl.h -+++ b/xen/include/asm-x86/spec_ctrl.h -@@ -31,7 +31,7 @@ - * context switched per domain, and some inhibited in the S3 path. - */ - #define SCF_use_shadow (1 << 0) --#define SCF_ist_wrmsr (1 << 1) -+#define SCF_ist_sc_msr (1 << 1) - #define SCF_ist_rsb (1 << 2) - #define SCF_verw (1 << 3) - -@@ -46,7 +46,7 @@ - * These are the controls to inhibit on the S3 resume path until microcode has - * been reloaded. - */ --#define SCF_IST_MASK (SCF_ist_wrmsr) -+#define SCF_IST_MASK (SCF_ist_sc_msr) - - /* - * Some speculative protections are per-domain. These settings are merged -diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h -index 66b00d511fc6..0ff1b118f882 100644 ---- a/xen/include/asm-x86/spec_ctrl_asm.h -+++ b/xen/include/asm-x86/spec_ctrl_asm.h -@@ -266,8 +266,8 @@ - - .L\@_skip_rsb: - -- test $SCF_ist_wrmsr, %al -- jz .L\@_skip_wrmsr -+ test $SCF_ist_sc_msr, %al -+ jz .L\@_skip_msr_spec_ctrl - - xor %edx, %edx - testb $3, UREGS_cs(%rsp) -@@ -290,7 +290,7 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - * to speculate around the WRMSR. As a result, we need a dispatch - * serialising instruction in the else clause. - */ --.L\@_skip_wrmsr: -+.L\@_skip_msr_spec_ctrl: - lfence - UNLIKELY_END(\@_serialise) - .endm -@@ -301,7 +301,7 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): - * Requires %rbx=stack_end - * Clobbers %rax, %rcx, %rdx - */ -- testb $SCF_ist_wrmsr, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx) -+ testb $SCF_ist_sc_msr, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%rbx) - jz .L\@_skip - - DO_SPEC_CTRL_EXIT_TO_XEN --- -2.35.1 - diff --git a/0046-x86-spec-ctrl-Rename-opt_ibpb-to-opt_ibpb_ctxt_switc.patch b/0046-x86-spec-ctrl-Rename-opt_ibpb-to-opt_ibpb_ctxt_switc.patch deleted file mode 100644 index a950639..0000000 --- a/0046-x86-spec-ctrl-Rename-opt_ibpb-to-opt_ibpb_ctxt_switc.patch +++ /dev/null @@ -1,97 +0,0 @@ -From e7671561c84322860875745e57b228a7a310f2bf Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Mon, 4 Jul 2022 21:32:17 +0100 -Subject: [PATCH 46/51] x86/spec-ctrl: Rename opt_ibpb to opt_ibpb_ctxt_switch - -We are about to introduce the use of IBPB at different points in Xen, making -opt_ibpb ambiguous. Rename it to opt_ibpb_ctxt_switch. - -No functional change. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit a8e5ef079d6f5c88c472e3e620db5a8d1402a50d) ---- - xen/arch/x86/domain.c | 2 +- - xen/arch/x86/spec_ctrl.c | 10 +++++----- - xen/include/asm-x86/spec_ctrl.h | 2 +- - 3 files changed, 7 insertions(+), 7 deletions(-) - -diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c -index 82a0b73cf6ef..0d39981550ca 100644 ---- a/xen/arch/x86/domain.c -+++ b/xen/arch/x86/domain.c -@@ -2064,7 +2064,7 @@ void context_switch(struct vcpu *prev, struct vcpu *next) - - ctxt_switch_levelling(next); - -- if ( opt_ibpb && !is_idle_domain(nextd) ) -+ if ( opt_ibpb_ctxt_switch && !is_idle_domain(nextd) ) - { - static DEFINE_PER_CPU(unsigned int, last); - unsigned int *last_id = &this_cpu(last); -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 89f95c083e1b..f4ae36eae2d0 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -54,7 +54,7 @@ int8_t __initdata opt_stibp = -1; - bool __read_mostly opt_ssbd; - int8_t __initdata opt_psfd = -1; - --bool __read_mostly opt_ibpb = true; -+bool __read_mostly opt_ibpb_ctxt_switch = true; - int8_t __read_mostly opt_eager_fpu = -1; - int8_t __read_mostly opt_l1d_flush = -1; - static bool __initdata opt_branch_harden = true; -@@ -117,7 +117,7 @@ static int __init parse_spec_ctrl(const char *s) - - opt_thunk = THUNK_JMP; - opt_ibrs = 0; -- opt_ibpb = false; -+ opt_ibpb_ctxt_switch = false; - opt_ssbd = false; - opt_l1d_flush = 0; - opt_branch_harden = false; -@@ -238,7 +238,7 @@ static int __init parse_spec_ctrl(const char *s) - - /* Misc settings. */ - else if ( (val = parse_boolean("ibpb", s, ss)) >= 0 ) -- opt_ibpb = val; -+ opt_ibpb_ctxt_switch = val; - else if ( (val = parse_boolean("eager-fpu", s, ss)) >= 0 ) - opt_eager_fpu = val; - else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 ) -@@ -458,7 +458,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - (opt_tsx & 1) ? " TSX+" : " TSX-", - !cpu_has_srbds_ctrl ? "" : - opt_srb_lock ? " SRB_LOCK+" : " SRB_LOCK-", -- opt_ibpb ? " IBPB" : "", -+ opt_ibpb_ctxt_switch ? " IBPB-ctxt" : "", - opt_l1d_flush ? " L1D_FLUSH" : "", - opt_md_clear_pv || opt_md_clear_hvm || - opt_fb_clear_mmio ? " VERW" : "", -@@ -1240,7 +1240,7 @@ void __init init_speculation_mitigations(void) - - /* Check we have hardware IBPB support before using it... */ - if ( !boot_cpu_has(X86_FEATURE_IBRSB) && !boot_cpu_has(X86_FEATURE_IBPB) ) -- opt_ibpb = false; -+ opt_ibpb_ctxt_switch = false; - - /* Check whether Eager FPU should be enabled by default. */ - if ( opt_eager_fpu == -1 ) -diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h -index f8f0ac47e759..fb4365575620 100644 ---- a/xen/include/asm-x86/spec_ctrl.h -+++ b/xen/include/asm-x86/spec_ctrl.h -@@ -63,7 +63,7 @@ - void init_speculation_mitigations(void); - void spec_ctrl_init_domain(struct domain *d); - --extern bool opt_ibpb; -+extern bool opt_ibpb_ctxt_switch; - extern bool opt_ssbd; - extern int8_t opt_eager_fpu; - extern int8_t opt_l1d_flush; --- -2.35.1 - diff --git a/0047-x86-spec-ctrl-Rework-SPEC_CTRL_ENTRY_FROM_INTR_IST.patch b/0047-x86-spec-ctrl-Rework-SPEC_CTRL_ENTRY_FROM_INTR_IST.patch deleted file mode 100644 index 3ce9fd9..0000000 --- a/0047-x86-spec-ctrl-Rework-SPEC_CTRL_ENTRY_FROM_INTR_IST.patch +++ /dev/null @@ -1,106 +0,0 @@ -From 2a9e690a0ad5d54dca4166e089089a07bbe7fc85 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Fri, 1 Jul 2022 15:59:40 +0100 -Subject: [PATCH 47/51] x86/spec-ctrl: Rework SPEC_CTRL_ENTRY_FROM_INTR_IST - -We are shortly going to add a conditional IBPB in this path. - -Therefore, we cannot hold spec_ctrl_flags in %eax, and rely on only clobbering -it after we're done with its contents. %rbx is available for use, and the -more normal register to hold preserved information in. - -With %rax freed up, use it instead of %rdx for the RSB tmp register, and for -the adjustment to spec_ctrl_flags. - -This leaves no use of %rdx, except as 0 for the upper half of WRMSR. In -practice, %rdx is 0 from SAVE_ALL on all paths and isn't likely to change in -the foreseeable future, so update the macro entry requirements to state this -dependency. This marginal optimisation can be revisited if circumstances -change. - -No practical change. - -This is part of XSA-407. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit e9b8d31981f184c6539f91ec54bd9cae29cdae36) ---- - xen/arch/x86/x86_64/entry.S | 4 ++-- - xen/include/asm-x86/spec_ctrl_asm.h | 21 ++++++++++----------- - 2 files changed, 12 insertions(+), 13 deletions(-) - -diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S -index 2a86938f1f32..a1810bf4d311 100644 ---- a/xen/arch/x86/x86_64/entry.S -+++ b/xen/arch/x86/x86_64/entry.S -@@ -932,7 +932,7 @@ ENTRY(double_fault) - - GET_STACK_END(14) - -- SPEC_CTRL_ENTRY_FROM_INTR_IST /* Req: %rsp=regs, %r14=end, Clob: acd */ -+ SPEC_CTRL_ENTRY_FROM_INTR_IST /* Req: %rsp=regs, %r14=end, %rdx=0, Clob: abcd */ - /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - - mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rbx -@@ -968,7 +968,7 @@ handle_ist_exception: - - GET_STACK_END(14) - -- SPEC_CTRL_ENTRY_FROM_INTR_IST /* Req: %rsp=regs, %r14=end, Clob: acd */ -+ SPEC_CTRL_ENTRY_FROM_INTR_IST /* Req: %rsp=regs, %r14=end, %rdx=0, Clob: abcd */ - /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - - mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx -diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h -index 0ff1b118f882..15e24cde00d1 100644 ---- a/xen/include/asm-x86/spec_ctrl_asm.h -+++ b/xen/include/asm-x86/spec_ctrl_asm.h -@@ -251,34 +251,33 @@ - */ - .macro SPEC_CTRL_ENTRY_FROM_INTR_IST - /* -- * Requires %rsp=regs, %r14=stack_end -- * Clobbers %rax, %rcx, %rdx -+ * Requires %rsp=regs, %r14=stack_end, %rdx=0 -+ * Clobbers %rax, %rbx, %rcx, %rdx - * - * This is logical merge of DO_OVERWRITE_RSB and DO_SPEC_CTRL_ENTRY - * maybexen=1, but with conditionals rather than alternatives. - */ -- movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %eax -+ movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %ebx - -- test $SCF_ist_rsb, %al -+ test $SCF_ist_rsb, %bl - jz .L\@_skip_rsb - -- DO_OVERWRITE_RSB tmp=rdx /* Clobbers %rcx/%rdx */ -+ DO_OVERWRITE_RSB /* Clobbers %rax/%rcx */ - - .L\@_skip_rsb: - -- test $SCF_ist_sc_msr, %al -+ test $SCF_ist_sc_msr, %bl - jz .L\@_skip_msr_spec_ctrl - -- xor %edx, %edx -+ xor %eax, %eax - testb $3, UREGS_cs(%rsp) -- setnz %dl -- not %edx -- and %dl, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14) -+ setnz %al -+ not %eax -+ and %al, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14) - - /* Load Xen's intended value. */ - mov $MSR_SPEC_CTRL, %ecx - movzbl STACK_CPUINFO_FIELD(xen_spec_ctrl)(%r14), %eax -- xor %edx, %edx - wrmsr - - /* Opencoded UNLIKELY_START() with no condition. */ --- -2.35.1 - diff --git a/0048-x86-spec-ctrl-Support-IBPB-on-entry.patch b/0048-x86-spec-ctrl-Support-IBPB-on-entry.patch deleted file mode 100644 index d5ad043..0000000 --- a/0048-x86-spec-ctrl-Support-IBPB-on-entry.patch +++ /dev/null @@ -1,300 +0,0 @@ -From 76c5fcee9027fb8823dd501086f0ff3ee3c4231c Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Thu, 24 Feb 2022 13:44:33 +0000 -Subject: [PATCH 48/51] x86/spec-ctrl: Support IBPB-on-entry - -We are going to need this to mitigate Branch Type Confusion on AMD/Hygon CPUs, -but as we've talked about using it in other cases too, arrange to support it -generally. However, this is also very expensive in some cases, so we're going -to want per-domain controls. - -Introduce SCF_ist_ibpb and SCF_entry_ibpb controls, adding them to the IST and -DOM masks as appropriate. Also introduce X86_FEATURE_IBPB_ENTRY_{PV,HVM} to -to patch the code blocks. - -For SVM, the STGI is serialising enough to protect against Spectre-v1 attacks, -so no "else lfence" is necessary. VT-x will use use the MSR host load list, -so doesn't need any code in the VMExit path. - -For the IST path, we can't safely check CPL==0 to skip a flush, as we might -have hit an entry path before it's IBPB. As IST hitting Xen is rare, flush -irrespective of CPL. A later path, SCF_ist_sc_msr, provides Spectre-v1 -safety. - -For the PV paths, we know we're interrupting CPL>0, while for the INTR paths, -we can safely check CPL==0. Only flush when interrupting guest context. - -An "else lfence" is needed for safety, but we want to be able to skip it on -unaffected CPUs, so the block wants to be an alternative, which means the -lfence has to be inline rather than UNLIKELY() (the replacement block doesn't -have displacements fixed up for anything other than the first instruction). - -As with SPEC_CTRL_ENTRY_FROM_INTR_IST, %rdx is 0 on entry so rely on this to -shrink the logic marginally. Update the comments to specify this new -dependency. - -This is part of XSA-407. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 53a570b285694947776d5190f591a0d5b9b18de7) ---- - xen/arch/x86/hvm/svm/entry.S | 18 ++++++++++- - xen/arch/x86/hvm/vmx/vmcs.c | 4 +++ - xen/arch/x86/x86_64/compat/entry.S | 2 +- - xen/arch/x86/x86_64/entry.S | 12 +++---- - xen/include/asm-x86/cpufeatures.h | 2 ++ - xen/include/asm-x86/spec_ctrl.h | 6 ++-- - xen/include/asm-x86/spec_ctrl_asm.h | 49 +++++++++++++++++++++++++++-- - 7 files changed, 81 insertions(+), 12 deletions(-) - -diff --git a/xen/arch/x86/hvm/svm/entry.S b/xen/arch/x86/hvm/svm/entry.S -index 4ae55a2ef605..0ff4008060fa 100644 ---- a/xen/arch/x86/hvm/svm/entry.S -+++ b/xen/arch/x86/hvm/svm/entry.S -@@ -97,7 +97,19 @@ __UNLIKELY_END(nsvm_hap) - - GET_CURRENT(bx) - -- /* SPEC_CTRL_ENTRY_FROM_SVM Req: %rsp=regs/cpuinfo Clob: acd */ -+ /* SPEC_CTRL_ENTRY_FROM_SVM Req: %rsp=regs/cpuinfo, %rdx=0 Clob: acd */ -+ -+ .macro svm_vmexit_cond_ibpb -+ testb $SCF_entry_ibpb, CPUINFO_xen_spec_ctrl(%rsp) -+ jz .L_skip_ibpb -+ -+ mov $MSR_PRED_CMD, %ecx -+ mov $PRED_CMD_IBPB, %eax -+ wrmsr -+.L_skip_ibpb: -+ .endm -+ ALTERNATIVE "", svm_vmexit_cond_ibpb, X86_FEATURE_IBPB_ENTRY_HVM -+ - ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_HVM - - .macro svm_vmexit_spec_ctrl -@@ -114,6 +126,10 @@ __UNLIKELY_END(nsvm_hap) - ALTERNATIVE "", svm_vmexit_spec_ctrl, X86_FEATURE_SC_MSR_HVM - /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - -+ /* -+ * STGI is executed unconditionally, and is sufficiently serialising -+ * to safely resolve any Spectre-v1 concerns in the above logic. -+ */ - stgi - GLOBAL(svm_stgi_label) - mov %rsp,%rdi -diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c -index f9f9bc18cdbc..dd817cee4e69 100644 ---- a/xen/arch/x86/hvm/vmx/vmcs.c -+++ b/xen/arch/x86/hvm/vmx/vmcs.c -@@ -1345,6 +1345,10 @@ static int construct_vmcs(struct vcpu *v) - rc = vmx_add_msr(v, MSR_FLUSH_CMD, FLUSH_CMD_L1D, - VMX_MSR_GUEST_LOADONLY); - -+ if ( !rc && (d->arch.spec_ctrl_flags & SCF_entry_ibpb) ) -+ rc = vmx_add_msr(v, MSR_PRED_CMD, PRED_CMD_IBPB, -+ VMX_MSR_HOST); -+ - out: - vmx_vmcs_exit(v); - -diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S -index 5fd6dbbd4513..b86d38d1c50d 100644 ---- a/xen/arch/x86/x86_64/compat/entry.S -+++ b/xen/arch/x86/x86_64/compat/entry.S -@@ -18,7 +18,7 @@ ENTRY(entry_int82) - movl $HYPERCALL_VECTOR, 4(%rsp) - SAVE_ALL compat=1 /* DPL1 gate, restricted to 32bit PV guests only. */ - -- SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, Clob: acd */ -+ SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ - /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - - CR4_PV32_RESTORE -diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S -index a1810bf4d311..fba8ae498f74 100644 ---- a/xen/arch/x86/x86_64/entry.S -+++ b/xen/arch/x86/x86_64/entry.S -@@ -260,7 +260,7 @@ ENTRY(lstar_enter) - movl $TRAP_syscall, 4(%rsp) - SAVE_ALL - -- SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, Clob: acd */ -+ SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ - /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - - GET_STACK_END(bx) -@@ -298,7 +298,7 @@ ENTRY(cstar_enter) - movl $TRAP_syscall, 4(%rsp) - SAVE_ALL - -- SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, Clob: acd */ -+ SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ - /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - - GET_STACK_END(bx) -@@ -338,7 +338,7 @@ GLOBAL(sysenter_eflags_saved) - movl $TRAP_syscall, 4(%rsp) - SAVE_ALL - -- SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, Clob: acd */ -+ SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ - /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - - GET_STACK_END(bx) -@@ -392,7 +392,7 @@ ENTRY(int80_direct_trap) - movl $0x80, 4(%rsp) - SAVE_ALL - -- SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, Clob: acd */ -+ SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ - /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - - GET_STACK_END(bx) -@@ -674,7 +674,7 @@ ENTRY(common_interrupt) - - GET_STACK_END(14) - -- SPEC_CTRL_ENTRY_FROM_INTR /* Req: %rsp=regs, %r14=end, Clob: acd */ -+ SPEC_CTRL_ENTRY_FROM_INTR /* Req: %rsp=regs, %r14=end, %rdx=0, Clob: acd */ - /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - - mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx -@@ -708,7 +708,7 @@ GLOBAL(handle_exception) - - GET_STACK_END(14) - -- SPEC_CTRL_ENTRY_FROM_INTR /* Req: %rsp=regs, %r14=end, Clob: acd */ -+ SPEC_CTRL_ENTRY_FROM_INTR /* Req: %rsp=regs, %r14=end, %rdx=0, Clob: acd */ - /* WARNING! `ret`, `call *`, `jmp *` not safe before this point. */ - - mov STACK_CPUINFO_FIELD(xen_cr3)(%r14), %rcx -diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h -index 493d338a085e..672c9ee22ba2 100644 ---- a/xen/include/asm-x86/cpufeatures.h -+++ b/xen/include/asm-x86/cpufeatures.h -@@ -39,6 +39,8 @@ XEN_CPUFEATURE(XEN_LBR, X86_SYNTH(22)) /* Xen uses MSR_DEBUGCTL.LBR */ - XEN_CPUFEATURE(SC_VERW_IDLE, X86_SYNTH(25)) /* VERW used by Xen for idle */ - XEN_CPUFEATURE(XEN_SHSTK, X86_SYNTH(26)) /* Xen uses CET Shadow Stacks */ - XEN_CPUFEATURE(XEN_IBT, X86_SYNTH(27)) /* Xen uses CET Indirect Branch Tracking */ -+XEN_CPUFEATURE(IBPB_ENTRY_PV, X86_SYNTH(28)) /* MSR_PRED_CMD used by Xen for PV */ -+XEN_CPUFEATURE(IBPB_ENTRY_HVM, X86_SYNTH(29)) /* MSR_PRED_CMD used by Xen for HVM */ - - /* Bug words follow the synthetic words. */ - #define X86_NR_BUG 1 -diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h -index fb4365575620..3fc599a817c4 100644 ---- a/xen/include/asm-x86/spec_ctrl.h -+++ b/xen/include/asm-x86/spec_ctrl.h -@@ -34,6 +34,8 @@ - #define SCF_ist_sc_msr (1 << 1) - #define SCF_ist_rsb (1 << 2) - #define SCF_verw (1 << 3) -+#define SCF_ist_ibpb (1 << 4) -+#define SCF_entry_ibpb (1 << 5) - - /* - * The IST paths (NMI/#MC) can interrupt any arbitrary context. Some -@@ -46,13 +48,13 @@ - * These are the controls to inhibit on the S3 resume path until microcode has - * been reloaded. - */ --#define SCF_IST_MASK (SCF_ist_sc_msr) -+#define SCF_IST_MASK (SCF_ist_sc_msr | SCF_ist_ibpb) - - /* - * Some speculative protections are per-domain. These settings are merged - * into the top-of-stack block in the context switch path. - */ --#define SCF_DOM_MASK (SCF_verw) -+#define SCF_DOM_MASK (SCF_verw | SCF_entry_ibpb) - - #ifndef __ASSEMBLY__ - -diff --git a/xen/include/asm-x86/spec_ctrl_asm.h b/xen/include/asm-x86/spec_ctrl_asm.h -index 15e24cde00d1..9eb4ad9ab71d 100644 ---- a/xen/include/asm-x86/spec_ctrl_asm.h -+++ b/xen/include/asm-x86/spec_ctrl_asm.h -@@ -88,6 +88,35 @@ - * - SPEC_CTRL_EXIT_TO_{SVM,VMX} - */ - -+.macro DO_SPEC_CTRL_COND_IBPB maybexen:req -+/* -+ * Requires %rsp=regs (also cpuinfo if !maybexen) -+ * Requires %r14=stack_end (if maybexen), %rdx=0 -+ * Clobbers %rax, %rcx, %rdx -+ * -+ * Conditionally issue IBPB if SCF_entry_ibpb is active. In the maybexen -+ * case, we can safely look at UREGS_cs to skip taking the hit when -+ * interrupting Xen. -+ */ -+ .if \maybexen -+ testb $SCF_entry_ibpb, STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14) -+ jz .L\@_skip -+ testb $3, UREGS_cs(%rsp) -+ .else -+ testb $SCF_entry_ibpb, CPUINFO_xen_spec_ctrl(%rsp) -+ .endif -+ jz .L\@_skip -+ -+ mov $MSR_PRED_CMD, %ecx -+ mov $PRED_CMD_IBPB, %eax -+ wrmsr -+ jmp .L\@_done -+ -+.L\@_skip: -+ lfence -+.L\@_done: -+.endm -+ - .macro DO_OVERWRITE_RSB tmp=rax - /* - * Requires nothing -@@ -225,12 +254,16 @@ - - /* Use after an entry from PV context (syscall/sysenter/int80/int82/etc). */ - #define SPEC_CTRL_ENTRY_FROM_PV \ -+ ALTERNATIVE "", __stringify(DO_SPEC_CTRL_COND_IBPB maybexen=0), \ -+ X86_FEATURE_IBPB_ENTRY_PV; \ - ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV; \ - ALTERNATIVE "", __stringify(DO_SPEC_CTRL_ENTRY maybexen=0), \ - X86_FEATURE_SC_MSR_PV - - /* Use in interrupt/exception context. May interrupt Xen or PV context. */ - #define SPEC_CTRL_ENTRY_FROM_INTR \ -+ ALTERNATIVE "", __stringify(DO_SPEC_CTRL_COND_IBPB maybexen=1), \ -+ X86_FEATURE_IBPB_ENTRY_PV; \ - ALTERNATIVE "", DO_OVERWRITE_RSB, X86_FEATURE_SC_RSB_PV; \ - ALTERNATIVE "", __stringify(DO_SPEC_CTRL_ENTRY maybexen=1), \ - X86_FEATURE_SC_MSR_PV -@@ -254,11 +287,23 @@ - * Requires %rsp=regs, %r14=stack_end, %rdx=0 - * Clobbers %rax, %rbx, %rcx, %rdx - * -- * This is logical merge of DO_OVERWRITE_RSB and DO_SPEC_CTRL_ENTRY -- * maybexen=1, but with conditionals rather than alternatives. -+ * This is logical merge of: -+ * DO_SPEC_CTRL_COND_IBPB maybexen=0 -+ * DO_OVERWRITE_RSB -+ * DO_SPEC_CTRL_ENTRY maybexen=1 -+ * but with conditionals rather than alternatives. - */ - movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %ebx - -+ test $SCF_ist_ibpb, %bl -+ jz .L\@_skip_ibpb -+ -+ mov $MSR_PRED_CMD, %ecx -+ mov $PRED_CMD_IBPB, %eax -+ wrmsr -+ -+.L\@_skip_ibpb: -+ - test $SCF_ist_rsb, %bl - jz .L\@_skip_rsb - --- -2.35.1 - diff --git a/0049-x86-cpuid-Enumeration-for-BTC_NO.patch b/0049-x86-cpuid-Enumeration-for-BTC_NO.patch deleted file mode 100644 index 0e5d119..0000000 --- a/0049-x86-cpuid-Enumeration-for-BTC_NO.patch +++ /dev/null @@ -1,106 +0,0 @@ -From 0826c7596d35c887b3b7858137c7ac374d9ef17a Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Mon, 16 May 2022 15:48:24 +0100 -Subject: [PATCH 49/51] x86/cpuid: Enumeration for BTC_NO - -BTC_NO indicates that hardware is not succeptable to Branch Type Confusion. - -Zen3 CPUs don't suffer BTC. - -This is part of XSA-407. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit 76cb04ad64f3ab9ae785988c40655a71dde9c319) ---- - tools/libs/light/libxl_cpuid.c | 1 + - tools/misc/xen-cpuid.c | 2 +- - xen/arch/x86/cpu/amd.c | 10 ++++++++++ - xen/arch/x86/spec_ctrl.c | 5 +++-- - xen/include/public/arch-x86/cpufeatureset.h | 1 + - 5 files changed, 16 insertions(+), 3 deletions(-) - -diff --git a/tools/libs/light/libxl_cpuid.c b/tools/libs/light/libxl_cpuid.c -index d462f9e421ed..bf6fdee360a9 100644 ---- a/tools/libs/light/libxl_cpuid.c -+++ b/tools/libs/light/libxl_cpuid.c -@@ -288,6 +288,7 @@ int libxl_cpuid_parse_config(libxl_cpuid_policy_list *cpuid, const char* str) - {"virt-ssbd", 0x80000008, NA, CPUID_REG_EBX, 25, 1}, - {"ssb-no", 0x80000008, NA, CPUID_REG_EBX, 26, 1}, - {"psfd", 0x80000008, NA, CPUID_REG_EBX, 28, 1}, -+ {"btc-no", 0x80000008, NA, CPUID_REG_EBX, 29, 1}, - - {"nc", 0x80000008, NA, CPUID_REG_ECX, 0, 8}, - {"apicidsize", 0x80000008, NA, CPUID_REG_ECX, 12, 4}, -diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c -index bc7dcf55757a..fe22f5f5b68b 100644 ---- a/tools/misc/xen-cpuid.c -+++ b/tools/misc/xen-cpuid.c -@@ -158,7 +158,7 @@ static const char *const str_e8b[32] = - /* [22] */ [23] = "ppin", - [24] = "amd-ssbd", [25] = "virt-ssbd", - [26] = "ssb-no", -- [28] = "psfd", -+ [28] = "psfd", [29] = "btc-no", - }; - - static const char *const str_7d0[32] = -diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c -index b3b9a0df5fed..b158e3acb5c7 100644 ---- a/xen/arch/x86/cpu/amd.c -+++ b/xen/arch/x86/cpu/amd.c -@@ -847,6 +847,16 @@ static void init_amd(struct cpuinfo_x86 *c) - warning_add(text); - } - break; -+ -+ case 0x19: -+ /* -+ * Zen3 (Fam19h model < 0x10) parts are not susceptible to -+ * Branch Type Confusion, but predate the allocation of the -+ * BTC_NO bit. Fill it back in if we're not virtualised. -+ */ -+ if (!cpu_has_hypervisor && !cpu_has(c, X86_FEATURE_BTC_NO)) -+ __set_bit(X86_FEATURE_BTC_NO, c->x86_capability); -+ break; - } - - display_cacheinfo(c); -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index f4ae36eae2d0..0f101c057f3e 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -388,7 +388,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - * Hardware read-only information, stating immunity to certain issues, or - * suggestions of which mitigation to use. - */ -- printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", -+ printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", - (caps & ARCH_CAPS_RDCL_NO) ? " RDCL_NO" : "", - (caps & ARCH_CAPS_IBRS_ALL) ? " IBRS_ALL" : "", - (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", -@@ -403,7 +403,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - (e8b & cpufeat_mask(X86_FEATURE_IBRS_ALWAYS)) ? " IBRS_ALWAYS" : "", - (e8b & cpufeat_mask(X86_FEATURE_STIBP_ALWAYS)) ? " STIBP_ALWAYS" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBRS_FAST)) ? " IBRS_FAST" : "", -- (e8b & cpufeat_mask(X86_FEATURE_IBRS_SAME_MODE)) ? " IBRS_SAME_MODE" : ""); -+ (e8b & cpufeat_mask(X86_FEATURE_IBRS_SAME_MODE)) ? " IBRS_SAME_MODE" : "", -+ (e8b & cpufeat_mask(X86_FEATURE_BTC_NO)) ? " BTC_NO" : ""); - - /* Hardware features which need driving to mitigate issues. */ - printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n", -diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h -index 743b857dcd5c..e7b8167800a2 100644 ---- a/xen/include/public/arch-x86/cpufeatureset.h -+++ b/xen/include/public/arch-x86/cpufeatureset.h -@@ -266,6 +266,7 @@ XEN_CPUFEATURE(AMD_SSBD, 8*32+24) /*S MSR_SPEC_CTRL.SSBD available */ - XEN_CPUFEATURE(VIRT_SSBD, 8*32+25) /* MSR_VIRT_SPEC_CTRL.SSBD */ - XEN_CPUFEATURE(SSB_NO, 8*32+26) /*A Hardware not vulnerable to SSB */ - XEN_CPUFEATURE(PSFD, 8*32+28) /*S MSR_SPEC_CTRL.PSFD */ -+XEN_CPUFEATURE(BTC_NO, 8*32+29) /*A Hardware not vulnerable to Branch Type Confusion */ - - /* Intel-defined CPU features, CPUID level 0x00000007:0.edx, word 9 */ - XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /*A AVX512 Neural Network Instructions */ --- -2.35.1 - diff --git a/0050-x86-spec-ctrl-Enable-Zen2-chickenbit.patch b/0050-x86-spec-ctrl-Enable-Zen2-chickenbit.patch deleted file mode 100644 index c83844d..0000000 --- a/0050-x86-spec-ctrl-Enable-Zen2-chickenbit.patch +++ /dev/null @@ -1,106 +0,0 @@ -From 5457a6870eb1369b868f7b8e833966ed43a773ad Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Tue, 15 Mar 2022 18:30:25 +0000 -Subject: [PATCH 50/51] x86/spec-ctrl: Enable Zen2 chickenbit - -... as instructed in the Branch Type Confusion whitepaper. - -This is part of XSA-407. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -(cherry picked from commit 9deaf2d932f08c16c6b96a1c426e4b1142c0cdbe) ---- - xen/arch/x86/cpu/amd.c | 28 ++++++++++++++++++++++++++++ - xen/arch/x86/cpu/cpu.h | 1 + - xen/arch/x86/cpu/hygon.c | 6 ++++++ - xen/include/asm-x86/msr-index.h | 1 + - 4 files changed, 36 insertions(+) - -diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c -index b158e3acb5c7..37ac84ddd74d 100644 ---- a/xen/arch/x86/cpu/amd.c -+++ b/xen/arch/x86/cpu/amd.c -@@ -731,6 +731,31 @@ void amd_init_ssbd(const struct cpuinfo_x86 *c) - printk_once(XENLOG_ERR "No SSBD controls available\n"); - } - -+/* -+ * On Zen2 we offer this chicken (bit) on the altar of Speculation. -+ * -+ * Refer to the AMD Branch Type Confusion whitepaper: -+ * https://XXX -+ * -+ * Setting this unnamed bit supposedly causes prediction information on -+ * non-branch instructions to be ignored. It is to be set unilaterally in -+ * newer microcode. -+ * -+ * This chickenbit is something unrelated on Zen1, and Zen1 vs Zen2 isn't a -+ * simple model number comparison, so use STIBP as a heuristic to separate the -+ * two uarches in Fam17h(AMD)/18h(Hygon). -+ */ -+void amd_init_spectral_chicken(void) -+{ -+ uint64_t val, chickenbit = 1 << 1; -+ -+ if (cpu_has_hypervisor || !boot_cpu_has(X86_FEATURE_AMD_STIBP)) -+ return; -+ -+ if (rdmsr_safe(MSR_AMD64_DE_CFG2, val) == 0 && !(val & chickenbit)) -+ wrmsr_safe(MSR_AMD64_DE_CFG2, val | chickenbit); -+} -+ - void __init detect_zen2_null_seg_behaviour(void) - { - uint64_t base; -@@ -796,6 +821,9 @@ static void init_amd(struct cpuinfo_x86 *c) - - amd_init_ssbd(c); - -+ if (c->x86 == 0x17) -+ amd_init_spectral_chicken(); -+ - /* Probe for NSCB on Zen2 CPUs when not virtualised */ - if (!cpu_has_hypervisor && !cpu_has_nscb && c == &boot_cpu_data && - c->x86 == 0x17) -diff --git a/xen/arch/x86/cpu/cpu.h b/xen/arch/x86/cpu/cpu.h -index b593bd85f04f..145bc5156a86 100644 ---- a/xen/arch/x86/cpu/cpu.h -+++ b/xen/arch/x86/cpu/cpu.h -@@ -22,4 +22,5 @@ void early_init_amd(struct cpuinfo_x86 *c); - void amd_log_freq(const struct cpuinfo_x86 *c); - void amd_init_lfence(struct cpuinfo_x86 *c); - void amd_init_ssbd(const struct cpuinfo_x86 *c); -+void amd_init_spectral_chicken(void); - void detect_zen2_null_seg_behaviour(void); -diff --git a/xen/arch/x86/cpu/hygon.c b/xen/arch/x86/cpu/hygon.c -index cdc94130dd2e..6f8d491297e8 100644 ---- a/xen/arch/x86/cpu/hygon.c -+++ b/xen/arch/x86/cpu/hygon.c -@@ -40,6 +40,12 @@ static void init_hygon(struct cpuinfo_x86 *c) - c->x86 == 0x18) - detect_zen2_null_seg_behaviour(); - -+ /* -+ * TODO: Check heuristic safety with Hygon first -+ if (c->x86 == 0x18) -+ amd_init_spectral_chicken(); -+ */ -+ - /* - * Hygon CPUs before Zen2 don't clear segment bases/limits when - * loading a NULL selector. -diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h -index 72bc32ba04ff..d3735e499e0f 100644 ---- a/xen/include/asm-x86/msr-index.h -+++ b/xen/include/asm-x86/msr-index.h -@@ -361,6 +361,7 @@ - #define MSR_AMD64_DE_CFG 0xc0011029 - #define AMD64_DE_CFG_LFENCE_SERIALISE (_AC(1, ULL) << 1) - #define MSR_AMD64_EX_CFG 0xc001102c -+#define MSR_AMD64_DE_CFG2 0xc00110e3 - - #define MSR_AMD64_DR0_ADDRESS_MASK 0xc0011027 - #define MSR_AMD64_DR1_ADDRESS_MASK 0xc0011019 --- -2.35.1 - diff --git a/0051-x86-spec-ctrl-Mitigate-Branch-Type-Confusion-when-po.patch b/0051-x86-spec-ctrl-Mitigate-Branch-Type-Confusion-when-po.patch deleted file mode 100644 index e313ede..0000000 --- a/0051-x86-spec-ctrl-Mitigate-Branch-Type-Confusion-when-po.patch +++ /dev/null @@ -1,305 +0,0 @@ -From 0a5387a01165b46c8c85e7f7e2ddbe60a7f5db44 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper <andrew.cooper3@citrix.com> -Date: Mon, 27 Jun 2022 19:29:40 +0100 -Subject: [PATCH 51/51] x86/spec-ctrl: Mitigate Branch Type Confusion when - possible - -Branch Type Confusion affects AMD/Hygon CPUs on Zen2 and earlier. To -mitigate, we require SMT safety (STIBP on Zen2, no-SMT on Zen1), and to issue -an IBPB on each entry to Xen, to flush the BTB. - -Due to performance concerns, dom0 (which is trusted in most configurations) is -excluded from protections by default. - -Therefore: - * Use STIBP by default on Zen2 too, which now means we want it on by default - on all hardware supporting STIBP. - * Break the current IBPB logic out into a new function, extending it with - IBPB-at-entry logic. - * Change the existing IBPB-at-ctxt-switch boolean to be tristate, and disable - it by default when IBPB-at-entry is providing sufficient safety. - -If all PV guests on the system are trusted, then it is recommended to boot -with `spec-ctrl=ibpb-entry=no-pv`, as this will provide an additional marginal -perf improvement. - -This is part of XSA-407 / CVE-2022-23825. - -Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> -Reviewed-by: Jan Beulich <jbeulich@suse.com> -(cherry picked from commit d8cb7e0f069e0f106d24941355b59b45a731eabe) ---- - docs/misc/xen-command-line.pandoc | 14 ++-- - xen/arch/x86/spec_ctrl.c | 113 ++++++++++++++++++++++++++---- - xen/include/asm-x86/spec_ctrl.h | 2 +- - 3 files changed, 112 insertions(+), 17 deletions(-) - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index 1bbdb55129cc..bd6826d0ae05 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -2234,7 +2234,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). - - ### spec-ctrl (x86) - > `= List of [ <bool>, xen=<bool>, {pv,hvm}=<bool>, --> {msr-sc,rsb,md-clear}=<bool>|{pv,hvm}=<bool>, -+> {msr-sc,rsb,md-clear,ibpb-entry}=<bool>|{pv,hvm}=<bool>, - > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd, - > eager-fpu,l1d-flush,branch-harden,srb-lock, - > unpriv-mmio}=<bool> ]` -@@ -2259,9 +2259,10 @@ in place for guests to use. - - Use of a positive boolean value for either of these options is invalid. - --The `pv=`, `hvm=`, `msr-sc=`, `rsb=` and `md-clear=` options offer fine --grained control over the primitives by Xen. These impact Xen's ability to --protect itself, and/or Xen's ability to virtualise support for guests to use. -+The `pv=`, `hvm=`, `msr-sc=`, `rsb=`, `md-clear=` and `ibpb-entry=` options -+offer fine grained control over the primitives by Xen. These impact Xen's -+ability to protect itself, and/or Xen's ability to virtualise support for -+guests to use. - - * `pv=` and `hvm=` offer control over all suboptions for PV and HVM guests - respectively. -@@ -2280,6 +2281,11 @@ protect itself, and/or Xen's ability to virtualise support for guests to use. - compatibility with development versions of this fix, `mds=` is also accepted - on Xen 4.12 and earlier as an alias. Consult vendor documentation in - preference to here.* -+* `ibpb-entry=` offers control over whether IBPB (Indirect Branch Prediction -+ Barrier) is used on entry to Xen. This is used by default on hardware -+ vulnerable to Branch Type Confusion, but for performance reasons, dom0 is -+ unprotected by default. If it necessary to protect dom0 too, boot with -+ `spec-ctrl=ibpb-entry`. - - If Xen was compiled with INDIRECT_THUNK support, `bti-thunk=` can be used to - select which of the thunks gets patched into the `__x86_indirect_thunk_%reg` -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 0f101c057f3e..1d9796c34d71 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -39,6 +39,10 @@ static bool __initdata opt_rsb_hvm = true; - static int8_t __read_mostly opt_md_clear_pv = -1; - static int8_t __read_mostly opt_md_clear_hvm = -1; - -+static int8_t __read_mostly opt_ibpb_entry_pv = -1; -+static int8_t __read_mostly opt_ibpb_entry_hvm = -1; -+static bool __read_mostly opt_ibpb_entry_dom0; -+ - /* Cmdline controls for Xen's speculative settings. */ - static enum ind_thunk { - THUNK_DEFAULT, /* Decide which thunk to use at boot time. */ -@@ -54,7 +58,7 @@ int8_t __initdata opt_stibp = -1; - bool __read_mostly opt_ssbd; - int8_t __initdata opt_psfd = -1; - --bool __read_mostly opt_ibpb_ctxt_switch = true; -+int8_t __read_mostly opt_ibpb_ctxt_switch = -1; - int8_t __read_mostly opt_eager_fpu = -1; - int8_t __read_mostly opt_l1d_flush = -1; - static bool __initdata opt_branch_harden = true; -@@ -114,6 +118,9 @@ static int __init parse_spec_ctrl(const char *s) - opt_rsb_hvm = false; - opt_md_clear_pv = 0; - opt_md_clear_hvm = 0; -+ opt_ibpb_entry_pv = 0; -+ opt_ibpb_entry_hvm = 0; -+ opt_ibpb_entry_dom0 = false; - - opt_thunk = THUNK_JMP; - opt_ibrs = 0; -@@ -140,12 +147,14 @@ static int __init parse_spec_ctrl(const char *s) - opt_msr_sc_pv = val; - opt_rsb_pv = val; - opt_md_clear_pv = val; -+ opt_ibpb_entry_pv = val; - } - else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) - { - opt_msr_sc_hvm = val; - opt_rsb_hvm = val; - opt_md_clear_hvm = val; -+ opt_ibpb_entry_hvm = val; - } - else if ( (val = parse_boolean("msr-sc", s, ss)) != -1 ) - { -@@ -210,6 +219,28 @@ static int __init parse_spec_ctrl(const char *s) - break; - } - } -+ else if ( (val = parse_boolean("ibpb-entry", s, ss)) != -1 ) -+ { -+ switch ( val ) -+ { -+ case 0: -+ case 1: -+ opt_ibpb_entry_pv = opt_ibpb_entry_hvm = -+ opt_ibpb_entry_dom0 = val; -+ break; -+ -+ case -2: -+ s += strlen("ibpb-entry="); -+ if ( (val = parse_boolean("pv", s, ss)) >= 0 ) -+ opt_ibpb_entry_pv = val; -+ else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) -+ opt_ibpb_entry_hvm = val; -+ else -+ default: -+ rc = -EINVAL; -+ break; -+ } -+ } - - /* Xen's speculative sidechannel mitigation settings. */ - else if ( !strncmp(s, "bti-thunk=", 10) ) -@@ -477,27 +508,31 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - * mitigation support for guests. - */ - #ifdef CONFIG_HVM -- printk(" Support for HVM VMs:%s%s%s%s%s\n", -+ printk(" Support for HVM VMs:%s%s%s%s%s%s\n", - (boot_cpu_has(X86_FEATURE_SC_MSR_HVM) || - boot_cpu_has(X86_FEATURE_SC_RSB_HVM) || - boot_cpu_has(X86_FEATURE_MD_CLEAR) || -+ boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) || - opt_eager_fpu) ? "" : " None", - boot_cpu_has(X86_FEATURE_SC_MSR_HVM) ? " MSR_SPEC_CTRL" : "", - boot_cpu_has(X86_FEATURE_SC_RSB_HVM) ? " RSB" : "", - opt_eager_fpu ? " EAGER_FPU" : "", -- boot_cpu_has(X86_FEATURE_MD_CLEAR) ? " MD_CLEAR" : ""); -+ boot_cpu_has(X86_FEATURE_MD_CLEAR) ? " MD_CLEAR" : "", -+ boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) ? " IBPB-entry" : ""); - - #endif - #ifdef CONFIG_PV -- printk(" Support for PV VMs:%s%s%s%s%s\n", -+ printk(" Support for PV VMs:%s%s%s%s%s%s\n", - (boot_cpu_has(X86_FEATURE_SC_MSR_PV) || - boot_cpu_has(X86_FEATURE_SC_RSB_PV) || - boot_cpu_has(X86_FEATURE_MD_CLEAR) || -+ boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) || - opt_eager_fpu) ? "" : " None", - boot_cpu_has(X86_FEATURE_SC_MSR_PV) ? " MSR_SPEC_CTRL" : "", - boot_cpu_has(X86_FEATURE_SC_RSB_PV) ? " RSB" : "", - opt_eager_fpu ? " EAGER_FPU" : "", -- boot_cpu_has(X86_FEATURE_MD_CLEAR) ? " MD_CLEAR" : ""); -+ boot_cpu_has(X86_FEATURE_MD_CLEAR) ? " MD_CLEAR" : "", -+ boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) ? " IBPB-entry" : ""); - - printk(" XPTI (64-bit PV only): Dom0 %s, DomU %s (with%s PCID)\n", - opt_xpti_hwdom ? "enabled" : "disabled", -@@ -759,6 +794,55 @@ static bool __init should_use_eager_fpu(void) - } - } - -+static void __init ibpb_calculations(void) -+{ -+ /* Check we have hardware IBPB support before using it... */ -+ if ( !boot_cpu_has(X86_FEATURE_IBRSB) && !boot_cpu_has(X86_FEATURE_IBPB) ) -+ { -+ opt_ibpb_entry_hvm = opt_ibpb_entry_pv = opt_ibpb_ctxt_switch = 0; -+ opt_ibpb_entry_dom0 = false; -+ return; -+ } -+ -+ /* -+ * IBPB-on-entry mitigations for Branch Type Confusion. -+ * -+ * IBPB && !BTC_NO selects all AMD/Hygon hardware, not known to be safe, -+ * that we can provide some form of mitigation on. -+ */ -+ if ( opt_ibpb_entry_pv == -1 ) -+ opt_ibpb_entry_pv = (IS_ENABLED(CONFIG_PV) && -+ boot_cpu_has(X86_FEATURE_IBPB) && -+ !boot_cpu_has(X86_FEATURE_BTC_NO)); -+ if ( opt_ibpb_entry_hvm == -1 ) -+ opt_ibpb_entry_hvm = (IS_ENABLED(CONFIG_HVM) && -+ boot_cpu_has(X86_FEATURE_IBPB) && -+ !boot_cpu_has(X86_FEATURE_BTC_NO)); -+ -+ if ( opt_ibpb_entry_pv ) -+ { -+ setup_force_cpu_cap(X86_FEATURE_IBPB_ENTRY_PV); -+ -+ /* -+ * We only need to flush in IST context if we're protecting against PV -+ * guests. HVM IBPB-on-entry protections are both atomic with -+ * NMI/#MC, so can't interrupt Xen ahead of having already flushed the -+ * BTB. -+ */ -+ default_spec_ctrl_flags |= SCF_ist_ibpb; -+ } -+ if ( opt_ibpb_entry_hvm ) -+ setup_force_cpu_cap(X86_FEATURE_IBPB_ENTRY_HVM); -+ -+ /* -+ * If we're using IBPB-on-entry to protect against PV and HVM guests -+ * (ignoring dom0 if trusted), then there's no need to also issue IBPB on -+ * context switch too. -+ */ -+ if ( opt_ibpb_ctxt_switch == -1 ) -+ opt_ibpb_ctxt_switch = !(opt_ibpb_entry_hvm && opt_ibpb_entry_pv); -+} -+ - /* Calculate whether this CPU is vulnerable to L1TF. */ - static __init void l1tf_calculations(uint64_t caps) - { -@@ -1014,8 +1098,12 @@ void spec_ctrl_init_domain(struct domain *d) - bool verw = ((pv ? opt_md_clear_pv : opt_md_clear_hvm) || - (opt_fb_clear_mmio && is_iommu_enabled(d))); - -+ bool ibpb = ((pv ? opt_ibpb_entry_pv : opt_ibpb_entry_hvm) && -+ (d->domain_id != 0 || opt_ibpb_entry_dom0)); -+ - d->arch.spec_ctrl_flags = - (verw ? SCF_verw : 0) | -+ (ibpb ? SCF_entry_ibpb : 0) | - 0; - } - -@@ -1162,12 +1250,15 @@ void __init init_speculation_mitigations(void) - } - - /* -- * Use STIBP by default if the hardware hint is set. Otherwise, leave it -- * off as it a severe performance pentalty on pre-eIBRS Intel hardware -- * where it was retrofitted in microcode. -+ * Use STIBP by default on all AMD systems. Zen3 and later enumerate -+ * STIBP_ALWAYS, but STIBP is needed on Zen2 as part of the mitigations -+ * for Branch Type Confusion. -+ * -+ * Leave STIBP off by default on Intel. Pre-eIBRS systems suffer a -+ * substantial perf hit when it was implemented in microcode. - */ - if ( opt_stibp == -1 ) -- opt_stibp = !!boot_cpu_has(X86_FEATURE_STIBP_ALWAYS); -+ opt_stibp = !!boot_cpu_has(X86_FEATURE_AMD_STIBP); - - if ( opt_stibp && (boot_cpu_has(X86_FEATURE_STIBP) || - boot_cpu_has(X86_FEATURE_AMD_STIBP)) ) -@@ -1239,9 +1330,7 @@ void __init init_speculation_mitigations(void) - if ( opt_rsb_hvm ) - setup_force_cpu_cap(X86_FEATURE_SC_RSB_HVM); - -- /* Check we have hardware IBPB support before using it... */ -- if ( !boot_cpu_has(X86_FEATURE_IBRSB) && !boot_cpu_has(X86_FEATURE_IBPB) ) -- opt_ibpb_ctxt_switch = false; -+ ibpb_calculations(); - - /* Check whether Eager FPU should be enabled by default. */ - if ( opt_eager_fpu == -1 ) -diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h -index 3fc599a817c4..9403b81dc7af 100644 ---- a/xen/include/asm-x86/spec_ctrl.h -+++ b/xen/include/asm-x86/spec_ctrl.h -@@ -65,7 +65,7 @@ - void init_speculation_mitigations(void); - void spec_ctrl_init_domain(struct domain *d); - --extern bool opt_ibpb_ctxt_switch; -+extern int8_t opt_ibpb_ctxt_switch; - extern bool opt_ssbd; - extern int8_t opt_eager_fpu; - extern int8_t opt_l1d_flush; --- -2.35.1 - @@ -1,6 +1,6 @@ -Xen upstream patchset #1 for 4.16.2-pre +Xen upstream patchset #0 for 4.16.3-pre Containing patches from -RELEASE-4.16.1 (13fee86475f3831d7a1ecf6d7e0acbc2ac779f7e) +RELEASE-4.16.2 (1871bd1c9eb934f0ffd039f3d68e42fd0097f322) to -staging-4.16 (0a5387a01165b46c8c85e7f7e2ddbe60a7f5db44) +staging-4.16 (1bce7fb1f702da4f7a749c6f1457ecb20bf74fca) |