git: 131bad17ead4 - stable/13 - vmm: permit some IPIs to be handled by userspace

From: Corvin Köhne <corvink_at_FreeBSD.org>
Date: Fri, 09 Dec 2022 13:18:59 UTC
The branch stable/13 has been updated by corvink:

URL: https://cgit.FreeBSD.org/src/commit/?id=131bad17ead4938179fb5446538324667987c349

commit 131bad17ead4938179fb5446538324667987c349
Author:     Corvin Köhne <CorvinK@beckhoff.com>
AuthorDate: 2022-09-07 07:07:03 +0000
Commit:     Corvin Köhne <corvink@FreeBSD.org>
CommitDate: 2022-12-09 13:18:08 +0000

    vmm: permit some IPIs to be handled by userspace
    
    Add VM_EXITCODE_IPI to permit returning unhandled IPIs to userland.
    INIT and STARTUP IPIs are now returned to userland. Due to backward
    compatibility reasons, a new capability is added for enabling
    VM_EXITCODE_IPI.
    
    Reviewed by:            jhb
    Differential Revision:  https://reviews.freebsd.org/D35623
    Sponsored by:           Beckhoff Automation GmbH & Co. KG
    
    (cherry picked from commit 0bda8d3e9f7a5c04881219723436616b23041e5f)
---
 sys/amd64/include/vmm.h        |   8 ++
 sys/amd64/vmm/amd/svm.c        |  10 +++
 sys/amd64/vmm/intel/vmx.c      |   8 ++
 sys/amd64/vmm/io/vlapic.c      | 192 ++++++++++++++++++++++++++---------------
 sys/amd64/vmm/io/vlapic.h      |   2 +
 sys/amd64/vmm/io/vlapic_priv.h |   2 +
 sys/amd64/vmm/vmm.c            |   9 ++
 usr.sbin/bhyve/bhyverun.c      |  34 ++++++++
 usr.sbin/bhyve/spinup_ap.c     |   3 +
 9 files changed, 198 insertions(+), 70 deletions(-)

diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index ce61e16522aa..a957ecb0f852 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -31,6 +31,7 @@
 #ifndef _VMM_H_
 #define	_VMM_H_
 
+#include <sys/cpuset.h>
 #include <sys/sdt.h>
 #include <x86/segments.h>
 
@@ -486,6 +487,7 @@ enum vm_cap_type {
 	VM_CAP_BPT_EXIT,
 	VM_CAP_RDPID,
 	VM_CAP_RDTSCP,
+	VM_CAP_IPI_EXIT,
 	VM_CAP_MAX
 };
 
@@ -633,6 +635,7 @@ enum vm_exitcode {
 	VM_EXITCODE_DEBUG,
 	VM_EXITCODE_VMINSN,
 	VM_EXITCODE_BPT,
+	VM_EXITCODE_IPI,
 	VM_EXITCODE_MAX
 };
 
@@ -740,6 +743,11 @@ struct vm_exit {
 		struct {
 			enum vm_suspend_how how;
 		} suspended;
+		struct {
+			uint32_t mode;
+			uint8_t vector;
+			cpuset_t dmask;
+		} ipi;
 		struct vm_task_switch task_switch;
 	} u;
 };
diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c
index 474350e9a1fa..f50a4c8c9097 100644
--- a/sys/amd64/vmm/amd/svm.c
+++ b/sys/amd64/vmm/amd/svm.c
@@ -2315,6 +2315,7 @@ static int
 svm_setcap(void *arg, int vcpu, int type, int val)
 {
 	struct svm_softc *sc;
+	struct vlapic *vlapic;
 	int error;
 
 	sc = arg;
@@ -2333,6 +2334,10 @@ svm_setcap(void *arg, int vcpu, int type, int val)
 		if (val == 0)
 			error = EINVAL;
 		break;
+	case VM_CAP_IPI_EXIT:
+		vlapic = vm_lapic(sc->vm, vcpu);
+		vlapic->ipi_exit = val;
+		break;
 	default:
 		error = ENOENT;
 		break;
@@ -2344,6 +2349,7 @@ static int
 svm_getcap(void *arg, int vcpu, int type, int *retval)
 {
 	struct svm_softc *sc;
+	struct vlapic *vlapic;
 	int error;
 
 	sc = arg;
@@ -2361,6 +2367,10 @@ svm_getcap(void *arg, int vcpu, int type, int *retval)
 	case VM_CAP_UNRESTRICTED_GUEST:
 		*retval = 1;	/* unrestricted guest is always enabled */
 		break;
+	case VM_CAP_IPI_EXIT:
+		vlapic = vm_lapic(sc->vm, vcpu);
+		*retval = vlapic->ipi_exit;
+		break;
 	default:
 		error = ENOENT;
 		break;
diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c
index 9a621b9bb3a2..d88abbc62342 100644
--- a/sys/amd64/vmm/intel/vmx.c
+++ b/sys/amd64/vmm/intel/vmx.c
@@ -3504,6 +3504,7 @@ vmx_getcap(void *arg, int vcpu, int type, int *retval)
 			ret = 0;
 		break;
 	case VM_CAP_BPT_EXIT:
+	case VM_CAP_IPI_EXIT:
 		ret = 0;
 		break;
 	default:
@@ -3521,6 +3522,7 @@ vmx_setcap(void *arg, int vcpu, int type, int val)
 {
 	struct vmx *vmx = arg;
 	struct vmcs *vmcs = &vmx->vmcs[vcpu];
+	struct vlapic *vlapic;
 	uint32_t baseval;
 	uint32_t *pptr;
 	int error;
@@ -3599,6 +3601,12 @@ vmx_setcap(void *arg, int vcpu, int type, int val)
 			reg = VMCS_EXCEPTION_BITMAP;
 		}
 		break;
+	case VM_CAP_IPI_EXIT:
+		retval = 0;
+
+		vlapic = vm_lapic(vmx->vm, vcpu);
+		vlapic->ipi_exit = val;
+		break;
 	default:
 		break;
 	}
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index c0ca167ef11f..d2e60fc3baeb 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$");
 
 static void vlapic_set_error(struct vlapic *, uint32_t, bool);
 static void vlapic_callout_handler(void *arg);
+static void vlapic_reset(struct vlapic *vlapic);
 
 static __inline uint32_t
 vlapic_get_id(struct vlapic *vlapic)
@@ -957,13 +958,12 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu)
 {
 	int i;
 	bool phys;
-	cpuset_t dmask;
+	cpuset_t dmask, ipimask;
 	uint64_t icrval;
-	uint32_t dest, vec, mode;
+	uint32_t dest, vec, mode, shorthand;
 	struct vlapic *vlapic2;
 	struct vm_exit *vmexit;
 	struct LAPIC *lapic;
-	uint16_t maxcpus;
 
 	lapic = vlapic->apic_page;
 	lapic->icr_lo &= ~APIC_DELSTAT_PEND;
@@ -975,97 +975,147 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu)
 		dest = icrval >> (32 + 24);
 	vec = icrval & APIC_VECTOR_MASK;
 	mode = icrval & APIC_DELMODE_MASK;
+	phys = (icrval & APIC_DESTMODE_LOG) == 0;
+	shorthand = icrval & APIC_DEST_MASK;
 
-	if (mode == APIC_DELMODE_FIXED && vec < 16) {
-		vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, false);
-		VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec);
-		return (0);
+	VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec);
+
+	switch (shorthand) {
+	case APIC_DEST_DESTFLD:
+		vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false, x2apic(vlapic));
+		break;
+	case APIC_DEST_SELF:
+		CPU_SETOF(vlapic->vcpuid, &dmask);
+		break;
+	case APIC_DEST_ALLISELF:
+		dmask = vm_active_cpus(vlapic->vm);
+		break;
+	case APIC_DEST_ALLESELF:
+		dmask = vm_active_cpus(vlapic->vm);
+		CPU_CLR(vlapic->vcpuid, &dmask);
+		break;
+	default:
+		__assert_unreachable();
 	}
 
-	VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec);
+	/*
+	 * ipimask is a set of vCPUs needing userland handling of the current
+	 * IPI.
+	 */
+	CPU_ZERO(&ipimask);
 
-	if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) {
-		switch (icrval & APIC_DEST_MASK) {
-		case APIC_DEST_DESTFLD:
-			phys = ((icrval & APIC_DESTMODE_LOG) == 0);
-			vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false,
-			    x2apic(vlapic));
-			break;
-		case APIC_DEST_SELF:
-			CPU_SETOF(vlapic->vcpuid, &dmask);
-			break;
-		case APIC_DEST_ALLISELF:
-			dmask = vm_active_cpus(vlapic->vm);
-			break;
-		case APIC_DEST_ALLESELF:
-			dmask = vm_active_cpus(vlapic->vm);
-			CPU_CLR(vlapic->vcpuid, &dmask);
-			break;
-		default:
-			CPU_ZERO(&dmask);	/* satisfy gcc */
-			break;
+	switch (mode) {
+	case APIC_DELMODE_FIXED:
+		if (vec < 16) {
+			vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR,
+			    false);
+			VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec);
+			return (0);
 		}
 
 		CPU_FOREACH_ISSET(i, &dmask) {
-			if (mode == APIC_DELMODE_FIXED) {
-				lapic_intr_edge(vlapic->vm, i, vec);
-				vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid,
-						    IPIS_SENT, i, 1);
-				VLAPIC_CTR2(vlapic, "vlapic sending ipi %d "
-				    "to vcpuid %d", vec, i);
-			} else {
-				vm_inject_nmi(vlapic->vm, i);
-				VLAPIC_CTR1(vlapic, "vlapic sending ipi nmi "
-				    "to vcpuid %d", i);
-			}
+			lapic_intr_edge(vlapic->vm, i, vec);
+			vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid,
+			    IPIS_SENT, i, 1);
+			VLAPIC_CTR2(vlapic,
+			    "vlapic sending ipi %d to vcpuid %d", vec, i);
 		}
 
-		return (0);	/* handled completely in the kernel */
-	}
+		break;
+	case APIC_DELMODE_NMI:
+		CPU_FOREACH_ISSET(i, &dmask) {
+			vm_inject_nmi(vlapic->vm, i);
+			VLAPIC_CTR1(vlapic,
+			    "vlapic sending ipi nmi to vcpuid %d", i);
+		}
 
-	maxcpus = vm_get_maxcpus(vlapic->vm);
-	if (mode == APIC_DELMODE_INIT) {
+		break;
+	case APIC_DELMODE_INIT:
 		if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT)
-			return (0);
-
-		if (vlapic->vcpuid == 0 && dest != 0 && dest < maxcpus) {
-			vlapic2 = vm_lapic(vlapic->vm, dest);
-
-			/* move from INIT to waiting-for-SIPI state */
-			if (vlapic2->boot_state == BS_INIT) {
-				vlapic2->boot_state = BS_SIPI;
-			}
+			break;
 
-			return (0);
+		CPU_FOREACH_ISSET(i, &dmask) {
+			/*
+			 * Userland which doesn't support the IPI exit requires
+			 * that the boot state is set to SIPI here.
+			 */
+			vlapic2 = vm_lapic(vlapic->vm, i);
+			vlapic2->boot_state = BS_SIPI;
+			CPU_SET(i, &ipimask);
 		}
-	}
-
-	if (mode == APIC_DELMODE_STARTUP) {
-		if (vlapic->vcpuid == 0 && dest != 0 && dest < maxcpus) {
-			vlapic2 = vm_lapic(vlapic->vm, dest);
 
+		break;
+	case APIC_DELMODE_STARTUP:
+		CPU_FOREACH_ISSET(i, &dmask) {
+			vlapic2 = vm_lapic(vlapic->vm, i);
 			/*
 			 * Ignore SIPIs in any state other than wait-for-SIPI
 			 */
 			if (vlapic2->boot_state != BS_SIPI)
-				return (0);
-
+				continue;
 			vlapic2->boot_state = BS_RUNNING;
+			CPU_SET(i, &ipimask);
+		}
+
+		break;
+	default:
+		return (1);
+	}
 
-			*retu = true;
-			vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
-			vmexit->exitcode = VM_EXITCODE_SPINUP_AP;
-			vmexit->u.spinup_ap.vcpu = dest;
-			vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT;
+	if (!CPU_EMPTY(&ipimask)) {
+		vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
+		vmexit->exitcode = VM_EXITCODE_IPI;
+		vmexit->u.ipi.mode = mode;
+		vmexit->u.ipi.vector = vec;
+		vmexit->u.ipi.dmask = dmask;
 
-			return (0);
+		*retu = true;
+
+		/*
+		 * Old bhyve versions don't support the IPI exit. Translate it
+		 * into the old style.
+		 */
+		if (!vlapic->ipi_exit) {
+			if (mode == APIC_DELMODE_STARTUP) {
+				vmexit->exitcode = VM_EXITCODE_SPINUP_AP;
+				vmexit->u.spinup_ap.vcpu = CPU_FFS(&ipimask) - 1;
+				vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT;
+			} else {
+				*retu = false;
+			}
 		}
 	}
 
-	/*
-	 * This will cause a return to userland.
-	 */
-	return (1);
+	return (0);
+}
+
+static void
+vlapic_handle_init(struct vm *vm, int vcpuid, void *arg)
+{
+	struct vlapic *vlapic = vm_lapic(vm, vcpuid);
+
+	vlapic_reset(vlapic);
+
+	/* vlapic_reset modifies the boot state. */
+	vlapic->boot_state = BS_SIPI;
+}
+
+int
+vm_handle_ipi(struct vm *vm, int vcpuid, struct vm_exit *vme, bool *retu)
+{
+	*retu = true;
+	switch (vme->u.ipi.mode) {
+	case APIC_DELMODE_INIT:
+		vm_smp_rendezvous(vm, vcpuid, vme->u.ipi.dmask,
+		    vlapic_handle_init, NULL);
+		break;
+	case APIC_DELMODE_STARTUP:
+		break;
+	default:
+		return (1);
+	}
+
+	return (0);
 }
 
 void
@@ -1467,6 +1517,8 @@ vlapic_init(struct vlapic *vlapic)
 	if (vlapic->vcpuid == 0)
 		vlapic->msr_apicbase |= APICBASE_BSP;
 
+	vlapic->ipi_exit = false;
+
 	vlapic_reset(vlapic);
 }
 
diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h
index b87657c8bb51..87f3d0c2660f 100644
--- a/sys/amd64/vmm/io/vlapic.h
+++ b/sys/amd64/vmm/io/vlapic.h
@@ -115,4 +115,6 @@ void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val);
 int vlapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta);
 #endif
 
+int vm_handle_ipi(struct vm *vm, int vcpuid, struct vm_exit *vme, bool *retu);
+
 #endif	/* _VLAPIC_H_ */
diff --git a/sys/amd64/vmm/io/vlapic_priv.h b/sys/amd64/vmm/io/vlapic_priv.h
index fe7965cb65d7..4b3e9009e68c 100644
--- a/sys/amd64/vmm/io/vlapic_priv.h
+++ b/sys/amd64/vmm/io/vlapic_priv.h
@@ -183,6 +183,8 @@ struct vlapic {
 	 */
 	uint32_t	svr_last;
 	uint32_t	lvt_last[VLAPIC_MAXLVT_INDEX + 1];
+
+	bool		ipi_exit;
 };
 
 void vlapic_init(struct vlapic *vlapic);
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 4de924287336..06ec385ba25d 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -1821,6 +1821,15 @@ restart:
 		}
 	}
 
+	/*
+	 * VM_EXITCODE_INST_EMUL could access the apic which could transform the
+	 * exit code into VM_EXITCODE_IPI.
+	 */
+	if (error == 0 && vme->exitcode == VM_EXITCODE_IPI) {
+		retu = false;
+		error = vm_handle_ipi(vm, vcpuid, vme, &retu);
+	}
+
 	if (error == 0 && retu == false)
 		goto restart;
 
diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c
index c1e0904d13ed..c14c0c60837f 100644
--- a/usr.sbin/bhyve/bhyverun.c
+++ b/usr.sbin/bhyve/bhyverun.c
@@ -46,6 +46,7 @@ __FBSDID("$FreeBSD$");
 #endif
 
 #include <amd64/vmm/intel/vmcs.h>
+#include <x86/apicreg.h>
 
 #include <machine/atomic.h>
 #include <machine/segments.h>
@@ -936,6 +937,35 @@ vmexit_breakpoint(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
 	return (VMEXIT_CONTINUE);
 }
 
+static int
+vmexit_ipi(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+	int error = -1;
+	int i;
+	switch (vmexit->u.ipi.mode) {
+	case APIC_DELMODE_INIT:
+		CPU_FOREACH_ISSET (i, &vmexit->u.ipi.dmask) {
+			error = vm_suspend_cpu(ctx, i);
+			if (error) {
+				warnx("%s: failed to suspend cpu %d\n",
+				    __func__, i);
+				break;
+			}
+		}
+		break;
+	case APIC_DELMODE_STARTUP:
+		CPU_FOREACH_ISSET (i, &vmexit->u.ipi.dmask) {
+			spinup_ap(ctx, i, vmexit->u.ipi.vector << PAGE_SHIFT);
+		}
+		error = 0;
+		break;
+	default:
+		break;
+	}
+
+	return (error);
+}
+
 static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
 	[VM_EXITCODE_INOUT]  = vmexit_inout,
 	[VM_EXITCODE_INOUT_STR]  = vmexit_inout,
@@ -952,6 +982,7 @@ static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
 	[VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
 	[VM_EXITCODE_DEBUG] = vmexit_debug,
 	[VM_EXITCODE_BPT] = vmexit_breakpoint,
+	[VM_EXITCODE_IPI] = vmexit_ipi,
 };
 
 static void
@@ -1140,6 +1171,9 @@ spinup_vcpu(struct vmctx *ctx, int vcpu, bool suspend)
 	error = vm_set_capability(ctx, vcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
 	assert(error == 0);
 
+	error = vm_set_capability(ctx, vcpu, VM_CAP_IPI_EXIT, 1);
+	assert(error == 0);
+
 	fbsdrun_addcpu(ctx, vcpu, rip, suspend);
 }
 
diff --git a/usr.sbin/bhyve/spinup_ap.c b/usr.sbin/bhyve/spinup_ap.c
index 2b7e602f8003..438091e564e7 100644
--- a/usr.sbin/bhyve/spinup_ap.c
+++ b/usr.sbin/bhyve/spinup_ap.c
@@ -98,6 +98,9 @@ spinup_ap(struct vmctx *ctx, int newcpu, uint64_t rip)
 	error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
 	assert(error == 0);
 
+	error = vm_set_capability(ctx, newcpu, VM_CAP_IPI_EXIT, 1);
+	assert(error == 0);
+
 	spinup_ap_realmode(ctx, newcpu, &rip);
 
 	vm_resume_cpu(ctx, newcpu);