git: c0f35dbf19c3 - main - vmm: Use a cpuset_t for vCPUs waiting for STARTUP IPIs.

From: John Baldwin <jhb_at_FreeBSD.org>
Date: Fri, 18 Nov 2022 18:26:54 UTC
The branch main has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=c0f35dbf19c3c8825bd2b321d8efd582807d1940

commit c0f35dbf19c3c8825bd2b321d8efd582807d1940
Author:     John Baldwin <jhb@FreeBSD.org>
AuthorDate: 2022-11-18 18:05:10 +0000
Commit:     John Baldwin <jhb@FreeBSD.org>
CommitDate: 2022-11-18 18:25:38 +0000

    vmm: Use a cpuset_t for vCPUs waiting for STARTUP IPIs.
    
    Retire the boot_state member of struct vlapic and instead use a cpuset
    in the VM to track vCPUs waiting for STARTUP IPIs.  INIT IPIs add
    vCPUs to this set, and STARTUP IPIs remove vCPUs from the set.
    STARTUP IPIs are only reported to userland for vCPUs that were removed
    from the set.
    
    In particular, this permits a subsequent change to allocate vCPUs on
    demand when the vCPU may not be allocated until after a STARTUP IPI is
    reported to userland.
    
    Reviewed by:    corvink, markj
    Differential Revision:  https://reviews.freebsd.org/D37173
---
 sys/amd64/include/vmm.h        |  3 +++
 sys/amd64/vmm/io/vlapic.c      | 46 ++++++++++--------------------------------
 sys/amd64/vmm/io/vlapic_priv.h |  7 -------
 sys/amd64/vmm/vmm.c            | 27 +++++++++++++++++++++++++
 4 files changed, 41 insertions(+), 42 deletions(-)

diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index b4f3312794dd..713c4a8b46e9 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -319,9 +319,12 @@ int vm_restore_time(struct vm *vm);
 typedef void (*vm_rendezvous_func_t)(struct vcpu *vcpu, void *arg);
 int vm_smp_rendezvous(struct vcpu *vcpu, cpuset_t dest,
     vm_rendezvous_func_t func, void *arg);
+
 cpuset_t vm_active_cpus(struct vm *vm);
 cpuset_t vm_debug_cpus(struct vm *vm);
 cpuset_t vm_suspended_cpus(struct vm *vm);
+cpuset_t vm_start_cpus(struct vm *vm, const cpuset_t *tostart);
+void vm_await_start(struct vm *vm, const cpuset_t *waiting);
 #endif	/* _SYS__CPUSET_H_ */
 
 static __inline int
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index 1a8b54bba3bf..e13cdcc63d57 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -1039,7 +1039,6 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu)
 	cpuset_t dmask, ipimask;
 	uint64_t icrval;
 	uint32_t dest, vec, mode, shorthand;
-	struct vlapic *vlapic2;
 	struct vcpu *vcpu;
 	struct vm_exit *vmexit;
 	struct LAPIC *lapic;
@@ -1128,14 +1127,9 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu)
 			    i == vlapic->vcpuid)
 				break;
 
-			/*
-			 * Userland which doesn't support the IPI exit
-			 * requires that the boot state is set to SIPI
-			 * here.
-			 */
-			vcpu = vm_vcpu(vlapic->vm, i);
-			vlapic2 = vm_lapic(vcpu);
-			vlapic2->boot_state = BS_SIPI;
+			/* vCPU i is waiting for SIPI. */
+			CPU_SETOF(i, &dmask);
+			vm_await_start(vlapic->vm, &dmask);
 			break;
 		}
 
@@ -1158,11 +1152,10 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu)
 			/*
 			 * Ignore SIPIs in any state other than wait-for-SIPI
 			 */
-			vcpu = vm_vcpu(vlapic->vm, i);
-			vlapic2 = vm_lapic(vcpu);
-			if (vlapic2->boot_state != BS_SIPI)
+			CPU_SETOF(i, &dmask);
+			dmask = vm_start_cpus(vlapic->vm, &dmask);
+			if (CPU_EMPTY(&dmask))
 				break;
-			vlapic2->boot_state = BS_RUNNING;
 
 			vmexit = vm_exitinfo(vlapic->vcpu);
 			vmexit->exitcode = VM_EXITCODE_SPINUP_AP;
@@ -1173,19 +1166,10 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu)
 			break;
 		}
 
-		CPU_FOREACH_ISSET(i, &dmask) {
-			vcpu = vm_vcpu(vlapic->vm, i);
-			vlapic2 = vm_lapic(vcpu);
-
-			/*
-			 * Ignore SIPIs in any state other than wait-for-SIPI
-			 */
-			if (vlapic2->boot_state != BS_SIPI)
-				continue;
-			vlapic2->boot_state = BS_RUNNING;
-			CPU_SET(i, &ipimask);
-		}
-
+		/*
+		 * Ignore SIPIs in any state other than wait-for-SIPI
+		 */
+		ipimask = vm_start_cpus(vlapic->vm, &dmask);
 		break;
 	default:
 		return (1);
@@ -1210,9 +1194,6 @@ vlapic_handle_init(struct vcpu *vcpu, void *arg)
 	struct vlapic *vlapic = vm_lapic(vcpu);
 
 	vlapic_reset(vlapic);
-
-	/* vlapic_reset modifies the boot state. */
-	vlapic->boot_state = BS_SIPI;
 }
 
 int
@@ -1223,6 +1204,7 @@ vm_handle_ipi(struct vcpu *vcpu, struct vm_exit *vme, bool *retu)
 	case APIC_DELMODE_INIT:
 		vm_smp_rendezvous(vcpu, vme->u.ipi.dmask, vlapic_handle_init,
 		    NULL);
+		vm_await_start(vcpu_vm(vcpu), &vme->u.ipi.dmask);
 		break;
 	case APIC_DELMODE_STARTUP:
 		break;
@@ -1598,11 +1580,6 @@ vlapic_reset(struct vlapic *vlapic)
 	lapic->dcr_timer = 0;
 	vlapic_dcr_write_handler(vlapic);
 
-	if (vlapic->vcpuid == 0)
-		vlapic->boot_state = BS_RUNNING;	/* BSP */
-	else
-		vlapic->boot_state = BS_INIT;		/* AP */
-
 	vlapic->svr_last = lapic->svr;
 }
 
@@ -1900,7 +1877,6 @@ vlapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta)
 				      sizeof(vlapic->isrvec_stk),
 				      meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(vlapic->isrvec_stk_top, meta, ret, done);
-		SNAPSHOT_VAR_OR_LEAVE(vlapic->boot_state, meta, ret, done);
 
 		SNAPSHOT_BUF_OR_LEAVE(vlapic->lvt_last,
 				      sizeof(vlapic->lvt_last),
diff --git a/sys/amd64/vmm/io/vlapic_priv.h b/sys/amd64/vmm/io/vlapic_priv.h
index 2ac0cbf68117..ccae4b748880 100644
--- a/sys/amd64/vmm/io/vlapic_priv.h
+++ b/sys/amd64/vmm/io/vlapic_priv.h
@@ -125,12 +125,6 @@ do {									\
 	VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]);	\
 } while (0)
 
-enum boot_state {
-	BS_INIT,
-	BS_SIPI,
-	BS_RUNNING
-};
-
 /*
  * 16 priority levels with at most one vector injected per level.
  */
@@ -175,7 +169,6 @@ struct vlapic {
 	int		isrvec_stk_top;
 
 	uint64_t	msr_apicbase;
-	enum boot_state	boot_state;
 
 	/*
 	 * Copies of some registers in the virtual APIC page. We do this for
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 89e406efcdb0..ec6504772db3 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -173,6 +173,7 @@ struct vm {
 	struct vrtc	*vrtc;			/* (o) virtual RTC */
 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for debug */
+	cpuset_t	startup_cpus;		/* (i) [r] waiting for startup */
 	int		suspend;		/* (i) stop VM execution */
 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
@@ -486,6 +487,7 @@ vm_init(struct vm *vm, bool create)
 
 	CPU_ZERO(&vm->active_cpus);
 	CPU_ZERO(&vm->debug_cpus);
+	CPU_ZERO(&vm->startup_cpus);
 
 	vm->suspend = 0;
 	CPU_ZERO(&vm->suspended_cpus);
@@ -2421,6 +2423,30 @@ vm_suspended_cpus(struct vm *vm)
 	return (vm->suspended_cpus);
 }
 
+/*
+ * Returns the subset of vCPUs in tostart that are awaiting startup.
+ * These vCPUs are also marked as no longer awaiting startup.
+ */
+cpuset_t
+vm_start_cpus(struct vm *vm, const cpuset_t *tostart)
+{
+	cpuset_t set;
+
+	mtx_lock(&vm->rendezvous_mtx);
+	CPU_AND(&set, &vm->startup_cpus, tostart);
+	CPU_ANDNOT(&vm->startup_cpus, &vm->startup_cpus, &set);
+	mtx_unlock(&vm->rendezvous_mtx);
+	return (set);
+}
+
+void
+vm_await_start(struct vm *vm, const cpuset_t *waiting)
+{
+	mtx_lock(&vm->rendezvous_mtx);
+	CPU_OR(&vm->startup_cpus, &vm->startup_cpus, waiting);
+	mtx_unlock(&vm->rendezvous_mtx);
+}
+
 void *
 vcpu_stats(struct vcpu *vcpu)
 {
@@ -2769,6 +2795,7 @@ vm_snapshot_vm(struct vm *vm, struct vm_snapshot_meta *meta)
 	if (ret != 0)
 		goto done;
 
+	SNAPSHOT_VAR_OR_LEAVE(vm->startup_cpus, meta, ret, done);
 done:
 	return (ret);
 }