git: 0bda8d3e9f7a - main - vmm: permit some IPIs to be handled by userspace
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Fri, 14 Oct 2022 10:03:49 UTC
The branch main has been updated by manu: URL: https://cgit.FreeBSD.org/src/commit/?id=0bda8d3e9f7a5c04881219723436616b23041e5f commit 0bda8d3e9f7a5c04881219723436616b23041e5f Author: Corvin Köhne <CorvinK@beckhoff.com> AuthorDate: 2022-09-07 07:07:03 +0000 Commit: Emmanuel Vadot <manu@FreeBSD.org> CommitDate: 2022-10-14 10:03:05 +0000 vmm: permit some IPIs to be handled by userspace Add VM_EXITCODE_IPI to permit returning unhandled IPIs to userland. INIT and STARTUP IPIs are now returned to userland. Due to backward compatibility reasons, a new capability is added for enabling VM_EXITCODE_IPI. Reviewed by: jhb Differential Revision: https://reviews.freebsd.org/D35623 Sponsored by: Beckhoff Automation GmbH & Co. KG --- sys/amd64/include/vmm.h | 8 ++ sys/amd64/vmm/amd/svm.c | 10 +++ sys/amd64/vmm/intel/vmx.c | 8 ++ sys/amd64/vmm/io/vlapic.c | 192 ++++++++++++++++++++++++++--------------- sys/amd64/vmm/io/vlapic.h | 2 + sys/amd64/vmm/io/vlapic_priv.h | 2 + sys/amd64/vmm/vmm.c | 9 ++ usr.sbin/bhyve/bhyverun.c | 34 ++++++++ usr.sbin/bhyve/spinup_ap.c | 3 + 9 files changed, 198 insertions(+), 70 deletions(-) diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h index dcf862c34264..37a74f053fb3 100644 --- a/sys/amd64/include/vmm.h +++ b/sys/amd64/include/vmm.h @@ -31,6 +31,7 @@ #ifndef _VMM_H_ #define _VMM_H_ +#include <sys/cpuset.h> #include <sys/sdt.h> #include <x86/segments.h> @@ -483,6 +484,7 @@ enum vm_cap_type { VM_CAP_BPT_EXIT, VM_CAP_RDPID, VM_CAP_RDTSCP, + VM_CAP_IPI_EXIT, VM_CAP_MAX }; @@ -630,6 +632,7 @@ enum vm_exitcode { VM_EXITCODE_DEBUG, VM_EXITCODE_VMINSN, VM_EXITCODE_BPT, + VM_EXITCODE_IPI, VM_EXITCODE_MAX }; @@ -737,6 +740,11 @@ struct vm_exit { struct { enum vm_suspend_how how; } suspended; + struct { + uint32_t mode; + uint8_t vector; + cpuset_t dmask; + } ipi; struct vm_task_switch task_switch; } u; }; diff --git a/sys/amd64/vmm/amd/svm.c b/sys/amd64/vmm/amd/svm.c index 35e8d9833d0e..4195cc5bd049 100644 --- a/sys/amd64/vmm/amd/svm.c +++ b/sys/amd64/vmm/amd/svm.c @@ -2315,6 +2315,7 @@ static int svm_setcap(void *arg, int vcpu, int type, int val) { struct svm_softc *sc; + struct vlapic *vlapic; int error; sc = arg; @@ -2333,6 +2334,10 @@ svm_setcap(void *arg, int vcpu, int type, int val) if (val == 0) error = EINVAL; break; + case VM_CAP_IPI_EXIT: + vlapic = vm_lapic(sc->vm, vcpu); + vlapic->ipi_exit = val; + break; default: error = ENOENT; break; @@ -2344,6 +2349,7 @@ static int svm_getcap(void *arg, int vcpu, int type, int *retval) { struct svm_softc *sc; + struct vlapic *vlapic; int error; sc = arg; @@ -2361,6 +2367,10 @@ svm_getcap(void *arg, int vcpu, int type, int *retval) case VM_CAP_UNRESTRICTED_GUEST: *retval = 1; /* unrestricted guest is always enabled */ break; + case VM_CAP_IPI_EXIT: + vlapic = vm_lapic(sc->vm, vcpu); + *retval = vlapic->ipi_exit; + break; default: error = ENOENT; break; diff --git a/sys/amd64/vmm/intel/vmx.c b/sys/amd64/vmm/intel/vmx.c index 64544a6e7955..857028dcd0f1 100644 --- a/sys/amd64/vmm/intel/vmx.c +++ b/sys/amd64/vmm/intel/vmx.c @@ -3504,6 +3504,7 @@ vmx_getcap(void *arg, int vcpu, int type, int *retval) ret = 0; break; case VM_CAP_BPT_EXIT: + case VM_CAP_IPI_EXIT: ret = 0; break; default: @@ -3521,6 +3522,7 @@ vmx_setcap(void *arg, int vcpu, int type, int val) { struct vmx *vmx = arg; struct vmcs *vmcs = &vmx->vmcs[vcpu]; + struct vlapic *vlapic; uint32_t baseval; uint32_t *pptr; int error; @@ -3599,6 +3601,12 @@ vmx_setcap(void *arg, int vcpu, int type, int val) reg = VMCS_EXCEPTION_BITMAP; } break; + case VM_CAP_IPI_EXIT: + retval = 0; + + vlapic = vm_lapic(vmx->vm, vcpu); + vlapic->ipi_exit = val; + break; default: break; } diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c index 9599b4b4e62c..8283c3cb422c 100644 --- a/sys/amd64/vmm/io/vlapic.c +++ b/sys/amd64/vmm/io/vlapic.c @@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$"); static void vlapic_set_error(struct vlapic *, uint32_t, bool); static void vlapic_callout_handler(void *arg); +static void vlapic_reset(struct vlapic *vlapic); static __inline uint32_t vlapic_get_id(struct vlapic *vlapic) @@ -957,13 +958,12 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) { int i; bool phys; - cpuset_t dmask; + cpuset_t dmask, ipimask; uint64_t icrval; - uint32_t dest, vec, mode; + uint32_t dest, vec, mode, shorthand; struct vlapic *vlapic2; struct vm_exit *vmexit; struct LAPIC *lapic; - uint16_t maxcpus; lapic = vlapic->apic_page; lapic->icr_lo &= ~APIC_DELSTAT_PEND; @@ -975,97 +975,147 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) dest = icrval >> (32 + 24); vec = icrval & APIC_VECTOR_MASK; mode = icrval & APIC_DELMODE_MASK; + phys = (icrval & APIC_DESTMODE_LOG) == 0; + shorthand = icrval & APIC_DEST_MASK; - if (mode == APIC_DELMODE_FIXED && vec < 16) { - vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, false); - VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec); - return (0); + VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec); + + switch (shorthand) { + case APIC_DEST_DESTFLD: + vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false, x2apic(vlapic)); + break; + case APIC_DEST_SELF: + CPU_SETOF(vlapic->vcpuid, &dmask); + break; + case APIC_DEST_ALLISELF: + dmask = vm_active_cpus(vlapic->vm); + break; + case APIC_DEST_ALLESELF: + dmask = vm_active_cpus(vlapic->vm); + CPU_CLR(vlapic->vcpuid, &dmask); + break; + default: + __assert_unreachable(); } - VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec); + /* + * ipimask is a set of vCPUs needing userland handling of the current + * IPI. + */ + CPU_ZERO(&ipimask); - if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) { - switch (icrval & APIC_DEST_MASK) { - case APIC_DEST_DESTFLD: - phys = ((icrval & APIC_DESTMODE_LOG) == 0); - vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false, - x2apic(vlapic)); - break; - case APIC_DEST_SELF: - CPU_SETOF(vlapic->vcpuid, &dmask); - break; - case APIC_DEST_ALLISELF: - dmask = vm_active_cpus(vlapic->vm); - break; - case APIC_DEST_ALLESELF: - dmask = vm_active_cpus(vlapic->vm); - CPU_CLR(vlapic->vcpuid, &dmask); - break; - default: - CPU_ZERO(&dmask); /* satisfy gcc */ - break; + switch (mode) { + case APIC_DELMODE_FIXED: + if (vec < 16) { + vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, + false); + VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec); + return (0); } CPU_FOREACH_ISSET(i, &dmask) { - if (mode == APIC_DELMODE_FIXED) { - lapic_intr_edge(vlapic->vm, i, vec); - vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, - IPIS_SENT, i, 1); - VLAPIC_CTR2(vlapic, "vlapic sending ipi %d " - "to vcpuid %d", vec, i); - } else { - vm_inject_nmi(vlapic->vm, i); - VLAPIC_CTR1(vlapic, "vlapic sending ipi nmi " - "to vcpuid %d", i); - } + lapic_intr_edge(vlapic->vm, i, vec); + vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, + IPIS_SENT, i, 1); + VLAPIC_CTR2(vlapic, + "vlapic sending ipi %d to vcpuid %d", vec, i); } - return (0); /* handled completely in the kernel */ - } + break; + case APIC_DELMODE_NMI: + CPU_FOREACH_ISSET(i, &dmask) { + vm_inject_nmi(vlapic->vm, i); + VLAPIC_CTR1(vlapic, + "vlapic sending ipi nmi to vcpuid %d", i); + } - maxcpus = vm_get_maxcpus(vlapic->vm); - if (mode == APIC_DELMODE_INIT) { + break; + case APIC_DELMODE_INIT: if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) - return (0); - - if (vlapic->vcpuid == 0 && dest != 0 && dest < maxcpus) { - vlapic2 = vm_lapic(vlapic->vm, dest); - - /* move from INIT to waiting-for-SIPI state */ - if (vlapic2->boot_state == BS_INIT) { - vlapic2->boot_state = BS_SIPI; - } + break; - return (0); + CPU_FOREACH_ISSET(i, &dmask) { + /* + * Userland which doesn't support the IPI exit requires + * that the boot state is set to SIPI here. + */ + vlapic2 = vm_lapic(vlapic->vm, i); + vlapic2->boot_state = BS_SIPI; + CPU_SET(i, &ipimask); } - } - - if (mode == APIC_DELMODE_STARTUP) { - if (vlapic->vcpuid == 0 && dest != 0 && dest < maxcpus) { - vlapic2 = vm_lapic(vlapic->vm, dest); + break; + case APIC_DELMODE_STARTUP: + CPU_FOREACH_ISSET(i, &dmask) { + vlapic2 = vm_lapic(vlapic->vm, i); /* * Ignore SIPIs in any state other than wait-for-SIPI */ if (vlapic2->boot_state != BS_SIPI) - return (0); - + continue; vlapic2->boot_state = BS_RUNNING; + CPU_SET(i, &ipimask); + } + + break; + default: + return (1); + } - *retu = true; - vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); - vmexit->exitcode = VM_EXITCODE_SPINUP_AP; - vmexit->u.spinup_ap.vcpu = dest; - vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT; + if (!CPU_EMPTY(&ipimask)) { + vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); + vmexit->exitcode = VM_EXITCODE_IPI; + vmexit->u.ipi.mode = mode; + vmexit->u.ipi.vector = vec; + vmexit->u.ipi.dmask = dmask; - return (0); + *retu = true; + + /* + * Old bhyve versions don't support the IPI exit. Translate it + * into the old style. + */ + if (!vlapic->ipi_exit) { + if (mode == APIC_DELMODE_STARTUP) { + vmexit->exitcode = VM_EXITCODE_SPINUP_AP; + vmexit->u.spinup_ap.vcpu = CPU_FFS(&ipimask) - 1; + vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT; + } else { + *retu = false; + } } } - /* - * This will cause a return to userland. - */ - return (1); + return (0); +} + +static void +vlapic_handle_init(struct vm *vm, int vcpuid, void *arg) +{ + struct vlapic *vlapic = vm_lapic(vm, vcpuid); + + vlapic_reset(vlapic); + + /* vlapic_reset modifies the boot state. */ + vlapic->boot_state = BS_SIPI; +} + +int +vm_handle_ipi(struct vm *vm, int vcpuid, struct vm_exit *vme, bool *retu) +{ + *retu = true; + switch (vme->u.ipi.mode) { + case APIC_DELMODE_INIT: + vm_smp_rendezvous(vm, vcpuid, vme->u.ipi.dmask, + vlapic_handle_init, NULL); + break; + case APIC_DELMODE_STARTUP: + break; + default: + return (1); + } + + return (0); } void @@ -1467,6 +1517,8 @@ vlapic_init(struct vlapic *vlapic) if (vlapic->vcpuid == 0) vlapic->msr_apicbase |= APICBASE_BSP; + vlapic->ipi_exit = false; + vlapic_reset(vlapic); } diff --git a/sys/amd64/vmm/io/vlapic.h b/sys/amd64/vmm/io/vlapic.h index b87657c8bb51..87f3d0c2660f 100644 --- a/sys/amd64/vmm/io/vlapic.h +++ b/sys/amd64/vmm/io/vlapic.h @@ -115,4 +115,6 @@ void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val); int vlapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta); #endif +int vm_handle_ipi(struct vm *vm, int vcpuid, struct vm_exit *vme, bool *retu); + #endif /* _VLAPIC_H_ */ diff --git a/sys/amd64/vmm/io/vlapic_priv.h b/sys/amd64/vmm/io/vlapic_priv.h index fe7965cb65d7..4b3e9009e68c 100644 --- a/sys/amd64/vmm/io/vlapic_priv.h +++ b/sys/amd64/vmm/io/vlapic_priv.h @@ -183,6 +183,8 @@ struct vlapic { */ uint32_t svr_last; uint32_t lvt_last[VLAPIC_MAXLVT_INDEX + 1]; + + bool ipi_exit; }; void vlapic_init(struct vlapic *vlapic); diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c index c504d4f26b3a..8daf2ae29737 100644 --- a/sys/amd64/vmm/vmm.c +++ b/sys/amd64/vmm/vmm.c @@ -1818,6 +1818,15 @@ restart: } } + /* + * VM_EXITCODE_INST_EMUL could access the apic which could transform the + * exit code into VM_EXITCODE_IPI. + */ + if (error == 0 && vme->exitcode == VM_EXITCODE_IPI) { + retu = false; + error = vm_handle_ipi(vm, vcpuid, vme, &retu); + } + if (error == 0 && retu == false) goto restart; diff --git a/usr.sbin/bhyve/bhyverun.c b/usr.sbin/bhyve/bhyverun.c index fb6e3b8a13df..0a7e8e252918 100644 --- a/usr.sbin/bhyve/bhyverun.c +++ b/usr.sbin/bhyve/bhyverun.c @@ -46,6 +46,7 @@ __FBSDID("$FreeBSD$"); #endif #include <amd64/vmm/intel/vmcs.h> +#include <x86/apicreg.h> #include <machine/atomic.h> #include <machine/segments.h> @@ -939,6 +940,35 @@ vmexit_breakpoint(struct vmctx *ctx __unused, struct vm_exit *vme, int *pvcpu) return (VMEXIT_CONTINUE); } +static int +vmexit_ipi(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + int error = -1; + int i; + switch (vmexit->u.ipi.mode) { + case APIC_DELMODE_INIT: + CPU_FOREACH_ISSET (i, &vmexit->u.ipi.dmask) { + error = vm_suspend_cpu(ctx, i); + if (error) { + warnx("%s: failed to suspend cpu %d\n", + __func__, i); + break; + } + } + break; + case APIC_DELMODE_STARTUP: + CPU_FOREACH_ISSET (i, &vmexit->u.ipi.dmask) { + spinup_ap(ctx, i, vmexit->u.ipi.vector << PAGE_SHIFT); + } + error = 0; + break; + default: + break; + } + + return (error); +} + static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_INOUT] = vmexit_inout, [VM_EXITCODE_INOUT_STR] = vmexit_inout, @@ -955,6 +985,7 @@ static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, [VM_EXITCODE_DEBUG] = vmexit_debug, [VM_EXITCODE_BPT] = vmexit_breakpoint, + [VM_EXITCODE_IPI] = vmexit_ipi, }; static void @@ -1155,6 +1186,9 @@ spinup_vcpu(struct vmctx *ctx, int vcpu, bool suspend) error = vm_set_capability(ctx, vcpu, VM_CAP_UNRESTRICTED_GUEST, 1); assert(error == 0); + error = vm_set_capability(ctx, vcpu, VM_CAP_IPI_EXIT, 1); + assert(error == 0); + fbsdrun_addcpu(ctx, vcpu, rip, suspend); } diff --git a/usr.sbin/bhyve/spinup_ap.c b/usr.sbin/bhyve/spinup_ap.c index 2b7e602f8003..438091e564e7 100644 --- a/usr.sbin/bhyve/spinup_ap.c +++ b/usr.sbin/bhyve/spinup_ap.c @@ -98,6 +98,9 @@ spinup_ap(struct vmctx *ctx, int newcpu, uint64_t rip) error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1); assert(error == 0); + error = vm_set_capability(ctx, newcpu, VM_CAP_IPI_EXIT, 1); + assert(error == 0); + spinup_ap_realmode(ctx, newcpu, &rip); vm_resume_cpu(ctx, newcpu);