git: 4b34c91973dd - stable/12 - pvclock: Add vDSO support

From: Konstantin Belousov <kib_at_FreeBSD.org>
Date: Tue, 12 Oct 2021 16:01:46 UTC
The branch stable/12 has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=4b34c91973ddaf594bbb192f84e421598a1b39be

commit 4b34c91973ddaf594bbb192f84e421598a1b39be
Author:     Adam Fenn <adam@fenn.io>
AuthorDate: 2021-08-07 20:10:04 +0000
Commit:     Konstantin Belousov <kib@FreeBSD.org>
CommitDate: 2021-10-12 16:01:26 +0000

    pvclock: Add vDSO support
    
    Add vDSO support for timekeeping devices that support the KVM/XEN
    paravirtual clock API.
    
    Also, expose, in the userspace-accessible '<machine/pvclock.h>',
    definitions that will be needed by 'libc' to support
    'VDSO_TH_ALGO_X86_PVCLK'.
    
    Sponsored by:   Juniper Networks, Inc.
    Sponsored by:   Klara, Inc.
    Reviewed by:    kib
    Differential Revision:  https://reviews.freebsd.org/D31418
    
    (cherry picked from commit d4b2d3035a23d5dc468d41151487a8299bf45cdc)
---
 sys/dev/acpica/acpi_hpet.c                  |   4 +
 sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c |   2 +
 sys/x86/include/pvclock.h                   |  62 +++++++++++
 sys/x86/include/vdso.h                      |   5 +-
 sys/x86/x86/pvclock.c                       | 165 +++++++++++++++++++---------
 sys/x86/x86/tsc.c                           |   4 +
 6 files changed, 188 insertions(+), 54 deletions(-)

diff --git a/sys/dev/acpica/acpi_hpet.c b/sys/dev/acpica/acpi_hpet.c
index fcb5632d896a..897867a59b35 100644
--- a/sys/dev/acpica/acpi_hpet.c
+++ b/sys/dev/acpica/acpi_hpet.c
@@ -156,6 +156,8 @@ hpet_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc)
 	vdso_th->th_algo = VDSO_TH_ALGO_X86_HPET;
 	vdso_th->th_x86_shift = 0;
 	vdso_th->th_x86_hpet_idx = device_get_unit(sc->dev);
+	vdso_th->th_x86_pvc_last_systime = 0;
+	vdso_th->th_x86_pvc_stable_mask = 0;
 	bzero(vdso_th->th_res, sizeof(vdso_th->th_res));
 	return (sc->mmap_allow != 0);
 }
@@ -171,6 +173,8 @@ hpet_vdso_timehands32(struct vdso_timehands32 *vdso_th32,
 	vdso_th32->th_algo = VDSO_TH_ALGO_X86_HPET;
 	vdso_th32->th_x86_shift = 0;
 	vdso_th32->th_x86_hpet_idx = device_get_unit(sc->dev);
+	vdso_th32->th_x86_pvc_last_systime = 0;
+	vdso_th32->th_x86_pvc_stable_mask = 0;
 	bzero(vdso_th32->th_res, sizeof(vdso_th32->th_res));
 	return (sc->mmap_allow != 0);
 }
diff --git a/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c b/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c
index c08098138805..11d549dc18d2 100644
--- a/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c
+++ b/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c
@@ -128,6 +128,8 @@ hyperv_tsc_vdso_timehands(struct vdso_timehands *vdso_th,
 	vdso_th->th_algo = VDSO_TH_ALGO_X86_HVTSC;
 	vdso_th->th_x86_shift = 0;
 	vdso_th->th_x86_hpet_idx = 0;
+	vdso_th->th_x86_pvc_last_systime = 0;
+	vdso_th->th_x86_pvc_stable_mask = 0;
 	bzero(vdso_th->th_res, sizeof(vdso_th->th_res));
 	return (1);
 }
diff --git a/sys/x86/include/pvclock.h b/sys/x86/include/pvclock.h
index 399017039dd0..023acdb80d9c 100644
--- a/sys/x86/include/pvclock.h
+++ b/sys/x86/include/pvclock.h
@@ -30,7 +30,12 @@
 #define X86_PVCLOCK
 
 #include <sys/types.h>
+
+#ifdef _KERNEL
 #include <sys/timetc.h>
+#endif /* _KERNEL */
+
+#define	PVCLOCK_CDEVNAME		"pvclock"
 
 struct pvclock_vcpu_time_info {
 	uint32_t	version;
@@ -46,6 +51,59 @@ struct pvclock_vcpu_time_info {
 #define PVCLOCK_FLAG_TSC_STABLE		0x01
 #define PVCLOCK_FLAG_GUEST_PASUED	0x02
 
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline uint64_t
+pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift)
+{
+	uint64_t product;
+
+	if (shift < 0)
+		delta >>= -shift;
+	else
+		delta <<= shift;
+#if defined(__i386__)
+	{
+		uint32_t tmp1, tmp2;
+
+		/**
+		 * For i386, the formula looks like:
+		 *
+		 *   lower = (mul_frac * (delta & UINT_MAX)) >> 32
+		 *   upper = mul_frac * (delta >> 32)
+		 *   product = lower + upper
+		 */
+		__asm__ (
+			"mul  %5       ; "
+			"mov  %4,%%eax ; "
+			"mov  %%edx,%4 ; "
+			"mul  %5       ; "
+			"xor  %5,%5    ; "
+			"add  %4,%%eax ; "
+			"adc  %5,%%edx ; "
+			: "=A" (product), "=r" (tmp1), "=r" (tmp2)
+			: "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)),
+			  "2" (mul_frac) );
+	}
+#elif defined(__amd64__)
+	{
+		unsigned long tmp;
+
+		__asm__ (
+			"mulq %[mul_frac] ; shrd $32, %[hi], %[lo]"
+			: [lo]"=a" (product), [hi]"=d" (tmp)
+			: "0" (delta), [mul_frac]"rm"((uint64_t)mul_frac));
+	}
+#else
+#error "pvclock: unsupported x86 architecture?"
+#endif
+	return (product);
+}
+
+#ifdef _KERNEL
+
 typedef struct pvclock_wall_clock *pvclock_get_wallclock_t(void *arg);
 
 struct pvclock_wall_clock {
@@ -62,7 +120,9 @@ struct pvclock {
 	bool				 stable_flag_supported;
 
 	/* Private; initialized by the 'pvclock' API: */
+	bool				 vdso_force_unstable;
 	struct timecounter		 tc;
+	struct cdev			*cdev;
 };
 
 /*
@@ -81,4 +141,6 @@ void		pvclock_init(struct pvclock *pvc, device_t dev,
 void		pvclock_gettime(struct pvclock *pvc, struct timespec *ts);
 int		pvclock_destroy(struct pvclock *pvc);
 
+#endif /* _KERNEL */
+
 #endif
diff --git a/sys/x86/include/vdso.h b/sys/x86/include/vdso.h
index 97972c660dde..ace63cbe9f62 100644
--- a/sys/x86/include/vdso.h
+++ b/sys/x86/include/vdso.h
@@ -37,11 +37,14 @@
 #define	VDSO_TIMEHANDS_MD			\
 	uint32_t	th_x86_shift;		\
 	uint32_t	th_x86_hpet_idx;	\
-	uint32_t	th_res[6];
+	uint64_t	th_x86_pvc_last_systime;\
+	uint8_t		th_x86_pvc_stable_mask;	\
+	uint8_t		th_res[15];
 
 #define	VDSO_TH_ALGO_X86_TSC	VDSO_TH_ALGO_1
 #define	VDSO_TH_ALGO_X86_HPET	VDSO_TH_ALGO_2
 #define	VDSO_TH_ALGO_X86_HVTSC	VDSO_TH_ALGO_3	/* Hyper-V ref. TSC */
+#define	VDSO_TH_ALGO_X86_PVCLK	VDSO_TH_ALGO_4	/* KVM/XEN paravirtual clock */
 
 #ifdef _KERNEL
 #ifdef COMPAT_FREEBSD32
diff --git a/sys/x86/x86/pvclock.c b/sys/x86/x86/pvclock.c
index e0ad65d906b8..e13366bd4bbd 100644
--- a/sys/x86/x86/pvclock.c
+++ b/sys/x86/x86/pvclock.c
@@ -31,11 +31,22 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/bus.h>
 #include <sys/clock.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
 #include <sys/limits.h>
+#include <sys/mman.h>
 #include <sys/proc.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/vdso.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
 
 #include <machine/atomic.h>
+#include <machine/cpufunc.h>
 #include <machine/md_var.h>
 #include <machine/pvclock.h>
 
@@ -54,6 +65,22 @@ static void		 pvclock_read_time_info(
 static void		 pvclock_read_wall_clock(struct pvclock_wall_clock *wc,
     struct timespec *ts);
 static u_int		 pvclock_tc_get_timecount(struct timecounter *tc);
+static uint32_t		 pvclock_tc_vdso_timehands(
+    struct vdso_timehands *vdso_th, struct timecounter *tc);
+#ifdef COMPAT_FREEBSD32
+static uint32_t		 pvclock_tc_vdso_timehands32(
+    struct vdso_timehands32 *vdso_th, struct timecounter *tc);
+#endif
+
+static d_open_t		 pvclock_cdev_open;
+static d_mmap_t		 pvclock_cdev_mmap;
+
+static struct cdevsw	 pvclock_cdev_cdevsw = {
+	.d_version =	D_VERSION,
+	.d_name =	PVCLOCK_CDEVNAME,
+	.d_open =	pvclock_cdev_open,
+	.d_mmap =	pvclock_cdev_mmap,
+};
 
 void
 pvclock_resume(void)
@@ -74,57 +101,6 @@ pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti)
 	return (freq);
 }
 
-/*
- * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
- * yielding a 64-bit result.
- */
-static inline uint64_t
-pvclock_scale_delta(uint64_t delta, uint32_t mul_frac, int shift)
-{
-	uint64_t product;
-
-	if (shift < 0)
-		delta >>= -shift;
-	else
-		delta <<= shift;
-#if defined(__i386__)
-	{
-		uint32_t tmp1, tmp2;
-
-		/**
-		 * For i386, the formula looks like:
-		 *
-		 *   lower = (mul_frac * (delta & UINT_MAX)) >> 32
-		 *   upper = mul_frac * (delta >> 32)
-		 *   product = lower + upper
-		 */
-		__asm__ (
-			"mul  %5       ; "
-			"mov  %4,%%eax ; "
-			"mov  %%edx,%4 ; "
-			"mul  %5       ; "
-			"xor  %5,%5    ; "
-			"add  %4,%%eax ; "
-			"adc  %5,%%edx ; "
-			: "=A" (product), "=r" (tmp1), "=r" (tmp2)
-			: "a" ((uint32_t)delta), "1" ((uint32_t)(delta >> 32)),
-			  "2" (mul_frac) );
-	}
-#elif defined(__amd64__)
-	{
-		unsigned long tmp;
-
-		__asm__ (
-			"mulq %[mul_frac] ; shrd $32, %[hi], %[lo]"
-			: [lo]"=a" (product), [hi]"=d" (tmp)
-			: "0" (delta), [mul_frac]"rm"((uint64_t)mul_frac));
-	}
-#else
-#error "pvclock: unsupported x86 architecture?"
-#endif
-	return (product);
-}
-
 static void
 pvclock_read_time_info(struct pvclock_vcpu_time_info *ti,
     uint64_t *ns, uint8_t *flags)
@@ -213,6 +189,27 @@ pvclock_get_wallclock(struct pvclock_wall_clock *wc, struct timespec *ts)
 	pvclock_read_wall_clock(wc, ts);
 }
 
+static int
+pvclock_cdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+	if (oflags & FWRITE)
+		return (EPERM);
+	return (0);
+}
+
+static int
+pvclock_cdev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
+    int nprot, vm_memattr_t *memattr)
+{
+	if (offset >= mp_ncpus * sizeof(struct pvclock_vcpu_time_info))
+		return (EINVAL);
+	if (nprot != PROT_READ)
+		return (EACCES);
+	*paddr = vtophys((uintptr_t)dev->si_drv1 + offset);
+	*memattr = VM_MEMATTR_DEFAULT;
+	return (0);
+}
+
 static u_int
 pvclock_tc_get_timecount(struct timecounter *tc)
 {
@@ -221,6 +218,42 @@ pvclock_tc_get_timecount(struct timecounter *tc)
 	return (pvclock_getsystime(pvc) & UINT_MAX);
 }
 
+static uint32_t
+pvclock_tc_vdso_timehands(struct vdso_timehands *vdso_th,
+    struct timecounter *tc)
+{
+	struct pvclock *pvc = tc->tc_priv;
+
+	vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK;
+	vdso_th->th_x86_shift = 0;
+	vdso_th->th_x86_hpet_idx = 0;
+	vdso_th->th_x86_pvc_last_systime =
+	    atomic_load_acq_64(&pvclock_last_systime);
+	vdso_th->th_x86_pvc_stable_mask = !pvc->vdso_force_unstable &&
+	    pvc->stable_flag_supported ? PVCLOCK_FLAG_TSC_STABLE : 0;
+	bzero(vdso_th->th_res, sizeof(vdso_th->th_res));
+	return (pvc->cdev != NULL && amd_feature & AMDID_RDTSCP);
+}
+
+#ifdef COMPAT_FREEBSD32
+static uint32_t
+pvclock_tc_vdso_timehands32(struct vdso_timehands32 *vdso_th,
+    struct timecounter *tc)
+{
+	struct pvclock *pvc = tc->tc_priv;
+
+	vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK;
+	vdso_th->th_x86_shift = 0;
+	vdso_th->th_x86_hpet_idx = 0;
+	vdso_th->th_x86_pvc_last_systime =
+	    atomic_load_acq_64(&pvclock_last_systime);
+	vdso_th->th_x86_pvc_stable_mask = !pvc->vdso_force_unstable &&
+	    pvc->stable_flag_supported ? PVCLOCK_FLAG_TSC_STABLE : 0;
+	bzero(vdso_th->th_res, sizeof(vdso_th->th_res));
+	return (pvc->cdev != NULL && amd_feature & AMDID_RDTSCP);
+}
+#endif
+
 void
 pvclock_gettime(struct pvclock *pvc, struct timespec *ts)
 {
@@ -238,9 +271,19 @@ void
 pvclock_init(struct pvclock *pvc, device_t dev, const char *tc_name,
     int tc_quality, u_int tc_flags)
 {
+	struct make_dev_args mda;
+	int err;
+
 	KASSERT(((uintptr_t)pvc->timeinfos & PAGE_MASK) == 0,
 	    ("Specified time info page(s) address is not page-aligned."));
 
+	/* Set up vDSO stable-flag suppression test facility: */
+	pvc->vdso_force_unstable = false;
+	SYSCTL_ADD_BOOL(device_get_sysctl_ctx(dev),
+	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
+	    "vdso_force_unstable", CTLFLAG_RW, &pvc->vdso_force_unstable, 0,
+	    "Forcibly deassert stable flag in vDSO codepath");
+
 	/* Set up timecounter and timecounter-supporting members: */
 	pvc->tc.tc_get_timecount = pvclock_tc_get_timecount;
 	pvc->tc.tc_poll_pps = NULL;
@@ -250,11 +293,27 @@ pvclock_init(struct pvclock *pvc, device_t dev, const char *tc_name,
 	pvc->tc.tc_quality = tc_quality;
 	pvc->tc.tc_flags = tc_flags;
 	pvc->tc.tc_priv = pvc;
-	pvc->tc.tc_fill_vdso_timehands = NULL;
+	pvc->tc.tc_fill_vdso_timehands = pvclock_tc_vdso_timehands;
 #ifdef COMPAT_FREEBSD32
-	pvc->tc.tc_fill_vdso_timehands32 = NULL;
+	pvc->tc.tc_fill_vdso_timehands32 = pvclock_tc_vdso_timehands32;
 #endif
 
+	/* Set up cdev for userspace mmapping of vCPU 0 time info page: */
+	make_dev_args_init(&mda);
+	mda.mda_devsw = &pvclock_cdev_cdevsw;
+	mda.mda_uid = UID_ROOT;
+	mda.mda_gid = GID_WHEEL;
+	mda.mda_mode = 0444;
+	mda.mda_si_drv1 = pvc->timeinfos;
+	err = make_dev_s(&mda, &pvc->cdev, PVCLOCK_CDEVNAME);
+	if (err != 0) {
+		device_printf(dev, "Could not create /dev/%s, error %d. Fast "
+		    "time of day will be unavailable for this timecounter.\n",
+		    PVCLOCK_CDEVNAME, err);
+		KASSERT(pvc->cdev == NULL,
+		    ("Failed make_dev_s() unexpectedly inited cdev."));
+	}
+
 	/* Register timecounter: */
 	tc_init(&pvc->tc);
 
diff --git a/sys/x86/x86/tsc.c b/sys/x86/x86/tsc.c
index f14e0707651c..5b3934e7bb9d 100644
--- a/sys/x86/x86/tsc.c
+++ b/sys/x86/x86/tsc.c
@@ -864,6 +864,8 @@ x86_tsc_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc)
 	vdso_th->th_algo = VDSO_TH_ALGO_X86_TSC;
 	vdso_th->th_x86_shift = (int)(intptr_t)tc->tc_priv;
 	vdso_th->th_x86_hpet_idx = 0xffffffff;
+	vdso_th->th_x86_pvc_last_systime = 0;
+	vdso_th->th_x86_pvc_stable_mask = 0;
 	bzero(vdso_th->th_res, sizeof(vdso_th->th_res));
 	return (1);
 }
@@ -877,6 +879,8 @@ x86_tsc_vdso_timehands32(struct vdso_timehands32 *vdso_th32,
 	vdso_th32->th_algo = VDSO_TH_ALGO_X86_TSC;
 	vdso_th32->th_x86_shift = (int)(intptr_t)tc->tc_priv;
 	vdso_th32->th_x86_hpet_idx = 0xffffffff;
+	vdso_th32->th_x86_pvc_last_systime = 0;
+	vdso_th32->th_x86_pvc_stable_mask = 0;
 	bzero(vdso_th32->th_res, sizeof(vdso_th32->th_res));
 	return (1);
 }