git: af93fea71038 - main - timerfd: Move implementation from linux compat to sys/kern

From: Warner Losh <imp_at_FreeBSD.org>
Date: Thu, 24 Aug 2023 20:29:48 UTC
The branch main has been updated by imp:

URL: https://cgit.FreeBSD.org/src/commit/?id=af93fea710385b2b11f0cabd377e7ed6f3d97c34

commit af93fea710385b2b11f0cabd377e7ed6f3d97c34
Author:     Jake Freeland <jfree@freebsd.org>
AuthorDate: 2023-08-24 04:39:54 +0000
Commit:     Warner Losh <imp@FreeBSD.org>
CommitDate: 2023-08-24 20:28:56 +0000

    timerfd: Move implementation from linux compat to sys/kern
    
    Move the timerfd impelemntation from linux compat code to sys/kern. Use
    it to implement the new system calls for timerfd. Add a hook to kern_tc
    to allow timerfd to know when the system time has stepped. Add kqueue
    support to timerfd. Adjust a few names to be less Linux centric.
    
    RelNotes: YES
    Reviewed by: markj (on irc), imp, kib (with reservations), jhb (slack)
    Differential Revision: https://reviews.freebsd.org/D38459
---
 lib/libc/sys/Symbol.map                        |   3 +
 sys/bsm/audit_kevents.h                        |   1 +
 sys/compat/freebsd32/freebsd32_proto.h         |  14 +
 sys/compat/freebsd32/freebsd32_syscall.h       |   5 +-
 sys/compat/freebsd32/freebsd32_syscalls.c      |   3 +
 sys/compat/freebsd32/freebsd32_sysent.c        |   3 +
 sys/compat/freebsd32/freebsd32_systrace_args.c |  86 ++++
 sys/compat/linux/linux_event.c                 | 443 ++---------------
 sys/compat/linux/linux_event.h                 |  11 -
 sys/conf/files                                 |   1 +
 sys/kern/init_sysent.c                         |   3 +
 sys/kern/kern_descrip.c                        |   4 +-
 sys/kern/kern_tc.c                             |   2 +
 sys/kern/sys_timerfd.c                         | 632 +++++++++++++++++++++++++
 sys/kern/syscalls.c                            |   3 +
 sys/kern/syscalls.master                       |  20 +
 sys/kern/systrace_args.c                       |  86 ++++
 sys/sys/file.h                                 |   2 +-
 sys/sys/syscall.h                              |   5 +-
 sys/sys/syscall.mk                             |   5 +-
 sys/sys/sysproto.h                             |  20 +
 sys/sys/timerfd.h                              |  66 +++
 sys/sys/user.h                                 |   6 +
 23 files changed, 999 insertions(+), 425 deletions(-)

diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map
index 9a07bb457eb8..7937661e3787 100644
--- a/lib/libc/sys/Symbol.map
+++ b/lib/libc/sys/Symbol.map
@@ -421,6 +421,9 @@ FBSD_1.7 {
 	kqueuex;
 	membarrier;
 	swapoff;
+	timerfd_create;
+	timerfd_gettime;
+	timerfd_settime;
 };
 
 FBSDprivate_1.0 {
diff --git a/sys/bsm/audit_kevents.h b/sys/bsm/audit_kevents.h
index a6b50a67ee6a..d06381837aad 100644
--- a/sys/bsm/audit_kevents.h
+++ b/sys/bsm/audit_kevents.h
@@ -661,6 +661,7 @@
 #define	AUE_AIO_WRITEV		43267	/* FreeBSD-specific. */
 #define	AUE_AIO_READV		43268	/* FreeBSD-specific. */
 #define	AUE_FSPACECTL		43269	/* FreeBSD-specific. */
+#define	AUE_TIMERFD		43270	/* FreeBSD/Linux. */
 
 /*
  * Darwin BSM uses a number of AUE_O_* definitions, which are aliased to the
diff --git a/sys/compat/freebsd32/freebsd32_proto.h b/sys/compat/freebsd32/freebsd32_proto.h
index bb333e0321a0..50448b6dce16 100644
--- a/sys/compat/freebsd32/freebsd32_proto.h
+++ b/sys/compat/freebsd32/freebsd32_proto.h
@@ -684,6 +684,16 @@ struct freebsd32_aio_writev_args {
 struct freebsd32_aio_readv_args {
 	char aiocbp_l_[PADL_(struct aiocb32 *)]; struct aiocb32 * aiocbp; char aiocbp_r_[PADR_(struct aiocb32 *)];
 };
+struct freebsd32_timerfd_gettime_args {
+	char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
+	char curr_value_l_[PADL_(struct itimerspec32 *)]; struct itimerspec32 * curr_value; char curr_value_r_[PADR_(struct itimerspec32 *)];
+};
+struct freebsd32_timerfd_settime_args {
+	char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
+	char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)];
+	char new_value_l_[PADL_(const struct itimerspec32 *)]; const struct itimerspec32 * new_value; char new_value_r_[PADR_(const struct itimerspec32 *)];
+	char old_value_l_[PADL_(struct itimerspec32 *)]; struct itimerspec32 * old_value; char old_value_r_[PADR_(struct itimerspec32 *)];
+};
 int	freebsd32_wait4(struct thread *, struct freebsd32_wait4_args *);
 int	freebsd32_ptrace(struct thread *, struct freebsd32_ptrace_args *);
 int	freebsd32_recvmsg(struct thread *, struct freebsd32_recvmsg_args *);
@@ -799,6 +809,8 @@ int	freebsd32_cpuset_setdomain(struct thread *, struct freebsd32_cpuset_setdomai
 int	freebsd32___sysctlbyname(struct thread *, struct freebsd32___sysctlbyname_args *);
 int	freebsd32_aio_writev(struct thread *, struct freebsd32_aio_writev_args *);
 int	freebsd32_aio_readv(struct thread *, struct freebsd32_aio_readv_args *);
+int	freebsd32_timerfd_gettime(struct thread *, struct freebsd32_timerfd_gettime_args *);
+int	freebsd32_timerfd_settime(struct thread *, struct freebsd32_timerfd_settime_args *);
 
 #ifdef COMPAT_43
 
@@ -1292,6 +1304,8 @@ int	freebsd11_freebsd32_fstatat(struct thread *, struct freebsd11_freebsd32_fsta
 #define	FREEBSD32_SYS_AUE_freebsd32___sysctlbyname	AUE_SYSCTL
 #define	FREEBSD32_SYS_AUE_freebsd32_aio_writev	AUE_AIO_WRITEV
 #define	FREEBSD32_SYS_AUE_freebsd32_aio_readv	AUE_AIO_READV
+#define	FREEBSD32_SYS_AUE_freebsd32_timerfd_gettime	AUE_TIMERFD
+#define	FREEBSD32_SYS_AUE_freebsd32_timerfd_settime	AUE_TIMERFD
 
 #undef PAD_
 #undef PADL_
diff --git a/sys/compat/freebsd32/freebsd32_syscall.h b/sys/compat/freebsd32/freebsd32_syscall.h
index c3d8617abf4b..e3777730be1c 100644
--- a/sys/compat/freebsd32/freebsd32_syscall.h
+++ b/sys/compat/freebsd32/freebsd32_syscall.h
@@ -502,4 +502,7 @@
 #define	FREEBSD32_SYS_swapoff	582
 #define	FREEBSD32_SYS_kqueuex	583
 #define	FREEBSD32_SYS_membarrier	584
-#define	FREEBSD32_SYS_MAXSYSCALL	585
+#define	FREEBSD32_SYS_timerfd_create	585
+#define	FREEBSD32_SYS_freebsd32_timerfd_gettime	586
+#define	FREEBSD32_SYS_freebsd32_timerfd_settime	587
+#define	FREEBSD32_SYS_MAXSYSCALL	588
diff --git a/sys/compat/freebsd32/freebsd32_syscalls.c b/sys/compat/freebsd32/freebsd32_syscalls.c
index 19d454743c55..ccc910ee5ca9 100644
--- a/sys/compat/freebsd32/freebsd32_syscalls.c
+++ b/sys/compat/freebsd32/freebsd32_syscalls.c
@@ -590,4 +590,7 @@ const char *freebsd32_syscallnames[] = {
 	"swapoff",			/* 582 = swapoff */
 	"kqueuex",			/* 583 = kqueuex */
 	"membarrier",			/* 584 = membarrier */
+	"timerfd_create",			/* 585 = timerfd_create */
+	"freebsd32_timerfd_gettime",			/* 586 = freebsd32_timerfd_gettime */
+	"freebsd32_timerfd_settime",			/* 587 = freebsd32_timerfd_settime */
 };
diff --git a/sys/compat/freebsd32/freebsd32_sysent.c b/sys/compat/freebsd32/freebsd32_sysent.c
index 971f06a643c5..fec6f4a47bd6 100644
--- a/sys/compat/freebsd32/freebsd32_sysent.c
+++ b/sys/compat/freebsd32/freebsd32_sysent.c
@@ -646,4 +646,7 @@ struct sysent freebsd32_sysent[] = {
 	{ .sy_narg = AS(swapoff_args), .sy_call = (sy_call_t *)sys_swapoff, .sy_auevent = AUE_SWAPOFF, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC },	/* 582 = swapoff */
 	{ .sy_narg = AS(kqueuex_args), .sy_call = (sy_call_t *)sys_kqueuex, .sy_auevent = AUE_KQUEUE, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 583 = kqueuex */
 	{ .sy_narg = AS(membarrier_args), .sy_call = (sy_call_t *)sys_membarrier, .sy_auevent = AUE_NULL, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 584 = membarrier */
+	{ .sy_narg = AS(timerfd_create_args), .sy_call = (sy_call_t *)sys_timerfd_create, .sy_auevent = AUE_TIMERFD, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 585 = timerfd_create */
+	{ .sy_narg = AS(freebsd32_timerfd_gettime_args), .sy_call = (sy_call_t *)freebsd32_timerfd_gettime, .sy_auevent = AUE_TIMERFD, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 586 = freebsd32_timerfd_gettime */
+	{ .sy_narg = AS(freebsd32_timerfd_settime_args), .sy_call = (sy_call_t *)freebsd32_timerfd_settime, .sy_auevent = AUE_TIMERFD, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 587 = freebsd32_timerfd_settime */
 };
diff --git a/sys/compat/freebsd32/freebsd32_systrace_args.c b/sys/compat/freebsd32/freebsd32_systrace_args.c
index 5dfc82c30b7b..2c26a0ddab2f 100644
--- a/sys/compat/freebsd32/freebsd32_systrace_args.c
+++ b/sys/compat/freebsd32/freebsd32_systrace_args.c
@@ -3336,6 +3336,32 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
 		*n_args = 3;
 		break;
 	}
+	/* timerfd_create */
+	case 585: {
+		struct timerfd_create_args *p = params;
+		iarg[a++] = p->clockid; /* int */
+		iarg[a++] = p->flags; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* freebsd32_timerfd_gettime */
+	case 586: {
+		struct freebsd32_timerfd_gettime_args *p = params;
+		iarg[a++] = p->fd; /* int */
+		uarg[a++] = (intptr_t)p->curr_value; /* struct itimerspec32 * */
+		*n_args = 2;
+		break;
+	}
+	/* freebsd32_timerfd_settime */
+	case 587: {
+		struct freebsd32_timerfd_settime_args *p = params;
+		iarg[a++] = p->fd; /* int */
+		iarg[a++] = p->flags; /* int */
+		uarg[a++] = (intptr_t)p->new_value; /* const struct itimerspec32 * */
+		uarg[a++] = (intptr_t)p->old_value; /* struct itimerspec32 * */
+		*n_args = 4;
+		break;
+	}
 	default:
 		*n_args = 0;
 		break;
@@ -9005,6 +9031,51 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
 			break;
 		};
 		break;
+	/* timerfd_create */
+	case 585:
+		switch (ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* freebsd32_timerfd_gettime */
+	case 586:
+		switch (ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "userland struct itimerspec32 *";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* freebsd32_timerfd_settime */
+	case 587:
+		switch (ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "int";
+			break;
+		case 2:
+			p = "userland const struct itimerspec32 *";
+			break;
+		case 3:
+			p = "userland struct itimerspec32 *";
+			break;
+		default:
+			break;
+		};
+		break;
 	default:
 		break;
 	};
@@ -10873,6 +10944,21 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
 		if (ndx == 0 || ndx == 1)
 			p = "int";
 		break;
+	/* timerfd_create */
+	case 585:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* freebsd32_timerfd_gettime */
+	case 586:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
+	/* freebsd32_timerfd_settime */
+	case 587:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
 	default:
 		break;
 	};
diff --git a/sys/compat/linux/linux_event.c b/sys/compat/linux/linux_event.c
index a7db8516e5f0..816c68a90f1d 100644
--- a/sys/compat/linux/linux_event.c
+++ b/sys/compat/linux/linux_event.c
@@ -44,6 +44,7 @@
 #include <sys/specialfd.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
+#include <sys/timerfd.h>
 #include <sys/timespec.h>
 #include <sys/user.h>
 
@@ -99,55 +100,6 @@ struct epoll_copyout_args {
 	int			error;
 };
 
-/* timerfd */
-typedef uint64_t	timerfd_t;
-
-static fo_rdwr_t	timerfd_read;
-static fo_ioctl_t	timerfd_ioctl;
-static fo_poll_t	timerfd_poll;
-static fo_kqfilter_t	timerfd_kqfilter;
-static fo_stat_t	timerfd_stat;
-static fo_close_t	timerfd_close;
-static fo_fill_kinfo_t	timerfd_fill_kinfo;
-
-static struct fileops timerfdops = {
-	.fo_read = timerfd_read,
-	.fo_write = invfo_rdwr,
-	.fo_truncate = invfo_truncate,
-	.fo_ioctl = timerfd_ioctl,
-	.fo_poll = timerfd_poll,
-	.fo_kqfilter = timerfd_kqfilter,
-	.fo_stat = timerfd_stat,
-	.fo_close = timerfd_close,
-	.fo_chmod = invfo_chmod,
-	.fo_chown = invfo_chown,
-	.fo_sendfile = invfo_sendfile,
-	.fo_fill_kinfo = timerfd_fill_kinfo,
-	.fo_flags = DFLAG_PASSABLE
-};
-
-static void	filt_timerfddetach(struct knote *kn);
-static int	filt_timerfdread(struct knote *kn, long hint);
-
-static struct filterops timerfd_rfiltops = {
-	.f_isfd = 1,
-	.f_detach = filt_timerfddetach,
-	.f_event = filt_timerfdread
-};
-
-struct timerfd {
-	clockid_t	tfd_clockid;
-	struct itimerspec tfd_time;
-	struct callout	tfd_callout;
-	timerfd_t	tfd_count;
-	bool		tfd_canceled;
-	struct selinfo	tfd_sel;
-	struct mtx	tfd_lock;
-};
-
-static void	linux_timerfd_expire(void *);
-static void	linux_timerfd_curval(struct timerfd *, struct itimerspec *);
-
 static int
 epoll_create_common(struct thread *td, int flags)
 {
@@ -658,255 +610,14 @@ linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args)
 int
 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args)
 {
-	struct timerfd *tfd;
-	struct file *fp;
 	clockid_t clockid;
-	int fflags, fd, error;
-
-	if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0)
-		return (EINVAL);
-
-	error = linux_to_native_clockid(&clockid, args->clockid);
-	if (error != 0)
-		return (error);
-	if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
-		return (EINVAL);
-
-	fflags = 0;
-	if ((args->flags & LINUX_TFD_CLOEXEC) != 0)
-		fflags |= O_CLOEXEC;
-
-	error = falloc(td, &fp, &fd, fflags);
-	if (error != 0)
-		return (error);
-
-	tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO);
-	tfd->tfd_clockid = clockid;
-	mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF);
-
-	callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0);
-	knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock);
-
-	fflags = FREAD;
-	if ((args->flags & LINUX_O_NONBLOCK) != 0)
-		fflags |= FNONBLOCK;
-
-	finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops);
-	fdrop(fp, td);
-
-	td->td_retval[0] = fd;
-	return (error);
-}
-
-static int
-timerfd_close(struct file *fp, struct thread *td)
-{
-	struct timerfd *tfd;
-
-	tfd = fp->f_data;
-	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
-		return (EINVAL);
-
-	timespecclear(&tfd->tfd_time.it_value);
-	timespecclear(&tfd->tfd_time.it_interval);
-
-	callout_drain(&tfd->tfd_callout);
-
-	seldrain(&tfd->tfd_sel);
-	knlist_destroy(&tfd->tfd_sel.si_note);
-
-	fp->f_ops = &badfileops;
-	mtx_destroy(&tfd->tfd_lock);
-	free(tfd, M_EPOLL);
-
-	return (0);
-}
-
-static int
-timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
-    int flags, struct thread *td)
-{
-	struct timerfd *tfd;
-	timerfd_t count;
-	int error;
-
-	tfd = fp->f_data;
-	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
-		return (EINVAL);
-
-	if (uio->uio_resid < sizeof(timerfd_t))
-		return (EINVAL);
-
-	error = 0;
-	mtx_lock(&tfd->tfd_lock);
-retry:
-	if (tfd->tfd_canceled) {
-		tfd->tfd_count = 0;
-		mtx_unlock(&tfd->tfd_lock);
-		return (ECANCELED);
-	}
-	if (tfd->tfd_count == 0) {
-		if ((fp->f_flag & FNONBLOCK) != 0) {
-			mtx_unlock(&tfd->tfd_lock);
-			return (EAGAIN);
-		}
-		error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0);
-		if (error == 0)
-			goto retry;
-	}
-	if (error == 0) {
-		count = tfd->tfd_count;
-		tfd->tfd_count = 0;
-		mtx_unlock(&tfd->tfd_lock);
-		error = uiomove(&count, sizeof(timerfd_t), uio);
-	} else
-		mtx_unlock(&tfd->tfd_lock);
-
-	return (error);
-}
-
-static int
-timerfd_poll(struct file *fp, int events, struct ucred *active_cred,
-    struct thread *td)
-{
-	struct timerfd *tfd;
-	int revents = 0;
-
-	tfd = fp->f_data;
-	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
-		return (POLLERR);
-
-	mtx_lock(&tfd->tfd_lock);
-	if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0)
-		revents |= events & (POLLIN|POLLRDNORM);
-	if (revents == 0)
-		selrecord(td, &tfd->tfd_sel);
-	mtx_unlock(&tfd->tfd_lock);
-
-	return (revents);
-}
-
-static int
-timerfd_kqfilter(struct file *fp, struct knote *kn)
-{
-	struct timerfd *tfd;
-
-	tfd = fp->f_data;
-	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
-		return (EINVAL);
-
-	if (kn->kn_filter == EVFILT_READ)
-		kn->kn_fop = &timerfd_rfiltops;
-	else
-		return (EINVAL);
-
-	kn->kn_hook = tfd;
-	knlist_add(&tfd->tfd_sel.si_note, kn, 0);
-
-	return (0);
-}
-
-static void
-filt_timerfddetach(struct knote *kn)
-{
-	struct timerfd *tfd = kn->kn_hook;
-
-	mtx_lock(&tfd->tfd_lock);
-	knlist_remove(&tfd->tfd_sel.si_note, kn, 1);
-	mtx_unlock(&tfd->tfd_lock);
-}
-
-static int
-filt_timerfdread(struct knote *kn, long hint)
-{
-	struct timerfd *tfd = kn->kn_hook;
-
-	return (tfd->tfd_count > 0);
-}
-
-static int
-timerfd_ioctl(struct file *fp, u_long cmd, void *data,
-    struct ucred *active_cred, struct thread *td)
-{
-
-	if (fp->f_data == NULL || fp->f_type != DTYPE_LINUXTFD)
-		return (EINVAL);
-
-	switch (cmd) {
-	case FIONBIO:
-	case FIOASYNC:
-		return (0);
-	}
-
-	return (ENOTTY);
-}
-
-static int
-timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
-{
-
-	return (ENXIO);
-}
-
-static int
-timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
-{
-
-	kif->kf_type = KF_TYPE_UNKNOWN;
-	return (0);
-}
-
-static void
-linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts)
-{
-
-	if (tfd->tfd_clockid == CLOCK_REALTIME)
-		getnanotime(ts);
-	else	/* CLOCK_MONOTONIC */
-		getnanouptime(ts);
-}
-
-static void
-linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots)
-{
-	struct timespec cts;
-
-	linux_timerfd_clocktime(tfd, &cts);
-	*ots = tfd->tfd_time;
-	if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) {
-		timespecsub(&ots->it_value, &cts, &ots->it_value);
-		if (ots->it_value.tv_sec < 0 ||
-		    (ots->it_value.tv_sec == 0 &&
-		     ots->it_value.tv_nsec == 0)) {
-			ots->it_value.tv_sec  = 0;
-			ots->it_value.tv_nsec = 1;
-		}
-	}
-}
-
-static int
-linux_timerfd_gettime_common(struct thread *td, int fd, struct itimerspec *ots)
-{
-	struct timerfd *tfd;
-	struct file *fp;
 	int error;
 
-	error = fget(td, fd, &cap_read_rights, &fp);
+	error = linux_to_native_clockid(&clockid, args->clockid);
 	if (error != 0)
 		return (error);
-	tfd = fp->f_data;
-	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
-		error = EINVAL;
-		goto out;
-	}
-
-	mtx_lock(&tfd->tfd_lock);
-	linux_timerfd_curval(tfd, ots);
-	mtx_unlock(&tfd->tfd_lock);
 
-out:
-	fdrop(fp, td);
-	return (error);
+	return (kern_timerfd_create(td, clockid, args->flags));
 }
 
 int
@@ -916,84 +627,14 @@ linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args
 	struct itimerspec ots;
 	int error;
 
-	error = linux_timerfd_gettime_common(td, args->fd, &ots);
+	error = kern_timerfd_gettime(td, args->fd, &ots);
 	if (error != 0)
 		return (error);
-	error = native_to_linux_itimerspec(&lots, &ots);
-	if (error == 0)
-		error = copyout(&lots, args->old_value, sizeof(lots));
-	return (error);
-}
-
-#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
-int
-linux_timerfd_gettime64(struct thread *td, struct linux_timerfd_gettime64_args *args)
-{
-	struct l_itimerspec64 lots;
-	struct itimerspec ots;
-	int error;
 
-	error = linux_timerfd_gettime_common(td, args->fd, &ots);
-	if (error != 0)
-		return (error);
-	error = native_to_linux_itimerspec64(&lots, &ots);
+	error = native_to_linux_itimerspec(&lots, &ots);
 	if (error == 0)
 		error = copyout(&lots, args->old_value, sizeof(lots));
-	return (error);
-}
-#endif
-
-static int
-linux_timerfd_settime_common(struct thread *td, int fd, int flags,
-    struct itimerspec *nts, struct itimerspec *oval)
-{
-	struct timespec cts, ts;
-	struct timerfd *tfd;
-	struct timeval tv;
-	struct file *fp;
-	int error;
-
-	if ((flags & ~LINUX_TFD_SETTIME_FLAGS) != 0)
-		return (EINVAL);
-
-	error = fget(td, fd, &cap_write_rights, &fp);
-	if (error != 0)
-		return (error);
-	tfd = fp->f_data;
-	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
-		error = EINVAL;
-		goto out;
-	}
-
-	mtx_lock(&tfd->tfd_lock);
-	if (!timespecisset(&nts->it_value))
-		timespecclear(&nts->it_interval);
-	if (oval != NULL)
-		linux_timerfd_curval(tfd, oval);
-
-	bcopy(nts, &tfd->tfd_time, sizeof(*nts));
-	tfd->tfd_count = 0;
-	if (timespecisset(&nts->it_value)) {
-		linux_timerfd_clocktime(tfd, &cts);
-		ts = nts->it_value;
-		if ((flags & LINUX_TFD_TIMER_ABSTIME) == 0) {
-			timespecadd(&tfd->tfd_time.it_value, &cts,
-				&tfd->tfd_time.it_value);
-		} else {
-			timespecsub(&ts, &cts, &ts);
-		}
-		TIMESPEC_TO_TIMEVAL(&tv, &ts);
-		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
-			linux_timerfd_expire, tfd);
-		tfd->tfd_canceled = false;
-	} else {
-		tfd->tfd_canceled = true;
-		callout_stop(&tfd->tfd_callout);
-	}
-	mtx_unlock(&tfd->tfd_lock);
 
-out:
-	fdrop(fp, td);
 	return (error);
 }
 
@@ -1001,7 +642,7 @@ int
 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args)
 {
 	struct l_itimerspec lots;
-	struct itimerspec nts, ots, *pots;
+	struct itimerspec nts, ots;
 	int error;
 
 	error = copyin(args->new_value, &lots, sizeof(lots));
@@ -1010,23 +651,43 @@ linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args
 	error = linux_to_native_itimerspec(&nts, &lots);
 	if (error != 0)
 		return (error);
-	pots = (args->old_value != NULL ? &ots : NULL);
-	error = linux_timerfd_settime_common(td, args->fd, args->flags,
-	    &nts, pots);
+	if (args->old_value == NULL)
+		error = kern_timerfd_settime(td, args->fd, args->flags, &nts, NULL);
+	else
+		error = kern_timerfd_settime(td, args->fd, args->flags, &nts, &ots);
 	if (error == 0 && args->old_value != NULL) {
 		error = native_to_linux_itimerspec(&lots, &ots);
 		if (error == 0)
 			error = copyout(&lots, args->old_value, sizeof(lots));
 	}
+
 	return (error);
 }
 
 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
+int
+linux_timerfd_gettime64(struct thread *td, struct linux_timerfd_gettime64_args *args)
+{
+	struct l_itimerspec64 lots;
+	struct itimerspec ots;
+	int error;
+
+	error = kern_timerfd_gettime(td, args->fd, &ots);
+	if (error != 0)
+		return (error);
+
+	error = native_to_linux_itimerspec64(&lots, &ots);
+	if (error == 0)
+		error = copyout(&lots, args->old_value, sizeof(lots));
+
+	return (error);
+}
+
 int
 linux_timerfd_settime64(struct thread *td, struct linux_timerfd_settime64_args *args)
 {
 	struct l_itimerspec64 lots;
-	struct itimerspec nts, ots, *pots;
+	struct itimerspec nts, ots;
 	int error;
 
 	error = copyin(args->new_value, &lots, sizeof(lots));
@@ -1035,50 +696,16 @@ linux_timerfd_settime64(struct thread *td, struct linux_timerfd_settime64_args *
 	error = linux_to_native_itimerspec64(&nts, &lots);
 	if (error != 0)
 		return (error);
-	pots = (args->old_value != NULL ? &ots : NULL);
-	error = linux_timerfd_settime_common(td, args->fd, args->flags,
-	    &nts, pots);
+	if (args->old_value == NULL)
+		error = kern_timerfd_settime(td, args->fd, args->flags, &nts, NULL);
+	else
+		error = kern_timerfd_settime(td, args->fd, args->flags, &nts, &ots);
 	if (error == 0 && args->old_value != NULL) {
 		error = native_to_linux_itimerspec64(&lots, &ots);
 		if (error == 0)
 			error = copyout(&lots, args->old_value, sizeof(lots));
 	}
+
 	return (error);
 }
 #endif
-
-static void
-linux_timerfd_expire(void *arg)
-{
-	struct timespec cts, ts;
-	struct timeval tv;
-	struct timerfd *tfd;
-
-	tfd = (struct timerfd *)arg;
-
-	linux_timerfd_clocktime(tfd, &cts);
-	if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) {
-		if (timespecisset(&tfd->tfd_time.it_interval))
-			timespecadd(&tfd->tfd_time.it_value,
-				    &tfd->tfd_time.it_interval,
-				    &tfd->tfd_time.it_value);
-		else
-			/* single shot timer */
-			timespecclear(&tfd->tfd_time.it_value);
-		if (timespecisset(&tfd->tfd_time.it_value)) {
-			timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
-			TIMESPEC_TO_TIMEVAL(&tv, &ts);
-			callout_reset(&tfd->tfd_callout, tvtohz(&tv),
-				linux_timerfd_expire, tfd);
-		}
-		tfd->tfd_count++;
-		KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0);
-		selwakeup(&tfd->tfd_sel);
-		wakeup(&tfd->tfd_count);
-	} else if (timespecisset(&tfd->tfd_time.it_value)) {
-		timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
-		TIMESPEC_TO_TIMEVAL(&tv, &ts);
-		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
-		    linux_timerfd_expire, tfd);
-	}
-}
diff --git a/sys/compat/linux/linux_event.h b/sys/compat/linux/linux_event.h
index 32269b0070bc..fa63371b5170 100644
--- a/sys/compat/linux/linux_event.h
+++ b/sys/compat/linux/linux_event.h
@@ -54,15 +54,4 @@
 
 #define	LINUX_EFD_SEMAPHORE	(1 << 0)
 
-#define	LINUX_TFD_TIMER_ABSTIME	(1 << 0)
-#define	LINUX_TFD_TIMER_CANCEL_ON_SET	(1 << 1)
-#define	LINUX_TFD_CLOEXEC	LINUX_O_CLOEXEC
-#define	LINUX_TFD_NONBLOCK	LINUX_O_NONBLOCK
-
-#define	LINUX_TFD_SHARED_FCNTL_FLAGS	(LINUX_TFD_CLOEXEC		\
-		|LINUX_TFD_NONBLOCK)
-#define	LINUX_TFD_CREATE_FLAGS	LINUX_TFD_SHARED_FCNTL_FLAGS
-#define	LINUX_TFD_SETTIME_FLAGS	(LINUX_TFD_TIMER_ABSTIME		\
-		|LINUX_TFD_TIMER_CANCEL_ON_SET)
-
 #endif	/* !_LINUX_EVENT_H_ */
diff --git a/sys/conf/files b/sys/conf/files
index 3f79ce752c80..8d38b9cc8a2e 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3908,6 +3908,7 @@ kern/sys_pipe.c			standard
 kern/sys_procdesc.c		standard
 kern/sys_process.c		standard
 kern/sys_socket.c		standard
+kern/sys_timerfd.c		standard
 kern/syscalls.c			standard
 kern/sysv_ipc.c			standard
 kern/sysv_msg.c			optional sysvmsg
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
index 1e62c46b8be0..d44fec54fcd7 100644
--- a/sys/kern/init_sysent.c
+++ b/sys/kern/init_sysent.c
@@ -645,4 +645,7 @@ struct sysent sysent[] = {
 	{ .sy_narg = AS(swapoff_args), .sy_call = (sy_call_t *)sys_swapoff, .sy_auevent = AUE_SWAPOFF, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC },	/* 582 = swapoff */
 	{ .sy_narg = AS(kqueuex_args), .sy_call = (sy_call_t *)sys_kqueuex, .sy_auevent = AUE_KQUEUE, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 583 = kqueuex */
 	{ .sy_narg = AS(membarrier_args), .sy_call = (sy_call_t *)sys_membarrier, .sy_auevent = AUE_NULL, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 584 = membarrier */
+	{ .sy_narg = AS(timerfd_create_args), .sy_call = (sy_call_t *)sys_timerfd_create, .sy_auevent = AUE_TIMERFD, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 585 = timerfd_create */
+	{ .sy_narg = AS(timerfd_gettime_args), .sy_call = (sy_call_t *)sys_timerfd_gettime, .sy_auevent = AUE_TIMERFD, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 586 = timerfd_gettime */
+	{ .sy_narg = AS(timerfd_settime_args), .sy_call = (sy_call_t *)sys_timerfd_settime, .sy_auevent = AUE_TIMERFD, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 587 = timerfd_settime */
 };
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index c5226288afc5..35046c856d54 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -5001,8 +5001,8 @@ file_type_to_name(short type)
 		return ("proc");
 	case DTYPE_EVENTFD:
 		return ("eventfd");
-	case DTYPE_LINUXTFD:
-		return ("ltimer");
+	case DTYPE_TIMERFD:
+		return ("timerfd");
 	default:
 		return ("unkn");
 	}
diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c
index 170f35830923..26f09cb60260 100644
--- a/sys/kern/kern_tc.c
+++ b/sys/kern/kern_tc.c
@@ -34,6 +34,7 @@
 #include <sys/systm.h>
 #include <sys/timeffc.h>
 #include <sys/timepps.h>
+#include <sys/timerfd.h>
 #include <sys/timetc.h>
 #include <sys/timex.h>
 #include <sys/vdso.h>
@@ -1305,6 +1306,7 @@ tc_setclock(struct timespec *ts)
 
 	/* Avoid rtc_generation == 0, since td_rtcgen == 0 is special. */
 	atomic_add_rel_int(&rtc_generation, 2);
+	timerfd_jumped();
 	sleepq_chains_remove_matching(sleeping_on_old_rtc);
 	if (timestepwarnings) {
 		nanotime(&taft);
diff --git a/sys/kern/sys_timerfd.c b/sys/kern/sys_timerfd.c
new file mode 100644
index 000000000000..6948fa059b8c
--- /dev/null
+++ b/sys/kern/sys_timerfd.c
@@ -0,0 +1,632 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2014 Dmitry Chagin <dchagin@FreeBSD.org>
+ * Copyright (c) 2023 Jake Freeland <jfree@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/callout.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/selinfo.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/timerfd.h>
+#include <sys/timespec.h>
+#include <sys/uio.h>
+#include <sys/user.h>
+
+#include <security/audit/audit.h>
+
+#ifdef COMPAT_FREEBSD32
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+#endif
+
+static MALLOC_DEFINE(M_TIMERFD, "timerfd", "timerfd structures");
+static LIST_HEAD(, timerfd) timerfd_head;
+static struct unrhdr64 tfdino_unr;
+
+#define	TFD_NOJUMP	0	/* Realtime clock has not jumped. */
+#define	TFD_READ	1	/* Jumped, tfd has been read since. */
+#define	TFD_ZREAD	2	/* Jumped backwards, CANCEL_ON_SET=false. */
+#define	TFD_CANCELED	4	/* Jumped, CANCEL_ON_SET=true. */
+#define	TFD_JUMPED	(TFD_ZREAD | TFD_CANCELED)
+
+struct timerfd {
+	/* User specified. */
+	struct itimerspec tfd_time;	/* tfd timer */
+	clockid_t	tfd_clockid;	/* timing base */
+	int		tfd_flags;	/* creation flags */
+	int		tfd_timflags;	/* timer flags */
+
+	/* Used internally. */
+	timerfd_t	tfd_count;	/* expiration count since last read */
+	bool		tfd_expired;	/* true upon initial expiration */
+	struct mtx	tfd_lock;	/* mtx lock */
+	struct callout	tfd_callout;	/* expiration notification */
+	struct selinfo	tfd_sel;	/* I/O alerts */
+	struct timespec	tfd_boottim;	/* cached boottime */
+	int		tfd_jumped;	/* timer jump status */
+	LIST_ENTRY(timerfd) entry;	/* entry in list */
+
+	/* For stat(2). */
+	ino_t		tfd_ino;	/* inode number */
+	struct timespec	tfd_atim;	/* time of last read */
+	struct timespec	tfd_mtim;	/* time of last settime */
+	struct timespec tfd_birthtim;	/* creation time */
+};
+
+static void
+timerfd_init(void *data)
+{
+	new_unrhdr64(&tfdino_unr, 1);
+}
+
+SYSINIT(timerfd, SI_SUB_VFS, SI_ORDER_ANY, timerfd_init, NULL);
+
+static inline void
+timerfd_getboottime(struct timespec *ts)
+{
+	struct timeval tv;
+	getboottime(&tv);
+	TIMEVAL_TO_TIMESPEC(&tv, ts);
+}
+
+/*
+ * Call when a discontinuous jump has occured in CLOCK_REALTIME and
+ * update timerfd's cached boottime. A jump can be triggered using
+ * functions like clock_settime(2) or settimeofday(2).
+ *
+ * Timer is marked TFD_CANCELED if TFD_TIMER_CANCEL_ON_SET is set
*** 850 LINES SKIPPED ***