Re: git: af93fea71038 - main - timerfd: Move implementation from linux compat to sys/kern

From: Konstantin Belousov <kostikbel_at_gmail.com>
Date: Thu, 24 Aug 2023 22:18:18 UTC
On Thu, Aug 24, 2023 at 08:29:48PM +0000, Warner Losh wrote:
> The branch main has been updated by imp:
> 
> URL: https://cgit.FreeBSD.org/src/commit/?id=af93fea710385b2b11f0cabd377e7ed6f3d97c34
> 
> commit af93fea710385b2b11f0cabd377e7ed6f3d97c34
> Author:     Jake Freeland <jfree@freebsd.org>
> AuthorDate: 2023-08-24 04:39:54 +0000
> Commit:     Warner Losh <imp@FreeBSD.org>
> CommitDate: 2023-08-24 20:28:56 +0000
> 
>     timerfd: Move implementation from linux compat to sys/kern
>     
>     Move the timerfd impelemntation from linux compat code to sys/kern. Use
>     it to implement the new system calls for timerfd. Add a hook to kern_tc
>     to allow timerfd to know when the system time has stepped. Add kqueue
>     support to timerfd. Adjust a few names to be less Linux centric.
>     
>     RelNotes: YES
>     Reviewed by: markj (on irc), imp, kib (with reservations), jhb (slack)
>     Differential Revision: https://reviews.freebsd.org/D38459
> ---
>  lib/libc/sys/Symbol.map                        |   3 +
>  sys/bsm/audit_kevents.h                        |   1 +
>  sys/compat/freebsd32/freebsd32_proto.h         |  14 +
>  sys/compat/freebsd32/freebsd32_syscall.h       |   5 +-
>  sys/compat/freebsd32/freebsd32_syscalls.c      |   3 +
>  sys/compat/freebsd32/freebsd32_sysent.c        |   3 +
>  sys/compat/freebsd32/freebsd32_systrace_args.c |  86 ++++
>  sys/compat/linux/linux_event.c                 | 443 ++---------------
>  sys/compat/linux/linux_event.h                 |  11 -
>  sys/conf/files                                 |   1 +
>  sys/kern/init_sysent.c                         |   3 +
>  sys/kern/kern_descrip.c                        |   4 +-
>  sys/kern/kern_tc.c                             |   2 +
>  sys/kern/sys_timerfd.c                         | 632 +++++++++++++++++++++++++
>  sys/kern/syscalls.c                            |   3 +
>  sys/kern/syscalls.master                       |  20 +
>  sys/kern/systrace_args.c                       |  86 ++++
>  sys/sys/file.h                                 |   2 +-
>  sys/sys/syscall.h                              |   5 +-
>  sys/sys/syscall.mk                             |   5 +-
>  sys/sys/sysproto.h                             |  20 +
>  sys/sys/timerfd.h                              |  66 +++
>  sys/sys/user.h                                 |   6 +
>  23 files changed, 999 insertions(+), 425 deletions(-)
> 
> diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map
> index 9a07bb457eb8..7937661e3787 100644
> --- a/lib/libc/sys/Symbol.map
> +++ b/lib/libc/sys/Symbol.map
> @@ -421,6 +421,9 @@ FBSD_1.7 {
>  	kqueuex;
>  	membarrier;
>  	swapoff;
> +	timerfd_create;
> +	timerfd_gettime;
> +	timerfd_settime;
>  };
>  
>  FBSDprivate_1.0 {
> diff --git a/sys/bsm/audit_kevents.h b/sys/bsm/audit_kevents.h
> index a6b50a67ee6a..d06381837aad 100644
> --- a/sys/bsm/audit_kevents.h
> +++ b/sys/bsm/audit_kevents.h
> @@ -661,6 +661,7 @@
>  #define	AUE_AIO_WRITEV		43267	/* FreeBSD-specific. */
>  #define	AUE_AIO_READV		43268	/* FreeBSD-specific. */
>  #define	AUE_FSPACECTL		43269	/* FreeBSD-specific. */
> +#define	AUE_TIMERFD		43270	/* FreeBSD/Linux. */
>  
>  /*
>   * Darwin BSM uses a number of AUE_O_* definitions, which are aliased to the
> diff --git a/sys/compat/freebsd32/freebsd32_proto.h b/sys/compat/freebsd32/freebsd32_proto.h
> index bb333e0321a0..50448b6dce16 100644
> --- a/sys/compat/freebsd32/freebsd32_proto.h
> +++ b/sys/compat/freebsd32/freebsd32_proto.h
> @@ -684,6 +684,16 @@ struct freebsd32_aio_writev_args {
>  struct freebsd32_aio_readv_args {
>  	char aiocbp_l_[PADL_(struct aiocb32 *)]; struct aiocb32 * aiocbp; char aiocbp_r_[PADR_(struct aiocb32 *)];
>  };
> +struct freebsd32_timerfd_gettime_args {
> +	char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
> +	char curr_value_l_[PADL_(struct itimerspec32 *)]; struct itimerspec32 * curr_value; char curr_value_r_[PADR_(struct itimerspec32 *)];
> +};
> +struct freebsd32_timerfd_settime_args {
> +	char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)];
> +	char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)];
> +	char new_value_l_[PADL_(const struct itimerspec32 *)]; const struct itimerspec32 * new_value; char new_value_r_[PADR_(const struct itimerspec32 *)];
> +	char old_value_l_[PADL_(struct itimerspec32 *)]; struct itimerspec32 * old_value; char old_value_r_[PADR_(struct itimerspec32 *)];
> +};
>  int	freebsd32_wait4(struct thread *, struct freebsd32_wait4_args *);
>  int	freebsd32_ptrace(struct thread *, struct freebsd32_ptrace_args *);
>  int	freebsd32_recvmsg(struct thread *, struct freebsd32_recvmsg_args *);
> @@ -799,6 +809,8 @@ int	freebsd32_cpuset_setdomain(struct thread *, struct freebsd32_cpuset_setdomai
>  int	freebsd32___sysctlbyname(struct thread *, struct freebsd32___sysctlbyname_args *);
>  int	freebsd32_aio_writev(struct thread *, struct freebsd32_aio_writev_args *);
>  int	freebsd32_aio_readv(struct thread *, struct freebsd32_aio_readv_args *);
> +int	freebsd32_timerfd_gettime(struct thread *, struct freebsd32_timerfd_gettime_args *);
> +int	freebsd32_timerfd_settime(struct thread *, struct freebsd32_timerfd_settime_args *);
>  
>  #ifdef COMPAT_43
>  
> @@ -1292,6 +1304,8 @@ int	freebsd11_freebsd32_fstatat(struct thread *, struct freebsd11_freebsd32_fsta
>  #define	FREEBSD32_SYS_AUE_freebsd32___sysctlbyname	AUE_SYSCTL
>  #define	FREEBSD32_SYS_AUE_freebsd32_aio_writev	AUE_AIO_WRITEV
>  #define	FREEBSD32_SYS_AUE_freebsd32_aio_readv	AUE_AIO_READV
> +#define	FREEBSD32_SYS_AUE_freebsd32_timerfd_gettime	AUE_TIMERFD
> +#define	FREEBSD32_SYS_AUE_freebsd32_timerfd_settime	AUE_TIMERFD
>  
>  #undef PAD_
>  #undef PADL_
> diff --git a/sys/compat/freebsd32/freebsd32_syscall.h b/sys/compat/freebsd32/freebsd32_syscall.h
> index c3d8617abf4b..e3777730be1c 100644
> --- a/sys/compat/freebsd32/freebsd32_syscall.h
> +++ b/sys/compat/freebsd32/freebsd32_syscall.h
> @@ -502,4 +502,7 @@
>  #define	FREEBSD32_SYS_swapoff	582
>  #define	FREEBSD32_SYS_kqueuex	583
>  #define	FREEBSD32_SYS_membarrier	584
> -#define	FREEBSD32_SYS_MAXSYSCALL	585
> +#define	FREEBSD32_SYS_timerfd_create	585
> +#define	FREEBSD32_SYS_freebsd32_timerfd_gettime	586
> +#define	FREEBSD32_SYS_freebsd32_timerfd_settime	587
> +#define	FREEBSD32_SYS_MAXSYSCALL	588
> diff --git a/sys/compat/freebsd32/freebsd32_syscalls.c b/sys/compat/freebsd32/freebsd32_syscalls.c
> index 19d454743c55..ccc910ee5ca9 100644
> --- a/sys/compat/freebsd32/freebsd32_syscalls.c
> +++ b/sys/compat/freebsd32/freebsd32_syscalls.c
> @@ -590,4 +590,7 @@ const char *freebsd32_syscallnames[] = {
>  	"swapoff",			/* 582 = swapoff */
>  	"kqueuex",			/* 583 = kqueuex */
>  	"membarrier",			/* 584 = membarrier */
> +	"timerfd_create",			/* 585 = timerfd_create */
> +	"freebsd32_timerfd_gettime",			/* 586 = freebsd32_timerfd_gettime */
> +	"freebsd32_timerfd_settime",			/* 587 = freebsd32_timerfd_settime */
>  };
> diff --git a/sys/compat/freebsd32/freebsd32_sysent.c b/sys/compat/freebsd32/freebsd32_sysent.c
> index 971f06a643c5..fec6f4a47bd6 100644
> --- a/sys/compat/freebsd32/freebsd32_sysent.c
> +++ b/sys/compat/freebsd32/freebsd32_sysent.c
> @@ -646,4 +646,7 @@ struct sysent freebsd32_sysent[] = {
>  	{ .sy_narg = AS(swapoff_args), .sy_call = (sy_call_t *)sys_swapoff, .sy_auevent = AUE_SWAPOFF, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC },	/* 582 = swapoff */
>  	{ .sy_narg = AS(kqueuex_args), .sy_call = (sy_call_t *)sys_kqueuex, .sy_auevent = AUE_KQUEUE, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 583 = kqueuex */
>  	{ .sy_narg = AS(membarrier_args), .sy_call = (sy_call_t *)sys_membarrier, .sy_auevent = AUE_NULL, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 584 = membarrier */
> +	{ .sy_narg = AS(timerfd_create_args), .sy_call = (sy_call_t *)sys_timerfd_create, .sy_auevent = AUE_TIMERFD, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 585 = timerfd_create */
> +	{ .sy_narg = AS(freebsd32_timerfd_gettime_args), .sy_call = (sy_call_t *)freebsd32_timerfd_gettime, .sy_auevent = AUE_TIMERFD, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 586 = freebsd32_timerfd_gettime */
> +	{ .sy_narg = AS(freebsd32_timerfd_settime_args), .sy_call = (sy_call_t *)freebsd32_timerfd_settime, .sy_auevent = AUE_TIMERFD, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 587 = freebsd32_timerfd_settime */
>  };
> diff --git a/sys/compat/freebsd32/freebsd32_systrace_args.c b/sys/compat/freebsd32/freebsd32_systrace_args.c
> index 5dfc82c30b7b..2c26a0ddab2f 100644
> --- a/sys/compat/freebsd32/freebsd32_systrace_args.c
> +++ b/sys/compat/freebsd32/freebsd32_systrace_args.c
> @@ -3336,6 +3336,32 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
>  		*n_args = 3;
>  		break;
>  	}
> +	/* timerfd_create */
> +	case 585: {
> +		struct timerfd_create_args *p = params;
> +		iarg[a++] = p->clockid; /* int */
> +		iarg[a++] = p->flags; /* int */
> +		*n_args = 2;
> +		break;
> +	}
> +	/* freebsd32_timerfd_gettime */
> +	case 586: {
> +		struct freebsd32_timerfd_gettime_args *p = params;
> +		iarg[a++] = p->fd; /* int */
> +		uarg[a++] = (intptr_t)p->curr_value; /* struct itimerspec32 * */
> +		*n_args = 2;
> +		break;
> +	}
> +	/* freebsd32_timerfd_settime */
> +	case 587: {
> +		struct freebsd32_timerfd_settime_args *p = params;
> +		iarg[a++] = p->fd; /* int */
> +		iarg[a++] = p->flags; /* int */
> +		uarg[a++] = (intptr_t)p->new_value; /* const struct itimerspec32 * */
> +		uarg[a++] = (intptr_t)p->old_value; /* struct itimerspec32 * */
> +		*n_args = 4;
> +		break;
> +	}
>  	default:
>  		*n_args = 0;
>  		break;
> @@ -9005,6 +9031,51 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
>  			break;
>  		};
>  		break;
> +	/* timerfd_create */
> +	case 585:
> +		switch (ndx) {
> +		case 0:
> +			p = "int";
> +			break;
> +		case 1:
> +			p = "int";
> +			break;
> +		default:
> +			break;
> +		};
> +		break;
> +	/* freebsd32_timerfd_gettime */
> +	case 586:
> +		switch (ndx) {
> +		case 0:
> +			p = "int";
> +			break;
> +		case 1:
> +			p = "userland struct itimerspec32 *";
> +			break;
> +		default:
> +			break;
> +		};
> +		break;
> +	/* freebsd32_timerfd_settime */
> +	case 587:
> +		switch (ndx) {
> +		case 0:
> +			p = "int";
> +			break;
> +		case 1:
> +			p = "int";
> +			break;
> +		case 2:
> +			p = "userland const struct itimerspec32 *";
> +			break;
> +		case 3:
> +			p = "userland struct itimerspec32 *";
> +			break;
> +		default:
> +			break;
> +		};
> +		break;
>  	default:
>  		break;
>  	};
> @@ -10873,6 +10944,21 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
>  		if (ndx == 0 || ndx == 1)
>  			p = "int";
>  		break;
> +	/* timerfd_create */
> +	case 585:
> +		if (ndx == 0 || ndx == 1)
> +			p = "int";
> +		break;
> +	/* freebsd32_timerfd_gettime */
> +	case 586:
> +		if (ndx == 0 || ndx == 1)
> +			p = "int";
> +		break;
> +	/* freebsd32_timerfd_settime */
> +	case 587:
> +		if (ndx == 0 || ndx == 1)
> +			p = "int";
> +		break;
>  	default:
>  		break;
>  	};
> diff --git a/sys/compat/linux/linux_event.c b/sys/compat/linux/linux_event.c
> index a7db8516e5f0..816c68a90f1d 100644
> --- a/sys/compat/linux/linux_event.c
> +++ b/sys/compat/linux/linux_event.c
> @@ -44,6 +44,7 @@
>  #include <sys/specialfd.h>
>  #include <sys/sx.h>
>  #include <sys/syscallsubr.h>
> +#include <sys/timerfd.h>
>  #include <sys/timespec.h>
>  #include <sys/user.h>
>  
> @@ -99,55 +100,6 @@ struct epoll_copyout_args {
>  	int			error;
>  };
>  
> -/* timerfd */
> -typedef uint64_t	timerfd_t;
> -
> -static fo_rdwr_t	timerfd_read;
> -static fo_ioctl_t	timerfd_ioctl;
> -static fo_poll_t	timerfd_poll;
> -static fo_kqfilter_t	timerfd_kqfilter;
> -static fo_stat_t	timerfd_stat;
> -static fo_close_t	timerfd_close;
> -static fo_fill_kinfo_t	timerfd_fill_kinfo;
> -
> -static struct fileops timerfdops = {
> -	.fo_read = timerfd_read,
> -	.fo_write = invfo_rdwr,
> -	.fo_truncate = invfo_truncate,
> -	.fo_ioctl = timerfd_ioctl,
> -	.fo_poll = timerfd_poll,
> -	.fo_kqfilter = timerfd_kqfilter,
> -	.fo_stat = timerfd_stat,
> -	.fo_close = timerfd_close,
> -	.fo_chmod = invfo_chmod,
> -	.fo_chown = invfo_chown,
> -	.fo_sendfile = invfo_sendfile,
> -	.fo_fill_kinfo = timerfd_fill_kinfo,
> -	.fo_flags = DFLAG_PASSABLE
> -};
> -
> -static void	filt_timerfddetach(struct knote *kn);
> -static int	filt_timerfdread(struct knote *kn, long hint);
> -
> -static struct filterops timerfd_rfiltops = {
> -	.f_isfd = 1,
> -	.f_detach = filt_timerfddetach,
> -	.f_event = filt_timerfdread
> -};
> -
> -struct timerfd {
> -	clockid_t	tfd_clockid;
> -	struct itimerspec tfd_time;
> -	struct callout	tfd_callout;
> -	timerfd_t	tfd_count;
> -	bool		tfd_canceled;
> -	struct selinfo	tfd_sel;
> -	struct mtx	tfd_lock;
> -};
> -
> -static void	linux_timerfd_expire(void *);
> -static void	linux_timerfd_curval(struct timerfd *, struct itimerspec *);
> -
>  static int
>  epoll_create_common(struct thread *td, int flags)
>  {
> @@ -658,255 +610,14 @@ linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args)
>  int
>  linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args)
>  {
> -	struct timerfd *tfd;
> -	struct file *fp;
>  	clockid_t clockid;
> -	int fflags, fd, error;
> -
> -	if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0)
> -		return (EINVAL);
> -
> -	error = linux_to_native_clockid(&clockid, args->clockid);
> -	if (error != 0)
> -		return (error);
> -	if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
> -		return (EINVAL);
> -
> -	fflags = 0;
> -	if ((args->flags & LINUX_TFD_CLOEXEC) != 0)
> -		fflags |= O_CLOEXEC;
> -
> -	error = falloc(td, &fp, &fd, fflags);
> -	if (error != 0)
> -		return (error);
> -
> -	tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO);
> -	tfd->tfd_clockid = clockid;
> -	mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF);
> -
> -	callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0);
> -	knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock);
> -
> -	fflags = FREAD;
> -	if ((args->flags & LINUX_O_NONBLOCK) != 0)
> -		fflags |= FNONBLOCK;
> -
> -	finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops);
> -	fdrop(fp, td);
> -
> -	td->td_retval[0] = fd;
> -	return (error);
> -}
> -
> -static int
> -timerfd_close(struct file *fp, struct thread *td)
> -{
> -	struct timerfd *tfd;
> -
> -	tfd = fp->f_data;
> -	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
> -		return (EINVAL);
> -
> -	timespecclear(&tfd->tfd_time.it_value);
> -	timespecclear(&tfd->tfd_time.it_interval);
> -
> -	callout_drain(&tfd->tfd_callout);
> -
> -	seldrain(&tfd->tfd_sel);
> -	knlist_destroy(&tfd->tfd_sel.si_note);
> -
> -	fp->f_ops = &badfileops;
> -	mtx_destroy(&tfd->tfd_lock);
> -	free(tfd, M_EPOLL);
> -
> -	return (0);
> -}
> -
> -static int
> -timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
> -    int flags, struct thread *td)
> -{
> -	struct timerfd *tfd;
> -	timerfd_t count;
> -	int error;
> -
> -	tfd = fp->f_data;
> -	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
> -		return (EINVAL);
> -
> -	if (uio->uio_resid < sizeof(timerfd_t))
> -		return (EINVAL);
> -
> -	error = 0;
> -	mtx_lock(&tfd->tfd_lock);
> -retry:
> -	if (tfd->tfd_canceled) {
> -		tfd->tfd_count = 0;
> -		mtx_unlock(&tfd->tfd_lock);
> -		return (ECANCELED);
> -	}
> -	if (tfd->tfd_count == 0) {
> -		if ((fp->f_flag & FNONBLOCK) != 0) {
> -			mtx_unlock(&tfd->tfd_lock);
> -			return (EAGAIN);
> -		}
> -		error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0);
> -		if (error == 0)
> -			goto retry;
> -	}
> -	if (error == 0) {
> -		count = tfd->tfd_count;
> -		tfd->tfd_count = 0;
> -		mtx_unlock(&tfd->tfd_lock);
> -		error = uiomove(&count, sizeof(timerfd_t), uio);
> -	} else
> -		mtx_unlock(&tfd->tfd_lock);
> -
> -	return (error);
> -}
> -
> -static int
> -timerfd_poll(struct file *fp, int events, struct ucred *active_cred,
> -    struct thread *td)
> -{
> -	struct timerfd *tfd;
> -	int revents = 0;
> -
> -	tfd = fp->f_data;
> -	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
> -		return (POLLERR);
> -
> -	mtx_lock(&tfd->tfd_lock);
> -	if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0)
> -		revents |= events & (POLLIN|POLLRDNORM);
> -	if (revents == 0)
> -		selrecord(td, &tfd->tfd_sel);
> -	mtx_unlock(&tfd->tfd_lock);
> -
> -	return (revents);
> -}
> -
> -static int
> -timerfd_kqfilter(struct file *fp, struct knote *kn)
> -{
> -	struct timerfd *tfd;
> -
> -	tfd = fp->f_data;
> -	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
> -		return (EINVAL);
> -
> -	if (kn->kn_filter == EVFILT_READ)
> -		kn->kn_fop = &timerfd_rfiltops;
> -	else
> -		return (EINVAL);
> -
> -	kn->kn_hook = tfd;
> -	knlist_add(&tfd->tfd_sel.si_note, kn, 0);
> -
> -	return (0);
> -}
> -
> -static void
> -filt_timerfddetach(struct knote *kn)
> -{
> -	struct timerfd *tfd = kn->kn_hook;
> -
> -	mtx_lock(&tfd->tfd_lock);
> -	knlist_remove(&tfd->tfd_sel.si_note, kn, 1);
> -	mtx_unlock(&tfd->tfd_lock);
> -}
> -
> -static int
> -filt_timerfdread(struct knote *kn, long hint)
> -{
> -	struct timerfd *tfd = kn->kn_hook;
> -
> -	return (tfd->tfd_count > 0);
> -}
> -
> -static int
> -timerfd_ioctl(struct file *fp, u_long cmd, void *data,
> -    struct ucred *active_cred, struct thread *td)
> -{
> -
> -	if (fp->f_data == NULL || fp->f_type != DTYPE_LINUXTFD)
> -		return (EINVAL);
> -
> -	switch (cmd) {
> -	case FIONBIO:
> -	case FIOASYNC:
> -		return (0);
> -	}
> -
> -	return (ENOTTY);
> -}
> -
> -static int
> -timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
> -{
> -
> -	return (ENXIO);
> -}
> -
> -static int
> -timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
> -{
> -
> -	kif->kf_type = KF_TYPE_UNKNOWN;
> -	return (0);
> -}
> -
> -static void
> -linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts)
> -{
> -
> -	if (tfd->tfd_clockid == CLOCK_REALTIME)
> -		getnanotime(ts);
> -	else	/* CLOCK_MONOTONIC */
> -		getnanouptime(ts);
> -}
> -
> -static void
> -linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots)
> -{
> -	struct timespec cts;
> -
> -	linux_timerfd_clocktime(tfd, &cts);
> -	*ots = tfd->tfd_time;
> -	if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) {
> -		timespecsub(&ots->it_value, &cts, &ots->it_value);
> -		if (ots->it_value.tv_sec < 0 ||
> -		    (ots->it_value.tv_sec == 0 &&
> -		     ots->it_value.tv_nsec == 0)) {
> -			ots->it_value.tv_sec  = 0;
> -			ots->it_value.tv_nsec = 1;
> -		}
> -	}
> -}
> -
> -static int
> -linux_timerfd_gettime_common(struct thread *td, int fd, struct itimerspec *ots)
> -{
> -	struct timerfd *tfd;
> -	struct file *fp;
>  	int error;
>  
> -	error = fget(td, fd, &cap_read_rights, &fp);
> +	error = linux_to_native_clockid(&clockid, args->clockid);
>  	if (error != 0)
>  		return (error);
> -	tfd = fp->f_data;
> -	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
> -		error = EINVAL;
> -		goto out;
> -	}
> -
> -	mtx_lock(&tfd->tfd_lock);
> -	linux_timerfd_curval(tfd, ots);
> -	mtx_unlock(&tfd->tfd_lock);
>  
> -out:
> -	fdrop(fp, td);
> -	return (error);
> +	return (kern_timerfd_create(td, clockid, args->flags));
>  }
>  
>  int
> @@ -916,84 +627,14 @@ linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args
>  	struct itimerspec ots;
>  	int error;
>  
> -	error = linux_timerfd_gettime_common(td, args->fd, &ots);
> +	error = kern_timerfd_gettime(td, args->fd, &ots);
>  	if (error != 0)
>  		return (error);
> -	error = native_to_linux_itimerspec(&lots, &ots);
> -	if (error == 0)
> -		error = copyout(&lots, args->old_value, sizeof(lots));
> -	return (error);
> -}
> -
> -#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
> -int
> -linux_timerfd_gettime64(struct thread *td, struct linux_timerfd_gettime64_args *args)
> -{
> -	struct l_itimerspec64 lots;
> -	struct itimerspec ots;
> -	int error;
>  
> -	error = linux_timerfd_gettime_common(td, args->fd, &ots);
> -	if (error != 0)
> -		return (error);
> -	error = native_to_linux_itimerspec64(&lots, &ots);
> +	error = native_to_linux_itimerspec(&lots, &ots);
>  	if (error == 0)
>  		error = copyout(&lots, args->old_value, sizeof(lots));
> -	return (error);
> -}
> -#endif
> -
> -static int
> -linux_timerfd_settime_common(struct thread *td, int fd, int flags,
> -    struct itimerspec *nts, struct itimerspec *oval)
> -{
> -	struct timespec cts, ts;
> -	struct timerfd *tfd;
> -	struct timeval tv;
> -	struct file *fp;
> -	int error;
> -
> -	if ((flags & ~LINUX_TFD_SETTIME_FLAGS) != 0)
> -		return (EINVAL);
> -
> -	error = fget(td, fd, &cap_write_rights, &fp);
> -	if (error != 0)
> -		return (error);
> -	tfd = fp->f_data;
> -	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
> -		error = EINVAL;
> -		goto out;
> -	}
> -
> -	mtx_lock(&tfd->tfd_lock);
> -	if (!timespecisset(&nts->it_value))
> -		timespecclear(&nts->it_interval);
> -	if (oval != NULL)
> -		linux_timerfd_curval(tfd, oval);
> -
> -	bcopy(nts, &tfd->tfd_time, sizeof(*nts));
> -	tfd->tfd_count = 0;
> -	if (timespecisset(&nts->it_value)) {
> -		linux_timerfd_clocktime(tfd, &cts);
> -		ts = nts->it_value;
> -		if ((flags & LINUX_TFD_TIMER_ABSTIME) == 0) {
> -			timespecadd(&tfd->tfd_time.it_value, &cts,
> -				&tfd->tfd_time.it_value);
> -		} else {
> -			timespecsub(&ts, &cts, &ts);
> -		}
> -		TIMESPEC_TO_TIMEVAL(&tv, &ts);
> -		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
> -			linux_timerfd_expire, tfd);
> -		tfd->tfd_canceled = false;
> -	} else {
> -		tfd->tfd_canceled = true;
> -		callout_stop(&tfd->tfd_callout);
> -	}
> -	mtx_unlock(&tfd->tfd_lock);
>  
> -out:
> -	fdrop(fp, td);
>  	return (error);
>  }
>  
> @@ -1001,7 +642,7 @@ int
>  linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args)
>  {
>  	struct l_itimerspec lots;
> -	struct itimerspec nts, ots, *pots;
> +	struct itimerspec nts, ots;
>  	int error;
>  
>  	error = copyin(args->new_value, &lots, sizeof(lots));
> @@ -1010,23 +651,43 @@ linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args
>  	error = linux_to_native_itimerspec(&nts, &lots);
>  	if (error != 0)
>  		return (error);
> -	pots = (args->old_value != NULL ? &ots : NULL);
> -	error = linux_timerfd_settime_common(td, args->fd, args->flags,
> -	    &nts, pots);
> +	if (args->old_value == NULL)
> +		error = kern_timerfd_settime(td, args->fd, args->flags, &nts, NULL);
> +	else
> +		error = kern_timerfd_settime(td, args->fd, args->flags, &nts, &ots);
>  	if (error == 0 && args->old_value != NULL) {
>  		error = native_to_linux_itimerspec(&lots, &ots);
>  		if (error == 0)
>  			error = copyout(&lots, args->old_value, sizeof(lots));
>  	}
> +
>  	return (error);
>  }
>  
>  #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
> +int
> +linux_timerfd_gettime64(struct thread *td, struct linux_timerfd_gettime64_args *args)
> +{
> +	struct l_itimerspec64 lots;
> +	struct itimerspec ots;
> +	int error;
> +
> +	error = kern_timerfd_gettime(td, args->fd, &ots);
> +	if (error != 0)
> +		return (error);
> +
> +	error = native_to_linux_itimerspec64(&lots, &ots);
> +	if (error == 0)
> +		error = copyout(&lots, args->old_value, sizeof(lots));
> +
> +	return (error);
> +}
> +
>  int
>  linux_timerfd_settime64(struct thread *td, struct linux_timerfd_settime64_args *args)
>  {
>  	struct l_itimerspec64 lots;
> -	struct itimerspec nts, ots, *pots;
> +	struct itimerspec nts, ots;
>  	int error;
>  
>  	error = copyin(args->new_value, &lots, sizeof(lots));
> @@ -1035,50 +696,16 @@ linux_timerfd_settime64(struct thread *td, struct linux_timerfd_settime64_args *
>  	error = linux_to_native_itimerspec64(&nts, &lots);
>  	if (error != 0)
>  		return (error);
> -	pots = (args->old_value != NULL ? &ots : NULL);
> -	error = linux_timerfd_settime_common(td, args->fd, args->flags,
> -	    &nts, pots);
> +	if (args->old_value == NULL)
> +		error = kern_timerfd_settime(td, args->fd, args->flags, &nts, NULL);
> +	else
> +		error = kern_timerfd_settime(td, args->fd, args->flags, &nts, &ots);
>  	if (error == 0 && args->old_value != NULL) {
>  		error = native_to_linux_itimerspec64(&lots, &ots);
>  		if (error == 0)
>  			error = copyout(&lots, args->old_value, sizeof(lots));
>  	}
> +
>  	return (error);
>  }
>  #endif
> -
> -static void
> -linux_timerfd_expire(void *arg)
> -{
> -	struct timespec cts, ts;
> -	struct timeval tv;
> -	struct timerfd *tfd;
> -
> -	tfd = (struct timerfd *)arg;
> -
> -	linux_timerfd_clocktime(tfd, &cts);
> -	if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) {
> -		if (timespecisset(&tfd->tfd_time.it_interval))
> -			timespecadd(&tfd->tfd_time.it_value,
> -				    &tfd->tfd_time.it_interval,
> -				    &tfd->tfd_time.it_value);
> -		else
> -			/* single shot timer */
> -			timespecclear(&tfd->tfd_time.it_value);
> -		if (timespecisset(&tfd->tfd_time.it_value)) {
> -			timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
> -			TIMESPEC_TO_TIMEVAL(&tv, &ts);
> -			callout_reset(&tfd->tfd_callout, tvtohz(&tv),
> -				linux_timerfd_expire, tfd);
> -		}
> -		tfd->tfd_count++;
> -		KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0);
> -		selwakeup(&tfd->tfd_sel);
> -		wakeup(&tfd->tfd_count);
> -	} else if (timespecisset(&tfd->tfd_time.it_value)) {
> -		timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
> -		TIMESPEC_TO_TIMEVAL(&tv, &ts);
> -		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
> -		    linux_timerfd_expire, tfd);
> -	}
> -}
> diff --git a/sys/compat/linux/linux_event.h b/sys/compat/linux/linux_event.h
> index 32269b0070bc..fa63371b5170 100644
> --- a/sys/compat/linux/linux_event.h
> +++ b/sys/compat/linux/linux_event.h
> @@ -54,15 +54,4 @@
>  
>  #define	LINUX_EFD_SEMAPHORE	(1 << 0)
>  
> -#define	LINUX_TFD_TIMER_ABSTIME	(1 << 0)
> -#define	LINUX_TFD_TIMER_CANCEL_ON_SET	(1 << 1)
> -#define	LINUX_TFD_CLOEXEC	LINUX_O_CLOEXEC
> -#define	LINUX_TFD_NONBLOCK	LINUX_O_NONBLOCK
> -
> -#define	LINUX_TFD_SHARED_FCNTL_FLAGS	(LINUX_TFD_CLOEXEC		\
> -		|LINUX_TFD_NONBLOCK)
> -#define	LINUX_TFD_CREATE_FLAGS	LINUX_TFD_SHARED_FCNTL_FLAGS
> -#define	LINUX_TFD_SETTIME_FLAGS	(LINUX_TFD_TIMER_ABSTIME		\
> -		|LINUX_TFD_TIMER_CANCEL_ON_SET)
> -
>  #endif	/* !_LINUX_EVENT_H_ */
> diff --git a/sys/conf/files b/sys/conf/files
> index 3f79ce752c80..8d38b9cc8a2e 100644
> --- a/sys/conf/files
> +++ b/sys/conf/files
> @@ -3908,6 +3908,7 @@ kern/sys_pipe.c			standard
>  kern/sys_procdesc.c		standard
>  kern/sys_process.c		standard
>  kern/sys_socket.c		standard
> +kern/sys_timerfd.c		standard
>  kern/syscalls.c			standard
>  kern/sysv_ipc.c			standard
>  kern/sysv_msg.c			optional sysvmsg
> diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
> index 1e62c46b8be0..d44fec54fcd7 100644
> --- a/sys/kern/init_sysent.c
> +++ b/sys/kern/init_sysent.c
> @@ -645,4 +645,7 @@ struct sysent sysent[] = {
>  	{ .sy_narg = AS(swapoff_args), .sy_call = (sy_call_t *)sys_swapoff, .sy_auevent = AUE_SWAPOFF, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC },	/* 582 = swapoff */
>  	{ .sy_narg = AS(kqueuex_args), .sy_call = (sy_call_t *)sys_kqueuex, .sy_auevent = AUE_KQUEUE, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 583 = kqueuex */
>  	{ .sy_narg = AS(membarrier_args), .sy_call = (sy_call_t *)sys_membarrier, .sy_auevent = AUE_NULL, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 584 = membarrier */
> +	{ .sy_narg = AS(timerfd_create_args), .sy_call = (sy_call_t *)sys_timerfd_create, .sy_auevent = AUE_TIMERFD, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 585 = timerfd_create */
> +	{ .sy_narg = AS(timerfd_gettime_args), .sy_call = (sy_call_t *)sys_timerfd_gettime, .sy_auevent = AUE_TIMERFD, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 586 = timerfd_gettime */
> +	{ .sy_narg = AS(timerfd_settime_args), .sy_call = (sy_call_t *)sys_timerfd_settime, .sy_auevent = AUE_TIMERFD, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC },	/* 587 = timerfd_settime */
>  };
> diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
> index c5226288afc5..35046c856d54 100644
> --- a/sys/kern/kern_descrip.c
> +++ b/sys/kern/kern_descrip.c
> @@ -5001,8 +5001,8 @@ file_type_to_name(short type)
>  		return ("proc");
>  	case DTYPE_EVENTFD:
>  		return ("eventfd");
> -	case DTYPE_LINUXTFD:
> -		return ("ltimer");
> +	case DTYPE_TIMERFD:
> +		return ("timerfd");
>  	default:
>  		return ("unkn");
>  	}
> diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c
> index 170f35830923..26f09cb60260 100644
> --- a/sys/kern/kern_tc.c
> +++ b/sys/kern/kern_tc.c
> @@ -34,6 +34,7 @@
>  #include <sys/systm.h>
>  #include <sys/timeffc.h>
>  #include <sys/timepps.h>
> +#include <sys/timerfd.h>
>  #include <sys/timetc.h>
>  #include <sys/timex.h>
>  #include <sys/vdso.h>
> @@ -1305,6 +1306,7 @@ tc_setclock(struct timespec *ts)
>  
>  	/* Avoid rtc_generation == 0, since td_rtcgen == 0 is special. */
>  	atomic_add_rel_int(&rtc_generation, 2);
> +	timerfd_jumped();
>  	sleepq_chains_remove_matching(sleeping_on_old_rtc);
>  	if (timestepwarnings) {
>  		nanotime(&taft);
> diff --git a/sys/kern/sys_timerfd.c b/sys/kern/sys_timerfd.c
> new file mode 100644
> index 000000000000..6948fa059b8c
> --- /dev/null
> +++ b/sys/kern/sys_timerfd.c
> @@ -0,0 +1,632 @@
> +/*-
> + * SPDX-License-Identifier: BSD-2-Clause
> + *
> + * Copyright (c) 2014 Dmitry Chagin <dchagin@FreeBSD.org>
> + * Copyright (c) 2023 Jake Freeland <jfree@FreeBSD.org>
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + */
> +
> +#include <sys/param.h>
> +#include <sys/systm.h>
> +#include <sys/callout.h>
> +#include <sys/fcntl.h>
> +#include <sys/file.h>
> +#include <sys/filedesc.h>
> +#include <sys/filio.h>
> +#include <sys/kernel.h>
> +#include <sys/lock.h>
> +#include <sys/malloc.h>
> +#include <sys/mount.h>
> +#include <sys/mutex.h>
> +#include <sys/poll.h>
> +#include <sys/proc.h>
> +#include <sys/queue.h>
> +#include <sys/selinfo.h>
> +#include <sys/stat.h>
> +#include <sys/sysctl.h>
> +#include <sys/sysent.h>
> +#include <sys/sysproto.h>
> +#include <sys/timerfd.h>
> +#include <sys/timespec.h>
> +#include <sys/uio.h>
> +#include <sys/user.h>
> +
> +#include <security/audit/audit.h>
> +
> +#ifdef COMPAT_FREEBSD32
> +#include <compat/freebsd32/freebsd32.h>
> +#include <compat/freebsd32/freebsd32_proto.h>
> +#endif
> +
> +static MALLOC_DEFINE(M_TIMERFD, "timerfd", "timerfd structures");
> +static LIST_HEAD(, timerfd) timerfd_head;
> +static struct unrhdr64 tfdino_unr;
> +
> +#define	TFD_NOJUMP	0	/* Realtime clock has not jumped. */
> +#define	TFD_READ	1	/* Jumped, tfd has been read since. */
> +#define	TFD_ZREAD	2	/* Jumped backwards, CANCEL_ON_SET=false. */
> +#define	TFD_CANCELED	4	/* Jumped, CANCEL_ON_SET=true. */
> +#define	TFD_JUMPED	(TFD_ZREAD | TFD_CANCELED)
> +
> +struct timerfd {
> +	/* User specified. */
> +	struct itimerspec tfd_time;	/* tfd timer */
> +	clockid_t	tfd_clockid;	/* timing base */
> +	int		tfd_flags;	/* creation flags */
> +	int		tfd_timflags;	/* timer flags */
> +
> +	/* Used internally. */
> +	timerfd_t	tfd_count;	/* expiration count since last read */
> +	bool		tfd_expired;	/* true upon initial expiration */
> +	struct mtx	tfd_lock;	/* mtx lock */
> +	struct callout	tfd_callout;	/* expiration notification */
> +	struct selinfo	tfd_sel;	/* I/O alerts */
> +	struct timespec	tfd_boottim;	/* cached boottime */
> +	int		tfd_jumped;	/* timer jump status */
> +	LIST_ENTRY(timerfd) entry;	/* entry in list */
> +
> +	/* For stat(2). */
> +	ino_t		tfd_ino;	/* inode number */
> +	struct timespec	tfd_atim;	/* time of last read */
> +	struct timespec	tfd_mtim;	/* time of last settime */
> +	struct timespec tfd_birthtim;	/* creation time */
> +};
> +
> +static void
> +timerfd_init(void *data)
> +{
> +	new_unrhdr64(&tfdino_unr, 1);
> +}
> +
> +SYSINIT(timerfd, SI_SUB_VFS, SI_ORDER_ANY, timerfd_init, NULL);
> +
> +static inline void
> +timerfd_getboottime(struct timespec *ts)
> +{
> +	struct timeval tv;
> +	getboottime(&tv);
> +	TIMEVAL_TO_TIMESPEC(&tv, ts);
> +}
> +
> +/*
> + * Call when a discontinuous jump has occured in CLOCK_REALTIME and
> + * update timerfd's cached boottime. A jump can be triggered using
> + * functions like clock_settime(2) or settimeofday(2).
> + *
> + * Timer is marked TFD_CANCELED if TFD_TIMER_CANCEL_ON_SET is set
> *** 850 LINES SKIPPED ***

I did a very quick look over the added code.

I do not see any protection for the timerfd_head list manipulation.

It is not clear what is protected by tfd->tfd_lock: e.g. in timerfd_stat()
it covers reading of items, writing of which is not protected by the mtx,
everything except tfd_atim.
There is no annotations in the timer structure for the locking regime.

stat st_ctim is always zero, this is somewhat strange.

The
	tfd = fp->f_data;
	if (tfd == NULL || fp->f_type != DTYPE_TIMERFD) {
triggers UB when f_type is not DTYPE_TIMERFD.

compat32 stuff was put into the sys/kern instead of sys/compat/freebsd32.
sys/timerfd.h pollutes userspace with sys/proc.h.

The regenerated files were put in the same commit as (probably) human
written files, why?