git: 7a202823aa54 - main - Expose eventfd in the native API/ABI using a new __specialfd syscall
Konstantin Belousov
kib at FreeBSD.org
Sun Dec 27 10:57:39 UTC 2020
The branch main has been updated by kib:
URL: https://cgit.FreeBSD.org/src/commit/?id=7a202823aa54ba18c485bdbcf355269bcfee1ab9
commit 7a202823aa54ba18c485bdbcf355269bcfee1ab9
Author: Konstantin Belousov <kib at FreeBSD.org>
AuthorDate: 2020-12-23 14:14:04 +0000
Commit: Konstantin Belousov <kib at FreeBSD.org>
CommitDate: 2020-12-27 10:57:26 +0000
Expose eventfd in the native API/ABI using a new __specialfd syscall
eventfd is a Linux system call that produces special file descriptors
for event notification. When porting Linux software, it is currently
usually emulated by epoll-shim on top of kqueues. Unfortunately, kqueues
are not passable between processes. And, as noted by the author of
epoll-shim, even if they were, the library state would also have to be
passed somehow. This came up when debugging strange HW video decode
failures in Firefox. A native implementation would avoid these problems
and help with porting Linux software.
Since we now already have an eventfd implementation in the kernel (for
the Linuxulator), it's pretty easy to expose it natively, which is what
this patch does.
Submitted by: greg at unrelenting.technology
Reviewed by: markj (previous version)
MFC after: 2 weeks
Differential Revision: https://reviews.freebsd.org/D26668
---
sys/bsm/audit_kevents.h | 1 +
sys/compat/freebsd32/syscalls.master | 2 +
sys/compat/linux/linux_event.c | 368 ++++-------------------------------
sys/conf/files | 1 +
sys/kern/capabilities.conf | 5 +
sys/kern/kern_descrip.c | 4 +-
sys/kern/sys_eventfd.c | 349 +++++++++++++++++++++++++++++++++
sys/kern/sys_generic.c | 63 ++++++
sys/kern/syscalls.master | 7 +
sys/sys/eventfd.h | 54 +++++
sys/sys/file.h | 2 +-
sys/sys/specialfd.h | 42 ++++
sys/sys/syscallsubr.h | 1 +
sys/sys/user.h | 5 +
14 files changed, 574 insertions(+), 330 deletions(-)
diff --git a/sys/bsm/audit_kevents.h b/sys/bsm/audit_kevents.h
index 150ecc1b49ac..5b37329078a1 100644
--- a/sys/bsm/audit_kevents.h
+++ b/sys/bsm/audit_kevents.h
@@ -659,6 +659,7 @@
#define AUE_SHMRENAME 43263 /* FreeBSD-specific. */
#define AUE_REALPATHAT 43264 /* FreeBSD-specific. */
#define AUE_CLOSERANGE 43265 /* FreeBSD-specific. */
+#define AUE_SPECIALFD 43266 /* FreeBSD-specific. */
/*
* Darwin BSM uses a number of AUE_O_* definitions, which are aliased to the
diff --git a/sys/compat/freebsd32/syscalls.master b/sys/compat/freebsd32/syscalls.master
index 1e3d26d727a1..f4339795781a 100644
--- a/sys/compat/freebsd32/syscalls.master
+++ b/sys/compat/freebsd32/syscalls.master
@@ -1168,5 +1168,7 @@
; 576 is initialised by the krpc code, if present.
576 AUE_NULL NOSTD|NOPROTO { int rpctls_syscall(int op, \
const char *path); }
+577 AUE_SPECIALFD NOPROTO { int __specialfd(int type, const void *req, \
+ size_t len); }
; vim: syntax=off
diff --git a/sys/compat/linux/linux_event.c b/sys/compat/linux/linux_event.c
index c67d62d8aff0..b4b4be1f7b49 100644
--- a/sys/compat/linux/linux_event.c
+++ b/sys/compat/linux/linux_event.c
@@ -51,9 +51,11 @@ __FBSDID("$FreeBSD$");
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/selinfo.h>
+#include <sys/specialfd.h>
#include <sys/sx.h>
#include <sys/syscallsubr.h>
#include <sys/timespec.h>
+#include <sys/eventfd.h>
#ifdef COMPAT_LINUX32
#include <machine/../linux32/linux.h>
@@ -124,53 +126,11 @@ struct epoll_copyout_args {
int error;
};
-/* eventfd */
-typedef uint64_t eventfd_t;
-
-static fo_rdwr_t eventfd_read;
-static fo_rdwr_t eventfd_write;
-static fo_ioctl_t eventfd_ioctl;
-static fo_poll_t eventfd_poll;
-static fo_kqfilter_t eventfd_kqfilter;
-static fo_stat_t eventfd_stat;
-static fo_close_t eventfd_close;
-static fo_fill_kinfo_t eventfd_fill_kinfo;
-
-static struct fileops eventfdops = {
- .fo_read = eventfd_read,
- .fo_write = eventfd_write,
- .fo_truncate = invfo_truncate,
- .fo_ioctl = eventfd_ioctl,
- .fo_poll = eventfd_poll,
- .fo_kqfilter = eventfd_kqfilter,
- .fo_stat = eventfd_stat,
- .fo_close = eventfd_close,
- .fo_chmod = invfo_chmod,
- .fo_chown = invfo_chown,
- .fo_sendfile = invfo_sendfile,
- .fo_fill_kinfo = eventfd_fill_kinfo,
- .fo_flags = DFLAG_PASSABLE
-};
-
-static void filt_eventfddetach(struct knote *kn);
-static int filt_eventfdread(struct knote *kn, long hint);
-static int filt_eventfdwrite(struct knote *kn, long hint);
-
-static struct filterops eventfd_rfiltops = {
- .f_isfd = 1,
- .f_detach = filt_eventfddetach,
- .f_event = filt_eventfdread
-};
-static struct filterops eventfd_wfiltops = {
- .f_isfd = 1,
- .f_detach = filt_eventfddetach,
- .f_event = filt_eventfdwrite
-};
-
/* timerfd */
typedef uint64_t timerfd_t;
static fo_rdwr_t timerfd_read;
+static fo_ioctl_t timerfd_ioctl;
static fo_poll_t timerfd_poll;
static fo_kqfilter_t timerfd_kqfilter;
static fo_stat_t timerfd_stat;
@@ -181,7 +141,7 @@ static struct fileops timerfdops = {
.fo_read = timerfd_read,
.fo_write = invfo_rdwr,
.fo_truncate = invfo_truncate,
- .fo_ioctl = eventfd_ioctl,
+ .fo_ioctl = timerfd_ioctl,
.fo_poll = timerfd_poll,
.fo_kqfilter = timerfd_kqfilter,
.fo_stat = timerfd_stat,
@@ -202,13 +162,6 @@ static struct filterops timerfd_rfiltops = {
.f_event = filt_timerfdread
};
-struct eventfd {
- eventfd_t efd_count;
- uint32_t efd_flags;
- struct selinfo efd_sel;
- struct mtx efd_lock;
-};
-
struct timerfd {
clockid_t tfd_clockid;
struct itimerspec tfd_time;
@@ -219,7 +172,6 @@ struct timerfd {
struct mtx tfd_lock;
};
-static int eventfd_create(struct thread *td, uint32_t initval, int flags);
static void linux_timerfd_expire(void *);
static void linux_timerfd_curval(struct timerfd *, struct itimerspec *);
@@ -691,294 +643,39 @@ epoll_delete_all_events(struct thread *td, struct file *epfp, int fd)
return (error1 == 0 ? 0 : error2);
}
-static int
-eventfd_create(struct thread *td, uint32_t initval, int flags)
-{
- struct filedesc *fdp;
- struct eventfd *efd;
- struct file *fp;
- int fflags, fd, error;
-
- fflags = 0;
- if ((flags & LINUX_O_CLOEXEC) != 0)
- fflags |= O_CLOEXEC;
-
- fdp = td->td_proc->p_fd;
- error = falloc(td, &fp, &fd, fflags);
- if (error != 0)
- return (error);
-
- efd = malloc(sizeof(*efd), M_EPOLL, M_WAITOK | M_ZERO);
- efd->efd_flags = flags;
- efd->efd_count = initval;
- mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF);
-
- knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock);
-
- fflags = FREAD | FWRITE;
- if ((flags & LINUX_O_NONBLOCK) != 0)
- fflags |= FNONBLOCK;
-
- finit(fp, fflags, DTYPE_LINUXEFD, efd, &eventfdops);
- fdrop(fp, td);
-
- td->td_retval[0] = fd;
- return (error);
-}
-
#ifdef LINUX_LEGACY_SYSCALLS
int
linux_eventfd(struct thread *td, struct linux_eventfd_args *args)
{
+ struct specialfd_eventfd ae;
- return (eventfd_create(td, args->initval, 0));
+ bzero(&ae, sizeof(ae));
+ ae.initval = args->initval;
+ return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
}
#endif
int
linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args)
{
+ struct specialfd_eventfd ae;
+ int flags;
- if ((args->flags & ~(LINUX_O_CLOEXEC|LINUX_O_NONBLOCK|LINUX_EFD_SEMAPHORE)) != 0)
- return (EINVAL);
-
- return (eventfd_create(td, args->initval, args->flags));
-}
-
-static int
-eventfd_close(struct file *fp, struct thread *td)
-{
- struct eventfd *efd;
-
- efd = fp->f_data;
- if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
- return (EINVAL);
-
- seldrain(&efd->efd_sel);
- knlist_destroy(&efd->efd_sel.si_note);
-
- fp->f_ops = &badfileops;
- mtx_destroy(&efd->efd_lock);
- free(efd, M_EPOLL);
-
- return (0);
-}
-
-static int
-eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
- int flags, struct thread *td)
-{
- struct eventfd *efd;
- eventfd_t count;
- int error;
-
- efd = fp->f_data;
- if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
- return (EINVAL);
-
- if (uio->uio_resid < sizeof(eventfd_t))
- return (EINVAL);
-
- error = 0;
- mtx_lock(&efd->efd_lock);
-retry:
- if (efd->efd_count == 0) {
- if ((fp->f_flag & FNONBLOCK) != 0) {
- mtx_unlock(&efd->efd_lock);
- return (EAGAIN);
- }
- error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "lefdrd", 0);
- if (error == 0)
- goto retry;
- }
- if (error == 0) {
- if ((efd->efd_flags & LINUX_EFD_SEMAPHORE) != 0) {
- count = 1;
- --efd->efd_count;
- } else {
- count = efd->efd_count;
- efd->efd_count = 0;
- }
- KNOTE_LOCKED(&efd->efd_sel.si_note, 0);
- selwakeup(&efd->efd_sel);
- wakeup(&efd->efd_count);
- mtx_unlock(&efd->efd_lock);
- error = uiomove(&count, sizeof(eventfd_t), uio);
- } else
- mtx_unlock(&efd->efd_lock);
-
- return (error);
-}
-
-static int
-eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
- int flags, struct thread *td)
-{
- struct eventfd *efd;
- eventfd_t count;
- int error;
-
- efd = fp->f_data;
- if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
- return (EINVAL);
-
- if (uio->uio_resid < sizeof(eventfd_t))
- return (EINVAL);
-
- error = uiomove(&count, sizeof(eventfd_t), uio);
- if (error != 0)
- return (error);
- if (count == UINT64_MAX)
- return (EINVAL);
-
- mtx_lock(&efd->efd_lock);
-retry:
- if (UINT64_MAX - efd->efd_count <= count) {
- if ((fp->f_flag & FNONBLOCK) != 0) {
- mtx_unlock(&efd->efd_lock);
- /* Do not not return the number of bytes written */
- uio->uio_resid += sizeof(eventfd_t);
- return (EAGAIN);
- }
- error = mtx_sleep(&efd->efd_count, &efd->efd_lock,
- PCATCH, "lefdwr", 0);
- if (error == 0)
- goto retry;
- }
- if (error == 0) {
- efd->efd_count += count;
- KNOTE_LOCKED(&efd->efd_sel.si_note, 0);
- selwakeup(&efd->efd_sel);
- wakeup(&efd->efd_count);
- }
- mtx_unlock(&efd->efd_lock);
-
- return (error);
-}
-
-static int
-eventfd_poll(struct file *fp, int events, struct ucred *active_cred,
- struct thread *td)
-{
- struct eventfd *efd;
- int revents = 0;
-
- efd = fp->f_data;
- if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
- return (POLLERR);
-
- mtx_lock(&efd->efd_lock);
- if ((events & (POLLIN|POLLRDNORM)) && efd->efd_count > 0)
- revents |= events & (POLLIN|POLLRDNORM);
- if ((events & (POLLOUT|POLLWRNORM)) && UINT64_MAX - 1 > efd->efd_count)
- revents |= events & (POLLOUT|POLLWRNORM);
- if (revents == 0)
- selrecord(td, &efd->efd_sel);
- mtx_unlock(&efd->efd_lock);
-
- return (revents);
-}
-
-static int
-eventfd_kqfilter(struct file *fp, struct knote *kn)
-{
- struct eventfd *efd;
-
- efd = fp->f_data;
- if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
- return (EINVAL);
-
- mtx_lock(&efd->efd_lock);
- switch (kn->kn_filter) {
- case EVFILT_READ:
- kn->kn_fop = &eventfd_rfiltops;
- break;
- case EVFILT_WRITE:
- kn->kn_fop = &eventfd_wfiltops;
- break;
- default:
- mtx_unlock(&efd->efd_lock);
- return (EINVAL);
- }
-
- kn->kn_hook = efd;
- knlist_add(&efd->efd_sel.si_note, kn, 1);
- mtx_unlock(&efd->efd_lock);
-
- return (0);
-}
-
-static void
-filt_eventfddetach(struct knote *kn)
-{
- struct eventfd *efd = kn->kn_hook;
-
- mtx_lock(&efd->efd_lock);
- knlist_remove(&efd->efd_sel.si_note, kn, 1);
- mtx_unlock(&efd->efd_lock);
-}
-
-static int
-filt_eventfdread(struct knote *kn, long hint)
-{
- struct eventfd *efd = kn->kn_hook;
- int ret;
-
- mtx_assert(&efd->efd_lock, MA_OWNED);
- ret = (efd->efd_count > 0);
-
- return (ret);
-}
-
-static int
-filt_eventfdwrite(struct knote *kn, long hint)
-{
- struct eventfd *efd = kn->kn_hook;
- int ret;
-
- mtx_assert(&efd->efd_lock, MA_OWNED);
- ret = (UINT64_MAX - 1 > efd->efd_count);
-
- return (ret);
-}
-
-static int
-eventfd_ioctl(struct file *fp, u_long cmd, void *data,
- struct ucred *active_cred, struct thread *td)
-{
-
- if (fp->f_data == NULL || (fp->f_type != DTYPE_LINUXEFD &&
- fp->f_type != DTYPE_LINUXTFD))
+ if ((args->flags & ~(LINUX_O_CLOEXEC | LINUX_O_NONBLOCK |
+ LINUX_EFD_SEMAPHORE)) != 0)
return (EINVAL);
-
- switch (cmd)
- {
- case FIONBIO:
- if ((*(int *)data))
- atomic_set_int(&fp->f_flag, FNONBLOCK);
- else
- atomic_clear_int(&fp->f_flag, FNONBLOCK);
- case FIOASYNC:
- return (0);
- default:
- return (ENXIO);
- }
-}
-
-static int
-eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
- struct thread *td)
-{
-
- return (ENXIO);
-}
-
-static int
-eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
-{
-
- kif->kf_type = KF_TYPE_UNKNOWN;
- return (0);
+ flags = 0;
+ if ((args->flags & LINUX_O_CLOEXEC) != 0)
+ flags |= EFD_CLOEXEC;
+ if ((args->flags & LINUX_O_NONBLOCK) != 0)
+ flags |= EFD_NONBLOCK;
+ if ((args->flags & LINUX_EFD_SEMAPHORE) != 0)
+ flags |= EFD_SEMAPHORE;
+
+ bzero(&ae, sizeof(ae));
+ ae.flags = flags;
+ ae.initval = args->initval;
+ return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
}
int
@@ -1154,6 +851,23 @@ filt_timerfdread(struct knote *kn, long hint)
return (tfd->tfd_count > 0);
}
+static int
+timerfd_ioctl(struct file *fp, u_long cmd, void *data,
+ struct ucred *active_cred, struct thread *td)
+{
+
+ if (fp->f_data == NULL || fp->f_type != DTYPE_LINUXTFD)
+ return (EINVAL);
+
+ switch (cmd) {
+ case FIONBIO:
+ case FIOASYNC:
+ return (0);
+ }
+
+ return (ENOTTY);
+}
+
static int
timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
struct thread *td)
diff --git a/sys/conf/files b/sys/conf/files
index 8e30ae1eded1..0258fca24836 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3926,6 +3926,7 @@ kern/subr_unit.c standard
kern/subr_vmem.c standard
kern/subr_witness.c optional witness
kern/sys_capability.c standard
+kern/sys_eventfd.c standard
kern/sys_generic.c standard
kern/sys_getrandom.c standard
kern/sys_pipe.c standard
diff --git a/sys/kern/capabilities.conf b/sys/kern/capabilities.conf
index 09d09515c816..3d552255d823 100644
--- a/sys/kern/capabilities.conf
+++ b/sys/kern/capabilities.conf
@@ -55,6 +55,11 @@ __mac_get_proc
__mac_set_fd
__mac_set_proc
+##
+## Allow creating special file descriptors like eventfd(2).
+##
+__specialfd
+
##
## Allow sysctl(2) as we scope internal to the call; this is a global
## namespace, but there are several critical sysctls required for almost
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index 3c9664c69da6..a510ad90a618 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -4609,8 +4609,8 @@ file_type_to_name(short type)
return ("dev");
case DTYPE_PROCDESC:
return ("proc");
- case DTYPE_LINUXEFD:
- return ("levent");
+ case DTYPE_EVENTFD:
+ return ("eventfd");
case DTYPE_LINUXTFD:
return ("ltimer");
default:
diff --git a/sys/kern/sys_eventfd.c b/sys/kern/sys_eventfd.c
new file mode 100644
index 000000000000..3fdb4afc7850
--- /dev/null
+++ b/sys/kern/sys_eventfd.c
@@ -0,0 +1,349 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2007 Roman Divacky
+ * Copyright (c) 2014 Dmitry Chagin
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/types.h>
+#include <sys/user.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/stat.h>
+#include <sys/errno.h>
+#include <sys/event.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+#include <sys/selinfo.h>
+#include <sys/eventfd.h>
+
+#include <security/audit/audit.h>
+
+_Static_assert(EFD_CLOEXEC == O_CLOEXEC, "Mismatched EFD_CLOEXEC");
+_Static_assert(EFD_NONBLOCK == O_NONBLOCK, "Mismatched EFD_NONBLOCK");
+
+MALLOC_DEFINE(M_EVENTFD, "eventfd", "eventfd structures");
+
+static fo_rdwr_t eventfd_read;
+static fo_rdwr_t eventfd_write;
+static fo_ioctl_t eventfd_ioctl;
+static fo_poll_t eventfd_poll;
+static fo_kqfilter_t eventfd_kqfilter;
+static fo_stat_t eventfd_stat;
+static fo_close_t eventfd_close;
+static fo_fill_kinfo_t eventfd_fill_kinfo;
+
+static struct fileops eventfdops = {
+ .fo_read = eventfd_read,
+ .fo_write = eventfd_write,
+ .fo_truncate = invfo_truncate,
+ .fo_ioctl = eventfd_ioctl,
+ .fo_poll = eventfd_poll,
+ .fo_kqfilter = eventfd_kqfilter,
+ .fo_stat = eventfd_stat,
+ .fo_close = eventfd_close,
+ .fo_chmod = invfo_chmod,
+ .fo_chown = invfo_chown,
+ .fo_sendfile = invfo_sendfile,
+ .fo_fill_kinfo = eventfd_fill_kinfo,
+ .fo_flags = DFLAG_PASSABLE
+};
+
+static void filt_eventfddetach(struct knote *kn);
+static int filt_eventfdread(struct knote *kn, long hint);
+static int filt_eventfdwrite(struct knote *kn, long hint);
+
+static struct filterops eventfd_rfiltops = {
+ .f_isfd = 1,
+ .f_detach = filt_eventfddetach,
+ .f_event = filt_eventfdread
+};
+
+static struct filterops eventfd_wfiltops = {
+ .f_isfd = 1,
+ .f_detach = filt_eventfddetach,
+ .f_event = filt_eventfdwrite
+};
+
+struct eventfd {
+ eventfd_t efd_count;
+ uint32_t efd_flags;
+ struct selinfo efd_sel;
+ struct mtx efd_lock;
+};
+
+int
+eventfd_create_file(struct thread *td, struct file *fp, uint32_t initval,
+ int flags)
+{
+ struct eventfd *efd;
+ int fflags;
+
+ AUDIT_ARG_FFLAGS(flags);
+ AUDIT_ARG_VALUE(initval);
+
+ efd = malloc(sizeof(*efd), M_EVENTFD, M_WAITOK | M_ZERO);
+ efd->efd_flags = flags;
+ efd->efd_count = initval;
+ mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF);
+ knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock);
+
+ fflags = FREAD | FWRITE;
+ if ((flags & EFD_NONBLOCK) != 0)
+ fflags |= FNONBLOCK;
+ finit(fp, fflags, DTYPE_EVENTFD, efd, &eventfdops);
+
+ return (0);
+}
+
+static int
+eventfd_close(struct file *fp, struct thread *td)
+{
+ struct eventfd *efd;
+
+ efd = fp->f_data;
+ seldrain(&efd->efd_sel);
+ knlist_destroy(&efd->efd_sel.si_note);
+ mtx_destroy(&efd->efd_lock);
+ free(efd, M_EVENTFD);
+ return (0);
+}
+
+static int
+eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+ struct eventfd *efd;
+ eventfd_t count;
+ int error;
+
+ if (uio->uio_resid < sizeof(eventfd_t))
+ return (EINVAL);
+
+ error = 0;
+ efd = fp->f_data;
+ mtx_lock(&efd->efd_lock);
+ while (error == 0 && efd->efd_count == 0) {
+ if ((fp->f_flag & FNONBLOCK) != 0) {
+ mtx_unlock(&efd->efd_lock);
+ return (EAGAIN);
+ }
+ error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH,
+ "efdrd", 0);
+ }
+ if (error == 0) {
+ MPASS(efd->efd_count > 0);
+ if ((efd->efd_flags & EFD_SEMAPHORE) != 0) {
+ count = 1;
+ --efd->efd_count;
+ } else {
+ count = efd->efd_count;
+ efd->efd_count = 0;
+ }
+ KNOTE_LOCKED(&efd->efd_sel.si_note, 0);
+ selwakeup(&efd->efd_sel);
+ wakeup(&efd->efd_count);
+ mtx_unlock(&efd->efd_lock);
+ error = uiomove(&count, sizeof(eventfd_t), uio);
+ } else
+ mtx_unlock(&efd->efd_lock);
+
+ return (error);
+}
+
+static int
+eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+ struct eventfd *efd;
+ eventfd_t count;
+ int error;
+
+ if (uio->uio_resid < sizeof(eventfd_t))
+ return (EINVAL);
+
+ error = uiomove(&count, sizeof(eventfd_t), uio);
+ if (error != 0)
+ return (error);
+ if (count == UINT64_MAX)
+ return (EINVAL);
+
+ efd = fp->f_data;
+ mtx_lock(&efd->efd_lock);
+retry:
+ if (UINT64_MAX - efd->efd_count <= count) {
+ if ((fp->f_flag & FNONBLOCK) != 0) {
+ mtx_unlock(&efd->efd_lock);
+ /* Do not not return the number of bytes written */
+ uio->uio_resid += sizeof(eventfd_t);
+ return (EAGAIN);
+ }
+ error = mtx_sleep(&efd->efd_count, &efd->efd_lock,
+ PCATCH, "efdwr", 0);
+ if (error == 0)
+ goto retry;
+ }
+ if (error == 0) {
+ MPASS(UINT64_MAX - efd->efd_count > count);
+ efd->efd_count += count;
+ KNOTE_LOCKED(&efd->efd_sel.si_note, 0);
+ selwakeup(&efd->efd_sel);
+ wakeup(&efd->efd_count);
+ }
+ mtx_unlock(&efd->efd_lock);
+
+ return (error);
+}
+
+static int
+eventfd_poll(struct file *fp, int events, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct eventfd *efd;
+ int revents;
+
+ efd = fp->f_data;
+ revents = 0;
+ mtx_lock(&efd->efd_lock);
+ if ((events & (POLLIN | POLLRDNORM)) != 0 && efd->efd_count > 0)
+ revents |= events & (POLLIN | POLLRDNORM);
+ if ((events & (POLLOUT | POLLWRNORM)) != 0 && UINT64_MAX - 1 >
+ efd->efd_count)
+ revents |= events & (POLLOUT | POLLWRNORM);
+ if (revents == 0)
+ selrecord(td, &efd->efd_sel);
+ mtx_unlock(&efd->efd_lock);
+
+ return (revents);
+}
+
+static int
+eventfd_kqfilter(struct file *fp, struct knote *kn)
+{
+ struct eventfd *efd = fp->f_data;
+
+ mtx_lock(&efd->efd_lock);
+ switch (kn->kn_filter) {
+ case EVFILT_READ:
+ kn->kn_fop = &eventfd_rfiltops;
+ break;
+ case EVFILT_WRITE:
+ kn->kn_fop = &eventfd_wfiltops;
+ break;
+ default:
+ mtx_unlock(&efd->efd_lock);
+ return (EINVAL);
+ }
+
+ kn->kn_hook = efd;
+ knlist_add(&efd->efd_sel.si_note, kn, 1);
+ mtx_unlock(&efd->efd_lock);
+
+ return (0);
+}
+
+static void
+filt_eventfddetach(struct knote *kn)
+{
+ struct eventfd *efd = kn->kn_hook;
+
+ mtx_lock(&efd->efd_lock);
+ knlist_remove(&efd->efd_sel.si_note, kn, 1);
+ mtx_unlock(&efd->efd_lock);
+}
+
+static int
+filt_eventfdread(struct knote *kn, long hint)
+{
+ struct eventfd *efd = kn->kn_hook;
+ int ret;
+
+ mtx_assert(&efd->efd_lock, MA_OWNED);
+ kn->kn_data = (int64_t)efd->efd_count;
+ ret = efd->efd_count > 0;
+
+ return (ret);
+}
+
+static int
+filt_eventfdwrite(struct knote *kn, long hint)
+{
+ struct eventfd *efd = kn->kn_hook;
+ int ret;
+
+ mtx_assert(&efd->efd_lock, MA_OWNED);
+ kn->kn_data = (int64_t)(UINT64_MAX - 1 - efd->efd_count);
+ ret = UINT64_MAX - 1 > efd->efd_count;
+
+ return (ret);
+}
+
+static int
+eventfd_ioctl(struct file *fp, u_long cmd, void *data,
+ struct ucred *active_cred, struct thread *td)
+{
+ switch (cmd) {
+ case FIONBIO:
+ case FIOASYNC:
+ return (0);
+ }
+
+ return (ENOTTY);
+}
+
+static int
+eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
+ struct thread *td)
+{
+ bzero((void *)st, sizeof *st);
+ st->st_mode = S_IFIFO;
+ return (0);
+}
+
+static int
+eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
+{
+ struct eventfd *efd = fp->f_data;
+
+ kif->kf_type = KF_TYPE_EVENTFD;
+ mtx_lock(&efd->efd_lock);
+ kif->kf_un.kf_eventfd.kf_eventfd_value = efd->efd_count;
+ kif->kf_un.kf_eventfd.kf_eventfd_flags = efd->efd_flags;
+ mtx_unlock(&efd->efd_lock);
+ return (0);
+}
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index fa9436d1f9f9..a055f4a9b597 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$");
#include <sys/signalvar.h>
#include <sys/socketvar.h>
#include <sys/uio.h>
+#include <sys/eventfd.h>
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/limits.h>
@@ -63,6 +64,7 @@ __FBSDID("$FreeBSD$");
#include <sys/resourcevar.h>
#include <sys/selinfo.h>
#include <sys/sleepqueue.h>
+#include <sys/specialfd.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysent.h>
@@ -859,6 +861,67 @@ kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
return (error);
}
+int
+kern_specialfd(struct thread *td, int type, void *arg)
+{
+ struct file *fp;
+ struct specialfd_eventfd *ae;
+ int error, fd, fflags;
+
+ fflags = 0;
+ error = falloc_noinstall(td, &fp);
+ if (error != 0)
+ return (error);
+
+ switch (type) {
+ case SPECIALFD_EVENTFD:
+ ae = arg;
+ if ((ae->flags & EFD_CLOEXEC) != 0)
+ fflags |= O_CLOEXEC;
+ error = eventfd_create_file(td, fp, ae->initval, ae->flags);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ if (error == 0)
+ error = finstall(td, fp, &fd, fflags, NULL);
+ fdrop(fp, td);
+ if (error == 0)
+ td->td_retval[0] = fd;
+ return (error);
+}
+
+int
+sys___specialfd(struct thread *td, struct __specialfd_args *args)
+{
+ struct specialfd_eventfd ae;
+ int error;
+
+ switch (args->type) {
+ case SPECIALFD_EVENTFD:
+ if (args->len != sizeof(struct specialfd_eventfd)) {
+ error = EINVAL;
+ break;
+ }
+ error = copyin(args->req, &ae, sizeof(ae));
+ if (error != 0)
+ break;
+ if ((ae.flags & ~(EFD_CLOEXEC | EFD_NONBLOCK |
+ EFD_SEMAPHORE)) != 0) {
+ error = EINVAL;
+ break;
+ }
+ error = kern_specialfd(td, args->type, &ae);
+ break;
*** 184 LINES SKIPPED ***
More information about the dev-commits-src-main
mailing list