git: b2deba043c0c - stable/13 - linux(4): Rework Linux ppoll system call.

From: Dmitry Chagin <dchagin_at_FreeBSD.org>
Date: Fri, 17 Jun 2022 19:32:01 UTC
The branch stable/13 has been updated by dchagin:

URL: https://cgit.FreeBSD.org/src/commit/?id=b2deba043c0ce5e89db934ff81dad753535eafa8

commit b2deba043c0ce5e89db934ff81dad753535eafa8
Author:     Dmitry Chagin <dchagin@FreeBSD.org>
AuthorDate: 2021-06-22 05:06:05 +0000
Commit:     Dmitry Chagin <dchagin@FreeBSD.org>
CommitDate: 2022-06-17 19:30:19 +0000

    linux(4): Rework Linux ppoll system call.
    
    For now the Linux emulation layer uses in kernel ppoll(2) without
    conversion of user supplied fd 'events', and does not convert the
    kernel supplied fd 'revents'.
    
    At least POLLRDHUP is handled by FreeBSD differently than by
    Linux. Seems that Linux silencly ignores POLLRDHUP on non socket fd's
    unlike FreeBSD, which does more strictly check and fails.
    
    Rework the Linux ppoll, using kern_poll and converting 'events'
    and 'revents' values.
    While here, move poll events defines to the MI part of code as they
    mostly identical on all arches except arm.
    
    Differential Revision:  https://reviews.freebsd.org/D30716
    MFC after:              2 weeks
    
    (cherry picked from commit 26795a0378b58c3e26b68577a4cc446ab527e8b5)
---
 sys/amd64/linux/linux.h         | 21 ---------
 sys/amd64/linux32/linux.h       | 21 ---------
 sys/compat/linux/linux.c        | 96 +++++++++++++++++++++++++++++++++++++++++
 sys/compat/linux/linux.h        | 17 ++++++++
 sys/compat/linux/linux_common.h |  4 ++
 sys/compat/linux/linux_misc.c   | 68 ++++++++++++++++++++++++++++-
 sys/i386/linux/linux.h          | 21 ---------
 7 files changed, 184 insertions(+), 64 deletions(-)

diff --git a/sys/amd64/linux/linux.h b/sys/amd64/linux/linux.h
index a9ed66689b64..4e736cc11c22 100644
--- a/sys/amd64/linux/linux.h
+++ b/sys/amd64/linux/linux.h
@@ -407,27 +407,6 @@ struct l_ifconf {
 #define	ifc_buf		ifc_ifcu.ifcu_buf
 #define	ifc_req		ifc_ifcu.ifcu_req
 
-/*
- * poll()
- */
-#define	LINUX_POLLIN		0x0001
-#define	LINUX_POLLPRI		0x0002
-#define	LINUX_POLLOUT		0x0004
-#define	LINUX_POLLERR		0x0008
-#define	LINUX_POLLHUP		0x0010
-#define	LINUX_POLLNVAL		0x0020
-#define	LINUX_POLLRDNORM	0x0040
-#define	LINUX_POLLRDBAND	0x0080
-#define	LINUX_POLLWRNORM	0x0100
-#define	LINUX_POLLWRBAND	0x0200
-#define	LINUX_POLLMSG		0x0400
-
-struct l_pollfd {
-	l_int		fd;
-	l_short		events;
-	l_short		revents;
-};
-
 #define LINUX_ARCH_SET_GS		0x1001
 #define LINUX_ARCH_SET_FS		0x1002
 #define LINUX_ARCH_GET_FS		0x1003
diff --git a/sys/amd64/linux32/linux.h b/sys/amd64/linux32/linux.h
index 50a4efed1709..a95545619640 100644
--- a/sys/amd64/linux32/linux.h
+++ b/sys/amd64/linux32/linux.h
@@ -515,27 +515,6 @@ struct l_ifconf {
 #define	ifc_buf		ifc_ifcu.ifcu_buf
 #define	ifc_req		ifc_ifcu.ifcu_req
 
-/*
- * poll()
- */
-#define	LINUX_POLLIN		0x0001
-#define	LINUX_POLLPRI		0x0002
-#define	LINUX_POLLOUT		0x0004
-#define	LINUX_POLLERR		0x0008
-#define	LINUX_POLLHUP		0x0010
-#define	LINUX_POLLNVAL		0x0020
-#define	LINUX_POLLRDNORM	0x0040
-#define	LINUX_POLLRDBAND	0x0080
-#define	LINUX_POLLWRNORM	0x0100
-#define	LINUX_POLLWRBAND	0x0200
-#define	LINUX_POLLMSG		0x0400
-
-struct l_pollfd {
-	l_int		fd;
-	l_short		events;
-	l_short		revents;
-};
-
 struct l_user_desc {
 	l_uint		entry_number;
 	l_uint		base_addr;
diff --git a/sys/compat/linux/linux.c b/sys/compat/linux/linux.c
index a8c5e2baddc4..350d2c1abaf9 100644
--- a/sys/compat/linux/linux.c
+++ b/sys/compat/linux/linux.c
@@ -33,9 +33,13 @@ __FBSDID("$FreeBSD$");
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/ctype.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
@@ -50,6 +54,7 @@ __FBSDID("$FreeBSD$");
 
 #include <compat/linux/linux.h>
 #include <compat/linux/linux_common.h>
+#include <compat/linux/linux_mib.h>
 #include <compat/linux/linux_util.h>
 
 struct futex_list futex_list;
@@ -627,3 +632,94 @@ linux_to_bsd_bits_(int value, struct bsd_to_linux_bitmap *bitmap,
 		return (no_value);
 	return (bsd_ret);
 }
+
+void
+linux_to_bsd_poll_events(struct thread *td, int fd, short lev,
+    short *bev)
+{
+	struct proc *p = td->td_proc;
+	struct filedesc *fdp;
+	struct file *fp;
+	int error;
+	short bits = 0;
+
+	if (lev & LINUX_POLLIN)
+		bits |= POLLIN;
+	if (lev & LINUX_POLLPRI)
+		bits |=	POLLPRI;
+	if (lev & LINUX_POLLOUT)
+		bits |= POLLOUT;
+	if (lev & LINUX_POLLERR)
+		bits |= POLLERR;
+	if (lev & LINUX_POLLHUP)
+		bits |= POLLHUP;
+	if (lev & LINUX_POLLNVAL)
+		bits |= POLLNVAL;
+	if (lev & LINUX_POLLRDNORM)
+		bits |= POLLRDNORM;
+	if (lev & LINUX_POLLRDBAND)
+		bits |= POLLRDBAND;
+	if (lev & LINUX_POLLWRBAND)
+		bits |= POLLWRBAND;
+	if (lev & LINUX_POLLWRNORM)
+		bits |= POLLWRNORM;
+
+	if (lev & LINUX_POLLRDHUP) {
+		/*
+		 * It seems that the Linux silencly ignores POLLRDHUP
+		 * on non-socket file descriptors unlike FreeBSD, where
+		 * events bits is more strictly checked (POLLSTANDARD).
+		 */
+		fdp = p->p_fd;
+		error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
+		if (error == 0) {
+			/*
+			 * XXX. On FreeBSD POLLRDHUP applies only to
+			 * stream sockets.
+			 */
+			if (fp->f_type == DTYPE_SOCKET)
+				bits |= POLLRDHUP;
+			fdrop(fp, td);
+		}
+	}
+
+	if (lev & LINUX_POLLMSG)
+		LINUX_RATELIMIT_MSG_OPT1("unsupported POLLMSG, events(%d)", lev);
+	if (lev & LINUX_POLLREMOVE)
+		LINUX_RATELIMIT_MSG_OPT1("unsupported POLLREMOVE, events(%d)", lev);
+
+	*bev = bits;
+}
+
+void
+bsd_to_linux_poll_events(short bev, short *lev)
+{
+	short bits = 0;
+
+	if (bev & POLLIN)
+		bits |= LINUX_POLLIN;
+	if (bev & POLLPRI)
+		bits |=	LINUX_POLLPRI;
+	if (bev & (POLLOUT | POLLWRNORM))
+		/*
+		 * POLLWRNORM is equal to POLLOUT on FreeBSD,
+		 * but not on Linux
+		 */
+		bits |= LINUX_POLLOUT;
+	if (bev & POLLERR)
+		bits |= LINUX_POLLERR;
+	if (bev & POLLHUP)
+		bits |= LINUX_POLLHUP;
+	if (bev & POLLNVAL)
+		bits |= LINUX_POLLNVAL;
+	if (bev & POLLRDNORM)
+		bits |= LINUX_POLLRDNORM;
+	if (bev & POLLRDBAND)
+		bits |= LINUX_POLLRDBAND;
+	if (bev & POLLWRBAND)
+		bits |= LINUX_POLLWRBAND;
+	if (bev & POLLRDHUP)
+		bits |= LINUX_POLLRDHUP;
+
+	*lev = bits;
+}
diff --git a/sys/compat/linux/linux.h b/sys/compat/linux/linux.h
index ba7a96e1aa79..2548f7d50a97 100644
--- a/sys/compat/linux/linux.h
+++ b/sys/compat/linux/linux.h
@@ -31,6 +31,23 @@
 
 #include <sys/queue.h>
 
+/*
+ * poll()
+ */
+#define	LINUX_POLLIN		0x0001
+#define	LINUX_POLLPRI		0x0002
+#define	LINUX_POLLOUT		0x0004
+#define	LINUX_POLLERR		0x0008
+#define	LINUX_POLLHUP		0x0010
+#define	LINUX_POLLNVAL		0x0020
+#define	LINUX_POLLRDNORM	0x0040
+#define	LINUX_POLLRDBAND	0x0080
+#define	LINUX_POLLWRNORM	0x0100
+#define	LINUX_POLLWRBAND	0x0200
+#define	LINUX_POLLMSG		0x0400
+#define	LINUX_POLLREMOVE	0x1000
+#define	LINUX_POLLRDHUP		0x2000
+
 #define	LINUX_IFHWADDRLEN	6
 #define	LINUX_IFNAMSIZ		16
 
diff --git a/sys/compat/linux/linux_common.h b/sys/compat/linux/linux_common.h
index a306bb1eb859..b0e3408e42df 100644
--- a/sys/compat/linux/linux_common.h
+++ b/sys/compat/linux/linux_common.h
@@ -41,5 +41,9 @@ int		bsd_to_linux_sockaddr(const struct sockaddr *sa,
 		    struct l_sockaddr **lsa, socklen_t len);
 int		linux_to_bsd_sockaddr(const struct l_sockaddr *lsa,
 		    struct sockaddr **sap, socklen_t *len);
+void		linux_to_bsd_poll_events(struct thread *td, int fd,
+		    short lev, short *bev);
+void		bsd_to_linux_poll_events(short bev, short *lev);
+
 
 #endif /* _LINUX_COMMON_H_ */
diff --git a/sys/compat/linux/linux_misc.c b/sys/compat/linux/linux_misc.c
index 22d44416c1b1..90a89578fc8f 100644
--- a/sys/compat/linux/linux_misc.c
+++ b/sys/compat/linux/linux_misc.c
@@ -50,6 +50,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
+#include <sys/poll.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/procctl.h>
@@ -89,6 +90,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/../linux/linux_proto.h>
 #endif
 
+#include <compat/linux/linux_common.h>
 #include <compat/linux/linux_dtrace.h>
 #include <compat/linux/linux_file.h>
 #include <compat/linux/linux_mib.h>
@@ -144,6 +146,10 @@ static int	linux_common_pselect6(struct thread *, l_int,
 static int	linux_common_ppoll(struct thread *, struct pollfd *,
 			uint32_t, struct timespec *, l_sigset_t *,
 			l_size_t);
+static int	linux_pollin(struct thread *, struct pollfd *,
+			struct pollfd *, u_int);
+static int	linux_pollout(struct thread *, struct pollfd *,
+			struct pollfd *, u_int);
 
 int
 linux_sysinfo(struct thread *td, struct linux_sysinfo_args *args)
@@ -2528,11 +2534,15 @@ linux_common_ppoll(struct thread *td, struct pollfd *fds, uint32_t nfds,
     struct timespec *tsp, l_sigset_t *sset, l_size_t ssize)
 {
 	struct timespec ts0, ts1;
+	struct pollfd stackfds[32];
+	struct pollfd *kfds;
  	l_sigset_t l_ss;
  	sigset_t *ssp;
  	sigset_t ss;
  	int error;
 
+	if (kern_poll_maxfds(nfds))
+		return (EINVAL);
 	if (sset != NULL) {
 		if (ssize != sizeof(l_ss))
 			return (EINVAL);
@@ -2546,7 +2556,17 @@ linux_common_ppoll(struct thread *td, struct pollfd *fds, uint32_t nfds,
 	if (tsp != NULL)
 		nanotime(&ts0);
 
-	error = kern_poll(td, fds, nfds, tsp, ssp);
+	if (nfds > nitems(stackfds))
+		kfds = mallocarray(nfds, sizeof(*kfds), M_TEMP, M_WAITOK);
+	else
+		kfds = stackfds;
+	error = linux_pollin(td, kfds, fds, nfds);
+	if (error != 0)
+		goto out;
+
+	error = kern_poll_kfds(td, kfds, nfds, tsp, ssp);
+	if (error == 0)
+		error = linux_pollout(td, kfds, fds, nfds);
 
 	if (error == 0 && tsp != NULL) {
 		if (td->td_retval[0]) {
@@ -2558,6 +2578,10 @@ linux_common_ppoll(struct thread *td, struct pollfd *fds, uint32_t nfds,
 		} else
 			timespecclear(tsp);
 	}
+
+out:
+	if (nfds > nitems(stackfds))
+		free(kfds, M_TEMP);
 	return (error);
 }
 
@@ -2592,6 +2616,48 @@ linux_ppoll_time64(struct thread *td, struct linux_ppoll_time64_args *args)
 }
 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
 
+static int
+linux_pollin(struct thread *td, struct pollfd *fds, struct pollfd *ufds, u_int nfd)
+{
+	int error;
+	u_int i;
+
+	error = copyin(ufds, fds, nfd * sizeof(*fds));
+	if (error != 0)
+		return (error);
+
+	for (i = 0; i < nfd; i++) {
+		if (fds->events != 0)
+			linux_to_bsd_poll_events(td, fds->fd,
+			    fds->events, &fds->events);
+		fds++;
+	}
+	return (0);
+}
+
+static int
+linux_pollout(struct thread *td, struct pollfd *fds, struct pollfd *ufds, u_int nfd)
+{
+	int error = 0;
+	u_int i, n = 0;
+
+	for (i = 0; i < nfd; i++) {
+		if (fds->revents != 0) {
+			bsd_to_linux_poll_events(fds->revents,
+			    &fds->revents);
+			n++;
+		}
+		error = copyout(&fds->revents, &ufds->revents,
+		    sizeof(ufds->revents));
+		if (error)
+			return (error);
+		fds++;
+		ufds++;
+	}
+	td->td_retval[0] = n;
+	return (0);
+}
+
 int
 linux_sched_rr_get_interval(struct thread *td,
     struct linux_sched_rr_get_interval_args *uap)
diff --git a/sys/i386/linux/linux.h b/sys/i386/linux/linux.h
index 1bb76d8e41d0..8dff1313c598 100644
--- a/sys/i386/linux/linux.h
+++ b/sys/i386/linux/linux.h
@@ -478,27 +478,6 @@ struct l_ifreq {
 #define	ifr_hwaddr	ifr_ifru.ifru_hwaddr	/* MAC address */
 #define	ifr_ifindex	ifr_ifru.ifru_ivalue	/* Interface index */
 
-/*
- * poll()
- */
-#define	LINUX_POLLIN		0x0001
-#define	LINUX_POLLPRI		0x0002
-#define	LINUX_POLLOUT		0x0004
-#define	LINUX_POLLERR		0x0008
-#define	LINUX_POLLHUP		0x0010
-#define	LINUX_POLLNVAL		0x0020
-#define	LINUX_POLLRDNORM	0x0040
-#define	LINUX_POLLRDBAND	0x0080
-#define	LINUX_POLLWRNORM	0x0100
-#define	LINUX_POLLWRBAND	0x0200
-#define	LINUX_POLLMSG		0x0400
-
-struct l_pollfd {
-	l_int		fd;
-	l_short		events;
-	l_short		revents;
-};
-
 struct l_user_desc {
 	l_uint		entry_number;
 	l_uint		base_addr;