git: bb66c5975383 - main - linux(4): Add sendfile fallback for non-socket fds

From: Dmitry Chagin <dchagin_at_FreeBSD.org>
Date: Thu, 17 Aug 2023 19:58:39 UTC
The branch main has been updated by dchagin:

URL: https://cgit.FreeBSD.org/src/commit/?id=bb66c59753836cd8abb596fe316dcdb77ea66999

commit bb66c59753836cd8abb596fe316dcdb77ea66999
Author:     James McLaughlin <james_mclgh.net>
AuthorDate: 2023-08-17 19:57:17 +0000
Commit:     Dmitry Chagin <dchagin@FreeBSD.org>
CommitDate: 2023-08-17 19:57:17 +0000

    linux(4): Add sendfile fallback for non-socket fds
    
    Before Linux 2.6.33, out_fd must refer to a socket. Since Linux 2.6.33
    it can be any file.
    The patch was originally provided by James McLaughlin and adapted by me
    for copy_file_range.
    
    PR:                     262535
    Differential revision:  https://reviews.freebsd.org/D34555
    MFC after:              1 month
---
 sys/compat/linux/linux_socket.c | 209 ++++++++++++++++++++++++++++++++++------
 1 file changed, 177 insertions(+), 32 deletions(-)

diff --git a/sys/compat/linux/linux_socket.c b/sys/compat/linux/linux_socket.c
index 45b94cb2f994..f768392be546 100644
--- a/sys/compat/linux/linux_socket.c
+++ b/sys/compat/linux/linux_socket.c
@@ -36,10 +36,12 @@
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
+#include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
+#include <sys/vnode.h>
 #include <sys/un.h>
 #include <sys/unistd.h>
 
@@ -2374,57 +2376,200 @@ out:
 	return (error);
 }
 
+/*
+ * Based on sendfile_getsock from kern_sendfile.c
+ * Determines whether an fd is a stream socket that can be used
+ * with FreeBSD sendfile.
+ */
+static bool
+is_stream_socket(struct file *fp)
+{
+	struct socket *so;
+
+	/*
+	 * The socket must be a stream socket and connected.
+	 */
+	if (fp->f_type != DTYPE_SOCKET)
+		return (false);
+	so = fp->f_data;
+	if (so->so_type != SOCK_STREAM)
+		return (false);
+	/*
+	 * SCTP one-to-one style sockets currently don't work with
+	 * sendfile().
+	 */
+	if (so->so_proto->pr_protocol == IPPROTO_SCTP)
+		return (false);
+	return (!SOLISTENING(so));
+}
+
+static bool
+is_regular_file(struct file *fp)
+{
+
+	return (fp->f_type == DTYPE_VNODE && fp->f_vnode != NULL &&
+	    fp->f_vnode->v_type == VREG);
+}
+
 static int
-linux_sendfile_common(struct thread *td, l_int out, l_int in,
-    off_t *offset, l_size_t count)
+sendfile_fallback(struct thread *td, struct file *fp, l_int out,
+    off_t *offset, l_size_t count, off_t *sbytes)
 {
-	off_t bytes_read;
-	int error;
-	l_loff_t current_offset;
-	struct file *fp;
+	off_t current_offset, out_offset, to_send;
+	l_size_t bytes_sent, n_read;
+	struct file *ofp;
+	struct iovec aiov;
+	struct uio auio;
+	bool seekable;
+	size_t bufsz;
+	void *buf;
+	int flags, error;
 
-	AUDIT_ARG_FD(in);
-	error = fget_read(td, in, &cap_pread_rights, &fp);
+	if (offset == NULL) {
+		if ((error = fo_seek(fp, 0, SEEK_CUR, td)) != 0)
+			return (error);
+		current_offset = td->td_uretoff.tdu_off;
+	} else {
+		if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0)
+			return (ESPIPE);
+		current_offset = *offset;
+	}
+	error = fget_write(td, out, &cap_pwrite_rights, &ofp);
 	if (error != 0)
 		return (error);
-
-	if (offset != NULL) {
-		current_offset = *offset;
-	} else {
-		error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
-		    fo_seek(fp, 0, SEEK_CUR, td) : ESPIPE;
-		if (error != 0)
+	seekable = (ofp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0;
+	if (seekable) {
+		if ((error = fo_seek(ofp, 0, SEEK_CUR, td)) != 0)
 			goto drop;
+		out_offset = td->td_uretoff.tdu_off;
+	} else
+		out_offset = 0;
+
+	flags = FOF_OFFSET | FOF_NOUPDATE;
+	bufsz = min(count, MAXPHYS);
+	buf = malloc(bufsz, M_LINUX, M_WAITOK);
+	bytes_sent = 0;
+	while (bytes_sent < count) {
+		to_send = min(count - bytes_sent, bufsz);
+		aiov.iov_base = buf;
+		aiov.iov_len = bufsz;
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_segflg = UIO_SYSSPACE;
+		auio.uio_td = td;
+		auio.uio_rw = UIO_READ;
+		auio.uio_offset = current_offset;
+		auio.uio_resid = to_send;
+		error = fo_read(fp, &auio, fp->f_cred, flags, td);
+		if (error != 0)
+			break;
+		n_read = to_send - auio.uio_resid;
+		if (n_read == 0)
+			break;
+		aiov.iov_base = buf;
+		aiov.iov_len = bufsz;
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_segflg = UIO_SYSSPACE;
+		auio.uio_td = td;
+		auio.uio_rw = UIO_WRITE;
+		auio.uio_offset = (seekable) ? out_offset : 0;
+		auio.uio_resid = n_read;
+		error = fo_write(ofp, &auio, ofp->f_cred, flags, td);
+		if (error != 0)
+			break;
+		bytes_sent += n_read;
+		current_offset += n_read;
+		out_offset += n_read;
+	}
+	free(buf, M_LINUX);
+
+	if (error == 0) {
+		*sbytes = bytes_sent;
+		if (offset != NULL)
+			*offset = current_offset;
+		else
+			error = fo_seek(fp, current_offset, SEEK_SET, td);
+	}
+	if (error == 0 && seekable)
+		error = fo_seek(ofp, out_offset, SEEK_SET, td);
+
+drop:
+	fdrop(ofp, td);
+	return (error);
+}
+
+static int
+sendfile_sendfile(struct thread *td, struct file *fp, l_int out,
+    off_t *offset, l_size_t count, off_t *sbytes)
+{
+	off_t current_offset;
+	int error;
+
+	if (offset == NULL) {
+		if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0)
+			return (ESPIPE);
+		if ((error = fo_seek(fp, 0, SEEK_CUR, td)) != 0)
+			return (error);
 		current_offset = td->td_uretoff.tdu_off;
+	} else
+		current_offset = *offset;
+	error = fo_sendfile(fp, out, NULL, NULL, current_offset, count,
+	    sbytes, 0, td);
+	if (error == 0) {
+		current_offset += *sbytes;
+		if (offset != NULL)
+			*offset = current_offset;
+		else
+			error = fo_seek(fp, current_offset, SEEK_SET, td);
 	}
+	return (error);
+}
 
-	bytes_read = 0;
+static int
+linux_sendfile_common(struct thread *td, l_int out, l_int in,
+    off_t *offset, l_size_t count)
+{
+	struct file *fp, *ofp;
+	off_t sbytes;
+	int error;
 
 	/* Linux cannot have 0 count. */
-	if (count <= 0 || current_offset < 0) {
+	if (count <= 0 || (offset != NULL && *offset < 0))
+		return (EINVAL);
+
+	AUDIT_ARG_FD(in);
+	error = fget_read(td, in, &cap_pread_rights, &fp);
+	if (error != 0)
+		return (error);
+	if ((fp->f_type != DTYPE_SHM && fp->f_type != DTYPE_VNODE) ||
+	    (fp->f_type == DTYPE_VNODE &&
+	    (fp->f_vnode == NULL || fp->f_vnode->v_type != VREG))) {
 		error = EINVAL;
 		goto drop;
 	}
-
-	error = fo_sendfile(fp, out, NULL, NULL, current_offset, count,
-	    &bytes_read, 0, td);
+	error = fget_unlocked(td, out, &cap_no_rights, &ofp);
 	if (error != 0)
 		goto drop;
-	current_offset += bytes_read;
 
-	if (offset != NULL) {
-		*offset = current_offset;
+	if (is_regular_file(fp) && is_regular_file(ofp)) {
+		error = kern_copy_file_range(td, in, offset, out, NULL, count,
+		    0);
 	} else {
-		error = fo_seek(fp, current_offset, SEEK_SET, td);
-		if (error != 0)
-			goto drop;
+		sbytes = 0;
+		if (is_stream_socket(ofp))
+			error = sendfile_sendfile(td, fp, out, offset, count,
+			    &sbytes);
+		else
+			error = sendfile_fallback(td, fp, out, offset, count,
+			    &sbytes);
+		if (error == 0)
+			td->td_retval[0] = sbytes;
 	}
+	fdrop(ofp, td);
 
-	td->td_retval[0] = (ssize_t)bytes_read;
 drop:
 	fdrop(fp, td);
-	if (error == ENOTSOCK)
-		error = EINVAL;
 	return (error);
 }
 
@@ -2434,10 +2579,10 @@ linux_sendfile(struct thread *td, struct linux_sendfile_args *arg)
 	/*
 	 * Differences between FreeBSD and Linux sendfile:
 	 * - Linux doesn't send anything when count is 0 (FreeBSD uses 0 to
-	 *   mean send the whole file.)  In linux_sendfile given fds are still
-	 *   checked for validity when the count is 0.
+	 *   mean send the whole file).
 	 * - Linux can send to any fd whereas FreeBSD only supports sockets.
-	 *   The same restriction follows for linux_sendfile.
+	 *   We therefore use FreeBSD sendfile where possible for performance,
+	 *   but fall back on a manual copy (sendfile_fallback).
 	 * - Linux doesn't have an equivalent for FreeBSD's flags and sf_hdtr.
 	 * - Linux takes an offset pointer and updates it to the read location.
 	 *   FreeBSD takes in an offset and a 'bytes read' parameter which is