RFC: Sendfile patch

Mike Silbersack silby at silby.com
Sat Jan 3 19:06:49 PST 2004


The attached patch (also available from the following url):
http://www.silby.com/patches/raw/sendfile-wip4.patch
modifies sendfile to ensure that the header is sent in the same packet as
the data from the file.  This greatly helps performance when dealing with
http servers, as it allows small files to fit in one packet, instead of
two.

In order to keep the implementation relatively simple, I have created two
helper functions, iov_to_uio, and m_uiotombuf.  You'll note that
iov_to_uio contains the body of readv/writev almost exactly, and
m_uiotombuf does much the same thing as sosend.  (In the future, I may
attempt to make readv/writev use iov_to_uio, and have sosend use
m_uiotombuf, but that's a project for another day.)

I'm going to be gone for two weeks, so that should allow plenty of time
for testing / review of the patch.  The location(s) of iov_to_uio and
m_uiotombuf in both source and header files sounds like an easy thing to
bikeshed about, so please don't bring that up.  Such matters can be
discussed when I get back, because I commit it.  What I am interested in
are reviews of the actual code, specifically regarding any possible
security holes which I may be introducing. <g>  Also, I would appreciate
it if someone could verify that I have the old sendfile compatibility part
working correctly, as it is quite messy.

If you're not in the mood to review the patch, please feel free to test
it.  I have only tested with apache2 and thttpd (with alfred's sf patches
from the port enabled), there are undoubtedly many other programs which
make use of sendfile and should be tested.  If you have any sort of web
benchmark which helps to show the performance increase given by this
patch, please run it; it's always great to include statistics in a commit
message.

Naturally, the patch is against -current.  If you have any questions,
please ask quickly, or prepare to wait a long time for an answer. :)

Mike "Silby" Silbersack
-------------- next part --------------
diff -u -r /usr/src/sys.old/kern/uipc_mbuf.c /usr/src/sys/kern/uipc_mbuf.c
--- /usr/src/sys.old/kern/uipc_mbuf.c	Sat Jan  3 19:51:29 2004
+++ /usr/src/sys/kern/uipc_mbuf.c	Sat Jan  3 20:14:03 2004
@@ -43,6 +43,7 @@
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
+#include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
@@ -50,6 +51,7 @@
 #include <sys/sysctl.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
+#include <sys/uio.h>
 
 int	max_linkhdr;
 int	max_protohdr;
@@ -1028,3 +1030,99 @@
 }
 
 #endif
+
+struct mbuf *
+m_uiotombuf(struct uio *uio, int how, int len)
+{
+	struct mbuf	*m_new = NULL, *m_final = NULL;
+	int		progress = 0, error = 0, length, total;
+
+	if (len > 0)
+		total = min(uio->uio_resid, len);
+	else
+		total = uio->uio_resid;
+
+	if (total > MHLEN)
+		m_final = m_getcl(how, MT_DATA, M_PKTHDR);
+	else
+		m_final = m_gethdr(how, MT_DATA);
+
+	if (m_final == NULL)
+		goto nospace;
+
+	m_new = m_final;
+
+	while (progress < total) {
+		length = total - progress;
+		if (length > MCLBYTES)
+			length = MCLBYTES;
+
+		if (m_new == NULL) {
+			if (length > MLEN)
+				m_new = m_getcl(how, MT_DATA, 0);
+			else
+				m_new = m_get(how, MT_DATA);
+			if (m_new == NULL)
+				goto nospace;
+		}
+
+		error = uiomove(mtod(m_new, void *), length, uio);
+		if (error)
+			goto nospace;
+		progress += length;
+		m_new->m_len = length;
+		if (m_new != m_final)
+			m_cat(m_final, m_new);
+		m_new = NULL;
+	}
+	m_fixhdr(m_final);
+	return (m_final);
+nospace:
+	if (m_new)
+		m_free(m_new);
+	if (m_final)
+		m_freem(m_final);
+	return (NULL);
+}
+
+int
+iov_to_uio(struct iovec *iovp, u_int iovcnt, struct uio *auio)
+{
+	int error = 0, i;
+	u_int iovlen;
+	struct iovec *iov = NULL;
+
+	if (iovcnt < 0)
+		panic("iovcnt < 0!\n");
+
+        /* note: can't use iovlen until iovcnt is validated */
+        iovlen = iovcnt * sizeof (struct iovec);
+        if (iovcnt > UIO_MAXIOV) {
+               error = EINVAL;
+               goto done;
+        }
+        MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
+        auio->uio_iov = iov;
+        auio->uio_iovcnt = iovcnt;
+        auio->uio_segflg = UIO_USERSPACE;
+        auio->uio_offset = -1;
+        if ((error = copyin(iovp, iov, iovlen)))
+                goto done;
+        auio->uio_resid = 0;
+        for (i = 0; i < iovcnt; i++) {
+                if (iov->iov_len > INT_MAX - auio->uio_resid) {
+                        error = EINVAL;
+                        goto done;
+                }
+                auio->uio_resid += iov->iov_len;
+                iov++;
+        }
+
+done:
+	if (error && auio->uio_iov) {
+		FREE(auio->uio_iov, M_IOV);
+		auio->uio_iov = NULL;
+	}
+	return (error);
+
+}
diff -u -r /usr/src/sys.old/kern/uipc_syscalls.c /usr/src/sys/kern/uipc_syscalls.c
--- /usr/src/sys.old/kern/uipc_syscalls.c	Sat Jan  3 19:51:29 2004
+++ /usr/src/sys/kern/uipc_syscalls.c	Sat Jan  3 20:15:08 2004
@@ -1667,13 +1667,15 @@
 	struct vnode *vp;
 	struct vm_object *obj;
 	struct socket *so = NULL;
-	struct mbuf *m;
+	struct mbuf *m, *m_header = NULL;
 	struct sf_buf *sf;
 	struct vm_page *pg;
 	struct writev_args nuap;
 	struct sf_hdtr hdtr;
+	struct uio hdr_uio;
 	off_t off, xfsize, hdtr_size, sbytes = 0;
-	int error, s;
+	int error, s, headersize = 0, headersent = 0;
+	struct iovec *hdr_iov = NULL;
 
 	mtx_lock(&Giant);
 
@@ -1721,19 +1723,25 @@
 		if (error)
 			goto done;
 		/*
-		 * Send any headers. Wimp out and use writev(2).
+		 * Send any headers.
 		 */
 		if (hdtr.headers != NULL) {
-			nuap.fd = uap->s;
-			nuap.iovp = hdtr.headers;
-			nuap.iovcnt = hdtr.hdr_cnt;
-			error = writev(td, &nuap);
+			hdr_uio.uio_td = td;
+			hdr_uio.uio_rw = UIO_WRITE;
+			error = iov_to_uio(hdtr.headers, hdtr.hdr_cnt,
+				&hdr_uio);
 			if (error)
 				goto done;
-			if (compat)
-				sbytes += td->td_retval[0];
-			else
-				hdtr_size += td->td_retval[0];
+			/* Cache hdr_iov, m_uiotombuf may change it. */
+			hdr_iov = hdr_uio.uio_iov;
+			if (hdr_uio.uio_resid > 0) {
+				m_header = m_uiotombuf(&hdr_uio, M_DONTWAIT, 0);
+				if (m_header == NULL)
+					goto done;
+				headersize = m_header->m_pkthdr.len;
+				if (compat)
+					sbytes += headersize;
+			}
 		}
 	}
 
@@ -1890,7 +1898,10 @@
 		/*
 		 * Get an mbuf header and set it up as having external storage.
 		 */
-		MGETHDR(m, M_TRYWAIT, MT_DATA);
+		if (m_header)
+			MGET(m, M_TRYWAIT, MT_DATA);
+		else
+			MGETHDR(m, M_TRYWAIT, MT_DATA);
 		if (m == NULL) {
 			error = ENOBUFS;
 			sf_buf_free((void *)sf_buf_kva(sf), sf);
@@ -1904,6 +1915,14 @@
 		    EXT_SFBUF);
 		m->m_data = (char *)sf_buf_kva(sf) + pgoff;
 		m->m_pkthdr.len = m->m_len = xfsize;
+
+		if (m_header) {
+			m_cat(m_header, m);
+			m = m_header;
+			m_header = NULL;
+			m_fixhdr(m);
+		}
+
 		/*
 		 * Add the buffer to the socket buffer chain.
 		 */
@@ -1965,6 +1984,7 @@
 			sbunlock(&so->so_snd);
 			goto done;
 		}
+		headersent = 1;
 	}
 	sbunlock(&so->so_snd);
 
@@ -1985,6 +2005,13 @@
 	}
 
 done:
+	if (headersent) {
+		if (!compat)
+			hdtr_size += headersize;
+	} else {
+		if (compat)
+			sbytes -= headersize;
+	}
 	/*
 	 * If there was no error we have to clear td->td_retval[0]
 	 * because it may have been set by writev.
@@ -2001,6 +2028,10 @@
 		vrele(vp);
 	if (so)
 		fputsock(so);
+	if (hdr_iov)
+		FREE(hdr_iov, M_IOV);
+	if (m_header)
+		m_freem(m_header);
 
 	mtx_unlock(&Giant);
 
Only in /usr/src/sys/kern: uipc_syscalls.c.rej.orig
diff -u -r /usr/src/sys.old/sys/uio.h /usr/src/sys/sys/uio.h
--- /usr/src/sys.old/sys/uio.h	Sat Jan  3 19:51:30 2004
+++ /usr/src/sys/sys/uio.h	Sat Jan  3 20:16:52 2004
@@ -94,6 +94,9 @@
 int	uiomove_frombuf(void *buf, int buflen, struct uio *uio);
 int	uiomoveco(void *cp, int n, struct uio *uio, struct vm_object *obj,
 	    int disposable);
+struct mbuf *
+	m_uiotombuf(struct uio *uio, int how, int len);
+int	iov_to_uio(struct iovec *iovp, u_int iovcnt, struct uio *auio);
 
 #else /* !_KERNEL */
 


More information about the freebsd-arch mailing list