svn commit: r261809 - projects/sendfile/sys/kern
Gleb Smirnoff
glebius at FreeBSD.org
Wed Feb 12 20:06:27 UTC 2014
Author: glebius
Date: Wed Feb 12 20:06:26 2014
New Revision: 261809
URL: http://svnweb.freebsd.org/changeset/base/261809
Log:
Make the sendfile(2) call non-blocking on disk I/O, but unlike SF_NODISKIO,
still performing the I/O. The new call is a drop-in replacement for older
sendfile(2), so applications like web-servers do not need to be recompiled
or reconfigured to achieve the benefit.
The mechanics of the change are the following:
o We grab enough pages to fill the socket buffer.
o We iterate through pages, and request I/O on those, that are not
valid. The I/O is requested via VOP_GETPAGES_ASYNC(), so it doesn't
block.
o If we did initiated any I/Os, then we send data to the buffer as
SB_NOTREADY data, since I/Os are still in progress. And return.
o Once the last I/O completes, we mark our data in socket as ready,
and if we were the blocker of the socket, then we initiate send.
The code still has quite a lot of rough places, but has already been
tested at Netflix with positive results.
Sponsored by: Netflix
Sponsored by: Nginx, Inc.
Modified:
projects/sendfile/sys/kern/uipc_syscalls.c
Modified: projects/sendfile/sys/kern/uipc_syscalls.c
==============================================================================
--- projects/sendfile/sys/kern/uipc_syscalls.c Wed Feb 12 19:59:30 2014 (r261808)
+++ projects/sendfile/sys/kern/uipc_syscalls.c Wed Feb 12 20:06:26 2014 (r261809)
@@ -132,9 +132,6 @@ static int filt_sfsync(struct knote *kn,
*/
static SYSCTL_NODE(_kern_ipc, OID_AUTO, sendfile, CTLFLAG_RW, 0,
"sendfile(2) tunables");
-static int sfreadahead = 1;
-SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW,
- &sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks");
#ifdef SFSYNC_DEBUG
static int sf_sync_debug = 0;
@@ -2651,11 +2648,53 @@ vmoff(int i, off_t off)
return (trunc_page(off + i * PAGE_SIZE));
}
+struct sf_io {
+ u_int nios;
+ int npages;
+ struct file *sock_fp;
+ struct mbuf *m;
+ vm_page_t pa[];
+};
+
static void
-sendfile_swapin(vm_object_t obj, vm_page_t *pa, int npages, off_t off,
- off_t len)
+sf_io_done(void *arg)
{
- int rv;
+ struct sf_io *sfio = arg;
+ struct socket *so;
+
+ if (!refcount_release(&sfio->nios))
+ return;
+
+ so = sfio->sock_fp->f_data;
+
+ if (sbready(&so->so_snd, sfio->m, sfio->npages) == 0) {
+ struct mbuf *m;
+
+ m = m_get(M_NOWAIT, MT_DATA);
+ if (m == NULL) {
+ panic("XXXGL");
+ }
+ m->m_len = 0;
+ CURVNET_SET(so->so_vnet);
+ /* XXXGL: curthread */
+ (void )(so->so_proto->pr_usrreqs->pru_send)
+ (so, 0, m, NULL, NULL, curthread);
+ CURVNET_RESTORE();
+ }
+
+ /* XXXGL: curthread */
+ fdrop(sfio->sock_fp, curthread);
+ free(sfio, M_TEMP);
+}
+
+static int
+sendfile_swapin(vm_object_t obj, struct sf_io *sfio, off_t off, off_t len)
+{
+ vm_page_t *pa = sfio->pa;
+ int npages = sfio->npages;
+ int nios, rv;
+
+ nios = 0;
VM_OBJECT_WLOCK(obj);
for (int i = 0; i < npages; i++)
@@ -2687,13 +2726,16 @@ sendfile_swapin(vm_object_t obj, vm_page
if (i == j)
continue;
- rv = vm_pager_get_pages(obj, pa + i, min(a + 1, npages - i), 0);
+ refcount_acquire(&sfio->nios);
+ rv = vm_pager_get_pages_async(obj, pa + i,
+ min(a + 1, npages - i), 0, &sf_io_done, sfio);
KASSERT(rv == VM_PAGER_OK, ("%s: pager fail obj %p page %p",
__func__, obj, pa[i]));
- vm_page_xunbusy(pa[i]);
SFSTAT_INC(sf_iocnt);
+ nios++;
+
i += a;
for (j = i - a; a > 0 && j < npages; a--, j++)
KASSERT(pa[j] == vm_page_lookup(obj,
@@ -2702,14 +2744,9 @@ sendfile_swapin(vm_object_t obj, vm_page
vm_page_lookup(obj, OFF_TO_IDX(vmoff(j, off)))));
}
- for (int i = 0; i < npages; i++)
- KASSERT((pa[i]->wire_count > 0 && vm_page_is_valid(pa[i],
- vmoff(i, off) & PAGE_MASK, xfsize(i, npages, off, len))),
- ("wrong page %p state off 0x%jx len 0x%jx",
- pa[i], (uintmax_t)vmoff(i, off),
- (uintmax_t)xfsize(i, npages, off, len)));
-
VM_OBJECT_WUNLOCK(obj);
+
+ return (nios);
}
static int
@@ -2905,9 +2942,10 @@ vn_sendfile(struct file *fp, int sockfd,
* and takes care of the overall progress.
*/
for (off = offset; rem > 0; ) {
+ struct sf_io *sfio;
vm_page_t *pa;
struct mbuf *mtail;
- int space, npages;
+ int nios, space, npages;
mtail = NULL;
/*
@@ -3002,17 +3040,22 @@ retry_space:
(PAGE_SIZE - (off & PAGE_MASK)), PAGE_SIZE);
else
npages = howmany(space, PAGE_SIZE);
- pa = malloc(npages * sizeof(vm_page_t), M_TEMP, mwait);
- if (pa == NULL) {
+ sfio = malloc(sizeof(struct sf_io) +
+ npages * sizeof(vm_page_t), M_TEMP, mwait);
+ if (sfio == NULL) {
error = merror;
goto done;
}
- sendfile_swapin(obj, pa, npages, off, space);
+ refcount_init(&sfio->nios, 1);
+ sfio->npages = npages;
+
+ nios = sendfile_swapin(obj, sfio, off, space);
/*
* Loop and construct maximum sized mbuf chain to be bulk
* dumped into socket buffer.
*/
+ pa = sfio->pa;
for (int i = 0; i < npages; i++) {
struct mbuf *m0;
@@ -3065,6 +3108,10 @@ retry_space:
m0->m_data = (char *)sf_buf_kva(sf) +
(vmoff(i, off) & PAGE_MASK);
m0->m_len = xfsize(i, npages, off, space);
+ m0->m_flags |= M_NOTREADY;
+
+ if (i == 0)
+ sfio->m = m0;
/* Append to mbuf chain. */
if (mtail != NULL)
@@ -3081,14 +3128,12 @@ retry_space:
sf_sync_ref(sfs);
}
- /* Keep track of bytes processed. */
- off += space;
- rem -= space;
-
if (vp != NULL)
VOP_UNLOCK(vp, 0);
- free(pa, M_TEMP);
+ /* Keep track of bytes processed. */
+ off += space;
+ rem -= space;
/* Prepend header, if any. */
if (hdrlen) {
@@ -3096,26 +3141,30 @@ retry_space:
m = mh;
}
- if (error)
- break;
+ if (error) {
+ free(sfio, M_TEMP);
+ goto done;
+ }
/* Add the buffer chain to the socket buffer. */
KASSERT(m_length(m, NULL) == space + hdrlen,
("%s: mlen %u space %d hdrlen %d",
__func__, m_length(m, NULL), space, hdrlen));
- SOCKBUF_LOCK(&so->so_snd);
- if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
- error = EPIPE;
- SOCKBUF_UNLOCK(&so->so_snd);
- goto done;
- }
- SOCKBUF_UNLOCK(&so->so_snd);
CURVNET_SET(so->so_vnet);
- /* Avoid error aliasing. */
- serror = (*so->so_proto->pr_usrreqs->pru_send)
+ if (nios == 0) {
+ free(sfio, M_TEMP);
+ serror = (*so->so_proto->pr_usrreqs->pru_send)
(so, 0, m, NULL, NULL, td);
+ } else {
+ sfio->sock_fp = sock_fp;
+ fhold(sock_fp);
+ serror = (*so->so_proto->pr_usrreqs->pru_send)
+ (so, PRUS_NOTREADY, m, NULL, NULL, td);
+ sf_io_done(sfio);
+ }
CURVNET_RESTORE();
+
if (serror == 0) {
sbytes += space + hdrlen;
if (hdrlen)
More information about the svn-src-projects
mailing list