svn commit: r239787 - in stable/9: lib/libc/sys sys/kern sys/sys
John Baldwin
jhb at FreeBSD.org
Tue Aug 28 18:44:56 UTC 2012
Author: jhb
Date: Tue Aug 28 18:44:56 2012
New Revision: 239787
URL: http://svn.freebsd.org/changeset/base/239787
Log:
MFC 230782,237274:
Refine the implementation of POSIX_FADV_NOREUSE to perform
POSIX_FADV_DONTNEED requests on the currently accessed portion of the
file on each read(2) or write(2) rather than using direct I/O. This
gives much better performance including read-ahead and write clustering
similar to normal read(2) and write(2) calls.
If subsequent read(2) and write(2) calls are sequential, then the
POSIX_FADV_DONTNEED requests will cover the entire sequentially-accessed
range.
Modified:
stable/9/lib/libc/sys/posix_fadvise.2
stable/9/sys/kern/vfs_syscalls.c
stable/9/sys/kern/vfs_vnops.c
stable/9/sys/sys/file.h
Directory Properties:
stable/9/lib/libc/ (props changed)
stable/9/lib/libc/stdtime/ (props changed)
stable/9/lib/libc/sys/ (props changed)
stable/9/lib/libc/uuid/ (props changed)
stable/9/sys/ (props changed)
stable/9/sys/amd64/include/xen/ (props changed)
stable/9/sys/boot/ (props changed)
stable/9/sys/boot/i386/efi/ (props changed)
stable/9/sys/boot/ia64/efi/ (props changed)
stable/9/sys/boot/ia64/ski/ (props changed)
stable/9/sys/boot/powerpc/boot1.chrp/ (props changed)
stable/9/sys/boot/powerpc/ofw/ (props changed)
stable/9/sys/cddl/contrib/opensolaris/ (props changed)
stable/9/sys/conf/ (props changed)
stable/9/sys/contrib/dev/acpica/ (props changed)
stable/9/sys/contrib/octeon-sdk/ (props changed)
stable/9/sys/contrib/pf/ (props changed)
stable/9/sys/contrib/x86emu/ (props changed)
stable/9/sys/dev/ (props changed)
stable/9/sys/dev/e1000/ (props changed)
stable/9/sys/dev/isp/ (props changed)
stable/9/sys/dev/ixgbe/ (props changed)
stable/9/sys/fs/ (props changed)
stable/9/sys/fs/ntfs/ (props changed)
stable/9/sys/modules/ (props changed)
Modified: stable/9/lib/libc/sys/posix_fadvise.2
==============================================================================
--- stable/9/lib/libc/sys/posix_fadvise.2 Tue Aug 28 18:33:12 2012 (r239786)
+++ stable/9/lib/libc/sys/posix_fadvise.2 Tue Aug 28 18:44:56 2012 (r239787)
@@ -28,7 +28,7 @@
.\" @(#)madvise.2 8.1 (Berkeley) 6/9/93
.\" $FreeBSD$
.\"
-.Dd February 25, 2012
+.Dd June 19, 2012
.Dt POSIX_FADVISE 2
.Os
.Sh NAME
@@ -84,10 +84,9 @@ specified range and future access to thi
.It Dv POSIX_FADV_NOREUSE
Tells the system that the specified data will only be accessed once and
then not reused.
-Accesses to data within the specified range are treated as if the file
-descriptor has the
-.Dv O_DIRECT
-flag enabled.
+The system may decrease the in-memory priority of data once it has been
+read or written.
+Future access to this data may require a read operation.
.El
.Pp
.Sh RETURN VALUES
Modified: stable/9/sys/kern/vfs_syscalls.c
==============================================================================
--- stable/9/sys/kern/vfs_syscalls.c Tue Aug 28 18:33:12 2012 (r239786)
+++ stable/9/sys/kern/vfs_syscalls.c Tue Aug 28 18:44:56 2012 (r239787)
@@ -4953,6 +4953,8 @@ kern_posix_fadvise(struct thread *td, in
new->fa_advice = advice;
new->fa_start = offset;
new->fa_end = end;
+ new->fa_prevstart = 0;
+ new->fa_prevend = 0;
fp->f_advice = new;
new = fa;
}
Modified: stable/9/sys/kern/vfs_vnops.c
==============================================================================
--- stable/9/sys/kern/vfs_vnops.c Tue Aug 28 18:33:12 2012 (r239786)
+++ stable/9/sys/kern/vfs_vnops.c Tue Aug 28 18:44:56 2012 (r239787)
@@ -519,6 +519,7 @@ vn_read(fp, uio, active_cred, flags, td)
int error, ioflag;
struct mtx *mtxp;
int advice, vfslocked;
+ off_t offset, start, end;
KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
uio->uio_td, td));
@@ -558,19 +559,14 @@ vn_read(fp, uio, active_cred, flags, td)
switch (advice) {
case POSIX_FADV_NORMAL:
case POSIX_FADV_SEQUENTIAL:
+ case POSIX_FADV_NOREUSE:
ioflag |= sequential_heuristic(uio, fp);
break;
case POSIX_FADV_RANDOM:
/* Disable read-ahead for random I/O. */
break;
- case POSIX_FADV_NOREUSE:
- /*
- * Request the underlying FS to discard the buffers
- * and pages after the I/O is complete.
- */
- ioflag |= IO_DIRECT;
- break;
}
+ offset = uio->uio_offset;
#ifdef MAC
error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
@@ -587,6 +583,39 @@ vn_read(fp, uio, active_cred, flags, td)
}
fp->f_nextoff = uio->uio_offset;
VOP_UNLOCK(vp, 0);
+ if (error == 0 && advice == POSIX_FADV_NOREUSE &&
+ offset != uio->uio_offset) {
+ /*
+ * Use POSIX_FADV_DONTNEED to flush clean pages and
+ * buffers for the backing file after a
+ * POSIX_FADV_NOREUSE read(2). To optimize the common
+ * case of using POSIX_FADV_NOREUSE with sequential
+ * access, track the previous implicit DONTNEED
+ * request and grow this request to include the
+ * current read(2) in addition to the previous
+ * DONTNEED. With purely sequential access this will
+ * cause the DONTNEED requests to continously grow to
+ * cover all of the previously read regions of the
+ * file. This allows filesystem blocks that are
+ * accessed by multiple calls to read(2) to be flushed
+ * once the last read(2) finishes.
+ */
+ start = offset;
+ end = uio->uio_offset - 1;
+ mtx_lock(mtxp);
+ if (fp->f_advice != NULL &&
+ fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
+ if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
+ start = fp->f_advice->fa_prevstart;
+ else if (fp->f_advice->fa_prevstart != 0 &&
+ fp->f_advice->fa_prevstart == end + 1)
+ end = fp->f_advice->fa_prevend;
+ fp->f_advice->fa_prevstart = start;
+ fp->f_advice->fa_prevend = end;
+ }
+ mtx_unlock(mtxp);
+ error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
+ }
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -607,6 +636,7 @@ vn_write(fp, uio, active_cred, flags, td
int error, ioflag, lock_flags;
struct mtx *mtxp;
int advice, vfslocked;
+ off_t offset, start, end;
KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
uio->uio_td, td));
@@ -641,6 +671,7 @@ vn_write(fp, uio, active_cred, flags, td
if ((flags & FOF_OFFSET) == 0)
uio->uio_offset = fp->f_offset;
advice = POSIX_FADV_NORMAL;
+ mtxp = NULL;
if (fp->f_advice != NULL) {
mtxp = mtx_pool_find(mtxpool_sleep, fp);
mtx_lock(mtxp);
@@ -653,19 +684,14 @@ vn_write(fp, uio, active_cred, flags, td
switch (advice) {
case POSIX_FADV_NORMAL:
case POSIX_FADV_SEQUENTIAL:
+ case POSIX_FADV_NOREUSE:
ioflag |= sequential_heuristic(uio, fp);
break;
case POSIX_FADV_RANDOM:
/* XXX: Is this correct? */
break;
- case POSIX_FADV_NOREUSE:
- /*
- * Request the underlying FS to discard the buffers
- * and pages after the I/O is complete.
- */
- ioflag |= IO_DIRECT;
- break;
}
+ offset = uio->uio_offset;
#ifdef MAC
error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
@@ -678,6 +704,55 @@ vn_write(fp, uio, active_cred, flags, td
VOP_UNLOCK(vp, 0);
if (vp->v_type != VCHR)
vn_finished_write(mp);
+ if (error == 0 && advice == POSIX_FADV_NOREUSE &&
+ offset != uio->uio_offset) {
+ /*
+ * Use POSIX_FADV_DONTNEED to flush clean pages and
+ * buffers for the backing file after a
+ * POSIX_FADV_NOREUSE write(2). To optimize the
+ * common case of using POSIX_FADV_NOREUSE with
+ * sequential access, track the previous implicit
+ * DONTNEED request and grow this request to include
+ * the current write(2) in addition to the previous
+ * DONTNEED. With purely sequential access this will
+ * cause the DONTNEED requests to continously grow to
+ * cover all of the previously written regions of the
+ * file.
+ *
+ * Note that the blocks just written are almost
+ * certainly still dirty, so this only works when
+ * VOP_ADVISE() calls from subsequent writes push out
+ * the data written by this write(2) once the backing
+ * buffers are clean. However, as compared to forcing
+ * IO_DIRECT, this gives much saner behavior. Write
+ * clustering is still allowed, and clean pages are
+ * merely moved to the cache page queue rather than
+ * outright thrown away. This means a subsequent
+ * read(2) can still avoid hitting the disk if the
+ * pages have not been reclaimed.
+ *
+ * This does make POSIX_FADV_NOREUSE largely useless
+ * with non-sequential access. However, sequential
+ * access is the more common use case and the flag is
+ * merely advisory.
+ */
+ start = offset;
+ end = uio->uio_offset - 1;
+ mtx_lock(mtxp);
+ if (fp->f_advice != NULL &&
+ fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
+ if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
+ start = fp->f_advice->fa_prevstart;
+ else if (fp->f_advice->fa_prevstart != 0 &&
+ fp->f_advice->fa_prevstart == end + 1)
+ end = fp->f_advice->fa_prevend;
+ fp->f_advice->fa_prevstart = start;
+ fp->f_advice->fa_prevend = end;
+ }
+ mtx_unlock(mtxp);
+ error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
+ }
+
unlock:
VFS_UNLOCK_GIANT(vfslocked);
return (error);
Modified: stable/9/sys/sys/file.h
==============================================================================
--- stable/9/sys/sys/file.h Tue Aug 28 18:33:12 2012 (r239786)
+++ stable/9/sys/sys/file.h Tue Aug 28 18:44:56 2012 (r239787)
@@ -126,6 +126,8 @@ struct fadvise_info {
int fa_advice; /* (f) FADV_* type. */
off_t fa_start; /* (f) Region start. */
off_t fa_end; /* (f) Region end. */
+ off_t fa_prevstart; /* (f) Previous NOREUSE start. */
+ off_t fa_prevend; /* (f) Previous NOREUSE end. */
};
struct file {
More information about the svn-src-stable-9
mailing list