svn commit: r307626 - head/sys/ufs/ffs
Adrian Chadd
adrian.chadd at gmail.com
Sat Nov 12 23:19:15 UTC 2016
hi!
This broke freebsd on mips24k.
BAD_PAGE_FAULT: pid 1 tid 100001 (init), uid 0: pc 0x4002a4 got a read
fault (type 0x2) at 0
Trapframe Register Dump:
zero: 0 at: 0 v0: 0 v1: 0
a0: 0x7fffeecc a1: 0 a2: 0 a3: 0
t0: 0 t1: 0 t2: 0 t3: 0
t4: 0 t5: 0 t6: 0 t7: 0
t8: 0 t9: 0x400260 s0: 0x10 s1: 0x2
s2: 0x7fffeed0 s3: 0 s4: 0 s5: 0
s6: 0 s7: 0 k0: 0 k1: 0
gp: 0x4d55d0 sp: 0x7ffeee90 s8: 0 ra: 0
sr: 0xfc13 mullo: 0 mulhi: 0 badvaddr: 0
cause: 0x8 pc: 0x4002a4
Page table info for pc address 0x4002a4: pde = 0x809be000, pte = 0xa001acda
Dumping 4 words starting at pc address 0x4002a4:
8c420000 14400003 00908021 8f828024
Page table info for bad address 0: pde = 0, pte = 0
.. and yes, I've spent three days bisecting everything to get to this
particular commit.
-adrian
On 19 October 2016 at 04:09, Konstantin Belousov <kib at freebsd.org> wrote:
> Author: kib
> Date: Wed Oct 19 11:09:29 2016
> New Revision: 307626
> URL: https://svnweb.freebsd.org/changeset/base/307626
>
> Log:
> Add FFS pager, which uses buffer cache read operation to validate pages.
> See the comments for more detailed description of the algorithm.
>
> The pager is used unconditionally when the block size of the
> underlying device is larger than the machine page size, since local
> vnode pager cannot handle the configuration [1]. Otherwise, the
> vfs.ffs.use_buf_pager sysctl allows to switch to the local pager.
>
> Measurements demonstrated no regression in the ever-important
> buildworld benchmark, and small (~5%) throughput improvements in the
> special microbenchmark configuration for dbench over swap-backed
> md(4).
>
> Code can be generalized and reused for other filesystems which use
> buffer cache.
>
> Reported by: Anton Yuzhaninov <citrin at citrin.ru> [1]
> Tested by: pho
> Benchmarked by: mjg, pho
> Reviewed by: alc, markj, mckusick (previous version)
> Sponsored by: The FreeBSD Foundation
> MFC after: 2 weeks
> Differential revision: https://reviews.freebsd.org/D8198
>
> Modified:
> head/sys/ufs/ffs/ffs_vnops.c
>
> Modified: head/sys/ufs/ffs/ffs_vnops.c
> ==============================================================================
> --- head/sys/ufs/ffs/ffs_vnops.c Wed Oct 19 10:01:04 2016 (r307625)
> +++ head/sys/ufs/ffs/ffs_vnops.c Wed Oct 19 11:09:29 2016 (r307626)
> @@ -77,6 +77,7 @@ __FBSDID("$FreeBSD$");
> #include <sys/priv.h>
> #include <sys/rwlock.h>
> #include <sys/stat.h>
> +#include <sys/sysctl.h>
> #include <sys/vmmeter.h>
> #include <sys/vnode.h>
>
> @@ -86,6 +87,7 @@ __FBSDID("$FreeBSD$");
> #include <vm/vm_object.h>
> #include <vm/vm_page.h>
> #include <vm/vm_pager.h>
> +#include <vm/vm_pageout.h>
> #include <vm/vnode_pager.h>
>
> #include <ufs/ufs/extattr.h>
> @@ -102,8 +104,9 @@ __FBSDID("$FreeBSD$");
> #ifdef DIRECTIO
> extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
> #endif
> -static vop_fsync_t ffs_fsync;
> static vop_fdatasync_t ffs_fdatasync;
> +static vop_fsync_t ffs_fsync;
> +static vop_getpages_t ffs_getpages;
> static vop_lock1_t ffs_lock;
> static vop_read_t ffs_read;
> static vop_write_t ffs_write;
> @@ -119,13 +122,12 @@ static vop_openextattr_t ffs_openextattr
> static vop_setextattr_t ffs_setextattr;
> static vop_vptofh_t ffs_vptofh;
>
> -
> /* Global vfs data structures for ufs. */
> struct vop_vector ffs_vnodeops1 = {
> .vop_default = &ufs_vnodeops,
> .vop_fsync = ffs_fsync,
> .vop_fdatasync = ffs_fdatasync,
> - .vop_getpages = vnode_pager_local_getpages,
> + .vop_getpages = ffs_getpages,
> .vop_getpages_async = vnode_pager_local_getpages_async,
> .vop_lock1 = ffs_lock,
> .vop_read = ffs_read,
> @@ -147,7 +149,7 @@ struct vop_vector ffs_vnodeops2 = {
> .vop_default = &ufs_vnodeops,
> .vop_fsync = ffs_fsync,
> .vop_fdatasync = ffs_fdatasync,
> - .vop_getpages = vnode_pager_local_getpages,
> + .vop_getpages = ffs_getpages,
> .vop_getpages_async = vnode_pager_local_getpages_async,
> .vop_lock1 = ffs_lock,
> .vop_read = ffs_read,
> @@ -1784,3 +1786,165 @@ vop_vptofh {
> ufhp->ufid_gen = ip->i_gen;
> return (0);
> }
> +
> +SYSCTL_DECL(_vfs_ffs);
> +static int use_buf_pager = 1;
> +SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0,
> + "Always use buffer pager instead of bmap");
> +static int buf_pager_relbuf;
> +SYSCTL_INT(_vfs_ffs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN,
> + &buf_pager_relbuf, 0,
> + "Make buffer pager release buffers after reading");
> +
> +/*
> + * The FFS pager. It uses buffer reads to validate pages.
> + *
> + * In contrast to the generic local pager from vm/vnode_pager.c, this
> + * pager correctly and easily handles volumes where the underlying
> + * device block size is greater than the machine page size. The
> + * buffer cache transparently extends the requested page run to be
> + * aligned at the block boundary, and does the necessary bogus page
> + * replacements in the addends to avoid obliterating already valid
> + * pages.
> + *
> + * The only non-trivial issue is that the exclusive busy state for
> + * pages, which is assumed by the vm_pager_getpages() interface, is
> + * incompatible with the VMIO buffer cache's desire to share-busy the
> + * pages. This function performs a trivial downgrade of the pages'
> + * state before reading buffers, and a less trivial upgrade from the
> + * shared-busy to excl-busy state after the read.
> + */
> +static int
> +ffs_getpages(struct vop_getpages_args *ap)
> +{
> + struct vnode *vp;
> + vm_page_t *ma, m;
> + vm_object_t object;
> + struct buf *bp;
> + struct ufsmount *um;
> + ufs_lbn_t lbn, lbnp;
> + vm_ooffset_t la, lb;
> + long bsize;
> + int bo_bs, count, error, i;
> + bool redo, lpart;
> +
> + vp = ap->a_vp;
> + ma = ap->a_m;
> + count = ap->a_count;
> +
> + um = VFSTOUFS(ap->a_vp->v_mount);
> + bo_bs = um->um_devvp->v_bufobj.bo_bsize;
> + if (!use_buf_pager && bo_bs <= PAGE_SIZE)
> + return (vnode_pager_generic_getpages(vp, ma, count,
> + ap->a_rbehind, ap->a_rahead, NULL, NULL));
> +
> + object = vp->v_object;
> + la = IDX_TO_OFF(ma[count - 1]->pindex);
> + if (la >= object->un_pager.vnp.vnp_size)
> + return (VM_PAGER_BAD);
> + lpart = la + PAGE_SIZE > object->un_pager.vnp.vnp_size;
> + if (ap->a_rbehind != NULL) {
> + lb = IDX_TO_OFF(ma[0]->pindex);
> + *ap->a_rbehind = OFF_TO_IDX(lb - rounddown2(lb, bo_bs));
> + }
> + if (ap->a_rahead != NULL) {
> + *ap->a_rahead = OFF_TO_IDX(roundup2(la, bo_bs) - la);
> + if (la + IDX_TO_OFF(*ap->a_rahead) >=
> + object->un_pager.vnp.vnp_size) {
> + *ap->a_rahead = OFF_TO_IDX(roundup2(object->un_pager.
> + vnp.vnp_size, PAGE_SIZE) - la);
> + }
> + }
> + VM_OBJECT_WLOCK(object);
> +again:
> + for (i = 0; i < count; i++)
> + vm_page_busy_downgrade(ma[i]);
> + VM_OBJECT_WUNLOCK(object);
> +
> + lbnp = -1;
> + for (i = 0; i < count; i++) {
> + m = ma[i];
> +
> + /*
> + * Pages are shared busy and the object lock is not
> + * owned, which together allow for the pages'
> + * invalidation. The racy test for validity avoids
> + * useless creation of the buffer for the most typical
> + * case when invalidation is not used in redo or for
> + * parallel read. The shared->excl upgrade loop at
> + * the end of the function catches the race in a
> + * reliable way (protected by the object lock).
> + */
> + if (m->valid == VM_PAGE_BITS_ALL)
> + continue;
> +
> + lbn = lblkno(um->um_fs, IDX_TO_OFF(m->pindex));
> + if (lbn != lbnp) {
> + bsize = blksize(um->um_fs, VTOI(vp), lbn);
> + error = bread_gb(vp, lbn, bsize, NOCRED, GB_UNMAPPED,
> + &bp);
> + if (error != 0)
> + break;
> + KASSERT(1 /* racy, enable for debugging */ ||
> + m->valid == VM_PAGE_BITS_ALL || i == count - 1,
> + ("buf %d %p invalid", i, m));
> + if (i == count - 1 && lpart) {
> + VM_OBJECT_WLOCK(object);
> + if (m->valid != 0 &&
> + m->valid != VM_PAGE_BITS_ALL)
> + vm_page_zero_invalid(m, TRUE);
> + VM_OBJECT_WUNLOCK(object);
> + }
> + if (LIST_EMPTY(&bp->b_dep)) {
> + /*
> + * Invalidation clears m->valid, but
> + * may leave B_CACHE flag if the
> + * buffer existed at the invalidation
> + * time. In this case, recycle the
> + * buffer to do real read on next
> + * bread() after redo.
> + *
> + * Otherwise B_RELBUF is not strictly
> + * necessary, enable to reduce buf
> + * cache pressure.
> + */
> + if (buf_pager_relbuf ||
> + m->valid != VM_PAGE_BITS_ALL)
> + bp->b_flags |= B_RELBUF;
> +
> + bp->b_flags &= ~B_NOCACHE;
> + brelse(bp);
> + } else {
> + bqrelse(bp);
> + }
> + lbnp = lbn;
> + }
> + }
> +
> + VM_OBJECT_WLOCK(object);
> + redo = false;
> + for (i = 0; i < count; i++) {
> + vm_page_sunbusy(ma[i]);
> + ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL);
> +
> + /*
> + * Since the pages were only sbusy while neither the
> + * buffer nor the object lock was held by us, or
> + * reallocated while vm_page_grab() slept for busy
> + * relinguish, they could have been invalidated.
> + * Recheck the valid bits and re-read as needed.
> + *
> + * Note that the last page is made fully valid in the
> + * read loop, and partial validity for the page at
> + * index count - 1 could mean that the page was
> + * invalidated or removed, so we must restart for
> + * safety as well.
> + */
> + if (ma[i]->valid != VM_PAGE_BITS_ALL)
> + redo = true;
> + }
> + if (redo && error == 0)
> + goto again;
> + VM_OBJECT_WUNLOCK(object);
> + return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
> +}
>
More information about the freebsd-mips
mailing list