svn commit: r289279 - in head/sys: kern vm
Adrian Chadd
adrian.chadd at gmail.com
Mon Nov 2 03:20:15 UTC 2015
hiya jeff,
this broke low-memory, no-swap boards (eg MIPS.)
On a MIPS board (carambola2) with 32MB of RAM, just scp'ing a kernel
into the rootfs on USB hangs the system. After doing some digging, I
found this:
INTERNAL: Allocating one item from buf free cache(0x83fea7e0)
uma_zalloc_arg: Bucketzone returned NULL
INTERNAL: Allocating one item from buf free cache(0x83fea7e0)
uma_zalloc_arg: Bucketzone returned NULL
.. and it was just stuck in a loop trying to allocate them, failing,
and trying to allocate them again.
I'll see if I can reproduce it with a qemu emulator with sufficiently
low RAM so you don't need a MIPS router to reproduce it.
It's sufficient to just start the scp; it runs out of RAM within a
couple of seconds.
Any ideas?
-adrian
On 13 October 2015 at 19:10, Jeff Roberson <jeff at freebsd.org> wrote:
> Author: jeff
> Date: Wed Oct 14 02:10:07 2015
> New Revision: 289279
> URL: https://svnweb.freebsd.org/changeset/base/289279
>
> Log:
> Parallelize the buffer cache and rewrite getnewbuf(). This results in a
> 8x performance improvement in a micro benchmark on a 4 socket machine.
>
> - Get buffer headers from a per-cpu uma cache that sits in from of the
> free queue.
> - Use a per-cpu quantum cache in vmem to eliminate contention for kva.
> - Use multiple clean queues according to buffer cache size to eliminate
> clean queue lock contention.
> - Introduce a bufspace daemon that attempts to prevent getnewbuf() callers
> from blocking or doing direct recycling.
> - Close some bufspace allocation races that could lead to endless
> recycling.
> - Further the transition to a more modern style of small functions grouped
> by prefix in order to improve growing complexity.
>
> Sponsored by: EMC / Isilon
> Reviewed by: kib
> Tested by: pho
>
> Modified:
> head/sys/kern/vfs_bio.c
> head/sys/vm/vm_init.c
>
> Modified: head/sys/kern/vfs_bio.c
> ==============================================================================
> --- head/sys/kern/vfs_bio.c Wed Oct 14 00:43:29 2015 (r289278)
> +++ head/sys/kern/vfs_bio.c Wed Oct 14 02:10:07 2015 (r289279)
> @@ -63,6 +63,7 @@ __FBSDID("$FreeBSD$");
> #include <sys/proc.h>
> #include <sys/resourcevar.h>
> #include <sys/rwlock.h>
> +#include <sys/smp.h>
> #include <sys/sysctl.h>
> #include <sys/sysproto.h>
> #include <sys/vmem.h>
> @@ -100,6 +101,7 @@ caddr_t unmapped_buf;
>
> /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
> struct proc *bufdaemonproc;
> +struct proc *bufspacedaemonproc;
>
> static int inmem(struct vnode *vp, daddr_t blkno);
> static void vm_hold_free_pages(struct buf *bp, int newbsize);
> @@ -116,11 +118,18 @@ static void vfs_vmio_extend(struct buf *
> static int vfs_bio_clcheck(struct vnode *vp, int size,
> daddr_t lblkno, daddr_t blkno);
> static int buf_flush(struct vnode *vp, int);
> +static int buf_recycle(bool);
> +static int buf_scan(bool);
> static int flushbufqueues(struct vnode *, int, int);
> static void buf_daemon(void);
> static void bremfreel(struct buf *bp);
> static __inline void bd_wakeup(void);
> static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
> +static void bufkva_reclaim(vmem_t *, int);
> +static void bufkva_free(struct buf *);
> +static int buf_import(void *, void **, int, int);
> +static void buf_release(void *, void **, int);
> +
> #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
> defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
> static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
> @@ -145,23 +154,23 @@ static long bufkvaspace;
> SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0,
> "Kernel virtual memory used for buffers");
> static long maxbufspace;
> -SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
> - "Maximum allowed value of bufspace (including buf_daemon)");
> +SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0,
> + "Maximum allowed value of bufspace (including metadata)");
> static long bufmallocspace;
> SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
> "Amount of malloced memory for buffers");
> static long maxbufmallocspace;
> -SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
> - "Maximum amount of malloced memory for buffers");
> +SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace,
> + 0, "Maximum amount of malloced memory for buffers");
> static long lobufspace;
> -SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
> +SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0,
> "Minimum amount of buffers we want to have");
> long hibufspace;
> -SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
> - "Maximum allowed value of bufspace (excluding buf_daemon)");
> -static int bufreusecnt;
> -SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0,
> - "Number of times we have reused a buffer");
> +SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0,
> + "Maximum allowed value of bufspace (excluding metadata)");
> +long bufspacethresh;
> +SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh,
> + 0, "Bufspace consumed before waking the daemon to free some");
> static int buffreekvacnt;
> SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
> "Number of times we have freed the KVA space from some buffer");
> @@ -205,10 +214,10 @@ SYSCTL_INT(_vfs, OID_AUTO, numfreebuffer
> "Number of free buffers");
> static int lofreebuffers;
> SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
> - "XXX Unused");
> + "Target number of free buffers");
> static int hifreebuffers;
> SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
> - "XXX Complicatedly unused");
> + "Threshold for clean buffer recycling");
> static int getnewbufcalls;
> SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
> "Number of calls to getnewbuf");
> @@ -219,6 +228,9 @@ static int mappingrestarts;
> SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
> "Number of times getblk has had to restart a buffer mapping for "
> "unmapped buffer");
> +static int numbufallocfails;
> +SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0,
> + "Number of times buffer allocations failed");
> static int flushbufqtarget = 100;
> SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
> "Amount of work to do in flushbufqueues when helping bufdaemon");
> @@ -233,16 +245,6 @@ SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_
> "Permit the use of the unmapped i/o");
>
> /*
> - * Lock for the non-dirty bufqueues
> - */
> -static struct mtx_padalign bqclean;
> -
> -/*
> - * Lock for the dirty queue.
> - */
> -static struct mtx_padalign bqdirty;
> -
> -/*
> * This lock synchronizes access to bd_request.
> */
> static struct mtx_padalign bdlock;
> @@ -271,6 +273,11 @@ static struct mtx_padalign bdirtylock;
> static int bd_request;
>
> /*
> + * Request/wakeup point for the bufspace daemon.
> + */
> +static int bufspace_request;
> +
> +/*
> * Request for the buf daemon to write more buffers than is indicated by
> * lodirtybuf. This may be necessary to push out excess dependencies or
> * defragment the address space where a simple count of the number of dirty
> @@ -298,7 +305,7 @@ static int runningbufreq;
> * Synchronization (sleep/wakeup) variable for buffer requests.
> * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
> * by and/or.
> - * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(),
> + * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(),
> * getnewbuf(), and getblk().
> */
> static volatile int needsbuffer;
> @@ -311,14 +318,21 @@ static int bdirtywait;
> /*
> * Definitions for the buffer free lists.
> */
> -#define BUFFER_QUEUES 4 /* number of free buffer queues */
> -
> #define QUEUE_NONE 0 /* on no queue */
> -#define QUEUE_CLEAN 1 /* non-B_DELWRI buffers */
> +#define QUEUE_EMPTY 1 /* empty buffer headers */
> #define QUEUE_DIRTY 2 /* B_DELWRI buffers */
> -#define QUEUE_EMPTY 3 /* empty buffer headers */
> +#define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */
> #define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */
>
> +/* Maximum number of clean buffer queues. */
> +#define CLEAN_QUEUES 16
> +
> +/* Configured number of clean queues. */
> +static int clean_queues;
> +
> +/* Maximum number of buffer queues. */
> +#define BUFFER_QUEUES (QUEUE_CLEAN + CLEAN_QUEUES)
> +
> /* Queues for free buffers with various properties */
> static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
> #ifdef INVARIANTS
> @@ -326,15 +340,21 @@ static int bq_len[BUFFER_QUEUES];
> #endif
>
> /*
> + * Lock for each bufqueue
> + */
> +static struct mtx_padalign bqlocks[BUFFER_QUEUES];
> +
> +/*
> + * per-cpu empty buffer cache.
> + */
> +uma_zone_t buf_zone;
> +
> +/*
> * Single global constant for BUF_WMESG, to avoid getting multiple references.
> * buf_wmesg is referred from macros.
> */
> const char *buf_wmesg = BUF_WMESG;
>
> -#define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */
> -#define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */
> -#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */
> -
> static int
> sysctl_runningspace(SYSCTL_HANDLER_ARGS)
> {
> @@ -382,6 +402,21 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS)
> }
> #endif
>
> +static int
> +bqcleanq(void)
> +{
> + static int nextq;
> +
> + return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN);
> +}
> +
> +static int
> +bqisclean(int qindex)
> +{
> +
> + return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES);
> +}
> +
> /*
> * bqlock:
> *
> @@ -391,9 +426,7 @@ static inline struct mtx *
> bqlock(int qindex)
> {
>
> - if (qindex == QUEUE_DIRTY)
> - return (struct mtx *)(&bqdirty);
> - return (struct mtx *)(&bqclean);
> + return (struct mtx *)&bqlocks[qindex];
> }
>
> /*
> @@ -447,62 +480,255 @@ bdirtyadd(void)
> }
>
> /*
> - * bufspacewakeup:
> + * bufspace_wakeup:
> *
> * Called when buffer space is potentially available for recovery.
> * getnewbuf() will block on this flag when it is unable to free
> * sufficient buffer space. Buffer space becomes recoverable when
> * bp's get placed back in the queues.
> */
> -static __inline void
> -bufspacewakeup(void)
> +static void
> +bufspace_wakeup(void)
> {
> - int need_wakeup, on;
>
> /*
> - * If someone is waiting for bufspace, wake them up. Even
> - * though we may not have freed the kva space yet, the waiting
> - * process will be able to now.
> + * If someone is waiting for bufspace, wake them up.
> + *
> + * Since needsbuffer is set prior to doing an additional queue
> + * scan it is safe to check for the flag prior to acquiring the
> + * lock. The thread that is preparing to scan again before
> + * blocking would discover the buf we released.
> */
> + if (needsbuffer) {
> + rw_rlock(&nblock);
> + if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1)
> + wakeup(__DEVOLATILE(void *, &needsbuffer));
> + rw_runlock(&nblock);
> + }
> +}
> +
> +/*
> + * bufspace_daemonwakeup:
> + *
> + * Wakeup the daemon responsible for freeing clean bufs.
> + */
> +static void
> +bufspace_daemonwakeup(void)
> +{
> rw_rlock(&nblock);
> - for (;;) {
> - need_wakeup = 0;
> - on = needsbuffer;
> - if ((on & VFS_BIO_NEED_BUFSPACE) == 0)
> - break;
> - need_wakeup = 1;
> - if (atomic_cmpset_rel_int(&needsbuffer, on,
> - on & ~VFS_BIO_NEED_BUFSPACE))
> - break;
> + if (bufspace_request == 0) {
> + bufspace_request = 1;
> + wakeup(&bufspace_request);
> }
> - if (need_wakeup)
> - wakeup(__DEVOLATILE(void *, &needsbuffer));
> rw_runlock(&nblock);
> }
>
> /*
> - * bufspaceadjust:
> + * bufspace_adjust:
> *
> * Adjust the reported bufspace for a KVA managed buffer, possibly
> * waking any waiters.
> */
> static void
> -bufspaceadjust(struct buf *bp, int bufsize)
> +bufspace_adjust(struct buf *bp, int bufsize)
> {
> + long space;
> int diff;
>
> KASSERT((bp->b_flags & B_MALLOC) == 0,
> - ("bufspaceadjust: malloc buf %p", bp));
> + ("bufspace_adjust: malloc buf %p", bp));
> diff = bufsize - bp->b_bufsize;
> if (diff < 0) {
> atomic_subtract_long(&bufspace, -diff);
> - bufspacewakeup();
> - } else
> - atomic_add_long(&bufspace, diff);
> + bufspace_wakeup();
> + } else {
> + space = atomic_fetchadd_long(&bufspace, diff);
> + /* Wake up the daemon on the transition. */
> + if (space < bufspacethresh && space + diff >= bufspacethresh)
> + bufspace_daemonwakeup();
> + }
> bp->b_bufsize = bufsize;
> }
>
> /*
> + * bufspace_reserve:
> + *
> + * Reserve bufspace before calling allocbuf(). metadata has a
> + * different space limit than data.
> + */
> +static int
> +bufspace_reserve(int size, bool metadata)
> +{
> + long limit;
> + long space;
> +
> + if (metadata)
> + limit = maxbufspace;
> + else
> + limit = hibufspace;
> + do {
> + space = bufspace;
> + if (space + size > limit)
> + return (ENOSPC);
> + } while (atomic_cmpset_long(&bufspace, space, space + size) == 0);
> +
> + /* Wake up the daemon on the transition. */
> + if (space < bufspacethresh && space + size >= bufspacethresh)
> + bufspace_daemonwakeup();
> +
> + return (0);
> +}
> +
> +/*
> + * bufspace_release:
> + *
> + * Release reserved bufspace after bufspace_adjust() has consumed it.
> + */
> +static void
> +bufspace_release(int size)
> +{
> + atomic_subtract_long(&bufspace, size);
> + bufspace_wakeup();
> +}
> +
> +/*
> + * bufspace_wait:
> + *
> + * Wait for bufspace, acting as the buf daemon if a locked vnode is
> + * supplied. needsbuffer must be set in a safe fashion prior to
> + * polling for space. The operation must be re-tried on return.
> + */
> +static void
> +bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo)
> +{
> + struct thread *td;
> + int error, fl, norunbuf;
> +
> + if ((gbflags & GB_NOWAIT_BD) != 0)
> + return;
> +
> + td = curthread;
> + rw_wlock(&nblock);
> + while (needsbuffer != 0) {
> + if (vp != NULL && vp->v_type != VCHR &&
> + (td->td_pflags & TDP_BUFNEED) == 0) {
> + rw_wunlock(&nblock);
> + /*
> + * getblk() is called with a vnode locked, and
> + * some majority of the dirty buffers may as
> + * well belong to the vnode. Flushing the
> + * buffers there would make a progress that
> + * cannot be achieved by the buf_daemon, that
> + * cannot lock the vnode.
> + */
> + norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
> + (td->td_pflags & TDP_NORUNNINGBUF);
> +
> + /*
> + * Play bufdaemon. The getnewbuf() function
> + * may be called while the thread owns lock
> + * for another dirty buffer for the same
> + * vnode, which makes it impossible to use
> + * VOP_FSYNC() there, due to the buffer lock
> + * recursion.
> + */
> + td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
> + fl = buf_flush(vp, flushbufqtarget);
> + td->td_pflags &= norunbuf;
> + rw_wlock(&nblock);
> + if (fl != 0)
> + continue;
> + if (needsbuffer == 0)
> + break;
> + }
> + error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
> + (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
> + if (error != 0)
> + break;
> + }
> + rw_wunlock(&nblock);
> +}
> +
> +
> +/*
> + * bufspace_daemon:
> + *
> + * buffer space management daemon. Tries to maintain some marginal
> + * amount of free buffer space so that requesting processes neither
> + * block nor work to reclaim buffers.
> + */
> +static void
> +bufspace_daemon(void)
> +{
> + for (;;) {
> + kproc_suspend_check(bufspacedaemonproc);
> +
> + /*
> + * Free buffers from the clean queue until we meet our
> + * targets.
> + *
> + * Theory of operation: The buffer cache is most efficient
> + * when some free buffer headers and space are always
> + * available to getnewbuf(). This daemon attempts to prevent
> + * the excessive blocking and synchronization associated
> + * with shortfall. It goes through three phases according
> + * demand:
> + *
> + * 1) The daemon wakes up voluntarily once per-second
> + * during idle periods when the counters are below
> + * the wakeup thresholds (bufspacethresh, lofreebuffers).
> + *
> + * 2) The daemon wakes up as we cross the thresholds
> + * ahead of any potential blocking. This may bounce
> + * slightly according to the rate of consumption and
> + * release.
> + *
> + * 3) The daemon and consumers are starved for working
> + * clean buffers. This is the 'bufspace' sleep below
> + * which will inefficiently trade bufs with bqrelse
> + * until we return to condition 2.
> + */
> + while (bufspace > lobufspace ||
> + numfreebuffers < hifreebuffers) {
> + if (buf_recycle(false) != 0) {
> + atomic_set_int(&needsbuffer, 1);
> + if (buf_recycle(false) != 0) {
> + rw_wlock(&nblock);
> + if (needsbuffer)
> + rw_sleep(__DEVOLATILE(void *,
> + &needsbuffer), &nblock,
> + PRIBIO|PDROP, "bufspace",
> + hz/10);
> + else
> + rw_wunlock(&nblock);
> + }
> + }
> + maybe_yield();
> + }
> +
> + /*
> + * Re-check our limits under the exclusive nblock.
> + */
> + rw_wlock(&nblock);
> + if (bufspace < bufspacethresh &&
> + numfreebuffers > lofreebuffers) {
> + bufspace_request = 0;
> + rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP,
> + "-", hz);
> + } else
> + rw_wunlock(&nblock);
> + }
> +}
> +
> +static struct kproc_desc bufspace_kp = {
> + "bufspacedaemon",
> + bufspace_daemon,
> + &bufspacedaemonproc
> +};
> +SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start,
> + &bufspace_kp);
> +
> +/*
> * bufmallocadjust:
> *
> * Adjust the reported bufspace for a malloc managed buffer, possibly
> @@ -516,10 +742,9 @@ bufmallocadjust(struct buf *bp, int bufs
> KASSERT((bp->b_flags & B_MALLOC) != 0,
> ("bufmallocadjust: non-malloc buf %p", bp));
> diff = bufsize - bp->b_bufsize;
> - if (diff < 0) {
> + if (diff < 0)
> atomic_subtract_long(&bufmallocspace, -diff);
> - bufspacewakeup();
> - } else
> + else
> atomic_add_long(&bufmallocspace, diff);
> bp->b_bufsize = bufsize;
> }
> @@ -571,67 +796,6 @@ runningbufwakeup(struct buf *bp)
> }
>
> /*
> - * bufcountadd:
> - *
> - * Called when a buffer has been added to one of the free queues to
> - * account for the buffer and to wakeup anyone waiting for free buffers.
> - * This typically occurs when large amounts of metadata are being handled
> - * by the buffer cache ( else buffer space runs out first, usually ).
> - */
> -static __inline void
> -bufcountadd(struct buf *bp)
> -{
> - int mask, need_wakeup, old, on;
> -
> - KASSERT((bp->b_flags & B_INFREECNT) == 0,
> - ("buf %p already counted as free", bp));
> - bp->b_flags |= B_INFREECNT;
> - old = atomic_fetchadd_int(&numfreebuffers, 1);
> - KASSERT(old >= 0 && old < nbuf,
> - ("numfreebuffers climbed to %d", old + 1));
> - mask = VFS_BIO_NEED_ANY;
> - if (numfreebuffers >= hifreebuffers)
> - mask |= VFS_BIO_NEED_FREE;
> - rw_rlock(&nblock);
> - for (;;) {
> - need_wakeup = 0;
> - on = needsbuffer;
> - if (on == 0)
> - break;
> - need_wakeup = 1;
> - if (atomic_cmpset_rel_int(&needsbuffer, on, on & ~mask))
> - break;
> - }
> - if (need_wakeup)
> - wakeup(__DEVOLATILE(void *, &needsbuffer));
> - rw_runlock(&nblock);
> -}
> -
> -/*
> - * bufcountsub:
> - *
> - * Decrement the numfreebuffers count as needed.
> - */
> -static void
> -bufcountsub(struct buf *bp)
> -{
> - int old;
> -
> - /*
> - * Fixup numfreebuffers count. If the buffer is invalid or not
> - * delayed-write, the buffer was free and we must decrement
> - * numfreebuffers.
> - */
> - if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
> - KASSERT((bp->b_flags & B_INFREECNT) != 0,
> - ("buf %p not counted in numfreebuffers", bp));
> - bp->b_flags &= ~B_INFREECNT;
> - old = atomic_fetchadd_int(&numfreebuffers, -1);
> - KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
> - }
> -}
> -
> -/*
> * waitrunningbufspace()
> *
> * runningbufspace is a measure of the amount of I/O currently
> @@ -847,8 +1011,10 @@ bufinit(void)
> int i;
>
> CTASSERT(MAXBCACHEBUF >= MAXBSIZE);
> - mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF);
> - mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF);
> + mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF);
> + mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF);
> + for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++)
> + mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF);
> mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
> rw_init(&nblock, "needsbuffer lock");
> mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
> @@ -864,7 +1030,7 @@ bufinit(void)
> for (i = 0; i < nbuf; i++) {
> bp = &buf[i];
> bzero(bp, sizeof *bp);
> - bp->b_flags = B_INVAL | B_INFREECNT;
> + bp->b_flags = B_INVAL;
> bp->b_rcred = NOCRED;
> bp->b_wcred = NOCRED;
> bp->b_qindex = QUEUE_EMPTY;
> @@ -881,18 +1047,19 @@ bufinit(void)
> /*
> * maxbufspace is the absolute maximum amount of buffer space we are
> * allowed to reserve in KVM and in real terms. The absolute maximum
> - * is nominally used by buf_daemon. hibufspace is the nominal maximum
> - * used by most other processes. The differential is required to
> - * ensure that buf_daemon is able to run when other processes might
> - * be blocked waiting for buffer space.
> + * is nominally used by metadata. hibufspace is the nominal maximum
> + * used by most other requests. The differential is required to
> + * ensure that metadata deadlocks don't occur.
> *
> * maxbufspace is based on BKVASIZE. Allocating buffers larger then
> * this may result in KVM fragmentation which is not handled optimally
> - * by the system.
> + * by the system. XXX This is less true with vmem. We could use
> + * PAGE_SIZE.
> */
> maxbufspace = (long)nbuf * BKVASIZE;
> hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10);
> - lobufspace = hibufspace - MAXBCACHEBUF;
> + lobufspace = (hibufspace / 20) * 19; /* 95% */
> + bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2;
>
> /*
> * Note: The 16 MiB upper limit for hirunningspace was chosen
> @@ -906,44 +1073,61 @@ bufinit(void)
> 16 * 1024 * 1024), 1024 * 1024);
> lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF);
>
> -/*
> - * Limit the amount of malloc memory since it is wired permanently into
> - * the kernel space. Even though this is accounted for in the buffer
> - * allocation, we don't want the malloced region to grow uncontrolled.
> - * The malloc scheme improves memory utilization significantly on average
> - * (small) directories.
> - */
> + /*
> + * Limit the amount of malloc memory since it is wired permanently into
> + * the kernel space. Even though this is accounted for in the buffer
> + * allocation, we don't want the malloced region to grow uncontrolled.
> + * The malloc scheme improves memory utilization significantly on
> + * average (small) directories.
> + */
> maxbufmallocspace = hibufspace / 20;
>
> -/*
> - * Reduce the chance of a deadlock occuring by limiting the number
> - * of delayed-write dirty buffers we allow to stack up.
> - */
> + /*
> + * Reduce the chance of a deadlock occuring by limiting the number
> + * of delayed-write dirty buffers we allow to stack up.
> + */
> hidirtybuffers = nbuf / 4 + 20;
> dirtybufthresh = hidirtybuffers * 9 / 10;
> numdirtybuffers = 0;
> -/*
> - * To support extreme low-memory systems, make sure hidirtybuffers cannot
> - * eat up all available buffer space. This occurs when our minimum cannot
> - * be met. We try to size hidirtybuffers to 3/4 our buffer space assuming
> - * BKVASIZE'd buffers.
> - */
> + /*
> + * To support extreme low-memory systems, make sure hidirtybuffers
> + * cannot eat up all available buffer space. This occurs when our
> + * minimum cannot be met. We try to size hidirtybuffers to 3/4 our
> + * buffer space assuming BKVASIZE'd buffers.
> + */
> while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
> hidirtybuffers >>= 1;
> }
> lodirtybuffers = hidirtybuffers / 2;
>
> -/*
> - * Try to keep the number of free buffers in the specified range,
> - * and give special processes (e.g. like buf_daemon) access to an
> - * emergency reserve.
> - */
> - lofreebuffers = nbuf / 18 + 5;
> - hifreebuffers = 2 * lofreebuffers;
> + /*
> + * lofreebuffers should be sufficient to avoid stalling waiting on
> + * buf headers under heavy utilization. The bufs in per-cpu caches
> + * are counted as free but will be unavailable to threads executing
> + * on other cpus.
> + *
> + * hifreebuffers is the free target for the bufspace daemon. This
> + * should be set appropriately to limit work per-iteration.
> + */
> + lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus);
> + hifreebuffers = (3 * lofreebuffers) / 2;
> numfreebuffers = nbuf;
>
> bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
> VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
> +
> + /* Setup the kva and free list allocators. */
> + vmem_set_reclaim(buffer_arena, bufkva_reclaim);
> + buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf),
> + NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0);
> +
> + /*
> + * Size the clean queue according to the amount of buffer space.
> + * One queue per-256mb up to the max. More queues gives better
> + * concurrency but less accurate LRU.
> + */
> + clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES);
> +
> }
>
> #ifdef INVARIANTS
> @@ -1129,10 +1313,25 @@ binsfree(struct buf *bp, int qindex)
> {
> struct mtx *olock, *nlock;
>
> - BUF_ASSERT_XLOCKED(bp);
> + if (qindex != QUEUE_EMPTY) {
> + BUF_ASSERT_XLOCKED(bp);
> + }
> +
> + /*
> + * Stick to the same clean queue for the lifetime of the buf to
> + * limit locking below. Otherwise pick ont sequentially.
> + */
> + if (qindex == QUEUE_CLEAN) {
> + if (bqisclean(bp->b_qindex))
> + qindex = bp->b_qindex;
> + else
> + qindex = bqcleanq();
> + }
>
> + /*
> + * Handle delayed bremfree() processing.
> + */
> nlock = bqlock(qindex);
> - /* Handle delayed bremfree() processing. */
> if (bp->b_flags & B_REMFREE) {
> olock = bqlock(bp->b_qindex);
> mtx_lock(olock);
> @@ -1156,15 +1355,263 @@ binsfree(struct buf *bp, int qindex)
> bq_len[bp->b_qindex]++;
> #endif
> mtx_unlock(nlock);
> +}
> +
> +/*
> + * buf_free:
> + *
> + * Free a buffer to the buf zone once it no longer has valid contents.
> + */
> +static void
> +buf_free(struct buf *bp)
> +{
> +
> + if (bp->b_flags & B_REMFREE)
> + bremfreef(bp);
> + if (bp->b_vflags & BV_BKGRDINPROG)
> + panic("losing buffer 1");
> + if (bp->b_rcred != NOCRED) {
> + crfree(bp->b_rcred);
> + bp->b_rcred = NOCRED;
> + }
> + if (bp->b_wcred != NOCRED) {
> + crfree(bp->b_wcred);
> + bp->b_wcred = NOCRED;
> + }
> + if (!LIST_EMPTY(&bp->b_dep))
> + buf_deallocate(bp);
> + bufkva_free(bp);
> + BUF_UNLOCK(bp);
> + uma_zfree(buf_zone, bp);
> + atomic_add_int(&numfreebuffers, 1);
> + bufspace_wakeup();
> +}
> +
> +/*
> + * buf_import:
> + *
> + * Import bufs into the uma cache from the buf list. The system still
> + * expects a static array of bufs and much of the synchronization
> + * around bufs assumes type stable storage. As a result, UMA is used
> + * only as a per-cpu cache of bufs still maintained on a global list.
> + */
> +static int
> +buf_import(void *arg, void **store, int cnt, int flags)
> +{
> + struct buf *bp;
> + int i;
> +
> + mtx_lock(&bqlocks[QUEUE_EMPTY]);
> + for (i = 0; i < cnt; i++) {
> + bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
> + if (bp == NULL)
> + break;
> + bremfreel(bp);
> + store[i] = bp;
> + }
> + mtx_unlock(&bqlocks[QUEUE_EMPTY]);
> +
> + return (i);
> +}
> +
> +/*
> + * buf_release:
> + *
> + * Release bufs from the uma cache back to the buffer queues.
> + */
> +static void
> +buf_release(void *arg, void **store, int cnt)
> +{
> + int i;
> +
> + for (i = 0; i < cnt; i++)
> + binsfree(store[i], QUEUE_EMPTY);
> +}
> +
> +/*
> + * buf_alloc:
> + *
> + * Allocate an empty buffer header.
> + */
> +static struct buf *
> +buf_alloc(void)
> +{
> + struct buf *bp;
> +
> + bp = uma_zalloc(buf_zone, M_NOWAIT);
> + if (bp == NULL) {
> + bufspace_daemonwakeup();
> + atomic_add_int(&numbufallocfails, 1);
> + return (NULL);
> + }
> +
> + /*
> + * Wake-up the bufspace daemon on transition.
> + */
> + if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers)
> + bufspace_daemonwakeup();
> +
> + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
> + panic("getnewbuf_empty: Locked buf %p on free queue.", bp);
> +
> + KASSERT(bp->b_vp == NULL,
> + ("bp: %p still has vnode %p.", bp, bp->b_vp));
> + KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0,
> + ("invalid buffer %p flags %#x", bp, bp->b_flags));
> + KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
> + ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
> + KASSERT(bp->b_npages == 0,
> + ("bp: %p still has %d vm pages\n", bp, bp->b_npages));
> + KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
> + KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
> +
> + bp->b_flags = 0;
> + bp->b_ioflags = 0;
> + bp->b_xflags = 0;
> + bp->b_vflags = 0;
> + bp->b_vp = NULL;
> + bp->b_blkno = bp->b_lblkno = 0;
> + bp->b_offset = NOOFFSET;
> + bp->b_iodone = 0;
> + bp->b_error = 0;
> + bp->b_resid = 0;
> + bp->b_bcount = 0;
> + bp->b_npages = 0;
> + bp->b_dirtyoff = bp->b_dirtyend = 0;
> + bp->b_bufobj = NULL;
> + bp->b_pin_count = 0;
> + bp->b_data = bp->b_kvabase = unmapped_buf;
> + bp->b_fsprivate1 = NULL;
> + bp->b_fsprivate2 = NULL;
> + bp->b_fsprivate3 = NULL;
> + LIST_INIT(&bp->b_dep);
> +
> + return (bp);
> +}
> +
> +/*
> + * buf_qrecycle:
> + *
> + * Free a buffer from the given bufqueue. kva controls whether the
> + * freed buf must own some kva resources. This is used for
> + * defragmenting.
> + */
> +static int
> +buf_qrecycle(int qindex, bool kva)
> +{
> + struct buf *bp, *nbp;
> +
> + if (kva)
> + atomic_add_int(&bufdefragcnt, 1);
> + nbp = NULL;
> + mtx_lock(&bqlocks[qindex]);
> + nbp = TAILQ_FIRST(&bufqueues[qindex]);
> +
> + /*
> + * Run scan, possibly freeing data and/or kva mappings on the fly
> + * depending.
> + */
> + while ((bp = nbp) != NULL) {
> + /*
> + * Calculate next bp (we can only use it if we do not
> + * release the bqlock).
> + */
> + nbp = TAILQ_NEXT(bp, b_freelist);
> +
> + /*
> + * If we are defragging then we need a buffer with
> + * some kva to reclaim.
> + */
> + if (kva && bp->b_kvasize == 0)
> + continue;
> +
> + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
> + continue;
> +
> + /*
> + * Skip buffers with background writes in progress.
> + */
> + if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
> + BUF_UNLOCK(bp);
> + continue;
> + }
> +
> + KASSERT(bp->b_qindex == qindex,
> + ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
> + /*
> + * NOTE: nbp is now entirely invalid. We can only restart
> + * the scan from this point on.
> + */
> + bremfreel(bp);
> + mtx_unlock(&bqlocks[qindex]);
> +
> + /*
> + * Requeue the background write buffer with error and
> + * restart the scan.
> + */
> + if ((bp->b_vflags & BV_BKGRDERR) != 0) {
> + bqrelse(bp);
> + mtx_lock(&bqlocks[qindex]);
> + nbp = TAILQ_FIRST(&bufqueues[qindex]);
> + continue;
> + }
> + bp->b_flags |= B_INVAL;
> + brelse(bp);
> + return (0);
> + }
> + mtx_unlock(&bqlocks[qindex]);
> +
> + return (ENOBUFS);
> +}
> +
> +/*
> + * buf_recycle:
> + *
> + * Iterate through all clean queues until we find a buf to recycle or
> + * exhaust the search.
> + */
> +static int
> +buf_recycle(bool kva)
> +{
> + int qindex, first_qindex;
> +
> + qindex = first_qindex = bqcleanq();
> + do {
> + if (buf_qrecycle(qindex, kva) == 0)
> + return (0);
> + if (++qindex == QUEUE_CLEAN + clean_queues)
> + qindex = QUEUE_CLEAN;
> + } while (qindex != first_qindex);
> +
> + return (ENOBUFS);
> +}
> +
> +/*
> + * buf_scan:
> + *
> + * Scan the clean queues looking for a buffer to recycle. needsbuffer
> + * is set on failure so that the caller may optionally bufspace_wait()
> + * in a race-free fashion.
> + */
> +static int
> +buf_scan(bool defrag)
> +{
> + int error;
>
> /*
> - * Something we can maybe free or reuse.
> - */
> - if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
> - bufspacewakeup();
> -
> - if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
> - bufcountadd(bp);
> + * To avoid heavy synchronization and wakeup races we set
> + * needsbuffer and re-poll before failing. This ensures that
> + * no frees can be missed between an unsuccessful poll and
> + * going to sleep in a synchronized fashion.
> + */
>
> *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
>
More information about the svn-src-all
mailing list