svn commit: r289279 - in head/sys: kern vm
Jeff Roberson
jeff at FreeBSD.org
Wed Oct 14 02:10:08 UTC 2015
Author: jeff
Date: Wed Oct 14 02:10:07 2015
New Revision: 289279
URL: https://svnweb.freebsd.org/changeset/base/289279
Log:
Parallelize the buffer cache and rewrite getnewbuf(). This results in a
8x performance improvement in a micro benchmark on a 4 socket machine.
- Get buffer headers from a per-cpu uma cache that sits in from of the
free queue.
- Use a per-cpu quantum cache in vmem to eliminate contention for kva.
- Use multiple clean queues according to buffer cache size to eliminate
clean queue lock contention.
- Introduce a bufspace daemon that attempts to prevent getnewbuf() callers
from blocking or doing direct recycling.
- Close some bufspace allocation races that could lead to endless
recycling.
- Further the transition to a more modern style of small functions grouped
by prefix in order to improve growing complexity.
Sponsored by: EMC / Isilon
Reviewed by: kib
Tested by: pho
Modified:
head/sys/kern/vfs_bio.c
head/sys/vm/vm_init.c
Modified: head/sys/kern/vfs_bio.c
==============================================================================
--- head/sys/kern/vfs_bio.c Wed Oct 14 00:43:29 2015 (r289278)
+++ head/sys/kern/vfs_bio.c Wed Oct 14 02:10:07 2015 (r289279)
@@ -63,6 +63,7 @@ __FBSDID("$FreeBSD$");
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
+#include <sys/smp.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/vmem.h>
@@ -100,6 +101,7 @@ caddr_t unmapped_buf;
/* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
struct proc *bufdaemonproc;
+struct proc *bufspacedaemonproc;
static int inmem(struct vnode *vp, daddr_t blkno);
static void vm_hold_free_pages(struct buf *bp, int newbsize);
@@ -116,11 +118,18 @@ static void vfs_vmio_extend(struct buf *
static int vfs_bio_clcheck(struct vnode *vp, int size,
daddr_t lblkno, daddr_t blkno);
static int buf_flush(struct vnode *vp, int);
+static int buf_recycle(bool);
+static int buf_scan(bool);
static int flushbufqueues(struct vnode *, int, int);
static void buf_daemon(void);
static void bremfreel(struct buf *bp);
static __inline void bd_wakeup(void);
static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
+static void bufkva_reclaim(vmem_t *, int);
+static void bufkva_free(struct buf *);
+static int buf_import(void *, void **, int, int);
+static void buf_release(void *, void **, int);
+
#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
@@ -145,23 +154,23 @@ static long bufkvaspace;
SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0,
"Kernel virtual memory used for buffers");
static long maxbufspace;
-SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
- "Maximum allowed value of bufspace (including buf_daemon)");
+SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0,
+ "Maximum allowed value of bufspace (including metadata)");
static long bufmallocspace;
SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
"Amount of malloced memory for buffers");
static long maxbufmallocspace;
-SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
- "Maximum amount of malloced memory for buffers");
+SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace,
+ 0, "Maximum amount of malloced memory for buffers");
static long lobufspace;
-SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
+SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0,
"Minimum amount of buffers we want to have");
long hibufspace;
-SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
- "Maximum allowed value of bufspace (excluding buf_daemon)");
-static int bufreusecnt;
-SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0,
- "Number of times we have reused a buffer");
+SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0,
+ "Maximum allowed value of bufspace (excluding metadata)");
+long bufspacethresh;
+SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh,
+ 0, "Bufspace consumed before waking the daemon to free some");
static int buffreekvacnt;
SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
"Number of times we have freed the KVA space from some buffer");
@@ -205,10 +214,10 @@ SYSCTL_INT(_vfs, OID_AUTO, numfreebuffer
"Number of free buffers");
static int lofreebuffers;
SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
- "XXX Unused");
+ "Target number of free buffers");
static int hifreebuffers;
SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
- "XXX Complicatedly unused");
+ "Threshold for clean buffer recycling");
static int getnewbufcalls;
SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
"Number of calls to getnewbuf");
@@ -219,6 +228,9 @@ static int mappingrestarts;
SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
"Number of times getblk has had to restart a buffer mapping for "
"unmapped buffer");
+static int numbufallocfails;
+SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0,
+ "Number of times buffer allocations failed");
static int flushbufqtarget = 100;
SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
"Amount of work to do in flushbufqueues when helping bufdaemon");
@@ -233,16 +245,6 @@ SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_
"Permit the use of the unmapped i/o");
/*
- * Lock for the non-dirty bufqueues
- */
-static struct mtx_padalign bqclean;
-
-/*
- * Lock for the dirty queue.
- */
-static struct mtx_padalign bqdirty;
-
-/*
* This lock synchronizes access to bd_request.
*/
static struct mtx_padalign bdlock;
@@ -271,6 +273,11 @@ static struct mtx_padalign bdirtylock;
static int bd_request;
/*
+ * Request/wakeup point for the bufspace daemon.
+ */
+static int bufspace_request;
+
+/*
* Request for the buf daemon to write more buffers than is indicated by
* lodirtybuf. This may be necessary to push out excess dependencies or
* defragment the address space where a simple count of the number of dirty
@@ -298,7 +305,7 @@ static int runningbufreq;
* Synchronization (sleep/wakeup) variable for buffer requests.
* Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
* by and/or.
- * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(),
+ * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(),
* getnewbuf(), and getblk().
*/
static volatile int needsbuffer;
@@ -311,14 +318,21 @@ static int bdirtywait;
/*
* Definitions for the buffer free lists.
*/
-#define BUFFER_QUEUES 4 /* number of free buffer queues */
-
#define QUEUE_NONE 0 /* on no queue */
-#define QUEUE_CLEAN 1 /* non-B_DELWRI buffers */
+#define QUEUE_EMPTY 1 /* empty buffer headers */
#define QUEUE_DIRTY 2 /* B_DELWRI buffers */
-#define QUEUE_EMPTY 3 /* empty buffer headers */
+#define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */
#define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */
+/* Maximum number of clean buffer queues. */
+#define CLEAN_QUEUES 16
+
+/* Configured number of clean queues. */
+static int clean_queues;
+
+/* Maximum number of buffer queues. */
+#define BUFFER_QUEUES (QUEUE_CLEAN + CLEAN_QUEUES)
+
/* Queues for free buffers with various properties */
static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
#ifdef INVARIANTS
@@ -326,15 +340,21 @@ static int bq_len[BUFFER_QUEUES];
#endif
/*
+ * Lock for each bufqueue
+ */
+static struct mtx_padalign bqlocks[BUFFER_QUEUES];
+
+/*
+ * per-cpu empty buffer cache.
+ */
+uma_zone_t buf_zone;
+
+/*
* Single global constant for BUF_WMESG, to avoid getting multiple references.
* buf_wmesg is referred from macros.
*/
const char *buf_wmesg = BUF_WMESG;
-#define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */
-#define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */
-#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */
-
static int
sysctl_runningspace(SYSCTL_HANDLER_ARGS)
{
@@ -382,6 +402,21 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS)
}
#endif
+static int
+bqcleanq(void)
+{
+ static int nextq;
+
+ return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN);
+}
+
+static int
+bqisclean(int qindex)
+{
+
+ return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES);
+}
+
/*
* bqlock:
*
@@ -391,9 +426,7 @@ static inline struct mtx *
bqlock(int qindex)
{
- if (qindex == QUEUE_DIRTY)
- return (struct mtx *)(&bqdirty);
- return (struct mtx *)(&bqclean);
+ return (struct mtx *)&bqlocks[qindex];
}
/*
@@ -447,62 +480,255 @@ bdirtyadd(void)
}
/*
- * bufspacewakeup:
+ * bufspace_wakeup:
*
* Called when buffer space is potentially available for recovery.
* getnewbuf() will block on this flag when it is unable to free
* sufficient buffer space. Buffer space becomes recoverable when
* bp's get placed back in the queues.
*/
-static __inline void
-bufspacewakeup(void)
+static void
+bufspace_wakeup(void)
{
- int need_wakeup, on;
/*
- * If someone is waiting for bufspace, wake them up. Even
- * though we may not have freed the kva space yet, the waiting
- * process will be able to now.
+ * If someone is waiting for bufspace, wake them up.
+ *
+ * Since needsbuffer is set prior to doing an additional queue
+ * scan it is safe to check for the flag prior to acquiring the
+ * lock. The thread that is preparing to scan again before
+ * blocking would discover the buf we released.
*/
+ if (needsbuffer) {
+ rw_rlock(&nblock);
+ if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1)
+ wakeup(__DEVOLATILE(void *, &needsbuffer));
+ rw_runlock(&nblock);
+ }
+}
+
+/*
+ * bufspace_daemonwakeup:
+ *
+ * Wakeup the daemon responsible for freeing clean bufs.
+ */
+static void
+bufspace_daemonwakeup(void)
+{
rw_rlock(&nblock);
- for (;;) {
- need_wakeup = 0;
- on = needsbuffer;
- if ((on & VFS_BIO_NEED_BUFSPACE) == 0)
- break;
- need_wakeup = 1;
- if (atomic_cmpset_rel_int(&needsbuffer, on,
- on & ~VFS_BIO_NEED_BUFSPACE))
- break;
+ if (bufspace_request == 0) {
+ bufspace_request = 1;
+ wakeup(&bufspace_request);
}
- if (need_wakeup)
- wakeup(__DEVOLATILE(void *, &needsbuffer));
rw_runlock(&nblock);
}
/*
- * bufspaceadjust:
+ * bufspace_adjust:
*
* Adjust the reported bufspace for a KVA managed buffer, possibly
* waking any waiters.
*/
static void
-bufspaceadjust(struct buf *bp, int bufsize)
+bufspace_adjust(struct buf *bp, int bufsize)
{
+ long space;
int diff;
KASSERT((bp->b_flags & B_MALLOC) == 0,
- ("bufspaceadjust: malloc buf %p", bp));
+ ("bufspace_adjust: malloc buf %p", bp));
diff = bufsize - bp->b_bufsize;
if (diff < 0) {
atomic_subtract_long(&bufspace, -diff);
- bufspacewakeup();
- } else
- atomic_add_long(&bufspace, diff);
+ bufspace_wakeup();
+ } else {
+ space = atomic_fetchadd_long(&bufspace, diff);
+ /* Wake up the daemon on the transition. */
+ if (space < bufspacethresh && space + diff >= bufspacethresh)
+ bufspace_daemonwakeup();
+ }
bp->b_bufsize = bufsize;
}
/*
+ * bufspace_reserve:
+ *
+ * Reserve bufspace before calling allocbuf(). metadata has a
+ * different space limit than data.
+ */
+static int
+bufspace_reserve(int size, bool metadata)
+{
+ long limit;
+ long space;
+
+ if (metadata)
+ limit = maxbufspace;
+ else
+ limit = hibufspace;
+ do {
+ space = bufspace;
+ if (space + size > limit)
+ return (ENOSPC);
+ } while (atomic_cmpset_long(&bufspace, space, space + size) == 0);
+
+ /* Wake up the daemon on the transition. */
+ if (space < bufspacethresh && space + size >= bufspacethresh)
+ bufspace_daemonwakeup();
+
+ return (0);
+}
+
+/*
+ * bufspace_release:
+ *
+ * Release reserved bufspace after bufspace_adjust() has consumed it.
+ */
+static void
+bufspace_release(int size)
+{
+ atomic_subtract_long(&bufspace, size);
+ bufspace_wakeup();
+}
+
+/*
+ * bufspace_wait:
+ *
+ * Wait for bufspace, acting as the buf daemon if a locked vnode is
+ * supplied. needsbuffer must be set in a safe fashion prior to
+ * polling for space. The operation must be re-tried on return.
+ */
+static void
+bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo)
+{
+ struct thread *td;
+ int error, fl, norunbuf;
+
+ if ((gbflags & GB_NOWAIT_BD) != 0)
+ return;
+
+ td = curthread;
+ rw_wlock(&nblock);
+ while (needsbuffer != 0) {
+ if (vp != NULL && vp->v_type != VCHR &&
+ (td->td_pflags & TDP_BUFNEED) == 0) {
+ rw_wunlock(&nblock);
+ /*
+ * getblk() is called with a vnode locked, and
+ * some majority of the dirty buffers may as
+ * well belong to the vnode. Flushing the
+ * buffers there would make a progress that
+ * cannot be achieved by the buf_daemon, that
+ * cannot lock the vnode.
+ */
+ norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
+ (td->td_pflags & TDP_NORUNNINGBUF);
+
+ /*
+ * Play bufdaemon. The getnewbuf() function
+ * may be called while the thread owns lock
+ * for another dirty buffer for the same
+ * vnode, which makes it impossible to use
+ * VOP_FSYNC() there, due to the buffer lock
+ * recursion.
+ */
+ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
+ fl = buf_flush(vp, flushbufqtarget);
+ td->td_pflags &= norunbuf;
+ rw_wlock(&nblock);
+ if (fl != 0)
+ continue;
+ if (needsbuffer == 0)
+ break;
+ }
+ error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
+ (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
+ if (error != 0)
+ break;
+ }
+ rw_wunlock(&nblock);
+}
+
+
+/*
+ * bufspace_daemon:
+ *
+ * buffer space management daemon. Tries to maintain some marginal
+ * amount of free buffer space so that requesting processes neither
+ * block nor work to reclaim buffers.
+ */
+static void
+bufspace_daemon(void)
+{
+ for (;;) {
+ kproc_suspend_check(bufspacedaemonproc);
+
+ /*
+ * Free buffers from the clean queue until we meet our
+ * targets.
+ *
+ * Theory of operation: The buffer cache is most efficient
+ * when some free buffer headers and space are always
+ * available to getnewbuf(). This daemon attempts to prevent
+ * the excessive blocking and synchronization associated
+ * with shortfall. It goes through three phases according
+ * demand:
+ *
+ * 1) The daemon wakes up voluntarily once per-second
+ * during idle periods when the counters are below
+ * the wakeup thresholds (bufspacethresh, lofreebuffers).
+ *
+ * 2) The daemon wakes up as we cross the thresholds
+ * ahead of any potential blocking. This may bounce
+ * slightly according to the rate of consumption and
+ * release.
+ *
+ * 3) The daemon and consumers are starved for working
+ * clean buffers. This is the 'bufspace' sleep below
+ * which will inefficiently trade bufs with bqrelse
+ * until we return to condition 2.
+ */
+ while (bufspace > lobufspace ||
+ numfreebuffers < hifreebuffers) {
+ if (buf_recycle(false) != 0) {
+ atomic_set_int(&needsbuffer, 1);
+ if (buf_recycle(false) != 0) {
+ rw_wlock(&nblock);
+ if (needsbuffer)
+ rw_sleep(__DEVOLATILE(void *,
+ &needsbuffer), &nblock,
+ PRIBIO|PDROP, "bufspace",
+ hz/10);
+ else
+ rw_wunlock(&nblock);
+ }
+ }
+ maybe_yield();
+ }
+
+ /*
+ * Re-check our limits under the exclusive nblock.
+ */
+ rw_wlock(&nblock);
+ if (bufspace < bufspacethresh &&
+ numfreebuffers > lofreebuffers) {
+ bufspace_request = 0;
+ rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP,
+ "-", hz);
+ } else
+ rw_wunlock(&nblock);
+ }
+}
+
+static struct kproc_desc bufspace_kp = {
+ "bufspacedaemon",
+ bufspace_daemon,
+ &bufspacedaemonproc
+};
+SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start,
+ &bufspace_kp);
+
+/*
* bufmallocadjust:
*
* Adjust the reported bufspace for a malloc managed buffer, possibly
@@ -516,10 +742,9 @@ bufmallocadjust(struct buf *bp, int bufs
KASSERT((bp->b_flags & B_MALLOC) != 0,
("bufmallocadjust: non-malloc buf %p", bp));
diff = bufsize - bp->b_bufsize;
- if (diff < 0) {
+ if (diff < 0)
atomic_subtract_long(&bufmallocspace, -diff);
- bufspacewakeup();
- } else
+ else
atomic_add_long(&bufmallocspace, diff);
bp->b_bufsize = bufsize;
}
@@ -571,67 +796,6 @@ runningbufwakeup(struct buf *bp)
}
/*
- * bufcountadd:
- *
- * Called when a buffer has been added to one of the free queues to
- * account for the buffer and to wakeup anyone waiting for free buffers.
- * This typically occurs when large amounts of metadata are being handled
- * by the buffer cache ( else buffer space runs out first, usually ).
- */
-static __inline void
-bufcountadd(struct buf *bp)
-{
- int mask, need_wakeup, old, on;
-
- KASSERT((bp->b_flags & B_INFREECNT) == 0,
- ("buf %p already counted as free", bp));
- bp->b_flags |= B_INFREECNT;
- old = atomic_fetchadd_int(&numfreebuffers, 1);
- KASSERT(old >= 0 && old < nbuf,
- ("numfreebuffers climbed to %d", old + 1));
- mask = VFS_BIO_NEED_ANY;
- if (numfreebuffers >= hifreebuffers)
- mask |= VFS_BIO_NEED_FREE;
- rw_rlock(&nblock);
- for (;;) {
- need_wakeup = 0;
- on = needsbuffer;
- if (on == 0)
- break;
- need_wakeup = 1;
- if (atomic_cmpset_rel_int(&needsbuffer, on, on & ~mask))
- break;
- }
- if (need_wakeup)
- wakeup(__DEVOLATILE(void *, &needsbuffer));
- rw_runlock(&nblock);
-}
-
-/*
- * bufcountsub:
- *
- * Decrement the numfreebuffers count as needed.
- */
-static void
-bufcountsub(struct buf *bp)
-{
- int old;
-
- /*
- * Fixup numfreebuffers count. If the buffer is invalid or not
- * delayed-write, the buffer was free and we must decrement
- * numfreebuffers.
- */
- if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
- KASSERT((bp->b_flags & B_INFREECNT) != 0,
- ("buf %p not counted in numfreebuffers", bp));
- bp->b_flags &= ~B_INFREECNT;
- old = atomic_fetchadd_int(&numfreebuffers, -1);
- KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
- }
-}
-
-/*
* waitrunningbufspace()
*
* runningbufspace is a measure of the amount of I/O currently
@@ -847,8 +1011,10 @@ bufinit(void)
int i;
CTASSERT(MAXBCACHEBUF >= MAXBSIZE);
- mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF);
- mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF);
+ mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF);
+ mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF);
+ for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++)
+ mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF);
mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
rw_init(&nblock, "needsbuffer lock");
mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
@@ -864,7 +1030,7 @@ bufinit(void)
for (i = 0; i < nbuf; i++) {
bp = &buf[i];
bzero(bp, sizeof *bp);
- bp->b_flags = B_INVAL | B_INFREECNT;
+ bp->b_flags = B_INVAL;
bp->b_rcred = NOCRED;
bp->b_wcred = NOCRED;
bp->b_qindex = QUEUE_EMPTY;
@@ -881,18 +1047,19 @@ bufinit(void)
/*
* maxbufspace is the absolute maximum amount of buffer space we are
* allowed to reserve in KVM and in real terms. The absolute maximum
- * is nominally used by buf_daemon. hibufspace is the nominal maximum
- * used by most other processes. The differential is required to
- * ensure that buf_daemon is able to run when other processes might
- * be blocked waiting for buffer space.
+ * is nominally used by metadata. hibufspace is the nominal maximum
+ * used by most other requests. The differential is required to
+ * ensure that metadata deadlocks don't occur.
*
* maxbufspace is based on BKVASIZE. Allocating buffers larger then
* this may result in KVM fragmentation which is not handled optimally
- * by the system.
+ * by the system. XXX This is less true with vmem. We could use
+ * PAGE_SIZE.
*/
maxbufspace = (long)nbuf * BKVASIZE;
hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10);
- lobufspace = hibufspace - MAXBCACHEBUF;
+ lobufspace = (hibufspace / 20) * 19; /* 95% */
+ bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2;
/*
* Note: The 16 MiB upper limit for hirunningspace was chosen
@@ -906,44 +1073,61 @@ bufinit(void)
16 * 1024 * 1024), 1024 * 1024);
lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF);
-/*
- * Limit the amount of malloc memory since it is wired permanently into
- * the kernel space. Even though this is accounted for in the buffer
- * allocation, we don't want the malloced region to grow uncontrolled.
- * The malloc scheme improves memory utilization significantly on average
- * (small) directories.
- */
+ /*
+ * Limit the amount of malloc memory since it is wired permanently into
+ * the kernel space. Even though this is accounted for in the buffer
+ * allocation, we don't want the malloced region to grow uncontrolled.
+ * The malloc scheme improves memory utilization significantly on
+ * average (small) directories.
+ */
maxbufmallocspace = hibufspace / 20;
-/*
- * Reduce the chance of a deadlock occuring by limiting the number
- * of delayed-write dirty buffers we allow to stack up.
- */
+ /*
+ * Reduce the chance of a deadlock occuring by limiting the number
+ * of delayed-write dirty buffers we allow to stack up.
+ */
hidirtybuffers = nbuf / 4 + 20;
dirtybufthresh = hidirtybuffers * 9 / 10;
numdirtybuffers = 0;
-/*
- * To support extreme low-memory systems, make sure hidirtybuffers cannot
- * eat up all available buffer space. This occurs when our minimum cannot
- * be met. We try to size hidirtybuffers to 3/4 our buffer space assuming
- * BKVASIZE'd buffers.
- */
+ /*
+ * To support extreme low-memory systems, make sure hidirtybuffers
+ * cannot eat up all available buffer space. This occurs when our
+ * minimum cannot be met. We try to size hidirtybuffers to 3/4 our
+ * buffer space assuming BKVASIZE'd buffers.
+ */
while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
hidirtybuffers >>= 1;
}
lodirtybuffers = hidirtybuffers / 2;
-/*
- * Try to keep the number of free buffers in the specified range,
- * and give special processes (e.g. like buf_daemon) access to an
- * emergency reserve.
- */
- lofreebuffers = nbuf / 18 + 5;
- hifreebuffers = 2 * lofreebuffers;
+ /*
+ * lofreebuffers should be sufficient to avoid stalling waiting on
+ * buf headers under heavy utilization. The bufs in per-cpu caches
+ * are counted as free but will be unavailable to threads executing
+ * on other cpus.
+ *
+ * hifreebuffers is the free target for the bufspace daemon. This
+ * should be set appropriately to limit work per-iteration.
+ */
+ lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus);
+ hifreebuffers = (3 * lofreebuffers) / 2;
numfreebuffers = nbuf;
bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
+
+ /* Setup the kva and free list allocators. */
+ vmem_set_reclaim(buffer_arena, bufkva_reclaim);
+ buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf),
+ NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0);
+
+ /*
+ * Size the clean queue according to the amount of buffer space.
+ * One queue per-256mb up to the max. More queues gives better
+ * concurrency but less accurate LRU.
+ */
+ clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES);
+
}
#ifdef INVARIANTS
@@ -1129,10 +1313,25 @@ binsfree(struct buf *bp, int qindex)
{
struct mtx *olock, *nlock;
- BUF_ASSERT_XLOCKED(bp);
+ if (qindex != QUEUE_EMPTY) {
+ BUF_ASSERT_XLOCKED(bp);
+ }
+
+ /*
+ * Stick to the same clean queue for the lifetime of the buf to
+ * limit locking below. Otherwise pick ont sequentially.
+ */
+ if (qindex == QUEUE_CLEAN) {
+ if (bqisclean(bp->b_qindex))
+ qindex = bp->b_qindex;
+ else
+ qindex = bqcleanq();
+ }
+ /*
+ * Handle delayed bremfree() processing.
+ */
nlock = bqlock(qindex);
- /* Handle delayed bremfree() processing. */
if (bp->b_flags & B_REMFREE) {
olock = bqlock(bp->b_qindex);
mtx_lock(olock);
@@ -1156,15 +1355,263 @@ binsfree(struct buf *bp, int qindex)
bq_len[bp->b_qindex]++;
#endif
mtx_unlock(nlock);
+}
+
+/*
+ * buf_free:
+ *
+ * Free a buffer to the buf zone once it no longer has valid contents.
+ */
+static void
+buf_free(struct buf *bp)
+{
+
+ if (bp->b_flags & B_REMFREE)
+ bremfreef(bp);
+ if (bp->b_vflags & BV_BKGRDINPROG)
+ panic("losing buffer 1");
+ if (bp->b_rcred != NOCRED) {
+ crfree(bp->b_rcred);
+ bp->b_rcred = NOCRED;
+ }
+ if (bp->b_wcred != NOCRED) {
+ crfree(bp->b_wcred);
+ bp->b_wcred = NOCRED;
+ }
+ if (!LIST_EMPTY(&bp->b_dep))
+ buf_deallocate(bp);
+ bufkva_free(bp);
+ BUF_UNLOCK(bp);
+ uma_zfree(buf_zone, bp);
+ atomic_add_int(&numfreebuffers, 1);
+ bufspace_wakeup();
+}
+
+/*
+ * buf_import:
+ *
+ * Import bufs into the uma cache from the buf list. The system still
+ * expects a static array of bufs and much of the synchronization
+ * around bufs assumes type stable storage. As a result, UMA is used
+ * only as a per-cpu cache of bufs still maintained on a global list.
+ */
+static int
+buf_import(void *arg, void **store, int cnt, int flags)
+{
+ struct buf *bp;
+ int i;
+
+ mtx_lock(&bqlocks[QUEUE_EMPTY]);
+ for (i = 0; i < cnt; i++) {
+ bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+ if (bp == NULL)
+ break;
+ bremfreel(bp);
+ store[i] = bp;
+ }
+ mtx_unlock(&bqlocks[QUEUE_EMPTY]);
+
+ return (i);
+}
+
+/*
+ * buf_release:
+ *
+ * Release bufs from the uma cache back to the buffer queues.
+ */
+static void
+buf_release(void *arg, void **store, int cnt)
+{
+ int i;
+
+ for (i = 0; i < cnt; i++)
+ binsfree(store[i], QUEUE_EMPTY);
+}
+
+/*
+ * buf_alloc:
+ *
+ * Allocate an empty buffer header.
+ */
+static struct buf *
+buf_alloc(void)
+{
+ struct buf *bp;
+
+ bp = uma_zalloc(buf_zone, M_NOWAIT);
+ if (bp == NULL) {
+ bufspace_daemonwakeup();
+ atomic_add_int(&numbufallocfails, 1);
+ return (NULL);
+ }
+
+ /*
+ * Wake-up the bufspace daemon on transition.
+ */
+ if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers)
+ bufspace_daemonwakeup();
+
+ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
+ panic("getnewbuf_empty: Locked buf %p on free queue.", bp);
+
+ KASSERT(bp->b_vp == NULL,
+ ("bp: %p still has vnode %p.", bp, bp->b_vp));
+ KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0,
+ ("invalid buffer %p flags %#x", bp, bp->b_flags));
+ KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
+ ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
+ KASSERT(bp->b_npages == 0,
+ ("bp: %p still has %d vm pages\n", bp, bp->b_npages));
+ KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
+ KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
+
+ bp->b_flags = 0;
+ bp->b_ioflags = 0;
+ bp->b_xflags = 0;
+ bp->b_vflags = 0;
+ bp->b_vp = NULL;
+ bp->b_blkno = bp->b_lblkno = 0;
+ bp->b_offset = NOOFFSET;
+ bp->b_iodone = 0;
+ bp->b_error = 0;
+ bp->b_resid = 0;
+ bp->b_bcount = 0;
+ bp->b_npages = 0;
+ bp->b_dirtyoff = bp->b_dirtyend = 0;
+ bp->b_bufobj = NULL;
+ bp->b_pin_count = 0;
+ bp->b_data = bp->b_kvabase = unmapped_buf;
+ bp->b_fsprivate1 = NULL;
+ bp->b_fsprivate2 = NULL;
+ bp->b_fsprivate3 = NULL;
+ LIST_INIT(&bp->b_dep);
+
+ return (bp);
+}
+
+/*
+ * buf_qrecycle:
+ *
+ * Free a buffer from the given bufqueue. kva controls whether the
+ * freed buf must own some kva resources. This is used for
+ * defragmenting.
+ */
+static int
+buf_qrecycle(int qindex, bool kva)
+{
+ struct buf *bp, *nbp;
+
+ if (kva)
+ atomic_add_int(&bufdefragcnt, 1);
+ nbp = NULL;
+ mtx_lock(&bqlocks[qindex]);
+ nbp = TAILQ_FIRST(&bufqueues[qindex]);
+
+ /*
+ * Run scan, possibly freeing data and/or kva mappings on the fly
+ * depending.
+ */
+ while ((bp = nbp) != NULL) {
+ /*
+ * Calculate next bp (we can only use it if we do not
+ * release the bqlock).
+ */
+ nbp = TAILQ_NEXT(bp, b_freelist);
+
+ /*
+ * If we are defragging then we need a buffer with
+ * some kva to reclaim.
+ */
+ if (kva && bp->b_kvasize == 0)
+ continue;
+
+ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
+ continue;
+
+ /*
+ * Skip buffers with background writes in progress.
+ */
+ if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
+ BUF_UNLOCK(bp);
+ continue;
+ }
+
+ KASSERT(bp->b_qindex == qindex,
+ ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
+ /*
+ * NOTE: nbp is now entirely invalid. We can only restart
+ * the scan from this point on.
+ */
+ bremfreel(bp);
+ mtx_unlock(&bqlocks[qindex]);
+
+ /*
+ * Requeue the background write buffer with error and
+ * restart the scan.
+ */
+ if ((bp->b_vflags & BV_BKGRDERR) != 0) {
+ bqrelse(bp);
+ mtx_lock(&bqlocks[qindex]);
+ nbp = TAILQ_FIRST(&bufqueues[qindex]);
+ continue;
+ }
+ bp->b_flags |= B_INVAL;
+ brelse(bp);
+ return (0);
+ }
+ mtx_unlock(&bqlocks[qindex]);
+
+ return (ENOBUFS);
+}
+
+/*
+ * buf_recycle:
+ *
+ * Iterate through all clean queues until we find a buf to recycle or
+ * exhaust the search.
+ */
+static int
+buf_recycle(bool kva)
+{
+ int qindex, first_qindex;
+
+ qindex = first_qindex = bqcleanq();
+ do {
+ if (buf_qrecycle(qindex, kva) == 0)
+ return (0);
+ if (++qindex == QUEUE_CLEAN + clean_queues)
+ qindex = QUEUE_CLEAN;
+ } while (qindex != first_qindex);
+
+ return (ENOBUFS);
+}
+
+/*
+ * buf_scan:
+ *
+ * Scan the clean queues looking for a buffer to recycle. needsbuffer
+ * is set on failure so that the caller may optionally bufspace_wait()
+ * in a race-free fashion.
+ */
+static int
+buf_scan(bool defrag)
+{
+ int error;
/*
- * Something we can maybe free or reuse.
- */
- if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
- bufspacewakeup();
-
- if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
- bufcountadd(bp);
+ * To avoid heavy synchronization and wakeup races we set
+ * needsbuffer and re-poll before failing. This ensures that
+ * no frees can be missed between an unsuccessful poll and
+ * going to sleep in a synchronized fashion.
+ */
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-all
mailing list