[RFC] vfs_bio additions, motivated by XFS for FreeBSD project
Craig Rodrigues
rodrigc at crodrigues.org
Sat Nov 12 10:29:06 PST 2005
Hi,
Now that FreeBSD 6.0 is released, I would like to work
on integrating code from the XFS for FreeBSD project into
FreeBSD-CURRENT.
Alexander Kabaev made some changes to vfs_bio.c which are
needed by the XFS for FreeBSD code. In addition to some
new functions, this patch adds three new fields
to struct buf (b_fsprivate1, b_fsprivate2, b_fsprivate3).
You don't see their use here, but in the XFS for FreeBSD code
(which you can get from http://people.freebsd.org/~rodrigc/xfs/ ),
they are used to cache certain information.
Comments?
--- //depot/vendor/freebsd/src/sys/kern/vfs_bio.c 2005/10/08 15:01:11
+++ //depot/projects/src/sys/kern/vfs_bio.c 2005/10/08 16:09:54
@@ -216,7 +216,7 @@
*/
static struct mtx rbreqlock;
-/*
+/*
* Synchronization (sleep/wakeup) variable for buffer requests.
* Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
* by and/or.
@@ -233,8 +233,12 @@
/*
* Lock that protects against bwait()/bdone()/B_DONE races.
*/
+static struct mtx bdonelock;
-static struct mtx bdonelock;
+/*
+ * Lock that protects against bwait()/bdone()/B_DONE races.
+ */
+static struct mtx bpinlock;
/*
* Definitions for the buffer free lists.
@@ -523,6 +527,7 @@
mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF);
mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
mtx_init(&bdonelock, "bdone lock", NULL, MTX_DEF);
+ mtx_init(&bpinlock, "bpin lock", NULL, MTX_DEF);
/* next, make a null set of free lists */
for (i = 0; i < BUFFER_QUEUES; i++)
@@ -636,7 +641,7 @@
* bremfree:
*
* Mark the buffer for removal from the appropriate free list in brelse.
- *
+ *
*/
void
bremfree(struct buf *bp)
@@ -720,18 +725,51 @@
}
/*
+ * Attempt to initiate asynchronous I/O on read-ahead blocks. We must
+ * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
+ * the buffer is valid and we do not have to do anything.
+ */
+void
+breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
+ int cnt, struct ucred * cred)
+{
+ struct buf *rabp;
+ int i;
+
+ for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
+ if (inmem(vp, *rablkno))
+ continue;
+ rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
+
+ if ((rabp->b_flags & B_CACHE) == 0) {
+ if (curthread != PCPU_GET(idlethread))
+ curthread->td_proc->p_stats->p_ru.ru_inblock++;
+ rabp->b_flags |= B_ASYNC;
+ rabp->b_flags &= ~B_INVAL;
+ rabp->b_ioflags &= ~BIO_ERROR;
+ rabp->b_iocmd = BIO_READ;
+ if (rabp->b_rcred == NOCRED && cred != NOCRED)
+ rabp->b_rcred = crhold(cred);
+ vfs_busy_pages(rabp, 0);
+ BUF_KERNPROC(rabp);
+ rabp->b_iooffset = dbtob(rabp->b_blkno);
+ bstrategy(rabp);
+ } else {
+ brelse(rabp);
+ }
+ }
+}
+
+/*
* Operates like bread, but also starts asynchronous I/O on
- * read-ahead blocks. We must clear BIO_ERROR and B_INVAL prior
- * to initiating I/O . If B_CACHE is set, the buffer is valid
- * and we do not have to do anything.
+ * read-ahead blocks.
*/
int
breadn(struct vnode * vp, daddr_t blkno, int size,
daddr_t * rablkno, int *rabsize,
int cnt, struct ucred * cred, struct buf **bpp)
{
- struct buf *bp, *rabp;
- int i;
+ struct buf *bp;
int rv = 0, readwait = 0;
CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
@@ -752,29 +790,8 @@
++readwait;
}
- for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
- if (inmem(vp, *rablkno))
- continue;
- rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
+ breada(vp, rablkno, rabsize, cnt, cred);
- if ((rabp->b_flags & B_CACHE) == 0) {
- if (curthread != PCPU_GET(idlethread))
- curthread->td_proc->p_stats->p_ru.ru_inblock++;
- rabp->b_flags |= B_ASYNC;
- rabp->b_flags &= ~B_INVAL;
- rabp->b_ioflags &= ~BIO_ERROR;
- rabp->b_iocmd = BIO_READ;
- if (rabp->b_rcred == NOCRED && cred != NOCRED)
- rabp->b_rcred = crhold(cred);
- vfs_busy_pages(rabp, 0);
- BUF_KERNPROC(rabp);
- rabp->b_iooffset = dbtob(rabp->b_blkno);
- bstrategy(rabp);
- } else {
- brelse(rabp);
- }
- }
-
if (readwait) {
rv = bufwait(bp);
}
@@ -807,6 +824,10 @@
if (BUF_REFCNT(bp) == 0)
panic("bufwrite: buffer is not busy???");
+
+ if (bp->b_pin_count > 0)
+ bunpin_wait(bp);
+
KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
("FFS background buffer should not get here %p", bp));
@@ -1117,6 +1138,11 @@
KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
+ if (bp->b_flags & B_MANAGED) {
+ bqrelse(bp);
+ return;
+ }
+
if (bp->b_iocmd == BIO_WRITE &&
(bp->b_ioflags & BIO_ERROR) &&
!(bp->b_flags & B_INVAL)) {
@@ -1286,7 +1312,7 @@
}
}
-
+
if (BUF_REFCNT(bp) > 1) {
/* do not release to free list */
BUF_UNLOCK(bp);
@@ -1394,6 +1420,18 @@
BUF_UNLOCK(bp);
return;
}
+
+ if (bp->b_flags & B_MANAGED) {
+ if (bp->b_flags & B_REMFREE) {
+ mtx_lock(&bqlock);
+ bremfreel(bp);
+ mtx_unlock(&bqlock);
+ }
+ bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+ BUF_UNLOCK(bp);
+ return;
+ }
+
mtx_lock(&bqlock);
/* Handle delayed bremfree() processing. */
if (bp->b_flags & B_REMFREE)
@@ -1821,6 +1859,10 @@
bp->b_npages = 0;
bp->b_dirtyoff = bp->b_dirtyend = 0;
bp->b_bufobj = NULL;
+ bp->b_pin_count = 0;
+ bp->b_fsprivate1 = NULL;
+ bp->b_fsprivate2 = NULL;
+ bp->b_fsprivate3 = NULL;
LIST_INIT(&bp->b_dep);
@@ -2059,6 +2101,10 @@
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
continue;
+ if (bp->b_pin_count > 0) {
+ BUF_UNLOCK(bp);
+ continue;
+ }
BO_LOCK(bp->b_bufobj);
if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
(bp->b_flags & B_DELWRI) == 0) {
@@ -2393,6 +2439,19 @@
if ((bp->b_flags & B_VMIO) == 0 ||
(size > bp->b_kvasize)) {
if (bp->b_flags & B_DELWRI) {
+ /*
+ * If buffer is pinned and caller does
+ * not want sleep waiting for it to be
+ * unpinned, bail out
+ * */
+ if (bp->b_pin_count > 0) {
+ if (flags & GB_LOCK_NOWAIT) {
+ bqrelse(bp);
+ return (NULL);
+ } else {
+ bunpin_wait(bp);
+ }
+ }
bp->b_flags |= B_NOCACHE;
bwrite(bp);
} else {
@@ -3034,11 +3093,11 @@
struct bufobj *dropobj;
void (*biodone)(struct buf *);
-
CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
dropobj = NULL;
- KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp)));
+ KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp,
+ BUF_REFCNT(bp)));
KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
runningbufwakeup(bp);
@@ -3053,6 +3112,19 @@
bufobj_wdrop(dropobj);
return;
}
+
+ bufdone_finish(bp);
+
+ if (dropobj)
+ bufobj_wdrop(dropobj);
+}
+
+void
+bufdone_finish(struct buf *bp)
+{
+ KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp,
+ BUF_REFCNT(bp)));
+
if (LIST_FIRST(&bp->b_dep) != NULL)
buf_complete(bp);
@@ -3118,7 +3190,8 @@
if (m == NULL)
panic("biodone: page disappeared!");
bp->b_pages[i] = m;
- pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+ pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+ bp->b_pages, bp->b_npages);
}
#if defined(VFS_BIO_DEBUG)
if (OFF_TO_IDX(foff) != m->pindex) {
@@ -3130,7 +3203,7 @@
/*
* In the write case, the valid and clean bits are
- * already changed correctly ( see bdwrite() ), so we
+ * already changed correctly ( see bdwrite() ), so we
* only need to do this here in the read case.
*/
if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) {
@@ -3185,8 +3258,6 @@
bqrelse(bp);
} else
bdone(bp);
- if (dropobj)
- bufobj_wdrop(dropobj);
}
/*
@@ -3742,6 +3813,32 @@
return (error);
}
+void
+bpin(struct buf *bp)
+{
+ mtx_lock(&bpinlock);
+ bp->b_pin_count ++;
+ mtx_unlock(&bpinlock);
+}
+
+void
+bunpin(struct buf *bp)
+{
+ mtx_lock(&bpinlock);
+ if ( --bp->b_pin_count == 0)
+ wakeup(bp);
+ mtx_unlock(&bpinlock);
+}
+
+void
+bunpin_wait(struct buf *bp)
+{
+ mtx_lock(&bpinlock);
+ while (bp->b_pin_count > 0)
+ msleep(bp, &bpinlock, PRIBIO, "bwunpin", 0);
+ mtx_unlock(&bpinlock);
+}
+
#include "opt_ddb.h"
#ifdef DDB
#include <ddb/ddb.h>
@@ -3794,3 +3891,4 @@
}
}
#endif /* DDB */
+
--- //depot/vendor/freebsd/src/sys/kern/vfs_cluster.c 2005/08/14 09:53:08
+++ //depot/projects/src/sys/kern/vfs_cluster.c 2005/08/14 10:01:58
@@ -765,6 +765,12 @@
--len;
continue;
}
+ if (tbp->b_pin_count > 0) {
+ BUF_UNLOCK(tbp);
+ ++start_lbn;
+ --len;
+ continue;
+ }
bremfree(tbp);
tbp->b_flags &= ~B_DONE;
@@ -868,6 +874,15 @@
BUF_UNLOCK(tbp);
break;
}
+
+ /*
+ * Do not pull in pinned buffers.
+ */
+ if (tbp->b_pin_count > 0) {
+ BUF_UNLOCK(tbp);
+ break;
+ }
+
/*
* Ok, it's passed all the tests,
* so remove it from the free list
@@ -979,3 +994,4 @@
buflist->bs_nchildren = i + 1;
return (buflist);
}
+
--- //depot/vendor/freebsd/src/sys/sys/buf.h 2005/10/08 15:01:11
+++ //depot/projects/src/sys/sys/buf.h 2005/10/08 16:09:54
@@ -135,6 +135,10 @@
struct vm_page *b_pages[btoc(MAXPHYS)];
int b_npages;
struct workhead b_dep; /* (D) List of filesystem dependencies. */
+ void *b_fsprivate1;
+ void *b_fsprivate2;
+ void *b_fsprivate3;
+ int b_pin_count;
};
#define b_object b_bufobj->bo_object
@@ -214,7 +218,7 @@
#define B_01000000 0x01000000 /* Available flag. */
#define B_02000000 0x02000000 /* Available flag. */
#define B_PAGING 0x04000000 /* volatile paging I/O -- bypass VMIO */
-#define B_08000000 0x08000000 /* Available flag. */
+#define B_MANAGED 0x08000000 /* Managed by FS. */
#define B_RAM 0x10000000 /* Read ahead mark (flag) */
#define B_VMIO 0x20000000 /* VMIO flag */
#define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */
@@ -486,6 +490,7 @@
void bremfree(struct buf *);
void bremfreef(struct buf *); /* XXX Force bremfree, only for nfs. */
int bread(struct vnode *, daddr_t, int, struct ucred *, struct buf **);
+void breada(struct vnode *, daddr_t *, int *, int, struct ucred *);
int breadn(struct vnode *, daddr_t, int, daddr_t *, int *, int,
struct ucred *, struct buf **);
void bdwrite(struct buf *);
@@ -504,6 +509,7 @@
int bufwait(struct buf *);
int bufwrite(struct buf *);
void bufdone(struct buf *);
+void bufdone_finish(struct buf *);
int cluster_read(struct vnode *, u_quad_t, daddr_t, long,
struct ucred *, long, int, struct buf **);
@@ -527,7 +533,11 @@
struct buf *trypbuf(int *);
void bwait(struct buf *, u_char, const char *);
void bdone(struct buf *);
+void bpin(struct buf *);
+void bunpin(struct buf *);
+void bunpin_wait(struct buf *);
#endif /* _KERNEL */
#endif /* !_SYS_BUF_H_ */
+
--
Craig Rodrigues
rodrigc at crodrigues.org
More information about the freebsd-arch
mailing list