NFS client/buffer cache deadlock
Brian Fundakowski Feldman
green at freebsd.org
Thu Apr 14 22:07:37 PDT 2005
I'll spare a lengthy write-up because I think the patch documents it well
enough. It certainly appears to fix things here when doing very large
block-sized writes, but it also reduces the throughput with those block
sizes. (I don't think there should be any difference when using reasonable
block sizes).
Would anyone care to take a shot at fixing it in a more elegant manner?
Index: sys/buf.h
===================================================================
RCS file: /export/ncvs/src/sys/sys/buf.h,v
retrieving revision 1.167.2.1
diff -u -r1.167.2.1 buf.h
--- sys/buf.h 31 Jan 2005 23:26:55 -0000 1.167.2.1
+++ sys/buf.h 15 Apr 2005 02:00:44 -0000
@@ -469,6 +469,7 @@
extern int maxswzone; /* Max KVA for swap structures */
extern int maxbcache; /* Max KVA for buffer cache */
extern int runningbufspace;
+extern int hibufspace;
extern int buf_maxio; /* nominal maximum I/O for buffer */
extern struct buf *buf; /* The buffer headers. */
extern char *buffers; /* The buffer contents. */
Index: kern/vfs_bio.c
===================================================================
RCS file: /export/ncvs/src/sys/kern/vfs_bio.c,v
retrieving revision 1.444.2.2
diff -u -r1.444.2.2 vfs_bio.c
--- kern/vfs_bio.c 31 Jan 2005 23:26:18 -0000 1.444.2.2
+++ kern/vfs_bio.c 15 Apr 2005 01:59:38 -0000
@@ -113,7 +113,7 @@
static int lobufspace;
SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
"Minimum amount of buffers we want to have");
-static int hibufspace;
+int hibufspace;
SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
"Maximum allowed value of bufspace (excluding buf_daemon)");
static int bufreusecnt;
Index: nfsclient/nfs_bio.c
===================================================================
RCS file: /export/ncvs/src/sys/nfsclient/nfs_bio.c,v
retrieving revision 1.133.2.2
diff -u -r1.133.2.2 nfs_bio.c
--- nfsclient/nfs_bio.c 31 Jan 2005 23:26:46 -0000 1.133.2.2
+++ nfsclient/nfs_bio.c 15 Apr 2005 04:41:13 -0000
@@ -726,6 +726,7 @@
struct vattr vattr;
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
daddr_t lbn;
+ off_t commitleft;
int bcount;
int n, on, error = 0;
int haverslock = 0;
@@ -755,6 +756,7 @@
*/
if (ioflag & (IO_APPEND | IO_SYNC)) {
if (np->n_flag & NMODIFIED) {
+flush_and_restart:
np->n_attrstamp = 0;
error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1);
if (error)
@@ -832,12 +834,65 @@
}
biosize = vp->v_mount->mnt_stat.f_iosize;
+ commitleft = 0;
+ /*
+ * If there are possible modifications, then there may be some
+ * B_NEEDCOMMIT buffers. Total those up here and force a flush
+ * before starting to write if our writes can exceed the local
+ * maximum per-file write commit size.
+ *
+ * If there are no possible pending modifications, we still need
+ * to limit our write to that size.
+ */
+ if ((ioflag & (IO_SYNC | IO_INVAL)) != (IO_SYNC | IO_INVAL)) {
+ commitleft = nmp->nm_wcommitsize;
+ if (np->n_flag & NMODIFIED) {
+ int wouldcommit = 0;
+ VI_LOCK(vp);
+ TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
+ if (bp->b_flags & B_NEEDCOMMIT)
+ wouldcommit += bp->b_bcount;
+ }
+ VI_UNLOCK(vp);
+ /*
+ * Since we're not operating synchronously and
+ * bypassing the buffer cache, we are in a commit
+ * and holding all of these buffers whether
+ * transmitted or not. If not limited, this
+ * will lead to the buffer cache deadlocking,
+ * as no one else can flush our uncommitted buffers.
+ */
+ wouldcommit += uio->uio_resid;
+ /*
+ * If we would initially exceed the maximum
+ * outstanding write commit size, flush and restart.
+ */
+ if (wouldcommit > commitleft) {
+ if (haverslock) {
+ nfs_rsunlock(np, td);
+ haverslock = 0;
+ }
+ goto flush_and_restart;
+ }
+ } else {
+ /*
+ * With no outstanding commits, we are limited only
+ * by commitleft as to how far we can go.
+ */
+ }
+ }
do {
nfsstats.biocache_writes++;
lbn = uio->uio_offset / biosize;
on = uio->uio_offset & (biosize-1);
n = min((unsigned)(biosize - on), uio->uio_resid);
+ /* Always allow at least one write. */
+ if (commitleft > 0) {
+ commitleft -= n;
+ if (commitleft == 0)
+ commitleft = -1;
+ }
again:
/*
* Handle direct append and file extension cases, calculate
@@ -932,12 +987,6 @@
break;
}
}
- if (!bp) {
- error = nfs_sigintr(nmp, NULL, td);
- if (!error)
- error = EINTR;
- break;
- }
if (bp->b_wcred == NOCRED)
bp->b_wcred = crhold(cred);
np->n_flag |= NMODIFIED;
@@ -1036,7 +1085,7 @@
} else {
bdwrite(bp);
}
- } while (uio->uio_resid > 0 && n > 0);
+ } while (uio->uio_resid > 0 && n > 0 && commitleft >= 0);
if (haverslock)
nfs_rsunlock(np, td);
Index: nfsclient/nfs_vfsops.c
===================================================================
RCS file: /export/ncvs/src/sys/nfsclient/nfs_vfsops.c,v
retrieving revision 1.158.2.3
diff -u -r1.158.2.3 nfs_vfsops.c
--- nfsclient/nfs_vfsops.c 31 Jan 2005 23:26:46 -0000 1.158.2.3
+++ nfsclient/nfs_vfsops.c 15 Apr 2005 02:03:05 -0000
@@ -41,6 +41,8 @@
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
@@ -625,6 +627,12 @@
else
nmp->nm_readahead = NFS_MAXRAHEAD;
}
+ if ((argp->flags & NFSMNT_WCOMMITSIZE) && argp->wcommitsize >= 0) {
+ if (argp->wcommitsize < nmp->nm_wsize)
+ nmp->nm_wcommitsize = nmp->nm_wsize;
+ else
+ nmp->nm_wcommitsize = argp->wcommitsize;
+ }
if ((argp->flags & NFSMNT_DEADTHRESH) && argp->deadthresh >= 0) {
if (argp->deadthresh <= NFS_MAXDEADTHRESH)
nmp->nm_deadthresh = argp->deadthresh;
@@ -785,6 +793,7 @@
nmp->nm_wsize = NFS_WSIZE;
nmp->nm_rsize = NFS_RSIZE;
}
+ nmp->nm_wcommitsize = hibufspace / (desiredvnodes / 1000);
nmp->nm_readdirsize = NFS_READDIRSIZE;
nmp->nm_numgrps = NFS_MAXGRPS;
nmp->nm_readahead = NFS_DEFRAHEAD;
Index: nfsclient/nfsargs.h
===================================================================
RCS file: /export/ncvs/src/sys/nfsclient/nfsargs.h,v
retrieving revision 1.66.2.1
diff -u -r1.66.2.1 nfsargs.h
--- nfsclient/nfsargs.h 31 Jan 2005 23:26:46 -0000 1.66.2.1
+++ nfsclient/nfsargs.h 15 Apr 2005 01:33:08 -0000
@@ -56,7 +56,7 @@
int retrans; /* times to retry send */
int maxgrouplist; /* Max. size of group list */
int readahead; /* # of blocks to readahead */
- int __pad1; /* was "leaseterm" */
+ int wcommitsize; /* Max. write commit size in bytes */
int deadthresh; /* Retrans threshold */
char *hostname; /* server's name */
int acregmin; /* cache attrs for reg files min time */
@@ -80,7 +80,7 @@
#define NFSMNT_NFSV3 0x00000200 /* Use NFS Version 3 protocol */
/* 0x400 free, was NFSMNT_KERB */
#define NFSMNT_DUMBTIMR 0x00000800 /* Don't estimate rtt dynamically */
-/* 0x1000 free, was NFSMNT_LEASETERM */
+#define NFSMNT_WCOMMITSIZE 0x00001000 /* set max write commit size */
#define NFSMNT_READAHEAD 0x00002000 /* set read ahead */
#define NFSMNT_DEADTHRESH 0x00004000 /* set dead server retry thresh */
#define NFSMNT_RESVPORT 0x00008000 /* Allocate a reserved port */
Index: nfsclient/nfsmount.h
===================================================================
RCS file: /export/ncvs/src/sys/nfsclient/nfsmount.h,v
retrieving revision 1.27.2.1
diff -u -r1.27.2.1 nfsmount.h
--- nfsclient/nfsmount.h 31 Jan 2005 23:26:46 -0000 1.27.2.1
+++ nfsclient/nfsmount.h 15 Apr 2005 01:21:57 -0000
@@ -66,6 +66,7 @@
int nm_wsize; /* Max size of write rpc */
int nm_readdirsize; /* Size of a readdir rpc */
int nm_readahead; /* Num. of blocks to readahead */
+ int nm_wcommitsize; /* Max size of commit for write */
int nm_acdirmin; /* Directory attr cache min lifetime */
int nm_acdirmax; /* Directory attr cache max lifetime */
int nm_acregmin; /* Reg file attr cache min lifetime */
--
Brian Fundakowski Feldman \'[ FreeBSD ]''''''''''\
<> green at FreeBSD.org \ The Power to Serve! \
Opinions expressed are my own. \,,,,,,,,,,,,,,,,,,,,,,\
More information about the freebsd-hackers
mailing list