atomic v_usecount and v_holdcnt
Mateusz Guzik
mjguzik at gmail.com
Tue Mar 17 01:44:19 UTC 2015
On Mon, Mar 16, 2015 at 11:46:43AM +0200, Konstantin Belousov wrote:
> On Sat, Mar 14, 2015 at 11:52:26PM +0100, Mateusz Guzik wrote:
> > On Mon, Nov 24, 2014 at 11:52:52AM +0200, Konstantin Belousov wrote:
> > > On Sat, Nov 22, 2014 at 10:11:47PM +0100, Mateusz Guzik wrote:
> > > > On Sat, Nov 22, 2014 at 11:25:27AM +0200, Konstantin Belousov wrote:
> > > > > On Sat, Nov 22, 2014 at 01:28:12AM +0100, Mateusz Guzik wrote:
> > > > > > The idea is that we don't need an interlock as long as we don't
> > > > > > transition either counter 1->0 or 0->1.
> > > > > I already said that something along the lines of the patch should work.
> > > > > In fact, you need vnode lock when hold count changes between 0 and 1,
> > > > > and probably the same for use count.
> > > > >
> > > >
> > > > I don't see why this would be required (not that I'm an VFS expert).
> > > > vnode recycling seems to be protected with the interlock.
> > > >
> > > > In fact I would argue that if this is really needed, current code is
> > > > buggy.
> > > Yes, it is already (somewhat) buggy.
> > >
> > > Most need of the lock is for the case of counts coming from 1 to 0.
> > > The reason is the handling of the active vnode list, which is used
> > > for limiting the amount of vnode list walking in syncer. When hold
> > > count is decremented to 0, vnode is removed from the active list.
> > > When use count is decremented to 0, vnode is supposedly inactivated,
> > > and vinactive() cleans the cached pages belonging to vnode. In other
> > > words, VI_OWEINACT for dirty vnode is sort of bug.
> > >
> >
> > Modified the patch to no longer have the usecount + interlock dropped +
> > VI_OWEINACT set window.
> >
> > Extended 0->1 hold count + vnode not locked window remains. I can fix
> > that if it is really necessary by having _vhold return with interlock
> > held if it did such transition.
>
> In v_upgrade_usecount(), you call v_incr_devcount() without without interlock
> held. What prevents the devfs vnode from being recycled, in particular,
> from invalidation of v_rdev pointer ?
>
Right, that was buggy. Fixed in the patch below.
> I think that refcount_acquire_if_greater() KPI is excessive. You always
> calls acquire with val == 0, and release with val == 1.
>
Yea i noted in my prevoius e-mail it should be changed (see below).
I replaced them with refcount_acquire_if_not_zero and
refcount_release_if_not_last.
> WRT to _refcount_release_lock, why is lock_object->lc_lock/lc_unlock KPI
> cannot be used ? This allows to make refcount_release_lock() a function
> instead of gcc extension macros. Not to mention that the macro is unused.
These were supposed to be used by other code, forgot to remove it from
the patch I sent here.
We can discuss this in another thread.
Striclty speaking we could use it here for vnode interlock, but I did
not want to get around VI_LOCK macro (which right now is just a
mtx_lock, but this may change).
Updated patch is below:
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
index 83f29c1..b587ebd 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
@@ -99,6 +99,6 @@ vn_rele_async(vnode_t *vp, taskq_t *taskq)
(task_func_t *)vn_rele_inactive, vp, TQ_SLEEP) != 0);
return;
}
- vp->v_usecount--;
+ refcount_release(&vp->v_usecount);
vdropl(vp);
}
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index d57dc90..f3f8339 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -665,12 +665,12 @@ success:
ltype = VOP_ISLOCKED(dvp);
VOP_UNLOCK(dvp, 0);
}
- VI_LOCK(*vpp);
+ vhold(*vpp);
if (wlocked)
CACHE_WUNLOCK();
else
CACHE_RUNLOCK();
- error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, cnp->cn_thread);
+ error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread);
if (cnp->cn_flags & ISDOTDOT) {
vn_lock(dvp, ltype | LK_RETRY);
if (dvp->v_iflag & VI_DOOMED) {
@@ -1368,9 +1368,9 @@ vn_dir_dd_ino(struct vnode *vp)
if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
continue;
ddvp = ncp->nc_dvp;
- VI_LOCK(ddvp);
+ vhold(ddvp);
CACHE_RUNLOCK();
- if (vget(ddvp, LK_INTERLOCK | LK_SHARED | LK_NOWAIT, curthread))
+ if (vget(ddvp, LK_SHARED | LK_NOWAIT | LK_VNHELD, curthread))
return (NULL);
return (ddvp);
}
diff --git a/sys/kern/vfs_hash.c b/sys/kern/vfs_hash.c
index 930fca1..48601e7 100644
--- a/sys/kern/vfs_hash.c
+++ b/sys/kern/vfs_hash.c
@@ -84,9 +84,9 @@ vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, s
continue;
if (fn != NULL && fn(vp, arg))
continue;
- VI_LOCK(vp);
+ vhold(vp);
rw_runlock(&vfs_hash_lock);
- error = vget(vp, flags | LK_INTERLOCK, td);
+ error = vget(vp, flags | LK_VNHELD, td);
if (error == ENOENT && (flags & LK_NOWAIT) == 0)
break;
if (error)
@@ -128,9 +128,9 @@ vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, stru
continue;
if (fn != NULL && fn(vp2, arg))
continue;
- VI_LOCK(vp2);
+ vhold(vp2);
rw_wunlock(&vfs_hash_lock);
- error = vget(vp2, flags | LK_INTERLOCK, td);
+ error = vget(vp2, flags | LK_VNHELD, td);
if (error == ENOENT && (flags & LK_NOWAIT) == 0)
break;
rw_wlock(&vfs_hash_lock);
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index fda80c9..c2cd91e 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -68,6 +68,7 @@ __FBSDID("$FreeBSD$");
#include <sys/pctrie.h>
#include <sys/priv.h>
#include <sys/reboot.h>
+#include <sys/refcount.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/sleepqueue.h>
@@ -102,9 +103,9 @@ static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
static void syncer_shutdown(void *arg, int howto);
static int vtryrecycle(struct vnode *vp);
static void v_incr_usecount(struct vnode *);
-static void v_decr_usecount(struct vnode *);
-static void v_decr_useonly(struct vnode *);
static void v_upgrade_usecount(struct vnode *);
+static void v_incr_devcount(struct vnode *, bool locked);
+static void v_decr_devcount(struct vnode *);
static void vnlru_free(int);
static void vgonel(struct vnode *);
static void vfs_knllock(void *arg);
@@ -863,7 +864,7 @@ vnlru_free(int count)
*/
freevnodes--;
vp->v_iflag &= ~VI_FREE;
- vp->v_holdcnt++;
+ refcount_acquire(&vp->v_holdcnt);
mtx_unlock(&vnode_free_list_mtx);
VI_UNLOCK(vp);
@@ -2082,70 +2083,60 @@ static void
v_incr_usecount(struct vnode *vp)
{
+ ASSERT_VI_UNLOCKED(vp, __func__);
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
- vholdl(vp);
- vp->v_usecount++;
- if (vp->v_type == VCHR && vp->v_rdev != NULL) {
- dev_lock();
- vp->v_rdev->si_usecount++;
- dev_unlock();
- }
+ vhold(vp);
+ v_upgrade_usecount(vp);
}
/*
- * Turn a holdcnt into a use+holdcnt such that only one call to
- * v_decr_usecount is needed.
+ * Turn a holdcnt into a use+holdcnt.
*/
static void
v_upgrade_usecount(struct vnode *vp)
{
+ bool locked = false;
+ ASSERT_VI_UNLOCKED(vp, __func__);
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
- vp->v_usecount++;
- if (vp->v_type == VCHR && vp->v_rdev != NULL) {
- dev_lock();
- vp->v_rdev->si_usecount++;
- dev_unlock();
+ if (!refcount_acquire_if_not_zero(&vp->v_usecount)) {
+ VI_LOCK(vp);
+ locked = true;
+ refcount_acquire(&vp->v_usecount);
}
+ v_incr_devcount(vp, locked);
+ if (locked)
+ VI_UNLOCK(vp);
}
/*
- * Decrement the vnode use and hold count along with the driver's usecount
- * if this is a chardev. The vdropl() below releases the vnode interlock
- * as it may free the vnode.
+ * Increment si_usecount of the associated device, if any.
*/
static void
-v_decr_usecount(struct vnode *vp)
+v_incr_devcount(struct vnode *vp, bool locked)
{
- ASSERT_VI_LOCKED(vp, __FUNCTION__);
- VNASSERT(vp->v_usecount > 0, vp,
- ("v_decr_usecount: negative usecount"));
- CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
- vp->v_usecount--;
+ ASSERT_VI_LOCK(vp, __func__, locked);
+ if (vp->v_type != VCHR)
+ return;
+ if (!locked)
+ VI_LOCK(vp);
if (vp->v_type == VCHR && vp->v_rdev != NULL) {
dev_lock();
- vp->v_rdev->si_usecount--;
+ vp->v_rdev->si_usecount++;
dev_unlock();
}
- vdropl(vp);
+ if (!locked)
+ VI_UNLOCK(vp);
}
/*
- * Decrement only the use count and driver use count. This is intended to
- * be paired with a follow on vdropl() to release the remaining hold count.
- * In this way we may vgone() a vnode with a 0 usecount without risk of
- * having it end up on a free list because the hold count is kept above 0.
+ * Increment si_usecount of the associated device, if any.
*/
static void
-v_decr_useonly(struct vnode *vp)
+v_decr_devcount(struct vnode *vp)
{
- ASSERT_VI_LOCKED(vp, __FUNCTION__);
- VNASSERT(vp->v_usecount > 0, vp,
- ("v_decr_useonly: negative usecount"));
- CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
- vp->v_usecount--;
if (vp->v_type == VCHR && vp->v_rdev != NULL) {
dev_lock();
vp->v_rdev->si_usecount--;
@@ -2163,17 +2154,22 @@ v_decr_useonly(struct vnode *vp)
int
vget(struct vnode *vp, int flags, struct thread *td)
{
- int error;
+ int error, oweinact;
- error = 0;
VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
("vget: invalid lock operation"));
+
+ ASSERT_VI_LOCK(vp, __func__, (flags & LK_INTERLOCK) != 0);
+ if ((flags & LK_VNHELD) != 0)
+ VNASSERT((vp->v_holdcnt > 0), vp,
+ ("vget: LK_VNHELD passed but vnode not held"));
+
CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
- if ((flags & LK_INTERLOCK) == 0)
- VI_LOCK(vp);
- vholdl(vp);
- if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) {
+ if ((flags & LK_VNHELD) == 0)
+ _vhold(vp, (flags & LK_INTERLOCK) != 0);
+
+ if ((error = vn_lock(vp, flags)) != 0) {
vdrop(vp);
CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
vp);
@@ -2181,22 +2177,33 @@ vget(struct vnode *vp, int flags, struct thread *td)
}
if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
panic("vget: vn_lock failed to return ENOENT\n");
- VI_LOCK(vp);
- /* Upgrade our holdcnt to a usecount. */
- v_upgrade_usecount(vp);
/*
* We don't guarantee that any particular close will
* trigger inactive processing so just make a best effort
* here at preventing a reference to a removed file. If
* we don't succeed no harm is done.
+ *
+ * Upgrade our holdcnt to a usecount.
*/
- if (vp->v_iflag & VI_OWEINACT) {
- if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
+ if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
+ v_incr_devcount(vp, false);
+ VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
+ ("vget: vnode with VI_OWEINACT and usecount"));
+ } else {
+ VI_LOCK(vp);
+ if ((vp->v_iflag & VI_OWEINACT) == 0) {
+ oweinact = 0;
+ } else {
+ oweinact = 1;
+ vp->v_iflag &= ~VI_OWEINACT;
+ }
+ refcount_acquire(&vp->v_usecount);
+ v_incr_devcount(vp, true);
+ if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
(flags & LK_NOWAIT) == 0)
vinactive(vp, td);
- vp->v_iflag &= ~VI_OWEINACT;
+ VI_UNLOCK(vp);
}
- VI_UNLOCK(vp);
return (0);
}
@@ -2208,9 +2215,7 @@ vref(struct vnode *vp)
{
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
- VI_LOCK(vp);
v_incr_usecount(vp);
- VI_UNLOCK(vp);
}
/*
@@ -2225,13 +2230,8 @@ vref(struct vnode *vp)
int
vrefcnt(struct vnode *vp)
{
- int usecnt;
-
- VI_LOCK(vp);
- usecnt = vp->v_usecount;
- VI_UNLOCK(vp);
- return (usecnt);
+ return (vp->v_usecount);
}
#define VPUTX_VRELE 1
@@ -2250,33 +2250,43 @@ vputx(struct vnode *vp, int func)
ASSERT_VOP_LOCKED(vp, "vput");
else
KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
+ ASSERT_VI_UNLOCKED(vp, __func__);
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
- VI_LOCK(vp);
- /* Skip this v_writecount check if we're going to panic below. */
- VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
- ("vputx: missed vn_close"));
- error = 0;
-
- if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
- vp->v_usecount == 1)) {
+ if (refcount_release_if_not_last(&vp->v_usecount)) {
if (func == VPUTX_VPUT)
VOP_UNLOCK(vp, 0);
- v_decr_usecount(vp);
+ v_decr_devcount(vp);
+ vdrop(vp);
return;
}
- if (vp->v_usecount != 1) {
- vprint("vputx: negative ref count", vp);
- panic("vputx: negative ref cnt");
- }
- CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
+ VI_LOCK(vp);
+
/*
* We want to hold the vnode until the inactive finishes to
* prevent vgone() races. We drop the use count here and the
* hold count below when we're done.
*/
- v_decr_useonly(vp);
+ if (!refcount_release(&vp->v_usecount) || (vp->v_iflag & VI_DOINGINACT)) {
+ if (func == VPUTX_VPUT)
+ VOP_UNLOCK(vp, 0);
+ v_decr_devcount(vp);
+ vdropl(vp);
+ return;
+ }
+
+ v_decr_devcount(vp);
+
+ error = 0;
+
+ if (vp->v_usecount != 0) {
+ vprint("vputx: negative ref count", vp);
+ panic("vputx: negative ref cnt");
+ }
+
+ CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
+
/*
* We must call VOP_INACTIVE with the node locked. Mark
* as VI_DOINGINACT to avoid recursion.
@@ -2346,38 +2356,29 @@ vunref(struct vnode *vp)
}
/*
- * Somebody doesn't want the vnode recycled.
- */
-void
-vhold(struct vnode *vp)
-{
-
- VI_LOCK(vp);
- vholdl(vp);
- VI_UNLOCK(vp);
-}
-
-/*
* Increase the hold count and activate if this is the first reference.
*/
void
-vholdl(struct vnode *vp)
+_vhold(struct vnode *vp, bool locked)
{
struct mount *mp;
+ ASSERT_VI_LOCK(vp, __func__, locked);
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
-#ifdef INVARIANTS
- /* getnewvnode() calls v_incr_usecount() without holding interlock. */
- if (vp->v_type != VNON || vp->v_data != NULL) {
- ASSERT_VI_LOCKED(vp, "vholdl");
- VNASSERT(vp->v_holdcnt > 0 || (vp->v_iflag & VI_FREE) != 0,
- vp, ("vholdl: free vnode is held"));
+ if (refcount_acquire_if_not_zero(&vp->v_holdcnt)) {
+ VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
+ ("_vhold: vnode with holdcnt is free"));
+ return;
}
-#endif
- vp->v_holdcnt++;
- if ((vp->v_iflag & VI_FREE) == 0)
+ if (!locked)
+ VI_LOCK(vp);
+ if ((vp->v_iflag & VI_FREE) == 0) {
+ refcount_acquire(&vp->v_holdcnt);
+ if (!locked)
+ VI_UNLOCK(vp);
return;
- VNASSERT(vp->v_holdcnt == 1, vp, ("vholdl: wrong hold count"));
+ }
+ VNASSERT(vp->v_holdcnt == 0, vp, ("vholdl: wrong hold count"));
VNASSERT(vp->v_op != NULL, vp, ("vholdl: vnode already reclaimed."));
/*
* Remove a vnode from the free list, mark it as in use,
@@ -2394,18 +2395,9 @@ vholdl(struct vnode *vp)
TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
mp->mnt_activevnodelistsize++;
mtx_unlock(&vnode_free_list_mtx);
-}
-
-/*
- * Note that there is one less who cares about this vnode.
- * vdrop() is the opposite of vhold().
- */
-void
-vdrop(struct vnode *vp)
-{
-
- VI_LOCK(vp);
- vdropl(vp);
+ refcount_acquire(&vp->v_holdcnt);
+ if (!locked)
+ VI_UNLOCK(vp);
}
/*
@@ -2414,20 +2406,25 @@ vdrop(struct vnode *vp)
* (marked VI_DOOMED) in which case we will free it.
*/
void
-vdropl(struct vnode *vp)
+_vdrop(struct vnode *vp, bool locked)
{
struct bufobj *bo;
struct mount *mp;
int active;
- ASSERT_VI_LOCKED(vp, "vdropl");
+ ASSERT_VI_LOCK(vp, __func__, locked);
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
if (vp->v_holdcnt <= 0)
panic("vdrop: holdcnt %d", vp->v_holdcnt);
- vp->v_holdcnt--;
- VNASSERT(vp->v_holdcnt >= vp->v_usecount, vp,
- ("hold count less than use count"));
- if (vp->v_holdcnt > 0) {
+ if (refcount_release_if_not_last(&vp->v_holdcnt)) {
+ if (locked)
+ VI_UNLOCK(vp);
+ return;
+ }
+
+ if (!locked)
+ VI_LOCK(vp);
+ if (refcount_release(&vp->v_holdcnt) == 0) {
VI_UNLOCK(vp);
return;
}
diff --git a/sys/sys/lockmgr.h b/sys/sys/lockmgr.h
index ff0473d..a74d5f5 100644
--- a/sys/sys/lockmgr.h
+++ b/sys/sys/lockmgr.h
@@ -159,6 +159,7 @@ _lockmgr_args_rw(struct lock *lk, u_int flags, struct rwlock *ilk,
#define LK_SLEEPFAIL 0x000800
#define LK_TIMELOCK 0x001000
#define LK_NODDLKTREAT 0x002000
+#define LK_VNHELD 0x004000
/*
* Operations for lockmgr().
diff --git a/sys/sys/refcount.h b/sys/sys/refcount.h
index 4611664..74c51c9 100644
--- a/sys/sys/refcount.h
+++ b/sys/sys/refcount.h
@@ -64,4 +64,32 @@ refcount_release(volatile u_int *count)
return (old == 1);
}
+static __inline int
+refcount_acquire_if_not_zero(volatile u_int *count)
+{
+ int old;
+
+ for (;;) {
+ old = *count;
+ if (old == 0)
+ return (0);
+ if (atomic_cmpset_int(count, old, old + 1))
+ return (1);
+ }
+}
+
+static __inline int
+refcount_release_if_not_last(volatile u_int *count)
+{
+ int old;
+
+ for (;;) {
+ old = *count;
+ if (old == 1)
+ return (0);
+ if (atomic_cmpset_int(count, old, old - 1))
+ return (1);
+ }
+}
+
#endif /* ! __SYS_REFCOUNT_H__ */
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index e1f912e..c695b52 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -547,6 +547,15 @@ void assert_vop_unlocked(struct vnode *vp, const char *str);
#define ASSERT_VOP_UNLOCKED(vp, str) ((void)0)
#endif /* DEBUG_VFS_LOCKS */
+static __inline void
+ASSERT_VI_LOCK(struct vnode *vp, const char *str, bool locked)
+{
+
+ if (locked)
+ ASSERT_VI_LOCKED(vp, str);
+ else
+ ASSERT_VI_UNLOCKED(vp, str);
+}
/*
* This call works for vnodes in the kernel.
@@ -649,13 +658,15 @@ int vaccess_acl_posix1e(enum vtype type, uid_t file_uid,
struct ucred *cred, int *privused);
void vattr_null(struct vattr *vap);
int vcount(struct vnode *vp);
-void vdrop(struct vnode *);
-void vdropl(struct vnode *);
+#define vdrop(vp) _vdrop((vp), 0)
+#define vdropl(vp) _vdrop((vp), 1)
+void _vdrop(struct vnode *, bool);
int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td);
int vget(struct vnode *vp, int lockflag, struct thread *td);
void vgone(struct vnode *vp);
-void vhold(struct vnode *);
-void vholdl(struct vnode *);
+#define vhold(vp) _vhold((vp), 0)
+#define vholdl(vp) _vhold((vp), 1)
+void _vhold(struct vnode *, bool);
void vinactive(struct vnode *, struct thread *);
int vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo);
int vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length,
--
Mateusz Guzik <mjguzik gmail.com>
More information about the freebsd-fs
mailing list