svn commit: r306224 - head/sys/kern
Adrian Chadd
adrian.chadd at gmail.com
Sat Nov 12 18:26:52 UTC 2016
This causes the AR9331 / Carambola2 kernel to just plainly not start. :(
-adrian
On 22 September 2016 at 21:45, Mateusz Guzik <mjg at freebsd.org> wrote:
> Author: mjg
> Date: Fri Sep 23 04:45:11 2016
> New Revision: 306224
> URL: https://svnweb.freebsd.org/changeset/base/306224
>
> Log:
> cache: get rid of the global lock
>
> Add a table of vnode locks and use them along with bucketlocks to provide
> concurrent modification support. The approach taken is to preserve the
> current behaviour of the namecache and just lock all relevant parts before
> any changes are made.
>
> Lookups still require the relevant bucket to be locked.
>
> Discussed with: kib
> Tested by: pho
>
> Modified:
> head/sys/kern/subr_witness.c
> head/sys/kern/vfs_cache.c
>
> Modified: head/sys/kern/subr_witness.c
> ==============================================================================
> --- head/sys/kern/subr_witness.c Fri Sep 23 03:21:40 2016 (r306223)
> +++ head/sys/kern/subr_witness.c Fri Sep 23 04:45:11 2016 (r306224)
> @@ -625,7 +625,7 @@ static struct witness_order_list_entry o
> /*
> * VFS namecache
> */
> - { "ncglobal", &lock_class_rw },
> + { "ncvn", &lock_class_mtx_sleep },
> { "ncbuc", &lock_class_rw },
> { "vnode interlock", &lock_class_mtx_sleep },
> { "ncneg", &lock_class_mtx_sleep },
>
> Modified: head/sys/kern/vfs_cache.c
> ==============================================================================
> --- head/sys/kern/vfs_cache.c Fri Sep 23 03:21:40 2016 (r306223)
> +++ head/sys/kern/vfs_cache.c Fri Sep 23 04:45:11 2016 (r306224)
> @@ -151,21 +151,35 @@ struct namecache_ts {
> * name is located in the cache, it will be dropped.
> *
> * These locks are used (in the order in which they can be taken):
> - * NAME TYPE ROLE
> - * cache_lock rwlock global, needed for all modifications
> - * bucketlock rwlock for access to given hash bucket
> - * ncneg_mtx mtx negative entry LRU management
> + * NAME TYPE ROLE
> + * vnodelock mtx vnode lists and v_cache_dd field protection
> + * bucketlock rwlock for access to given set of hash buckets
> + * ncneg_mtx mtx negative entry LRU management
> *
> - * A name -> vnode lookup can be safely performed by either locking cache_lock
> - * or the relevant hash bucket.
> + * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
> + * shrinking the LRU list.
> *
> - * ".." and vnode -> name lookups require cache_lock.
> + * It is legal to take multiple vnodelock and bucketlock locks. The locking
> + * order is lower address first. Both are recursive.
> *
> - * Modifications require both cache_lock and relevant bucketlock taken for
> - * writing.
> + * "." lookups are lockless.
> *
> - * Negative entry LRU management requires ncneg_mtx taken on top of either
> - * cache_lock or bucketlock.
> + * ".." and vnode -> name lookups require vnodelock.
> + *
> + * name -> vnode lookup requires the relevant bucketlock to be held for reading.
> + *
> + * Insertions and removals of entries require involved vnodes and bucketlocks
> + * to be write-locked to prevent other threads from seeing the entry.
> + *
> + * Some lookups result in removal of the found entry (e.g. getting rid of a
> + * negative entry with the intent to create a positive one), which poses a
> + * problem when multiple threads reach the state. Similarly, two different
> + * threads can purge two different vnodes and try to remove the same name.
> + *
> + * If the already held vnode lock is lower than the second required lock, we
> + * can just take the other lock. However, in the opposite case, this could
> + * deadlock. As such, this is resolved by trylocking and if that fails unlocking
> + * the first node, locking everything in order and revalidating the state.
> */
>
> /*
> @@ -196,15 +210,9 @@ SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor
>
> struct nchstats nchstats; /* cache effectiveness statistics */
>
> -static struct rwlock cache_lock;
> -RW_SYSINIT(vfscache, &cache_lock, "ncglobal");
> -
> -#define CACHE_TRY_WLOCK() rw_try_wlock(&cache_lock)
> -#define CACHE_UPGRADE_LOCK() rw_try_upgrade(&cache_lock)
> -#define CACHE_RLOCK() rw_rlock(&cache_lock)
> -#define CACHE_RUNLOCK() rw_runlock(&cache_lock)
> -#define CACHE_WLOCK() rw_wlock(&cache_lock)
> -#define CACHE_WUNLOCK() rw_wunlock(&cache_lock)
> +static struct mtx ncneg_shrink_lock;
> +MTX_SYSINIT(vfscache_shrink_neg, &ncneg_shrink_lock, "Name Cache shrink neg",
> + MTX_DEF);
>
> static struct mtx_padalign ncneg_mtx;
> MTX_SYSINIT(vfscache_neg, &ncneg_mtx, "ncneg", MTX_DEF);
> @@ -214,6 +222,19 @@ static struct rwlock_padalign *bucketlo
> #define HASH2BUCKETLOCK(hash) \
> ((struct rwlock *)(&bucketlocks[((hash) % numbucketlocks)]))
>
> +static u_int numvnodelocks;
> +static struct mtx *vnodelocks;
> +static inline struct mtx *
> +VP2VNODELOCK(struct vnode *vp)
> +{
> + struct mtx *vlp;
> +
> + if (vp == NULL)
> + return (NULL);
> + vlp = &vnodelocks[(((uintptr_t)(vp) >> 8) % numvnodelocks)];
> + return (vlp);
> +}
> +
> /*
> * UMA zones for the VFS cache.
> *
> @@ -329,19 +350,49 @@ STATNODE_COUNTER(numfullpathfail2,
> "Number of fullpath search errors (VOP_VPTOCNP failures)");
> STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
> STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
> -static long numupgrades; STATNODE_ULONG(numupgrades,
> - "Number of updates of the cache after lookup (write lock + retry)");
> static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
> - "Number of times bucketlocked zap_and_exit case failed to writelock");
> + "Number of times zap_and_exit failed to lock");
> +static long cache_lock_vnodes_cel_3_failures;
> +STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
> + "Number of times 3-way vnode locking failed");
>
> -static void cache_zap(struct namecache *ncp);
> -static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
> - u_int *buflen);
> +static void cache_zap_locked(struct namecache *ncp, bool neg_locked);
> static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
> char *buf, char **retbuf, u_int buflen);
>
> static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
>
> +static int cache_yield;
> +SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
> + "Number of times cache called yield");
> +
> +static void
> +cache_maybe_yield(void)
> +{
> +
> + if (should_yield()) {
> + cache_yield++;
> + kern_yield(PRI_USER);
> + }
> +}
> +
> +static inline void
> +cache_assert_vlp_locked(struct mtx *vlp)
> +{
> +
> + if (vlp != NULL)
> + mtx_assert(vlp, MA_OWNED);
> +}
> +
> +static inline void
> +cache_assert_vnode_locked(struct vnode *vp)
> +{
> + struct mtx *vlp;
> +
> + vlp = VP2VNODELOCK(vp);
> + cache_assert_vlp_locked(vlp);
> +}
> +
> static uint32_t
> cache_get_hash(char *name, u_char len, struct vnode *dvp)
> {
> @@ -352,21 +403,41 @@ cache_get_hash(char *name, u_char len, s
> return (hash);
> }
>
> +static inline struct rwlock *
> +NCP2BUCKETLOCK(struct namecache *ncp)
> +{
> + uint32_t hash;
> +
> + hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp);
> + return (HASH2BUCKETLOCK(hash));
> +}
> +
> #ifdef INVARIANTS
> static void
> cache_assert_bucket_locked(struct namecache *ncp, int mode)
> {
> - struct rwlock *bucketlock;
> - uint32_t hash;
> + struct rwlock *blp;
>
> - hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp);
> - bucketlock = HASH2BUCKETLOCK(hash);
> - rw_assert(bucketlock, mode);
> + blp = NCP2BUCKETLOCK(ncp);
> + rw_assert(blp, mode);
> }
> #else
> #define cache_assert_bucket_locked(x, y) do { } while (0)
> #endif
>
> +#define cache_sort(x, y) _cache_sort((void **)(x), (void **)(y))
> +static void
> +_cache_sort(void **p1, void **p2)
> +{
> + void *tmp;
> +
> + if (*p1 > *p2) {
> + tmp = *p2;
> + *p2 = *p1;
> + *p1 = tmp;
> + }
> +}
> +
> static void
> cache_lock_all_buckets(void)
> {
> @@ -385,6 +456,56 @@ cache_unlock_all_buckets(void)
> rw_wunlock(&bucketlocks[i]);
> }
>
> +static void
> +cache_lock_all_vnodes(void)
> +{
> + u_int i;
> +
> + for (i = 0; i < numvnodelocks; i++)
> + mtx_lock(&vnodelocks[i]);
> +}
> +
> +static void
> +cache_unlock_all_vnodes(void)
> +{
> + u_int i;
> +
> + for (i = 0; i < numvnodelocks; i++)
> + mtx_unlock(&vnodelocks[i]);
> +}
> +
> +static int
> +cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
> +{
> +
> + cache_sort(&vlp1, &vlp2);
> + MPASS(vlp2 != NULL);
> +
> + if (vlp1 != NULL) {
> + if (!mtx_trylock(vlp1))
> + return (EAGAIN);
> + }
> + if (!mtx_trylock(vlp2)) {
> + if (vlp1 != NULL)
> + mtx_unlock(vlp1);
> + return (EAGAIN);
> + }
> +
> + return (0);
> +}
> +
> +static void
> +cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
> +{
> +
> + MPASS(vlp1 != NULL || vlp2 != NULL);
> +
> + if (vlp1 != NULL)
> + mtx_unlock(vlp1);
> + if (vlp2 != NULL)
> + mtx_unlock(vlp2);
> +}
> +
> static int
> sysctl_nchstats(SYSCTL_HANDLER_ARGS)
> {
> @@ -426,9 +547,9 @@ retry:
> if (req->oldptr == NULL)
> return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
> cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
> - CACHE_RLOCK();
> + cache_lock_all_buckets();
> if (n_nchash != nchash + 1) {
> - CACHE_RUNLOCK();
> + cache_unlock_all_buckets();
> free(cntbuf, M_TEMP);
> goto retry;
> }
> @@ -436,7 +557,7 @@ retry:
> for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
> LIST_FOREACH(ncp, ncpp, nc_hash)
> cntbuf[i]++;
> - CACHE_RUNLOCK();
> + cache_unlock_all_buckets();
> for (error = 0, i = 0; i < n_nchash; i++)
> if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
> break;
> @@ -459,7 +580,7 @@ sysctl_debug_hashstat_nchash(SYSCTL_HAND
> if (!req->oldptr)
> return SYSCTL_OUT(req, 0, 4 * sizeof(int));
>
> - CACHE_RLOCK();
> + cache_lock_all_buckets();
> n_nchash = nchash + 1; /* nchash is max index, not count */
> used = 0;
> maxlength = 0;
> @@ -476,7 +597,7 @@ sysctl_debug_hashstat_nchash(SYSCTL_HAND
> maxlength = count;
> }
> n_nchash = nchash + 1;
> - CACHE_RUNLOCK();
> + cache_unlock_all_buckets();
> pct = (used * 100) / (n_nchash / 100);
> error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
> if (error)
> @@ -504,6 +625,7 @@ static void
> cache_negative_hit(struct namecache *ncp)
> {
>
> + MPASS(ncp->nc_vp == NULL);
> mtx_lock(&ncneg_mtx);
> TAILQ_REMOVE(&ncneg, ncp, nc_dst);
> TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
> @@ -514,9 +636,8 @@ static void
> cache_negative_insert(struct namecache *ncp)
> {
>
> - rw_assert(&cache_lock, RA_WLOCKED);
> - cache_assert_bucket_locked(ncp, RA_WLOCKED);
> MPASS(ncp->nc_vp == NULL);
> + cache_assert_bucket_locked(ncp, RA_WLOCKED);
> mtx_lock(&ncneg_mtx);
> TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
> numneg++;
> @@ -524,43 +645,74 @@ cache_negative_insert(struct namecache *
> }
>
> static void
> -cache_negative_remove(struct namecache *ncp)
> +cache_negative_remove(struct namecache *ncp, bool neg_locked)
> {
>
> - rw_assert(&cache_lock, RA_WLOCKED);
> - cache_assert_bucket_locked(ncp, RA_WLOCKED);
> MPASS(ncp->nc_vp == NULL);
> - mtx_lock(&ncneg_mtx);
> + cache_assert_bucket_locked(ncp, RA_WLOCKED);
> + if (!neg_locked)
> + mtx_lock(&ncneg_mtx);
> + else
> + mtx_assert(&ncneg_mtx, MA_OWNED);
> TAILQ_REMOVE(&ncneg, ncp, nc_dst);
> numneg--;
> - mtx_unlock(&ncneg_mtx);
> + if (!neg_locked)
> + mtx_unlock(&ncneg_mtx);
> }
>
> -static struct namecache *
> +static void
> cache_negative_zap_one(void)
> {
> - struct namecache *ncp;
> + struct namecache *ncp, *ncp2;
> + struct mtx *dvlp;
> + struct rwlock *blp;
> +
> + if (!mtx_trylock(&ncneg_shrink_lock))
> + return;
>
> - rw_assert(&cache_lock, RA_WLOCKED);
> + mtx_lock(&ncneg_mtx);
> ncp = TAILQ_FIRST(&ncneg);
> - KASSERT(ncp->nc_vp == NULL, ("ncp %p vp %p on ncneg",
> - ncp, ncp->nc_vp));
> - cache_zap(ncp);
> - return (ncp);
> + if (ncp == NULL) {
> + mtx_unlock(&ncneg_mtx);
> + goto out;
> + }
> + MPASS(ncp->nc_vp == NULL);
> + dvlp = VP2VNODELOCK(ncp->nc_dvp);
> + blp = NCP2BUCKETLOCK(ncp);
> + mtx_unlock(&ncneg_mtx);
> + mtx_lock(dvlp);
> + rw_wlock(blp);
> + mtx_lock(&ncneg_mtx);
> + ncp2 = TAILQ_FIRST(&ncneg);
> + if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
> + blp != NCP2BUCKETLOCK(ncp2) || ncp2->nc_vp != NULL) {
> + ncp = NULL;
> + goto out_unlock_all;
> + }
> + cache_zap_locked(ncp, true);
> +out_unlock_all:
> + mtx_unlock(&ncneg_mtx);
> + rw_wunlock(blp);
> + mtx_unlock(dvlp);
> +out:
> + mtx_unlock(&ncneg_shrink_lock);
> + cache_free(ncp);
> }
>
> /*
> - * cache_zap():
> + * cache_zap_locked():
> *
> * Removes a namecache entry from cache, whether it contains an actual
> * pointer to a vnode or if it is just a negative cache entry.
> */
> static void
> -cache_zap_locked(struct namecache *ncp)
> +cache_zap_locked(struct namecache *ncp, bool neg_locked)
> {
>
> - rw_assert(&cache_lock, RA_WLOCKED);
> + cache_assert_vnode_locked(ncp->nc_vp);
> + cache_assert_vnode_locked(ncp->nc_dvp);
> cache_assert_bucket_locked(ncp, RA_WLOCKED);
> +
> CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp);
> if (ncp->nc_vp != NULL) {
> SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
> @@ -577,7 +729,7 @@ cache_zap_locked(struct namecache *ncp)
> LIST_REMOVE(ncp, nc_src);
> if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
> ncp->nc_flag |= NCF_DVDROP;
> - numcachehv--;
> + atomic_subtract_rel_long(&numcachehv, 1);
> }
> }
> if (ncp->nc_vp) {
> @@ -585,24 +737,198 @@ cache_zap_locked(struct namecache *ncp)
> if (ncp == ncp->nc_vp->v_cache_dd)
> ncp->nc_vp->v_cache_dd = NULL;
> } else {
> - cache_negative_remove(ncp);
> + cache_negative_remove(ncp, neg_locked);
> }
> - numcache--;
> + atomic_subtract_rel_long(&numcache, 1);
> }
>
> static void
> -cache_zap(struct namecache *ncp)
> +cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
> {
> - struct rwlock *bucketlock;
> - uint32_t hash;
> + struct rwlock *blp;
>
> - rw_assert(&cache_lock, RA_WLOCKED);
> + MPASS(ncp->nc_dvp == vp);
> + MPASS(ncp->nc_vp == NULL);
> + cache_assert_vnode_locked(vp);
>
> - hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp);
> - bucketlock = HASH2BUCKETLOCK(hash);
> - rw_wlock(bucketlock);
> - cache_zap_locked(ncp);
> - rw_wunlock(bucketlock);
> + blp = NCP2BUCKETLOCK(ncp);
> + rw_wlock(blp);
> + cache_zap_locked(ncp, false);
> + rw_wunlock(blp);
> +}
> +
> +static bool
> +cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
> + struct mtx **vlpp)
> +{
> + struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
> + struct rwlock *blp;
> +
> + MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
> + cache_assert_vnode_locked(vp);
> +
> + if (ncp->nc_vp == NULL) {
> + if (*vlpp != NULL) {
> + mtx_unlock(*vlpp);
> + *vlpp = NULL;
> + }
> + cache_zap_negative_locked_vnode_kl(ncp, vp);
> + return (true);
> + }
> +
> + pvlp = VP2VNODELOCK(vp);
> + blp = NCP2BUCKETLOCK(ncp);
> + vlp1 = VP2VNODELOCK(ncp->nc_dvp);
> + vlp2 = VP2VNODELOCK(ncp->nc_vp);
> +
> + if (*vlpp == vlp1 || *vlpp == vlp2) {
> + to_unlock = *vlpp;
> + *vlpp = NULL;
> + } else {
> + if (*vlpp != NULL) {
> + mtx_unlock(*vlpp);
> + *vlpp = NULL;
> + }
> + cache_sort(&vlp1, &vlp2);
> + if (vlp1 == pvlp) {
> + mtx_lock(vlp2);
> + to_unlock = vlp2;
> + } else {
> + if (!mtx_trylock(vlp1))
> + goto out_relock;
> + to_unlock = vlp1;
> + }
> + }
> + rw_wlock(blp);
> + cache_zap_locked(ncp, false);
> + rw_wunlock(blp);
> + if (to_unlock != NULL)
> + mtx_unlock(to_unlock);
> + return (true);
> +
> +out_relock:
> + mtx_unlock(vlp2);
> + mtx_lock(vlp1);
> + mtx_lock(vlp2);
> + MPASS(*vlpp == NULL);
> + *vlpp = vlp1;
> + return (false);
> +}
> +
> +static int
> +cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
> +{
> + struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
> + struct rwlock *blp;
> + int error = 0;
> +
> + MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
> + cache_assert_vnode_locked(vp);
> +
> + pvlp = VP2VNODELOCK(vp);
> + if (ncp->nc_vp == NULL) {
> + cache_zap_negative_locked_vnode_kl(ncp, vp);
> + goto out;
> + }
> +
> + blp = NCP2BUCKETLOCK(ncp);
> + vlp1 = VP2VNODELOCK(ncp->nc_dvp);
> + vlp2 = VP2VNODELOCK(ncp->nc_vp);
> + cache_sort(&vlp1, &vlp2);
> + if (vlp1 == pvlp) {
> + mtx_lock(vlp2);
> + to_unlock = vlp2;
> + } else {
> + if (!mtx_trylock(vlp1)) {
> + error = EAGAIN;
> + goto out;
> + }
> + to_unlock = vlp1;
> + }
> + rw_wlock(blp);
> + cache_zap_locked(ncp, false);
> + rw_wunlock(blp);
> + mtx_unlock(to_unlock);
> +out:
> + mtx_unlock(pvlp);
> + return (error);
> +}
> +
> +static int
> +cache_zap_rlocked_bucket(struct namecache *ncp, struct rwlock *blp)
> +{
> + struct mtx *dvlp, *vlp;
> +
> + cache_assert_bucket_locked(ncp, RA_RLOCKED);
> +
> + dvlp = VP2VNODELOCK(ncp->nc_dvp);
> + vlp = VP2VNODELOCK(ncp->nc_vp);
> + if (cache_trylock_vnodes(dvlp, vlp) == 0) {
> + rw_runlock(blp);
> + rw_wlock(blp);
> + cache_zap_locked(ncp, false);
> + rw_wunlock(blp);
> + cache_unlock_vnodes(dvlp, vlp);
> + return (0);
> + }
> +
> + rw_runlock(blp);
> + return (EAGAIN);
> +}
> +
> +static int
> +cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp,
> + struct mtx **vlpp1, struct mtx **vlpp2)
> +{
> + struct mtx *dvlp, *vlp;
> +
> + cache_assert_bucket_locked(ncp, RA_WLOCKED);
> +
> + dvlp = VP2VNODELOCK(ncp->nc_dvp);
> + vlp = VP2VNODELOCK(ncp->nc_vp);
> + cache_sort(&dvlp, &vlp);
> +
> + if (*vlpp1 == dvlp && *vlpp2 == vlp) {
> + cache_zap_locked(ncp, false);
> + cache_unlock_vnodes(dvlp, vlp);
> + *vlpp1 = NULL;
> + *vlpp2 = NULL;
> + return (0);
> + }
> +
> + if (*vlpp1 != NULL)
> + mtx_unlock(*vlpp1);
> + if (*vlpp2 != NULL)
> + mtx_unlock(*vlpp2);
> + *vlpp1 = NULL;
> + *vlpp2 = NULL;
> +
> + if (cache_trylock_vnodes(dvlp, vlp) == 0) {
> + cache_zap_locked(ncp, false);
> + cache_unlock_vnodes(dvlp, vlp);
> + return (0);
> + }
> +
> + rw_wunlock(blp);
> + *vlpp1 = dvlp;
> + *vlpp2 = vlp;
> + if (*vlpp1 != NULL)
> + mtx_lock(*vlpp1);
> + mtx_lock(*vlpp2);
> + rw_wlock(blp);
> + return (EAGAIN);
> +}
> +
> +static void
> +cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp)
> +{
> +
> + if (blp != NULL) {
> + rw_runlock(blp);
> + mtx_assert(vlp, MA_NOTOWNED);
> + } else {
> + mtx_unlock(vlp);
> + }
> }
>
> /*
> @@ -622,44 +948,26 @@ cache_zap(struct namecache *ncp)
> * not recursively acquired.
> */
>
> -enum { UNLOCKED, WLOCKED, RLOCKED };
> -
> -static void
> -cache_unlock(int cache_locked)
> -{
> -
> - switch (cache_locked) {
> - case UNLOCKED:
> - break;
> - case WLOCKED:
> - CACHE_WUNLOCK();
> - break;
> - case RLOCKED:
> - CACHE_RUNLOCK();
> - break;
> - }
> -}
> -
> int
> cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
> struct timespec *tsp, int *ticksp)
> {
> - struct rwlock *bucketlock;
> struct namecache *ncp;
> + struct rwlock *blp;
> + struct mtx *dvlp, *dvlp2;
> uint32_t hash;
> - int error, ltype, cache_locked;
> + int error, ltype;
>
> if (!doingcache) {
> cnp->cn_flags &= ~MAKEENTRY;
> return (0);
> }
> retry:
> - bucketlock = NULL;
> - cache_locked = UNLOCKED;
> + blp = NULL;
> + dvlp = VP2VNODELOCK(dvp);
> error = 0;
> counter_u64_add(numcalls, 1);
>
> -retry_wlocked:
> if (cnp->cn_nameptr[0] == '.') {
> if (cnp->cn_namelen == 1) {
> *vpp = dvp;
> @@ -693,32 +1001,37 @@ retry_wlocked:
> }
> if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
> counter_u64_add(dotdothits, 1);
> - if (cache_locked == UNLOCKED) {
> - CACHE_RLOCK();
> - cache_locked = RLOCKED;
> - }
> -
> - if (dvp->v_cache_dd == NULL) {
> + dvlp2 = NULL;
> + mtx_lock(dvlp);
> +retry_dotdot:
> + ncp = dvp->v_cache_dd;
> + if (ncp == NULL) {
> SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
> "..", NULL);
> - goto unlock;
> + mtx_unlock(dvlp);
> + return (0);
> }
> if ((cnp->cn_flags & MAKEENTRY) == 0) {
> - if (cache_locked != WLOCKED &&
> - !CACHE_UPGRADE_LOCK())
> - goto wlock;
> - ncp = NULL;
> - if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT) {
> - ncp = dvp->v_cache_dd;
> - cache_zap(ncp);
> + if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
> + if (ncp->nc_dvp != dvp)
> + panic("dvp %p v_cache_dd %p\n", dvp, ncp);
> + if (!cache_zap_locked_vnode_kl2(ncp,
> + dvp, &dvlp2))
> + goto retry_dotdot;
> + MPASS(dvp->v_cache_dd == NULL);
> + mtx_unlock(dvlp);
> + if (dvlp2 != NULL)
> + mtx_unlock(dvlp2);
> + cache_free(ncp);
> + } else {
> + dvp->v_cache_dd = NULL;
> + mtx_unlock(dvlp);
> + if (dvlp2 != NULL)
> + mtx_unlock(dvlp2);
> }
> - dvp->v_cache_dd = NULL;
> - CACHE_WUNLOCK();
> - cache_free(ncp);
> return (0);
> }
> - ncp = dvp->v_cache_dd;
> - if (ncp->nc_flag & NCF_ISDOTDOT)
> + if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
> *vpp = ncp->nc_vp;
> else
> *vpp = ncp->nc_dvp;
> @@ -739,10 +1052,8 @@ retry_wlocked:
> }
>
> hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
> - if (cache_locked == UNLOCKED) {
> - bucketlock = HASH2BUCKETLOCK(hash);
> - rw_rlock(bucketlock);
> - }
> + blp = HASH2BUCKETLOCK(hash);
> + rw_rlock(blp);
>
> LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
> counter_u64_add(numchecks, 1);
> @@ -795,24 +1106,9 @@ negative_success:
> SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
> nc_get_name(ncp));
> cache_out_ts(ncp, tsp, ticksp);
> - MPASS(bucketlock != NULL || cache_locked != UNLOCKED);
> - if (bucketlock != NULL)
> - rw_runlock(bucketlock);
> - cache_unlock(cache_locked);
> + cache_lookup_unlock(blp, dvlp);
> return (ENOENT);
>
> -wlock:
> - /*
> - * We need to update the cache after our lookup, so upgrade to
> - * a write lock and retry the operation.
> - */
> - CACHE_RUNLOCK();
> -wlock_unlocked:
> - CACHE_WLOCK();
> - numupgrades++;
> - cache_locked = WLOCKED;
> - goto retry_wlocked;
> -
> success:
> /*
> * On success we return a locked and ref'd vnode as per the lookup
> @@ -825,10 +1121,7 @@ success:
> VOP_UNLOCK(dvp, 0);
> }
> vhold(*vpp);
> - MPASS(bucketlock != NULL || cache_locked != UNLOCKED);
> - if (bucketlock != NULL)
> - rw_runlock(bucketlock);
> - cache_unlock(cache_locked);
> + cache_lookup_unlock(blp, dvlp);
> error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread);
> if (cnp->cn_flags & ISDOTDOT) {
> vn_lock(dvp, ltype | LK_RETRY);
> @@ -850,32 +1143,232 @@ success:
> return (-1);
>
> unlock:
> - MPASS(bucketlock != NULL || cache_locked != UNLOCKED);
> - if (bucketlock != NULL)
> - rw_runlock(bucketlock);
> - cache_unlock(cache_locked);
> + cache_lookup_unlock(blp, dvlp);
> return (0);
>
> zap_and_exit:
> - if (bucketlock != NULL) {
> - rw_assert(&cache_lock, RA_UNLOCKED);
> - if (!CACHE_TRY_WLOCK()) {
> - rw_runlock(bucketlock);
> - bucketlock = NULL;
> - zap_and_exit_bucket_fail++;
> - goto wlock_unlocked;
> - }
> - cache_locked = WLOCKED;
> - rw_runlock(bucketlock);
> - bucketlock = NULL;
> - } else if (cache_locked != WLOCKED && !CACHE_UPGRADE_LOCK())
> - goto wlock;
> - cache_zap(ncp);
> - CACHE_WUNLOCK();
> + if (blp != NULL)
> + error = cache_zap_rlocked_bucket(ncp, blp);
> + else
> + error = cache_zap_locked_vnode(ncp, dvp);
> + if (error != 0) {
> + zap_and_exit_bucket_fail++;
> + cache_maybe_yield();
> + goto retry;
> + }
> cache_free(ncp);
> return (0);
> }
>
> +struct celockstate {
> + struct mtx *vlp[3];
> + struct rwlock *blp[2];
> +};
> +CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
> +CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
> +
> +static inline void
> +cache_celockstate_init(struct celockstate *cel)
> +{
> +
> + bzero(cel, sizeof(*cel));
> +}
> +
> +static void
> +cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
> + struct vnode *dvp)
> +{
> + struct mtx *vlp1, *vlp2;
> +
> + MPASS(cel->vlp[0] == NULL);
> + MPASS(cel->vlp[1] == NULL);
> + MPASS(cel->vlp[2] == NULL);
> +
> + MPASS(vp != NULL || dvp != NULL);
> +
> + vlp1 = VP2VNODELOCK(vp);
> + vlp2 = VP2VNODELOCK(dvp);
> + cache_sort(&vlp1, &vlp2);
> +
> + if (vlp1 != NULL) {
> + mtx_lock(vlp1);
> + cel->vlp[0] = vlp1;
> + }
> + mtx_lock(vlp2);
> + cel->vlp[1] = vlp2;
> +}
> +
> +static void
> +cache_unlock_vnodes_cel(struct celockstate *cel)
> +{
> +
> + MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
> +
> + if (cel->vlp[0] != NULL)
> + mtx_unlock(cel->vlp[0]);
> + if (cel->vlp[1] != NULL)
> + mtx_unlock(cel->vlp[1]);
> + if (cel->vlp[2] != NULL)
> + mtx_unlock(cel->vlp[2]);
> +}
> +
> +static bool
> +cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
> +{
> + struct mtx *vlp;
> + bool ret;
> +
> + cache_assert_vlp_locked(cel->vlp[0]);
> + cache_assert_vlp_locked(cel->vlp[1]);
> + MPASS(cel->vlp[2] == NULL);
> +
> + vlp = VP2VNODELOCK(vp);
> + if (vlp == NULL)
> + return (true);
> +
> + ret = true;
> + if (vlp >= cel->vlp[1]) {
> + mtx_lock(vlp);
> + } else {
> + if (mtx_trylock(vlp))
> + goto out;
> + cache_lock_vnodes_cel_3_failures++;
> + cache_unlock_vnodes_cel(cel);
> + if (vlp < cel->vlp[0]) {
> + mtx_lock(vlp);
> + mtx_lock(cel->vlp[0]);
> + mtx_lock(cel->vlp[1]);
> + } else {
> + if (cel->vlp[0] != NULL)
> + mtx_lock(cel->vlp[0]);
> + mtx_lock(vlp);
> + mtx_lock(cel->vlp[1]);
> + }
> + ret = false;
> + }
> +out:
> + cel->vlp[2] = vlp;
> + return (ret);
> +}
> +
> +static void
> +cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1,
> + struct rwlock *blp2)
> +{
> +
> + MPASS(cel->blp[0] == NULL);
> + MPASS(cel->blp[1] == NULL);
> +
> + cache_sort(&blp1, &blp2);
> +
> + if (blp1 != NULL) {
> + rw_wlock(blp1);
> + cel->blp[0] = blp1;
> + }
> + rw_wlock(blp2);
> + cel->blp[1] = blp2;
> +}
> +
> +static void
> +cache_unlock_buckets_cel(struct celockstate *cel)
> +{
> +
> + if (cel->blp[0] != NULL)
> + rw_wunlock(cel->blp[0]);
> + rw_wunlock(cel->blp[1]);
> +}
> +
> +/*
> + * Lock part of the cache affected by the insertion.
> + *
> + * This means vnodelocks for dvp, vp and the relevant bucketlock.
> + * However, insertion can result in removal of an old entry. In this
> + * case we have an additional vnode and bucketlock pair to lock. If the
> + * entry is negative, ncelock is locked instead of the vnode.
> + *
> + * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
> + * preserving the locking order (smaller address first).
> + */
> +static void
> +cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
> + uint32_t hash)
> +{
> + struct namecache *ncp;
> + struct rwlock *blps[2];
> +
> + blps[0] = HASH2BUCKETLOCK(hash);
> + for (;;) {
> + blps[1] = NULL;
> + cache_lock_vnodes_cel(cel, dvp, vp);
> + if (vp == NULL || vp->v_type != VDIR)
> + break;
> + ncp = vp->v_cache_dd;
> + if (ncp == NULL)
> + break;
> + if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
> + break;
> + MPASS(ncp->nc_dvp == vp);
> + blps[1] = NCP2BUCKETLOCK(ncp);
> + if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
> + break;
> + /*
> + * All vnodes got re-locked. Re-validate the state and if
> + * nothing changed we are done. Otherwise restart.
> + */
> + if (ncp == vp->v_cache_dd &&
> + (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
> + blps[1] == NCP2BUCKETLOCK(ncp) &&
> + VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
> + break;
> + cache_unlock_vnodes_cel(cel);
> + cel->vlp[0] = NULL;
> + cel->vlp[1] = NULL;
> + cel->vlp[2] = NULL;
> + }
> + cache_lock_buckets_cel(cel, blps[0], blps[1]);
> +}
> +
> +static void
> +cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
> + uint32_t hash)
> +{
> + struct namecache *ncp;
> + struct rwlock *blps[2];
> +
> + blps[0] = HASH2BUCKETLOCK(hash);
>
> *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
>
More information about the svn-src-all
mailing list