git: 206dd9d1a82d - stable/14 - vfs: drop one vnode list lock trip during vnlru free recycle

From: Mateusz Guzik <mjg_at_FreeBSD.org>
Date: Fri, 22 Sep 2023 21:51:14 UTC
The branch stable/14 has been updated by mjg:

URL: https://cgit.FreeBSD.org/src/commit/?id=206dd9d1a82df140a6071545a2dc558e8d9f5ad0

commit 206dd9d1a82df140a6071545a2dc558e8d9f5ad0
Author:     Mateusz Guzik <mjg@FreeBSD.org>
AuthorDate: 2023-09-14 14:35:40 +0000
Commit:     Mateusz Guzik <mjg@FreeBSD.org>
CommitDate: 2023-09-22 21:50:46 +0000

    vfs: drop one vnode list lock trip during vnlru free recycle
    
    vnlru_free_impl would take the lock prior to returning even though most
    frequent caller does not need it.
    
    Unsurprisingly vnode_list mtx is the primary bottleneck when recycling
    and avoiding the useless lock trip helps.
    
    Setting maxvnodes to 400000 and running 20 parallel finds each with a
    dedicated directory tree of 1 million vnodes in total:
    before: 4.50s user 1225.71s system 1979% cpu 1:02.14 total
    after:  4.20s user 806.23s system 1973% cpu 41.059 total
    
    That's 34% reduction in total real time.
    
    With this the block *remains* the primary bottleneck when running on
    ZFS.
    
    (cherry picked from commit 74be676d87745eb727642f6f8329236c848929d5)
---
 sys/kern/vfs_subr.c | 43 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 1c6827ba0587..80ec15f78028 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -1290,13 +1290,14 @@ vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp)
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 	if (count > max_vnlru_free)
 		count = max_vnlru_free;
+	if (count == 0) {
+		mtx_unlock(&vnode_list_mtx);
+		return (0);
+	}
 	ocount = count;
 	retried = false;
 	vp = mvp;
 	for (;;) {
-		if (count == 0) {
-			break;
-		}
 		vp = TAILQ_NEXT(vp, v_vnodelist);
 		if (__predict_false(vp == NULL)) {
 			/*
@@ -1319,6 +1320,7 @@ vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp)
 			 */
 			TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 			TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist);
+			mtx_unlock(&vnode_list_mtx);
 			break;
 		}
 		if (__predict_false(vp->v_type == VMARKER))
@@ -1366,18 +1368,28 @@ vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp)
 		 */
 		vtryrecycle(vp);
 		count--;
+		if (count == 0) {
+			break;
+		}
 		mtx_lock(&vnode_list_mtx);
 		vp = mvp;
 	}
+	mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
 	return (ocount - count);
 }
 
+/*
+ * XXX: returns without vnode_list_mtx locked!
+ */
 static int
 vnlru_free_locked(int count)
 {
+	int ret;
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
-	return (vnlru_free_impl(count, NULL, vnode_list_free_marker));
+	ret = vnlru_free_impl(count, NULL, vnode_list_free_marker);
+	mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
+	return (ret);
 }
 
 void
@@ -1389,7 +1401,7 @@ vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp)
 	VNPASS(mvp->v_type == VMARKER, mvp);
 	mtx_lock(&vnode_list_mtx);
 	vnlru_free_impl(count, mnt_op, mvp);
-	mtx_unlock(&vnode_list_mtx);
+	mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
 }
 
 struct vnode *
@@ -1534,7 +1546,7 @@ vnlru_under_unlocked(u_long rnumvnodes, u_long limit)
 }
 
 static void
-vnlru_kick(void)
+vnlru_kick_locked(void)
 {
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
@@ -1544,6 +1556,15 @@ vnlru_kick(void)
 	}
 }
 
+static void
+vnlru_kick(void)
+{
+
+	mtx_lock(&vnode_list_mtx);
+	vnlru_kick_locked();
+	mtx_unlock(&vnode_list_mtx);
+}
+
 static void
 vnlru_proc(void)
 {
@@ -1574,6 +1595,7 @@ vnlru_proc(void)
 		 */
 		if (rnumvnodes > desiredvnodes) {
 			vnlru_free_locked(rnumvnodes - desiredvnodes);
+			mtx_lock(&vnode_list_mtx);
 			rnumvnodes = atomic_load_long(&numvnodes);
 		}
 		/*
@@ -1751,6 +1773,7 @@ vn_alloc_hard(struct mount *mp)
 	rnumvnodes = atomic_load_long(&numvnodes);
 	if (rnumvnodes + 1 < desiredvnodes) {
 		vn_alloc_cyclecount = 0;
+		mtx_unlock(&vnode_list_mtx);
 		goto alloc;
 	}
 	rfreevnodes = vnlru_read_freevnodes();
@@ -1770,22 +1793,26 @@ vn_alloc_hard(struct mount *mp)
 	 */
 	if (vnlru_free_locked(1) > 0)
 		goto alloc;
+	mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
 	if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
 		/*
 		 * Wait for space for a new vnode.
 		 */
-		vnlru_kick();
+		mtx_lock(&vnode_list_mtx);
+		vnlru_kick_locked();
 		vn_alloc_sleeps++;
 		msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz);
 		if (atomic_load_long(&numvnodes) + 1 > desiredvnodes &&
 		    vnlru_read_freevnodes() > 1)
 			vnlru_free_locked(1);
+		else
+			mtx_unlock(&vnode_list_mtx);
 	}
 alloc:
+	mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
 	rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1;
 	if (vnlru_under(rnumvnodes, vlowat))
 		vnlru_kick();
-	mtx_unlock(&vnode_list_mtx);
 	return (uma_zalloc_smr(vnode_zone, M_WAITOK));
 }