git: a521cee3322f - stable/13 - vfs: try harder to find free vnodes when recycling

Go to: [ bottom of page ] [ top of archives ] [ this month ]
From: Mateusz Guzik <mjg_at_FreeBSD.org>
Date: Sun, 27 Aug 2023 22:45:16 UTC
The branch stable/13 has been updated by mjg:

URL: https://cgit.FreeBSD.org/src/commit/?id=a521cee3322f979b1caade7ec000bf4f7509246b

commit a521cee3322f979b1caade7ec000bf4f7509246b
Author:     Mateusz Guzik <mjg@FreeBSD.org>
AuthorDate: 2023-08-24 05:34:08 +0000
Commit:     Mateusz Guzik <mjg@FreeBSD.org>
CommitDate: 2023-08-27 22:44:12 +0000

    vfs: try harder to find free vnodes when recycling
    
    The free vnode marker can slide past eligible entries.
    
    Artificially reducing vnode limit to 300k and spawning 104 workers each
    creating a million files results in all of them trying to recycle, which
    often fails when it should not have to.
    
    Because of the excessive traffic in this scenario, the trylock to
    requeue is virtually guaranteed to fail, meaning nothing gets pushed
    forward.
    
    Since no vnodes were found, the most unfortunate sleep for 1 second is
    induced (see vn_alloc_hard, the "vlruwk" msleep).
    
    Without the fix the machine is mostly idle with almost everyone stuck
    off CPU waiting for the sleep to finish. With the fix it is busy
    creating files.
    
    Unrelated to the above problem the marker could have landed in a
    similarly problematic spot for because of any failure in vtryrecycle.
    
    Originally reported as poudriere builders stalling in a vnode-count
    restricted setup.
    
    Fixes:  138a5dafba31 ("vfs: trylock vnode requeue")
    Reported by:    Mark Millard
    
    (cherry picked from commit c1d85ac3df82df721e3d33b292579c4de491488e)
---
 sys/kern/vfs_subr.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index c1c474b6724d..484ad75b243e 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -195,6 +195,10 @@ static counter_u64_t recycles_free_count;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count,
     "Number of free vnodes recycled to meet vnode cache targets");
 
+static counter_u64_t vnode_skipped_requeues;
+SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnode_skipped_requeues, CTLFLAG_RD, &vnode_skipped_requeues,
+    "Number of times LRU requeue was skipped due to lock contention");
+
 static u_long deferred_inact;
 SYSCTL_ULONG(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD,
     &deferred_inact, 0, "Number of times inactive processing was deferred");
@@ -724,6 +728,7 @@ vntblinit(void *dummy __unused)
 	vnodes_created = counter_u64_alloc(M_WAITOK);
 	recycles_count = counter_u64_alloc(M_WAITOK);
 	recycles_free_count = counter_u64_alloc(M_WAITOK);
+	vnode_skipped_requeues = counter_u64_alloc(M_WAITOK);
 
 	/*
 	 * Initialize the filesystem syncer.
@@ -1268,11 +1273,13 @@ vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp)
 	struct vnode *vp;
 	struct mount *mp;
 	int ocount;
+	bool retried;
 
 	mtx_assert(&vnode_list_mtx, MA_OWNED);
 	if (count > max_vnlru_free)
 		count = max_vnlru_free;
 	ocount = count;
+	retried = false;
 	vp = mvp;
 	for (;;) {
 		if (count == 0) {
@@ -1280,6 +1287,24 @@ vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp)
 		}
 		vp = TAILQ_NEXT(vp, v_vnodelist);
 		if (__predict_false(vp == NULL)) {
+			/*
+			 * The free vnode marker can be past eligible vnodes:
+			 * 1. if vdbatch_process trylock failed
+			 * 2. if vtryrecycle failed
+			 *
+			 * If so, start the scan from scratch.
+			 */
+			if (!retried && vnlru_read_freevnodes() > 0) {
+				TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
+				TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist);
+				vp = mvp;
+				retried++;
+				continue;
+			}
+
+			/*
+			 * Give up
+			 */
 			TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
 			TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist);
 			break;
@@ -3533,6 +3558,17 @@ vdbatch_process(struct vdbatch *vd)
 	MPASS(curthread->td_pinned > 0);
 	MPASS(vd->index == VDBATCH_SIZE);
 
+	/*
+	 * Attempt to requeue the passed batch, but give up easily.
+	 *
+	 * Despite batching the mechanism is prone to transient *significant*
+	 * lock contention, where vnode_list_mtx becomes the primary bottleneck
+	 * if multiple CPUs get here (one real-world example is highly parallel
+	 * do-nothing make , which will stat *tons* of vnodes). Since it is
+	 * quasi-LRU (read: not that great even if fully honoured) just dodge
+	 * the problem. Parties which don't like it are welcome to implement
+	 * something better.
+	 */
 	critical_enter();
 	if (mtx_trylock(&vnode_list_mtx)) {
 		for (i = 0; i < VDBATCH_SIZE; i++) {
@@ -3545,6 +3581,8 @@ vdbatch_process(struct vdbatch *vd)
 		}
 		mtx_unlock(&vnode_list_mtx);
 	} else {
+		counter_u64_add(vnode_skipped_requeues, 1);
+
 		for (i = 0; i < VDBATCH_SIZE; i++) {
 			vp = vd->tab[i];
 			vd->tab[i] = NULL;