git: e2414d91d33f - main - vfs_subr: maintain sorted tailq

From: Doug Moore <dougm_at_FreeBSD.org>
Date: Tue, 22 Oct 2024 21:58:42 UTC
The branch main has been updated by dougm:

URL: https://cgit.FreeBSD.org/src/commit/?id=e2414d91d33f31d6f2c9f49eef7a1553b5798c9e

commit e2414d91d33f31d6f2c9f49eef7a1553b5798c9e
Author:     Doug Moore <dougm@FreeBSD.org>
AuthorDate: 2024-10-22 21:54:34 +0000
Commit:     Doug Moore <dougm@FreeBSD.org>
CommitDate: 2024-10-22 21:54:34 +0000

    vfs_subr: maintain sorted tailq
    
    Pctries are based on unsigned index values. Type daddr_t is
    signed. Using daddr_t as an index type for a pctrie works, except that
    the pctrie considers negative values greater than nonnegative
    ones. Building a sorted tailq of bufs, based on pctrie results, sorts
    negative daddr_ts larger than nonnegative ones, and makes code that
    depends on the tailq being actually sorted broken.
    
    Write wrappers for the functions that do pctrie operations that depend
    on index ordering that fix the order problem, and use them in place of
    direct pctrie operations.
    
    PR:             282134
    Reported by:    pho
    Reviewed by:    kib, markj
    Tested by:      pho
    Fixes: 2c8caa4b3925aa7335 vfs_subr: optimize inval_buf_range
    Differential Revision:  https://reviews.freebsd.org/D47200
---
 sys/kern/vfs_subr.c | 56 +++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 44 insertions(+), 12 deletions(-)

diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index ff18c50546dd..3b00fdbe93b4 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -537,6 +537,42 @@ buf_trie_free(struct pctrie *ptree, void *node)
 PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free,
     buf_trie_smr);
 
+/*
+ * Lookup the next element greater than or equal to lblkno, accounting for the
+ * fact that, for pctries, negative values are greater than nonnegative ones.
+ */
+static struct buf *
+buf_lookup_ge(struct bufv *bv, daddr_t lblkno)
+{
+	struct buf *bp;
+
+	bp = BUF_PCTRIE_LOOKUP_GE(&bv->bv_root, lblkno);
+	if (bp == NULL && lblkno < 0)
+		bp = BUF_PCTRIE_LOOKUP_GE(&bv->bv_root, 0);
+	if (bp != NULL && bp->b_lblkno < lblkno)
+		bp = NULL;
+	return (bp);
+}
+
+/*
+ * Insert bp, and find the next element smaller than bp, accounting for the fact
+ * that, for pctries, negative values are greater than nonnegative ones.
+ */
+static int
+buf_insert_lookup_le(struct bufv *bv, struct buf *bp, struct buf **n)
+{
+	int error;
+
+	error = BUF_PCTRIE_INSERT_LOOKUP_LE(&bv->bv_root, bp, n);
+	if (error != EEXIST) {
+		if (*n == NULL && bp->b_lblkno >= 0)
+			*n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, ~0L);
+		if (*n != NULL && (*n)->b_lblkno >= bp->b_lblkno)
+			*n = NULL;
+	}
+	return (error);
+}
+
 /*
  * Initialize the vnode management data structures.
  *
@@ -2489,9 +2525,8 @@ bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn)
 
 	for (lblkno = startn;;) {
 again:
-		bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno);
-		if (bp == NULL || bp->b_lblkno >= endn ||
-		    bp->b_lblkno < startn)
+		bp = buf_lookup_ge(bufv, lblkno);
+		if (bp == NULL || bp->b_lblkno >= endn)
 			break;
 		error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
 		    LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0);
@@ -2628,9 +2663,8 @@ v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
 	clean = true;
 	do {
 		bv = clean ? &bo->bo_clean : &bo->bo_dirty;
-		bp = BUF_PCTRIE_LOOKUP_GE(&bv->bv_root, startlbn);
-		if (bp == NULL || bp->b_lblkno >= endlbn ||
-		    bp->b_lblkno < startlbn)
+		bp = buf_lookup_ge(bv, startlbn);
+		if (bp == NULL)
 			continue;
 		TAILQ_FOREACH_FROM_SAFE(bp, &bv->bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno >= endlbn)
@@ -2709,13 +2743,13 @@ buf_vlist_find_or_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
 	else
 		bv = &bo->bo_clean;
 
-	error = BUF_PCTRIE_INSERT_LOOKUP_LE(&bv->bv_root, bp, &n);
+	error = buf_insert_lookup_le(bv, bp, &n);
 	if (n == NULL) {
 		KASSERT(error != EEXIST,
 		    ("buf_vlist_add: EEXIST but no existing buf found: bp %p",
 		    bp));
 	} else {
-		KASSERT((uint64_t)n->b_lblkno <= (uint64_t)bp->b_lblkno,
+		KASSERT(n->b_lblkno <= bp->b_lblkno,
 		    ("buf_vlist_add: out of order insert/lookup: bp %p n %p",
 		    bp, n));
 		KASSERT((n->b_lblkno == bp->b_lblkno) == (error == EEXIST),
@@ -2728,16 +2762,14 @@ buf_vlist_find_or_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
 	/* Keep the list ordered. */
 	if (n == NULL) {
 		KASSERT(TAILQ_EMPTY(&bv->bv_hd) ||
-		    (uint64_t)bp->b_lblkno <
-		    (uint64_t)TAILQ_FIRST(&bv->bv_hd)->b_lblkno,
+		    bp->b_lblkno < TAILQ_FIRST(&bv->bv_hd)->b_lblkno,
 		    ("buf_vlist_add: queue order: "
 		    "%p should be before first %p",
 		    bp, TAILQ_FIRST(&bv->bv_hd)));
 		TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
 	} else {
 		KASSERT(TAILQ_NEXT(n, b_bobufs) == NULL ||
-		    (uint64_t)bp->b_lblkno <
-		    (uint64_t)TAILQ_NEXT(n, b_bobufs)->b_lblkno,
+		    bp->b_lblkno < TAILQ_NEXT(n, b_bobufs)->b_lblkno,
 		    ("buf_vlist_add: queue order: "
 		    "%p should be before next %p",
 		    bp, TAILQ_NEXT(n, b_bobufs)));