git: d0b225d16418 - main - swap_pager: use iterators in swp_pager_meta_build

From: Doug Moore <dougm_at_FreeBSD.org>
Date: Tue, 08 Oct 2024 08:33:14 UTC
The branch main has been updated by dougm:

URL: https://cgit.FreeBSD.org/src/commit/?id=d0b225d16418e224b5d30d13f45515aa70ad47a3

commit d0b225d16418e224b5d30d13f45515aa70ad47a3
Author:     Doug Moore <dougm@FreeBSD.org>
AuthorDate: 2024-10-08 08:31:16 +0000
Commit:     Doug Moore <dougm@FreeBSD.org>
CommitDate: 2024-10-08 08:31:16 +0000

    swap_pager: use iterators in swp_pager_meta_build
    
    Add a method to use an iterator for pctrie insertion; this should
    improve performance when the last search ended near the place where
    the new item will be inserted.
    
    Add an iterator argument to swp_pager_meta_build, so that the lookups
    and insertions it does can be faster in the common case when keys are
    bunched close together, or appear in sequence.
    
    Reviewed by:    alc
    Differential Revision:  https://reviews.freebsd.org/D46848
---
 sys/kern/subr_pctrie.c |  36 +++++++++++++++++
 sys/sys/pctrie.h       |  20 +++++++++
 sys/vm/swap_pager.c    | 107 ++++++++++++++++++++++++++++++++-----------------
 3 files changed, 126 insertions(+), 37 deletions(-)

diff --git a/sys/kern/subr_pctrie.c b/sys/kern/subr_pctrie.c
index a7b487166054..50216287845f 100644
--- a/sys/kern/subr_pctrie.c
+++ b/sys/kern/subr_pctrie.c
@@ -594,6 +594,42 @@ pctrie_iter_lookup(struct pctrie_iter *it, uint64_t index)
 	return (_pctrie_iter_lookup(it, index, NULL, PCTRIE_LOCKED));
 }
 
+/*
+ * Insert the val in the trie, starting search with iterator.  Return a pointer
+ * to indicate where a new node must be allocated to complete insertion.
+ * Assumes access is externally synchronized by a lock.
+ */
+void *
+pctrie_iter_insert_lookup(struct pctrie_iter *it, uint64_t *val)
+{
+	struct pctrie_node *node;
+
+	it->index = *val;
+	node = _pctrie_iter_lookup_node(it, *val, NULL, PCTRIE_LOCKED);
+	if (node == PCTRIE_NULL) {
+		if (it->top == 0)
+			pctrie_root_store(it->ptree,
+			    pctrie_toleaf(val), PCTRIE_LOCKED);
+		else
+			pctrie_addnode(it->path[it->top - 1], it->index,
+			    pctrie_toleaf(val), PCTRIE_LOCKED);
+		return (NULL);
+	}
+	if (__predict_false(pctrie_match_value(node, it->index) != NULL))
+		panic("%s: key %jx is already present", __func__,
+		    (uintmax_t)it->index);
+
+	/*
+	 * 'node' must be replaced in the tree with a new branch node, with
+	 * children 'node' and 'val'. Return the place that points to 'node'
+	 * now, and will point to to the new branching node later.
+	 */
+	if (it->top == 0)
+		return ((smr_pctnode_t *)&it->ptree->pt_root);
+	node = it->path[it->top - 1];
+	return (&node->pn_child[pctrie_slot(node, it->index)]);
+}
+
 /*
  * Returns the value stored at a fixed offset from the current index value,
  * possibly NULL.
diff --git a/sys/sys/pctrie.h b/sys/sys/pctrie.h
index 4d5450c25079..d71290c8cf23 100644
--- a/sys/sys/pctrie.h
+++ b/sys/sys/pctrie.h
@@ -239,6 +239,24 @@ name##_PCTRIE_RECLAIM_CALLBACK(struct pctrie *ptree,			\
 		freefn(ptree, freenode);				\
 }									\
 									\
+static __inline __unused int						\
+name##_PCTRIE_ITER_INSERT(struct pctrie_iter *it, struct type *ptr)	\
+{									\
+	struct pctrie_node *parent;					\
+	void *parentp;							\
+	uint64_t *val = name##_PCTRIE_PTR2VAL(ptr);			\
+									\
+	parentp = pctrie_iter_insert_lookup(it, val);			\
+	if (parentp == NULL)						\
+		return (0);						\
+	parent = allocfn(it->ptree);					\
+	if (__predict_false(parent == NULL))				\
+		return (ENOMEM);					\
+	pctrie_insert_node(parentp, parent, val);			\
+	it->path[it->top++] = parent;					\
+	return (0);							\
+}									\
+									\
 static __inline __unused struct type *					\
 name##_PCTRIE_ITER_LOOKUP(struct pctrie_iter *it, uint64_t index)	\
 {									\
@@ -369,6 +387,8 @@ uint64_t	*pctrie_iter_lookup(struct pctrie_iter *it, uint64_t index);
 uint64_t	*pctrie_iter_stride(struct pctrie_iter *it, int stride);
 uint64_t	*pctrie_iter_next(struct pctrie_iter *it);
 uint64_t	*pctrie_iter_prev(struct pctrie_iter *it);
+void		*pctrie_iter_insert_lookup(struct pctrie_iter *it,
+		    uint64_t *val);
 uint64_t	*pctrie_lookup_ge(struct pctrie *ptree, uint64_t key);
 uint64_t	*pctrie_subtree_lookup_gt(struct pctrie_node *node,
 		    uint64_t key);
diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index 807215647a93..f4db46a32dee 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -486,8 +486,8 @@ static daddr_t	swp_pager_getswapspace(int *npages);
 /*
  * Metadata functions
  */
-static daddr_t swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t,
-	bool);
+static daddr_t swp_pager_meta_build(struct pctrie_iter *, vm_object_t object,
+	vm_pindex_t, daddr_t, bool);
 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t,
     vm_size_t *);
 static void swp_pager_meta_transfer(vm_object_t src, vm_object_t dst,
@@ -551,29 +551,39 @@ swblk_lookup_remove(vm_object_t object, struct swblk *sb)
 	SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p);
 }
 
-static int
-swblk_lookup_insert(vm_object_t object, struct swblk *sb)
-{
-	return (SWAP_PCTRIE_INSERT(&object->un_pager.swp.swp_blks, sb));
-}
-
 static bool
 swblk_is_empty(vm_object_t object)
 {
 	return (pctrie_is_empty(&object->un_pager.swp.swp_blks));
 }
 
-static struct swblk *
-swblk_iter_init(struct pctrie_iter *blks, vm_object_t object,
-    vm_pindex_t pindex)
+static void
+swblk_iter_init_only(struct pctrie_iter *blks, vm_object_t object)
 {
 	VM_OBJECT_ASSERT_LOCKED(object);
 	MPASS((object->flags & OBJ_SWAP) != 0);
 	pctrie_iter_init(blks, &object->un_pager.swp.swp_blks);
+}
+
+
+static struct swblk *
+swblk_iter_init(struct pctrie_iter *blks, vm_object_t object,
+    vm_pindex_t pindex)
+{
+	swblk_iter_init_only(blks, object);
 	return (SWAP_PCTRIE_ITER_LOOKUP_GE(blks,
 	    rounddown(pindex, SWAP_META_PAGES)));
 }
 
+static struct swblk *
+swblk_iter_reinit(struct pctrie_iter *blks, vm_object_t object,
+    vm_pindex_t pindex)
+{
+	swblk_iter_init_only(blks, object);
+	return (SWAP_PCTRIE_ITER_LOOKUP(blks,
+	    rounddown(pindex, SWAP_META_PAGES)));
+}
+
 static struct swblk *
 swblk_iter_limit_init(struct pctrie_iter *blks, vm_object_t object,
     vm_pindex_t pindex, vm_pindex_t limit)
@@ -591,6 +601,19 @@ swblk_iter_next(struct pctrie_iter *blks)
 	return (SWAP_PCTRIE_ITER_JUMP_GE(blks, SWAP_META_PAGES));
 }
 
+static struct swblk *
+swblk_iter_lookup(struct pctrie_iter *blks, vm_pindex_t pindex)
+{
+	return (SWAP_PCTRIE_ITER_LOOKUP(blks,
+	    rounddown(pindex, SWAP_META_PAGES)));
+}
+
+static int
+swblk_iter_insert(struct pctrie_iter *blks, struct swblk *sb)
+{
+	return (SWAP_PCTRIE_ITER_INSERT(blks, sb));
+}
+
 static void
 swblk_iter_remove(struct pctrie_iter *blks)
 {
@@ -1081,6 +1104,7 @@ swap_pager_freespace_pgo(vm_object_t object, vm_pindex_t start, vm_size_t size)
 int
 swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_pindex_t size)
 {
+	struct pctrie_iter blks;
 	struct page_range range;
 	daddr_t addr, blk;
 	vm_pindex_t i, j;
@@ -1088,6 +1112,7 @@ swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_pindex_t size)
 
 	swp_pager_init_freerange(&range);
 	VM_OBJECT_WLOCK(object);
+	swblk_iter_init_only(&blks, object);
 	for (i = 0; i < size; i += n) {
 		n = MIN(size - i, INT_MAX);
 		blk = swp_pager_getswapspace(&n);
@@ -1097,7 +1122,7 @@ swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_pindex_t size)
 			return (-1);
 		}
 		for (j = 0; j < n; ++j) {
-			addr = swp_pager_meta_build(object,
+			addr = swp_pager_meta_build(&blks, object,
 			    start + i + j, blk + j, false);
 			if (addr != SWAPBLK_NONE)
 				swp_pager_update_freerange(&range, addr);
@@ -1535,6 +1560,7 @@ static void
 swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count,
     int flags, int *rtvals)
 {
+	struct pctrie_iter blks;
 	struct page_range range;
 	struct buf *bp;
 	daddr_t addr, blk;
@@ -1580,11 +1606,15 @@ swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count,
 			continue;
 		}
 		VM_OBJECT_WLOCK(object);
+		swblk_iter_init_only(&blks, object);
 		for (j = 0; j < n; ++j) {
 			mreq = ma[i + j];
 			vm_page_aflag_clear(mreq, PGA_SWAP_FREE);
-			addr = swp_pager_meta_build(mreq->object, mreq->pindex,
-			    blk + j, false);
+			KASSERT(mreq->object == object,
+			    ("%s: object mismatch %p/%p",
+			    __func__, mreq->object, object));
+			addr = swp_pager_meta_build(&blks, object,
+			    mreq->pindex, blk + j, false);
 			if (addr != SWAPBLK_NONE)
 				swp_pager_update_freerange(&range, addr);
 			MPASS(mreq->dirty == VM_PAGE_BITS_ALL);
@@ -2102,19 +2132,18 @@ swp_pager_free_empty_swblk(vm_object_t object, struct swblk *sb)
  *	any.
  */
 static daddr_t
-swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk,
-    bool nowait_noreplace)
+swp_pager_meta_build(struct pctrie_iter *blks, vm_object_t object,
+    vm_pindex_t pindex, daddr_t swapblk, bool nowait_noreplace)
 {
 	static volatile int swblk_zone_exhausted, swpctrie_zone_exhausted;
 	struct swblk *sb, *sb1;
-	vm_pindex_t modpi, rdpi;
+	vm_pindex_t modpi;
 	daddr_t prev_swapblk;
 	int error, i;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
-	rdpi = rounddown(pindex, SWAP_META_PAGES);
-	sb = swblk_lookup(object, rdpi);
+	sb = swblk_iter_lookup(blks, pindex);
 	if (sb == NULL) {
 		if (swapblk == SWAPBLK_NONE)
 			return (SWAPBLK_NONE);
@@ -2122,7 +2151,7 @@ swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk,
 			sb = uma_zalloc(swblk_zone, M_NOWAIT | (curproc ==
 			    pageproc ? M_USE_RESERVE : 0));
 			if (sb != NULL) {
-				sb->p = rdpi;
+				sb->p = rounddown(pindex, SWAP_META_PAGES);
 				for (i = 0; i < SWAP_META_PAGES; i++)
 					sb->d[i] = SWAPBLK_NONE;
 				if (atomic_cmpset_int(&swblk_zone_exhausted,
@@ -2143,17 +2172,17 @@ swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk,
 			} else
 				uma_zwait(swblk_zone);
 			VM_OBJECT_WLOCK(object);
-			sb = swblk_lookup(object, rdpi);
+			sb = swblk_iter_reinit(blks, object, pindex);
 			if (sb != NULL)
 				/*
 				 * Somebody swapped out a nearby page,
-				 * allocating swblk at the rdpi index,
+				 * allocating swblk at the pindex index,
 				 * while we dropped the object lock.
 				 */
 				goto allocated;
 		}
 		for (;;) {
-			error = swblk_lookup_insert(object, sb);
+			error = swblk_iter_insert(blks, sb);
 			if (error == 0) {
 				if (atomic_cmpset_int(&swpctrie_zone_exhausted,
 				    1, 0))
@@ -2175,7 +2204,7 @@ swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk,
 			} else
 				uma_zwait(swpctrie_zone);
 			VM_OBJECT_WLOCK(object);
-			sb1 = swblk_lookup(object, rdpi);
+			sb1 = swblk_iter_reinit(blks, object, pindex);
 			if (sb1 != NULL) {
 				uma_zfree(swblk_zone, sb);
 				sb = sb1;
@@ -2184,7 +2213,7 @@ swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk,
 		}
 	}
 allocated:
-	MPASS(sb->p == rdpi);
+	MPASS(sb->p == rounddown(pindex, SWAP_META_PAGES));
 
 	modpi = pindex % SWAP_META_PAGES;
 	/* Return prior contents of metadata. */
@@ -2196,8 +2225,11 @@ allocated:
 		/*
 		 * Free the swblk if we end up with the empty page run.
 		 */
-		if (swapblk == SWAPBLK_NONE)
-			swp_pager_free_empty_swblk(object, sb);
+		if (swapblk == SWAPBLK_NONE &&
+		    swp_pager_swblk_empty(sb, 0, SWAP_META_PAGES)) {
+			swblk_iter_remove(blks);
+			uma_zfree(swblk_zone, sb);
+		}
 	}
 	return (prev_swapblk);
 }
@@ -2213,7 +2245,7 @@ static void
 swp_pager_meta_transfer(vm_object_t srcobject, vm_object_t dstobject,
     vm_pindex_t pindex, vm_pindex_t count)
 {
-	struct pctrie_iter blks;
+	struct pctrie_iter dstblks, srcblks;
 	struct page_range range;
 	struct swblk *sb;
 	daddr_t blk, d[SWAP_META_PAGES];
@@ -2231,15 +2263,16 @@ swp_pager_meta_transfer(vm_object_t srcobject, vm_object_t dstobject,
 	swp_pager_init_freerange(&range);
 	d_mask = 0;
 	last = pindex + count;
-	for (sb = swblk_iter_limit_init(&blks, srcobject, pindex, last),
+	swblk_iter_init_only(&dstblks, dstobject);
+	for (sb = swblk_iter_limit_init(&srcblks, srcobject, pindex, last),
 	    start = swblk_start(sb, pindex);
-	    sb != NULL; sb = swblk_iter_next(&blks), start = 0) {
-		limit = MIN(last - blks.index, SWAP_META_PAGES);
+	    sb != NULL; sb = swblk_iter_next(&srcblks), start = 0) {
+		limit = MIN(last - srcblks.index, SWAP_META_PAGES);
 		for (i = start; i < limit; i++) {
 			if (sb->d[i] == SWAPBLK_NONE)
 				continue;
-			blk = swp_pager_meta_build(dstobject,
-			    blks.index + i - pindex, sb->d[i], true);
+			blk = swp_pager_meta_build(&dstblks, dstobject,
+			    srcblks.index + i - pindex, sb->d[i], true);
 			if (blk == sb->d[i]) {
 				/*
 				 * Failed memory allocation stopped transfer;
@@ -2256,7 +2289,7 @@ swp_pager_meta_transfer(vm_object_t srcobject, vm_object_t dstobject,
 		}
 		if (swp_pager_swblk_empty(sb, 0, start) &&
 		    swp_pager_swblk_empty(sb, limit, SWAP_META_PAGES)) {
-			swblk_iter_remove(&blks);
+			swblk_iter_remove(&srcblks);
 			uma_zfree(swblk_zone, sb);
 		}
 		if (d_mask != 0) {
@@ -2264,8 +2297,8 @@ swp_pager_meta_transfer(vm_object_t srcobject, vm_object_t dstobject,
 			VM_OBJECT_WUNLOCK(srcobject);
 			do {
 				i = ffs(d_mask) - 1;
-				swp_pager_meta_build(dstobject,
-				    blks.index + i - pindex, d[i], false);
+				swp_pager_meta_build(&dstblks, dstobject,
+				    srcblks.index + i - pindex, d[i], false);
 				d_mask &= ~(1 << i);
 			} while (d_mask != 0);
 			VM_OBJECT_WLOCK(srcobject);
@@ -2274,7 +2307,7 @@ swp_pager_meta_transfer(vm_object_t srcobject, vm_object_t dstobject,
 			 * While the lock was not held, the iterator path could
 			 * have become stale, so discard it.
 			 */
-			pctrie_iter_reset(&blks);
+			pctrie_iter_reset(&srcblks);
 		}
 	}
 	swp_pager_freeswapspace(&range);