git: 52b35140528c - main - swap_pager: examine swblks with pctrie iterators

From: Doug Moore <dougm_at_FreeBSD.org>
Date: Sun, 29 Sep 2024 16:47:27 UTC
The branch main has been updated by dougm:

URL: https://cgit.FreeBSD.org/src/commit/?id=52b35140528c3ada05cccbd9246d80d9148b9f38

commit 52b35140528c3ada05cccbd9246d80d9148b9f38
Author:     Doug Moore <dougm@FreeBSD.org>
AuthorDate: 2024-09-29 16:38:53 +0000
Commit:     Doug Moore <dougm@FreeBSD.org>
CommitDate: 2024-09-29 16:38:53 +0000

    swap_pager: examine swblks with pctrie iterators
    
    Replace calls to pctrie lookup and remove functions, which always
    begin with a search from the pctrie root, with the use of pctrie
    iterators to traverse and remove items from the pctrie of swapblks
    without duplicating searches.  Take care to reset iterators after
    reacquiring an object lock, since with the lock released they could be
    invalidated.
    
    Reviewed by:    alc, markj, kib
    Tested by:      pho (previous versions)
    Differential Revision:  https://reviews.freebsd.org/D46620
---
 sys/vm/swap_pager.c | 275 ++++++++++++++++++++++++++++------------------------
 1 file changed, 148 insertions(+), 127 deletions(-)

diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index fbf2967aaf17..3a8c7e0f5777 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -529,6 +529,13 @@ swblk_trie_free(struct pctrie *ptree, void *node)
 	uma_zfree(swpctrie_zone, node);
 }
 
+static int
+swblk_start(struct swblk *sb, vm_pindex_t pindex)
+{
+	return (sb == NULL || sb->p >= pindex ?
+	    0 : pindex % SWAP_META_PAGES);
+}
+
 PCTRIE_DEFINE(SWAP, swblk, p, swblk_trie_alloc, swblk_trie_free);
 
 static struct swblk *
@@ -538,50 +545,56 @@ swblk_lookup(vm_object_t object, vm_pindex_t pindex)
 	    rounddown(pindex, SWAP_META_PAGES)));
 }
 
-static struct swblk *
-swblk_start(vm_object_t object, vm_pindex_t pindex)
+static void
+swblk_lookup_remove(vm_object_t object, struct swblk *sb)
 {
-	return (SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks,
-	    rounddown(pindex, SWAP_META_PAGES)));
+	SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p);
 }
 
-static struct swblk *
-swblk_next(vm_object_t object, struct swblk *sb)
+static int
+swblk_lookup_insert(vm_object_t object, struct swblk *sb)
 {
-	return (swblk_start(object, sb->p + SWAP_META_PAGES));
+	return (SWAP_PCTRIE_INSERT(&object->un_pager.swp.swp_blks, sb));
 }
 
-static struct swblk *
-swblk_start_limit(vm_object_t object, vm_pindex_t pindex, vm_pindex_t limit)
+static bool
+swblk_is_empty(vm_object_t object)
 {
-	struct swblk *sb = swblk_start(object, pindex);
-	if (sb != NULL && sb->p < limit)
-		return (sb);
-	return (NULL);
+	return (pctrie_is_empty(&object->un_pager.swp.swp_blks));
 }
 
 static struct swblk *
-swblk_next_limit(vm_object_t object, struct swblk *sb, vm_pindex_t limit)
+swblk_iter(struct pctrie_iter *blks, vm_object_t object,
+    vm_pindex_t pindex)
 {
-	return (swblk_start_limit(object, sb->p + SWAP_META_PAGES, limit));
+	VM_OBJECT_ASSERT_LOCKED(object);
+	MPASS((object->flags & OBJ_SWAP) != 0);
+	pctrie_iter_init(blks, &object->un_pager.swp.swp_blks);
+	return (SWAP_PCTRIE_ITER_LOOKUP_GE(blks,
+	    rounddown(pindex, SWAP_META_PAGES)));
 }
 
-static void
-swblk_lookup_remove(vm_object_t object, struct swblk *sb)
+static struct swblk *
+swblk_iter_limit(struct pctrie_iter *blks, vm_object_t object,
+    vm_pindex_t pindex, vm_pindex_t limit)
 {
-	SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p);
+	VM_OBJECT_ASSERT_LOCKED(object);
+	MPASS((object->flags & OBJ_SWAP) != 0);
+	pctrie_iter_limit_init(blks, &object->un_pager.swp.swp_blks, limit);
+	return (SWAP_PCTRIE_ITER_LOOKUP_GE(blks,
+	    rounddown(pindex, SWAP_META_PAGES)));
 }
 
-static int
-swblk_lookup_insert(vm_object_t object, struct swblk *sb)
+static struct swblk *
+swblk_iter_next(struct pctrie_iter *blks)
 {
-	return (SWAP_PCTRIE_INSERT(&object->un_pager.swp.swp_blks, sb));
+	return (SWAP_PCTRIE_ITER_JUMP_GE(blks, SWAP_META_PAGES));
 }
 
-static bool
-swblk_is_empty(vm_object_t object)
+static void
+swblk_iter_remove(struct pctrie_iter *blks)
 {
-	return (pctrie_is_empty(&object->un_pager.swp.swp_blks));
+	SWAP_PCTRIE_ITER_REMOVE(blks);
 }
 
 /*
@@ -1827,6 +1840,7 @@ swp_pager_force_dirty(struct page_range *range, vm_page_t m, daddr_t *blk)
 u_long
 swap_pager_swapped_pages(vm_object_t object)
 {
+	struct pctrie_iter blks;
 	struct swblk *sb;
 	u_long res;
 	int i;
@@ -1837,8 +1851,8 @@ swap_pager_swapped_pages(vm_object_t object)
 		return (0);
 
 	res = 0;
-	for (sb = swblk_start(object, 0); sb != NULL;
-	    sb = swblk_next(object, sb)) {
+	for (sb = swblk_iter(&blks, object, 0); sb != NULL;
+	    sb = swblk_iter_next(&blks)) {
 		for (i = 0; i < SWAP_META_PAGES; i++) {
 			if (sb->d[i] != SWAPBLK_NONE)
 				res++;
@@ -1856,25 +1870,24 @@ swap_pager_swapped_pages(vm_object_t object)
 static void
 swap_pager_swapoff_object(struct swdevt *sp, vm_object_t object)
 {
-	struct pctrie_iter pages;
+	struct pctrie_iter blks, pages;
 	struct page_range range;
 	struct swblk *sb;
 	vm_page_t m;
-	vm_pindex_t pi;
-	daddr_t blk;
 	int i, rahead, rv;
 	bool sb_empty;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((object->flags & OBJ_SWAP) != 0,
 	    ("%s: Object not swappable", __func__));
+	KASSERT((sp->sw_flags & SW_CLOSING) != 0,
+	    ("%s: Device not blocking further allocations", __func__));
 
-	pi = 0;
-	i = 0;
 	vm_page_iter_init(&pages, object);
 	swp_pager_init_freerange(&range);
-	for (;;) {
-		if (i == 0 && (object->flags & OBJ_DEAD) != 0) {
+	sb = swblk_iter(&blks, object, 0);
+	while (sb != NULL) {
+		if ((object->flags & OBJ_DEAD) != 0) {
 			/*
 			 * Make sure that pending writes finish before
 			 * returning.
@@ -1883,78 +1896,83 @@ swap_pager_swapoff_object(struct swdevt *sp, vm_object_t object)
 			swp_pager_meta_free_all(object);
 			break;
 		}
+		sb_empty = true;
+		for (i = 0; i < SWAP_META_PAGES; i++) {
+			/* Skip an invalid block. */
+			if (sb->d[i] == SWAPBLK_NONE)
+				continue;
+			/* Skip a block not of this device. */
+			if (!swp_pager_isondev(sb->d[i], sp)) {
+				sb_empty = false;
+				continue;
+			}
 
-		if (i == SWAP_META_PAGES) {
-			pi = sb->p + SWAP_META_PAGES;
-			if (sb_empty) {
-				swblk_lookup_remove(object, sb);
-				uma_zfree(swblk_zone, sb);
+			/*
+			 * Look for a page corresponding to this block. If the
+			 * found page has pending operations, sleep and restart
+			 * the scan.
+			 */
+			m = vm_page_iter_lookup(&pages, blks.index + i);
+			if (m != NULL && (m->oflags & VPO_SWAPINPROG) != 0) {
+				m->oflags |= VPO_SWAPSLEEP;
+				VM_OBJECT_SLEEP(object, &object->handle, PSWP,
+				    "swpoff", 0);
+				break;
 			}
-			i = 0;
-		}
 
-		if (i == 0) {
-			sb = swblk_start(object, pi);
-			if (sb == NULL)
+			/*
+			 * If the found page is valid, mark it dirty and free
+			 * the swap block.
+			 */
+			if (m != NULL && vm_page_all_valid(m)) {
+				swp_pager_force_dirty(&range, m, &sb->d[i]);
+				continue;
+			}
+			/* Is there a page we can acquire or allocate? */
+			if (m != NULL) {
+				if (!vm_page_busy_acquire(m, VM_ALLOC_WAITFAIL))
+					break;
+			} else if ((m = vm_page_alloc(object, blks.index + i,
+			    VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL)) == NULL)
 				break;
-			sb_empty = true;
-		}
 
-		/* Skip an invalid block. */
-		blk = sb->d[i];
-		if (blk == SWAPBLK_NONE || !swp_pager_isondev(blk, sp)) {
-			if (blk != SWAPBLK_NONE)
-				sb_empty = false;
-			i++;
-			continue;
+			/* Get the page from swap, and restart the scan. */
+			vm_object_pip_add(object, 1);
+			rahead = SWAP_META_PAGES;
+			rv = swap_pager_getpages_locked(object, &m, 1,
+			    NULL, &rahead);
+			if (rv != VM_PAGER_OK)
+				panic("%s: read from swap failed: %d",
+				    __func__, rv);
+			VM_OBJECT_WLOCK(object);
+			vm_object_pip_wakeupn(object, 1);
+			KASSERT(vm_page_all_valid(m),
+			    ("%s: Page %p not all valid", __func__, m));
+			vm_page_xunbusy(m);
+			break;
 		}
-
-		/*
-		 * Look for a page corresponding to this block, If the found
-		 * page has pending operations, sleep and restart the scan.
-		 */
-		m = vm_page_iter_lookup(&pages, sb->p + i);
-		if (m != NULL && (m->oflags & VPO_SWAPINPROG) != 0) {
-			m->oflags |= VPO_SWAPSLEEP;
-			VM_OBJECT_SLEEP(object, &object->handle, PSWP, "swpoff",
-			    0);
-			i = 0;	/* Restart scan after object lock dropped. */
+		if (i < SWAP_META_PAGES) {
+			/*
+			 * With the object lock released and regained, the
+			 * swapblk could have been freed, so reset the pages
+			 * iterator and search again for the first swblk at or
+			 * after blks.index.
+			 */
+			pctrie_iter_reset(&pages);
+			sb = swblk_iter(&blks, object, blks.index);
 			continue;
 		}
+		if (sb_empty) {
+			swblk_iter_remove(&blks);
+			uma_zfree(swblk_zone, sb);
+		}
 
 		/*
-		 * If the found page is valid, mark it dirty and free the swap
-		 * block.
+		 * It is safe to advance to the next block.  No allocations
+		 * before blk.index have happened, even with the lock released,
+		 * because allocations on this device are blocked.
 		 */
-		if (m != NULL && vm_page_all_valid(m)) {
-			swp_pager_force_dirty(&range, m, &sb->d[i]);
-			i++;
-			continue;
-		}
-
-		/* Is there a page we can acquire or allocate? */
-		if (m == NULL) {
-			m = vm_page_alloc(object, sb->p + i,
-			    VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL);
-		} else if (!vm_page_busy_acquire(m, VM_ALLOC_WAITFAIL))
-			m = NULL;
-
-		/* If no page available, repeat this iteration. */
-		if (m == NULL)
-			continue;
-
-		/* Get the page from swap, mark it dirty, restart the scan. */
-		vm_object_pip_add(object, 1);
-		rahead = SWAP_META_PAGES;
-		rv = swap_pager_getpages_locked(object, &m, 1, NULL, &rahead);
-		if (rv != VM_PAGER_OK)
-			panic("%s: read from swap failed: %d", __func__, rv);
-		VM_OBJECT_WLOCK(object);
-		vm_object_pip_wakeupn(object, 1);
-		KASSERT(vm_page_all_valid(m),
-		    ("%s: Page %p not all valid", __func__, m));
-		vm_page_xunbusy(m);
-		i = 0;	/* Restart scan after object lock dropped. */
+		sb = swblk_iter_next(&blks);
 	}
 	swp_pager_freeswapspace(&range);
 }
@@ -2195,10 +2213,11 @@ static void
 swp_pager_meta_transfer(vm_object_t srcobject, vm_object_t dstobject,
     vm_pindex_t pindex, vm_pindex_t count)
 {
+	struct pctrie_iter blks;
 	struct page_range range;
 	struct swblk *sb;
 	daddr_t blk, d[SWAP_META_PAGES];
-	vm_pindex_t offset, last;
+	vm_pindex_t last;
 	int d_mask, i, limit, start;
 	_Static_assert(8 * sizeof(d_mask) >= SWAP_META_PAGES,
 	    "d_mask not big enough");
@@ -2211,20 +2230,16 @@ swp_pager_meta_transfer(vm_object_t srcobject, vm_object_t dstobject,
 
 	swp_pager_init_freerange(&range);
 	d_mask = 0;
-	offset = pindex;
 	last = pindex + count;
-	sb = swblk_start_limit(srcobject, pindex, last);
-	start = (sb != NULL && sb->p < pindex) ? pindex - sb->p : 0;
-	for (; sb != NULL;
-	    sb = swblk_start_limit(srcobject, pindex, last), start = 0) {
-		pindex = sb->p;
-		MPASS(d_mask == 0);
-		limit = MIN(last - pindex, SWAP_META_PAGES);
+	for (sb = swblk_iter_limit(&blks, srcobject, pindex, last),
+	    start = swblk_start(sb, pindex);
+	    sb != NULL; sb = swblk_iter_next(&blks), start = 0) {
+		limit = MIN(last - blks.index, SWAP_META_PAGES);
 		for (i = start; i < limit; i++) {
 			if (sb->d[i] == SWAPBLK_NONE)
 				continue;
 			blk = swp_pager_meta_build(dstobject,
-			    pindex + i - offset, sb->d[i], true);
+			    blks.index + i - pindex, sb->d[i], true);
 			if (blk == sb->d[i]) {
 				/*
 				 * Failed memory allocation stopped transfer;
@@ -2241,7 +2256,7 @@ swp_pager_meta_transfer(vm_object_t srcobject, vm_object_t dstobject,
 		}
 		if (swp_pager_swblk_empty(sb, 0, start) &&
 		    swp_pager_swblk_empty(sb, limit, SWAP_META_PAGES)) {
-			swblk_lookup_remove(srcobject, sb);
+			swblk_iter_remove(&blks);
 			uma_zfree(swblk_zone, sb);
 		}
 		if (d_mask != 0) {
@@ -2250,12 +2265,17 @@ swp_pager_meta_transfer(vm_object_t srcobject, vm_object_t dstobject,
 			do {
 				i = ffs(d_mask) - 1;
 				swp_pager_meta_build(dstobject,
-				    pindex + i - offset, d[i], false);
+				    blks.index + i - pindex, d[i], false);
 				d_mask &= ~(1 << i);
 			} while (d_mask != 0);
 			VM_OBJECT_WLOCK(srcobject);
+
+			/*
+			 * While the lock was not held, the iterator path could
+			 * have become stale, so discard it.
+			 */
+			pctrie_iter_reset(&blks);
 		}
-		pindex += SWAP_META_PAGES;
 	}
 	swp_pager_freeswapspace(&range);
 }
@@ -2271,7 +2291,7 @@ static void
 swp_pager_meta_free(vm_object_t object, vm_pindex_t pindex, vm_pindex_t count,
     vm_size_t *freed)
 {
-	struct pctrie_iter pages;
+	struct pctrie_iter blks, pages;
 	struct page_range range;
 	struct swblk *sb;
 	vm_page_t m;
@@ -2288,26 +2308,24 @@ swp_pager_meta_free(vm_object_t object, vm_pindex_t pindex, vm_pindex_t count,
 	swp_pager_init_freerange(&range);
 	vm_page_iter_init(&pages, object);
 	last = pindex + count;
-	sb = swblk_start_limit(object, pindex, last);
-	start = (sb != NULL && sb->p < pindex) ? pindex - sb->p : 0;
-	for (; sb != NULL;
-	    sb = swblk_start_limit(object, pindex, last), start = 0) {
-		limit = MIN(last - sb->p, SWAP_META_PAGES);
+	for (sb = swblk_iter_limit(&blks, object, pindex, last),
+	    start = swblk_start(sb, pindex);
+	    sb != NULL; sb = swblk_iter_next(&blks), start = 0) {
+		limit = MIN(last - blks.index, SWAP_META_PAGES);
 		for (i = start; i < limit; i++) {
 			if (sb->d[i] == SWAPBLK_NONE)
 				continue;
 			swp_pager_update_freerange(&range, sb->d[i]);
 			if (freed != NULL) {
-				m = vm_page_iter_lookup(&pages, sb->p + i);
+				m = vm_page_iter_lookup(&pages, blks.index + i);
 				if (m == NULL || vm_page_none_valid(m))
 					fc++;
 			}
 			sb->d[i] = SWAPBLK_NONE;
 		}
-		pindex = sb->p + SWAP_META_PAGES;
 		if (swp_pager_swblk_empty(sb, 0, start) &&
 		    swp_pager_swblk_empty(sb, limit, SWAP_META_PAGES)) {
-			swblk_lookup_remove(object, sb);
+			swblk_iter_remove(&blks);
 			uma_zfree(swblk_zone, sb);
 		}
 	}
@@ -2387,22 +2405,23 @@ swp_pager_meta_lookup(vm_object_t object, vm_pindex_t pindex)
 vm_pindex_t
 swap_pager_find_least(vm_object_t object, vm_pindex_t pindex)
 {
+	struct pctrie_iter blks;
 	struct swblk *sb;
 	int i;
 
-	if ((sb = swblk_start(object, pindex)) == NULL)
+	if ((sb = swblk_iter(&blks, object, pindex)) == NULL)
 		return (OBJ_MAX_SIZE);
-	if (sb->p < pindex) {
+	if (blks.index < pindex) {
 		for (i = pindex % SWAP_META_PAGES; i < SWAP_META_PAGES; i++) {
 			if (sb->d[i] != SWAPBLK_NONE)
-				return (sb->p + i);
+				return (blks.index + i);
 		}
-		if ((sb = swblk_next(object, sb)) == NULL)
+		if ((sb = swblk_iter_next(&blks)) == NULL)
 			return (OBJ_MAX_SIZE);
 	}
 	for (i = 0; i < SWAP_META_PAGES; i++) {
 		if (sb->d[i] != SWAPBLK_NONE)
-			return (sb->p + i);
+			return (blks.index + i);
 	}
 
 	/*
@@ -2840,13 +2859,14 @@ SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD | CTLFLAG_MPSAFE,
 long
 vmspace_swap_count(struct vmspace *vmspace)
 {
+	struct pctrie_iter blks;
 	vm_map_t map;
 	vm_map_entry_t cur;
 	vm_object_t object;
 	struct swblk *sb;
 	vm_pindex_t e, pi;
 	long count;
-	int i;
+	int i, limit, start;
 
 	map = &vmspace->vm_map;
 	count = 0;
@@ -2862,11 +2882,12 @@ vmspace_swap_count(struct vmspace *vmspace)
 			goto unlock;
 		pi = OFF_TO_IDX(cur->offset);
 		e = pi + OFF_TO_IDX(cur->end - cur->start);
-		for (sb = swblk_start_limit(object, pi, e);
-		    sb != NULL; sb = swblk_next_limit(object, sb, e)) {
-			for (i = 0; i < SWAP_META_PAGES; i++) {
-				if (sb->p + i < e &&
-				    sb->d[i] != SWAPBLK_NONE)
+		for (sb = swblk_iter_limit(&blks, object, pi, e),
+		    start = swblk_start(sb, pi);
+		    sb != NULL; sb = swblk_iter_next(&blks), start = 0) {
+			limit = MIN(e - blks.index, SWAP_META_PAGES);
+			for (i = start; i < limit; i++) {
+				if (sb->d[i] != SWAPBLK_NONE)
 					count++;
 			}
 		}