git: 5b78ff830791 - main - vm_page: remove pages with iterators

From: Doug Moore <dougm_at_FreeBSD.org>
Date: Wed, 20 Nov 2024 17:56:25 UTC
The branch main has been updated by dougm:

URL: https://cgit.FreeBSD.org/src/commit/?id=5b78ff830791633c02a3d906b2c8f5c9b3bb1a91

commit 5b78ff830791633c02a3d906b2c8f5c9b3bb1a91
Author:     Doug Moore <dougm@FreeBSD.org>
AuthorDate: 2024-11-20 17:54:20 +0000
Commit:     Doug Moore <dougm@FreeBSD.org>
CommitDate: 2024-11-20 17:54:20 +0000

    vm_page: remove pages with iterators
    
    Use pctrie iterators for removing some page sequences from radix
    trees, to avoid repeated searches from the tree root.
    
    Rename vm_page_object_remove to vm_page_remove_radixdone, and remove
    from it the responsibility for removing a page from its radix tree,
    and pass that responsibility on to its callers.
    
    For one of those callers, vm_page_rename, pass a pages pctrie_iter,
    rather than a page, and use the iterator to remove the page from its
    radix tree.
    
    Define functions vm_page_iter_remove() and vm_page_iter_free() that
    are like vm_page_remove() and vm_page_free(), respectively, except
    that they take an iterator as parameter rather than a page, and use
    the iterator to remove the page from the radix tree instead of
    searching the radix tree. Function vm_page_iter_free() assumes that
    the page is associated with an object, and calls
    vm_page_free_object_prep to do the part of vm_page_free_prep that is
    object-related.
    
    In functions vm_object_split and vm_object_collapse_scan, use a
    pctrie_iter to walk over the pages of the object, and use
    vm_page_rename and vm_radix_iter_remove modify the radix tree without
    searching for pages.  In vm_object_page_remove and _kmem_unback, use a
    pctrie_iter and vm_page_iter_free to remove the page from the radix
    tree.
    
    Reviewed by:    markj (prevoius version)
    Tested by:      pho
    Differential Revision:  https://reviews.freebsd.org/D46724
---
 sys/vm/vm_kern.c   |  12 +++---
 sys/vm/vm_object.c |  72 +++++++++++++++----------------
 sys/vm/vm_page.c   | 124 ++++++++++++++++++++++++++++++++++++++++-------------
 sys/vm/vm_page.h   |   4 +-
 4 files changed, 141 insertions(+), 71 deletions(-)

diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c
index 22776e2196b0..6343fb66cfa3 100644
--- a/sys/vm/vm_kern.c
+++ b/sys/vm/vm_kern.c
@@ -634,8 +634,9 @@ kmem_back(vm_object_t object, vm_offset_t addr, vm_size_t size, int flags)
 static struct vmem *
 _kmem_unback(vm_object_t object, vm_offset_t addr, vm_size_t size)
 {
+	struct pctrie_iter pages;
 	struct vmem *arena;
-	vm_page_t m, next;
+	vm_page_t m;
 	vm_offset_t end, offset;
 	int domain;
 
@@ -648,17 +649,18 @@ _kmem_unback(vm_object_t object, vm_offset_t addr, vm_size_t size)
 	offset = addr - VM_MIN_KERNEL_ADDRESS;
 	end = offset + size;
 	VM_OBJECT_WLOCK(object);
-	m = vm_page_lookup(object, atop(offset)); 
+	vm_page_iter_init(&pages, object);
+	m = vm_page_iter_lookup(&pages, atop(offset)); 
 	domain = vm_page_domain(m);
 	if (__predict_true((m->oflags & VPO_KMEM_EXEC) == 0))
 		arena = vm_dom[domain].vmd_kernel_arena;
 	else
 		arena = vm_dom[domain].vmd_kernel_rwx_arena;
-	for (; offset < end; offset += PAGE_SIZE, m = next) {
-		next = vm_page_next(m);
+	for (; offset < end; offset += PAGE_SIZE,
+	    m = vm_page_iter_lookup(&pages, atop(offset))) {
 		vm_page_xbusy_claim(m);
 		vm_page_unwire_noq(m);
-		vm_page_free(m);
+		vm_page_iter_free(&pages);
 	}
 	VM_OBJECT_WUNLOCK(object);
 
diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c
index e6324647e29e..21773318cea0 100644
--- a/sys/vm/vm_object.c
+++ b/sys/vm/vm_object.c
@@ -1520,9 +1520,10 @@ vm_object_shadow(vm_object_t *object, vm_ooffset_t *offset, vm_size_t length,
 void
 vm_object_split(vm_map_entry_t entry)
 {
-	vm_page_t m, m_next;
+	struct pctrie_iter pages;
+	vm_page_t m;
 	vm_object_t orig_object, new_object, backing_object;
-	vm_pindex_t idx, offidxstart;
+	vm_pindex_t offidxstart;
 	vm_size_t size;
 
 	orig_object = entry->object.vm_object;
@@ -1573,17 +1574,11 @@ vm_object_split(vm_map_entry_t entry)
 	 * that the object is in transition.
 	 */
 	vm_object_set_flag(orig_object, OBJ_SPLIT);
-#ifdef INVARIANTS
-	idx = 0;
-#endif
+	vm_page_iter_limit_init(&pages, orig_object, offidxstart + size);
 retry:
-	m = vm_page_find_least(orig_object, offidxstart);
-	KASSERT(m == NULL || idx <= m->pindex - offidxstart,
-	    ("%s: object %p was repopulated", __func__, orig_object));
-	for (; m != NULL && (idx = m->pindex - offidxstart) < size;
-	    m = m_next) {
-		m_next = TAILQ_NEXT(m, listq);
-
+	pctrie_iter_reset(&pages);
+	for (m = vm_page_iter_lookup_ge(&pages, offidxstart); m != NULL;
+	    m = vm_radix_iter_step(&pages)) {
 		/*
 		 * We must wait for pending I/O to complete before we can
 		 * rename the page.
@@ -1604,13 +1599,13 @@ retry:
 		 * an incomplete fault.  Just remove and ignore.
 		 */
 		if (vm_page_none_valid(m)) {
-			if (vm_page_remove(m))
+			if (vm_page_iter_remove(&pages))
 				vm_page_free(m);
 			continue;
 		}
 
 		/* vm_page_rename() will dirty the page. */
-		if (vm_page_rename(m, new_object, idx)) {
+		if (vm_page_rename(&pages, new_object, m->pindex - offidxstart)) {
 			vm_page_xunbusy(m);
 			VM_OBJECT_WUNLOCK(new_object);
 			VM_OBJECT_WUNLOCK(orig_object);
@@ -1656,7 +1651,8 @@ retry:
 }
 
 static vm_page_t
-vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p)
+vm_object_collapse_scan_wait(struct pctrie_iter *pages, vm_object_t object,
+    vm_page_t p)
 {
 	vm_object_t backing_object;
 
@@ -1683,12 +1679,14 @@ vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p)
 		VM_OBJECT_WLOCK(object);
 	}
 	VM_OBJECT_WLOCK(backing_object);
-	return (TAILQ_FIRST(&backing_object->memq));
+	vm_page_iter_init(pages, backing_object);
+	return (vm_page_iter_lookup_ge(pages, 0));
 }
 
 static void
 vm_object_collapse_scan(vm_object_t object)
 {
+	struct pctrie_iter pages;
 	vm_object_t backing_object;
 	vm_page_t next, p, pp;
 	vm_pindex_t backing_offset_index, new_pindex;
@@ -1702,7 +1700,8 @@ vm_object_collapse_scan(vm_object_t object)
 	/*
 	 * Our scan
 	 */
-	for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) {
+	vm_page_iter_init(&pages, backing_object);
+	for (p = vm_page_iter_lookup_ge(&pages, 0); p != NULL; p = next) {
 		next = TAILQ_NEXT(p, listq);
 		new_pindex = p->pindex - backing_offset_index;
 
@@ -1710,7 +1709,7 @@ vm_object_collapse_scan(vm_object_t object)
 		 * Check for busy page
 		 */
 		if (vm_page_tryxbusy(p) == 0) {
-			next = vm_object_collapse_scan_wait(object, p);
+			next = vm_object_collapse_scan_wait(&pages, object, p);
 			continue;
 		}
 
@@ -1727,16 +1726,18 @@ vm_object_collapse_scan(vm_object_t object)
 
 			KASSERT(!pmap_page_is_mapped(p),
 			    ("freeing mapped page %p", p));
-			if (vm_page_remove(p))
+			if (vm_page_iter_remove(&pages))
 				vm_page_free(p);
+			next = vm_radix_iter_step(&pages);
 			continue;
 		}
 
 		if (!vm_page_all_valid(p)) {
 			KASSERT(!pmap_page_is_mapped(p),
 			    ("freeing mapped page %p", p));
-			if (vm_page_remove(p))
+			if (vm_page_iter_remove(&pages))
 				vm_page_free(p);
+			next = vm_radix_iter_step(&pages);
 			continue;
 		}
 
@@ -1749,7 +1750,7 @@ vm_object_collapse_scan(vm_object_t object)
 			 * busy bit owner, we can't tell whether it shadows the
 			 * original page.
 			 */
-			next = vm_object_collapse_scan_wait(object, pp);
+			next = vm_object_collapse_scan_wait(&pages, object, pp);
 			continue;
 		}
 
@@ -1775,10 +1776,11 @@ vm_object_collapse_scan(vm_object_t object)
 			vm_pager_freespace(backing_object, p->pindex, 1);
 			KASSERT(!pmap_page_is_mapped(p),
 			    ("freeing mapped page %p", p));
-			if (vm_page_remove(p))
-				vm_page_free(p);
 			if (pp != NULL)
 				vm_page_xunbusy(pp);
+			if (vm_page_iter_remove(&pages))
+				vm_page_free(p);
+			next = vm_radix_iter_step(&pages);
 			continue;
 		}
 
@@ -1789,9 +1791,10 @@ vm_object_collapse_scan(vm_object_t object)
 		 * If the page was mapped to a process, it can remain mapped
 		 * through the rename.  vm_page_rename() will dirty the page.
 		 */
-		if (vm_page_rename(p, object, new_pindex)) {
+		if (vm_page_rename(&pages, object, new_pindex)) {
 			vm_page_xunbusy(p);
-			next = vm_object_collapse_scan_wait(object, NULL);
+			next = vm_object_collapse_scan_wait(&pages, object,
+			    NULL);
 			continue;
 		}
 
@@ -1807,6 +1810,7 @@ vm_object_collapse_scan(vm_object_t object)
 		    backing_offset_index);
 #endif
 		vm_page_xunbusy(p);
+		next = vm_radix_iter_step(&pages);
 	}
 	return;
 }
@@ -1981,7 +1985,8 @@ void
 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
     int options)
 {
-	vm_page_t p, next;
+	struct pctrie_iter pages;
+	vm_page_t p;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((object->flags & OBJ_UNMANAGED) == 0 ||
@@ -1990,16 +1995,11 @@ vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
 	if (object->resident_page_count == 0)
 		return;
 	vm_object_pip_add(object, 1);
+	vm_page_iter_limit_init(&pages, object, end);
 again:
-	p = vm_page_find_least(object, start);
-
-	/*
-	 * Here, the variable "p" is either (1) the page with the least pindex
-	 * greater than or equal to the parameter "start" or (2) NULL. 
-	 */
-	for (; p != NULL && (p->pindex < end || end == 0); p = next) {
-		next = TAILQ_NEXT(p, listq);
-
+	pctrie_iter_reset(&pages);
+	for (p = vm_page_iter_lookup_ge(&pages, start); p != NULL;
+	     p = vm_radix_iter_step(&pages)) {
 		/*
 		 * Skip invalid pages if asked to do so.  Try to avoid acquiring
 		 * the busy lock, as some consumers rely on this to avoid
@@ -2060,7 +2060,7 @@ wired:
 		if ((options & OBJPR_NOTMAPPED) == 0 &&
 		    object->ref_count != 0 && !vm_page_try_remove_all(p))
 			goto wired;
-		vm_page_free(p);
+		vm_page_iter_free(&pages);
 	}
 	vm_object_pip_wakeup(object);
 
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index 0b9b55337b52..7d093579e35d 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -170,8 +170,9 @@ static bool _vm_page_busy_sleep(vm_object_t obj, vm_page_t m,
     vm_pindex_t pindex, const char *wmesg, int allocflags, bool locked);
 static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
 static void vm_page_enqueue(vm_page_t m, uint8_t queue);
-static bool vm_page_free_prep(vm_page_t m);
+static bool vm_page_free_prep(vm_page_t m, bool do_remove);
 static void vm_page_free_toq(vm_page_t m);
+static void vm_page_free_toq_impl(vm_page_t m, bool do_remove);
 static void vm_page_init(void *dummy);
 static int vm_page_insert_after(vm_page_t m, vm_object_t object,
     vm_pindex_t pindex, vm_page_t mpred);
@@ -1386,6 +1387,22 @@ vm_page_free(vm_page_t m)
 	vm_page_free_toq(m);
 }
 
+/*
+ *	vm_page_iter_free:
+ *
+ *	Free the current page, as identified by iterator.
+ */
+void
+vm_page_iter_free(struct pctrie_iter *pages)
+{
+	vm_page_t m;
+
+	m = vm_radix_iter_page(pages);
+	vm_radix_iter_remove(pages);
+	m->flags &= ~PG_ZERO;
+	vm_page_free_toq_impl(m, false);
+}
+
 /*
  *	vm_page_free_zero:
  *
@@ -1639,14 +1656,18 @@ vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred)
 }
 
 /*
- * Do the work to remove a page from its object.  The caller is responsible for
- * updating the page's fields to reflect this removal.
+ *	vm_page_remove_radixdone
+ *
+ *	Complete page "m" removal from the specified object after the radix trie
+ *	unhooking.
+ *
+ *	The caller is responsible for updating the page's fields to reflect this
+ *	removal.
  */
 static void
-vm_page_object_remove(vm_page_t m)
+vm_page_remove_radixdone(vm_page_t m)
 {
 	vm_object_t object;
-	vm_page_t mrem __diagused;
 
 	vm_page_assert_xbusied(m);
 	object = m->object;
@@ -1659,10 +1680,7 @@ vm_page_object_remove(vm_page_t m)
 		vm_pager_page_unswapped(m);
 
 	vm_pager_page_removed(object, m);
-
 	m->object = NULL;
-	mrem = vm_radix_remove(&object->rtree, m->pindex);
-	KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m));
 
 	/*
 	 * Now remove from the object's list of backed pages.
@@ -1704,6 +1722,42 @@ vm_page_remove(vm_page_t m)
 	return (dropped);
 }
 
+/*
+ *	vm_page_iter_remove:
+ *
+ *	Remove the current page, as identified by iterator, and remove it from the
+ *	radix tree.
+ */
+bool
+vm_page_iter_remove(struct pctrie_iter *pages)
+{
+	vm_page_t m;
+	bool dropped;
+
+	m = vm_radix_iter_page(pages);
+	vm_radix_iter_remove(pages);
+	vm_page_remove_radixdone(m);
+	dropped = (vm_page_drop(m, VPRC_OBJREF) == VPRC_OBJREF);
+	vm_page_xunbusy(m);
+
+	return (dropped);
+}
+
+/*
+ *	vm_page_radix_remove
+ *
+ *	Removes the specified page from the radix tree.
+ */
+static void
+vm_page_radix_remove(vm_page_t m)
+{
+	vm_page_t mrem __diagused;
+
+	mrem = vm_radix_remove(&m->object->rtree, m->pindex);
+	KASSERT(mrem == m,
+	    ("removed page %p, expected page %p", mrem, m));
+}
+
 /*
  *	vm_page_remove_xbusy
  *
@@ -1714,7 +1768,8 @@ bool
 vm_page_remove_xbusy(vm_page_t m)
 {
 
-	vm_page_object_remove(m);
+	vm_page_radix_remove(m);
+	vm_page_remove_radixdone(m);
 	return (vm_page_drop(m, VPRC_OBJREF) == VPRC_OBJREF);
 }
 
@@ -1985,8 +2040,8 @@ vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex,
 /*
  *	vm_page_rename:
  *
- *	Move the given memory entry from its
- *	current object to the specified target object/offset.
+ *	Move the current page, as identified by iterator, from its current
+ *	object to the specified target object/offset.
  *
  *	Note: swap associated with the page must be invalidated by the move.  We
  *	      have to do this for several reasons:  (1) we aren't freeing the
@@ -2001,13 +2056,15 @@ vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex,
  *	The objects must be locked.
  */
 int
-vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
+vm_page_rename(struct pctrie_iter *pages,
+    vm_object_t new_object, vm_pindex_t new_pindex)
 {
-	vm_page_t mpred;
+	vm_page_t m, mpred;
 	vm_pindex_t opidx;
 
 	VM_OBJECT_ASSERT_WLOCKED(new_object);
 
+	m = vm_radix_iter_page(pages);
 	KASSERT(m->ref_count != 0, ("vm_page_rename: page %p has no refs", m));
 
 	/*
@@ -2027,7 +2084,8 @@ vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
 	 * the listq iterator is tainted.
 	 */
 	m->pindex = opidx;
-	vm_page_object_remove(m);
+	vm_radix_iter_remove(pages);
+	vm_page_remove_radixdone(m);
 
 	/* Return back to the new pindex to complete vm_page_insert(). */
 	m->pindex = new_pindex;
@@ -3122,7 +3180,7 @@ vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run,
 					vm_page_dequeue(m);
 					if (vm_page_replace_hold(m_new, object,
 					    m->pindex, m) &&
-					    vm_page_free_prep(m))
+					    vm_page_free_prep(m, true))
 						SLIST_INSERT_HEAD(&free, m,
 						    plinks.s.ss);
 
@@ -3134,7 +3192,7 @@ vm_page_reclaim_run(int req_class, int domain, u_long npages, vm_page_t m_run,
 				} else {
 					m->flags &= ~PG_ZERO;
 					vm_page_dequeue(m);
-					if (vm_page_free_prep(m))
+					if (vm_page_free_prep(m, true))
 						SLIST_INSERT_HEAD(&free, m,
 						    plinks.s.ss);
 					KASSERT(m->dirty == 0,
@@ -4073,7 +4131,7 @@ vm_page_enqueue(vm_page_t m, uint8_t queue)
  *	page must be unmapped.
  */
 static bool
-vm_page_free_prep(vm_page_t m)
+vm_page_free_prep(vm_page_t m, bool do_remove)
 {
 
 	/*
@@ -4120,7 +4178,9 @@ vm_page_free_prep(vm_page_t m)
 		    m->ref_count == VPRC_OBJREF,
 		    ("vm_page_free_prep: page %p has unexpected ref_count %u",
 		    m, m->ref_count));
-		vm_page_object_remove(m);
+		if (do_remove)
+			vm_page_radix_remove(m);
+		vm_page_remove_radixdone(m);
 		m->ref_count -= VPRC_OBJREF;
 	} else
 		vm_page_assert_unbusied(m);
@@ -4172,22 +4232,13 @@ vm_page_free_prep(vm_page_t m)
 	return (true);
 }
 
-/*
- *	vm_page_free_toq:
- *
- *	Returns the given page to the free list, disassociating it
- *	from any VM object.
- *
- *	The object must be locked.  The page must be exclusively busied if it
- *	belongs to an object.
- */
 static void
-vm_page_free_toq(vm_page_t m)
+vm_page_free_toq_impl(vm_page_t m, bool do_remove)
 {
 	struct vm_domain *vmd;
 	uma_zone_t zone;
 
-	if (!vm_page_free_prep(m))
+	if (!vm_page_free_prep(m, do_remove))
 		return;
 
 	vmd = vm_pagequeue_domain(m);
@@ -4202,6 +4253,21 @@ vm_page_free_toq(vm_page_t m)
 	vm_domain_freecnt_inc(vmd, 1);
 }
 
+/*
+ *	vm_page_free_toq:
+ *
+ *	Returns the given page to the free list, disassociating it
+ *	from any VM object.
+ *
+ *	The object must be locked.  The page must be exclusively busied if it
+ *	belongs to an object.
+ */
+static void
+vm_page_free_toq(vm_page_t m)
+{
+	vm_page_free_toq_impl(m, true);
+}
+
 /*
  *	vm_page_free_pages_toq:
  *
diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index 893608bcacf1..613896e77dd9 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -602,6 +602,7 @@ bool vm_page_busy_sleep(vm_page_t m, const char *msg, int allocflags);
 void vm_page_busy_sleep_unlocked(vm_object_t obj, vm_page_t m,
     vm_pindex_t pindex, const char *wmesg, int allocflags);
 void vm_page_free(vm_page_t m);
+void vm_page_iter_free(struct pctrie_iter *);
 void vm_page_free_zero(vm_page_t m);
 
 void vm_page_activate (vm_page_t);
@@ -679,8 +680,9 @@ void vm_page_release(vm_page_t m, int flags);
 void vm_page_release_locked(vm_page_t m, int flags);
 vm_page_t vm_page_relookup(vm_object_t, vm_pindex_t);
 bool vm_page_remove(vm_page_t);
+bool vm_page_iter_remove(struct pctrie_iter *);
 bool vm_page_remove_xbusy(vm_page_t);
-int vm_page_rename(vm_page_t, vm_object_t, vm_pindex_t);
+int vm_page_rename(struct pctrie_iter *, vm_object_t, vm_pindex_t);
 void vm_page_replace(vm_page_t mnew, vm_object_t object,
     vm_pindex_t pindex, vm_page_t mold);
 int vm_page_sbusied(vm_page_t m);