git: 8f58b693814e - main - vm_grab_pages_unlocked: read all the pages at once

From: Doug Moore <dougm_at_FreeBSD.org>
Date: Tue, 29 Apr 2025 06:33:46 UTC
The branch main has been updated by dougm:

URL: https://cgit.FreeBSD.org/src/commit/?id=8f58b693814e06afc51eade99a8c38e4abe9a804

commit 8f58b693814e06afc51eade99a8c38e4abe9a804
Author:     Doug Moore <dougm@FreeBSD.org>
AuthorDate: 2025-04-29 06:28:32 +0000
Commit:     Doug Moore <dougm@FreeBSD.org>
CommitDate: 2025-04-29 06:28:32 +0000

    vm_grab_pages_unlocked: read all the pages at once
    
    Define a function pctrie_lookup_range_unlocked, which looks up
    consecutive elements in the pctrie, until a count limit is reached or
    an item is discovered missing, and writes them into the first elements
    of an array, returning the count of items found.  It does not require
    a lock.  It uses an iterator, strictly between smr_enter and smr_exit,
    when some of the nodes in the pctrie on entry may come to have only
    one child, but none of them can be freed before exit.
    
    Define a vm_radix interface to it for reading page ranges.
    
    Change vm_page_grab_pages_unlocked to read all requested pages at
    once, without relying on the existence of a linked list of pages.
    
    Drop smr arguments from pctrie_iter_stride, since there's no smr
    version in use.  Drop _pctrie_iter_lookup, since it isn't needed.
    
    Reviewed by:    alc, kib, markj
    Differential Revision:  https://reviews.freebsd.org/D47114
---
 sys/kern/subr_pctrie.c | 83 +++++++++++++++++++++++++++++++-------------------
 sys/sys/pctrie.h       | 26 ++++++++++++++--
 sys/vm/vm_page.c       | 24 ++++-----------
 sys/vm/vm_radix.h      | 12 ++++++++
 4 files changed, 93 insertions(+), 52 deletions(-)

diff --git a/sys/kern/subr_pctrie.c b/sys/kern/subr_pctrie.c
index 03cf3e1e5990..feb29cf82330 100644
--- a/sys/kern/subr_pctrie.c
+++ b/sys/kern/subr_pctrie.c
@@ -494,7 +494,7 @@ _pctrie_lookup_node(struct pctrie *ptree, struct pctrie_node *node,
 	 * search for a value matching 'index'.
 	 */
 	while (node != NULL) {
-		KASSERT(!powerof2(node->pn_popmap),
+		KASSERT(access == PCTRIE_SMR || !powerof2(node->pn_popmap),
 		    ("%s: freed node in iter path", __func__));
 		if (!pctrie_keybarr(node, index, &slot))
 			break;
@@ -520,29 +520,20 @@ _pctrie_lookup_node(struct pctrie *ptree, struct pctrie_node *node,
 }
 
 /*
- * Returns the value stored at a given index value, possibly NULL.
+ * Returns the value stored at a given index value, possibly NULL, assuming
+ * access is externally synchronized by a lock.
  */
-static __always_inline uint64_t *
-_pctrie_iter_lookup(struct pctrie_iter *it, uint64_t index, smr_t smr,
-    enum pctrie_access access)
+uint64_t *
+pctrie_iter_lookup(struct pctrie_iter *it, uint64_t index)
 {
 	struct pctrie_node *node;
 
 	it->index = index;
 	node = _pctrie_lookup_node(it->ptree, it->node, index, &it->node,
-	    smr, access);
+	    NULL, PCTRIE_LOCKED);
 	return (pctrie_match_value(node, index));
 }
 
-/*
- * Returns the value stored at a given index value, possibly NULL.
- */
-uint64_t *
-pctrie_iter_lookup(struct pctrie_iter *it, uint64_t index)
-{
-	return (_pctrie_iter_lookup(it, index, NULL, PCTRIE_LOCKED));
-}
-
 /*
  * Insert the val in the trie, starting search with iterator.  Return a pointer
  * to indicate where a new node must be allocated to complete insertion.
@@ -581,9 +572,8 @@ pctrie_iter_insert_lookup(struct pctrie_iter *it, uint64_t *val)
  * Returns the value stored at a fixed offset from the current index value,
  * possibly NULL.
  */
-static __always_inline uint64_t *
-_pctrie_iter_stride(struct pctrie_iter *it, int stride, smr_t smr,
-    enum pctrie_access access)
+uint64_t *
+pctrie_iter_stride(struct pctrie_iter *it, int stride)
 {
 	uint64_t index = it->index + stride;
 
@@ -594,17 +584,7 @@ _pctrie_iter_stride(struct pctrie_iter *it, int stride, smr_t smr,
 	if ((index < it->limit) != (it->index < it->limit))
 		return (NULL);
 
-	return (_pctrie_iter_lookup(it, index, smr, access));
-}
-
-/*
- * Returns the value stored at a fixed offset from the current index value,
- * possibly NULL.
- */
-uint64_t *
-pctrie_iter_stride(struct pctrie_iter *it, int stride)
-{
-	return (_pctrie_iter_stride(it, stride, NULL, PCTRIE_LOCKED));
+	return (pctrie_iter_lookup(it, index));
 }
 
 /*
@@ -614,7 +594,7 @@ pctrie_iter_stride(struct pctrie_iter *it, int stride)
 uint64_t *
 pctrie_iter_next(struct pctrie_iter *it)
 {
-	return (_pctrie_iter_stride(it, 1, NULL, PCTRIE_LOCKED));
+	return (pctrie_iter_stride(it, 1));
 }
 
 /*
@@ -624,7 +604,48 @@ pctrie_iter_next(struct pctrie_iter *it)
 uint64_t *
 pctrie_iter_prev(struct pctrie_iter *it)
 {
-	return (_pctrie_iter_stride(it, -1, NULL, PCTRIE_LOCKED));
+	return (pctrie_iter_stride(it, -1));
+}
+
+/*
+ * Returns the number of contiguous, non-NULL entries read into the value[]
+ * array, without requiring an external lock.  These entries *may* never have
+ * been in the pctrie all at one time, but for a series of times t0, t1, t2,
+ * ..., with ti <= t(i+1), value[i] was in the trie at time ti.
+ */
+int
+pctrie_lookup_range_unlocked(struct pctrie *ptree, uint64_t index,
+    uint64_t *value[], int count, smr_t smr)
+{
+	struct pctrie_node *parent, *node;
+	int base, end, i;
+
+	parent = NULL;
+	smr_enter(smr);
+	for (i = 0; i < count;) {
+		node = _pctrie_lookup_node(ptree, parent, index + i, &parent,
+		    smr, PCTRIE_SMR);
+		value[i] = pctrie_match_value(node, index + i);
+		if (value[i] == NULL)
+			break;
+		++i;
+		base = (index + i) % PCTRIE_COUNT;
+		if (base == 0 || parent == NULL || parent->pn_clev != 0)
+			continue;
+		end = MIN(count, i + PCTRIE_COUNT - base);
+		while (i < end) {
+			node = pctrie_node_load(&parent->pn_child[base++],
+			    smr, PCTRIE_SMR);
+			value[i] = pctrie_toval(node);
+			if (value[i] == NULL)
+				break;
+			++i;
+		}
+		if (i < end)
+			break;
+	}
+	smr_exit(smr);
+	return (i);
 }
 
 /*
diff --git a/sys/sys/pctrie.h b/sys/sys/pctrie.h
index dac18b58498b..01d9f34f11b9 100644
--- a/sys/sys/pctrie.h
+++ b/sys/sys/pctrie.h
@@ -83,6 +83,19 @@ name##_PCTRIE_LOOKUP_UNLOCKED(struct pctrie *ptree, uint64_t key)	\
 	return name##_PCTRIE_VAL2PTR(pctrie_lookup_unlocked(ptree,	\
 	    key, (smr)));						\
 }									\
+									\
+static __inline __unused int						\
+name##_PCTRIE_LOOKUP_RANGE_UNLOCKED(struct pctrie *ptree, uint64_t key,	\
+     struct type *value[], int count) 					\
+{									\
+	uint64_t **data = (uint64_t **)value;				\
+									\
+	count = pctrie_lookup_range_unlocked(ptree, key, data, count,	\
+	    (smr));							\
+	for (int i = 0; i < count; i++)					\
+		value[i] = name##_PCTRIE_NZVAL2PTR(data[i]);		\
+	return (count);							\
+}									\
 
 #define	PCTRIE_DEFINE(name, type, field, allocfn, freefn)		\
 									\
@@ -94,13 +107,18 @@ CTASSERT(sizeof(((struct type *)0)->field) == sizeof(uint64_t));	\
 CTASSERT((__offsetof(struct type, field) & (sizeof(uint32_t) - 1)) == 0); \
 									\
 static __inline struct type *						\
-name##_PCTRIE_VAL2PTR(uint64_t *val)					\
+name##_PCTRIE_NZVAL2PTR(uint64_t *val)					\
 {									\
+	return (struct type *)						\
+	    ((uintptr_t)val - __offsetof(struct type, field));		\
+}									\
 									\
+static __inline struct type *						\
+name##_PCTRIE_VAL2PTR(uint64_t *val)					\
+{									\
 	if (val == NULL)						\
 		return (NULL);						\
-	return (struct type *)						\
-	    ((uintptr_t)val - __offsetof(struct type, field));		\
+	return (name##_PCTRIE_NZVAL2PTR(val));				\
 }									\
 									\
 static __inline uint64_t *						\
@@ -365,6 +383,8 @@ void		pctrie_insert_node(uint64_t *val, struct pctrie_node *parent,
 uint64_t	*pctrie_lookup(struct pctrie *ptree, uint64_t key);
 uint64_t	*pctrie_lookup_unlocked(struct pctrie *ptree, uint64_t key,
 		    smr_t smr);
+int		pctrie_lookup_range_unlocked(struct pctrie *ptree,
+		    uint64_t index, uint64_t *value[], int count, smr_t smr);
 uint64_t	*pctrie_iter_lookup(struct pctrie_iter *it, uint64_t index);
 uint64_t	*pctrie_iter_stride(struct pctrie_iter *it, int stride);
 uint64_t	*pctrie_iter_next(struct pctrie_iter *it);
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
index e2f11eb57100..8dce6feaca09 100644
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -5269,7 +5269,7 @@ vm_page_grab_pages_unlocked(vm_object_t object, vm_pindex_t pindex,
 {
 	vm_page_t m;
 	int flags;
-	int i;
+	int i, num_fetched;
 
 	KASSERT(count > 0,
 	    ("vm_page_grab_pages_unlocked: invalid page count %d", count));
@@ -5281,22 +5281,10 @@ vm_page_grab_pages_unlocked(vm_object_t object, vm_pindex_t pindex,
 	 */
 	flags = allocflags & ~VM_ALLOC_NOBUSY;
 	vm_page_grab_check(flags);
-	m = NULL;
-	for (i = 0; i < count; i++, pindex++) {
-		/*
-		 * We may see a false NULL here because the previous page has
-		 * been removed or just inserted and the list is loaded without
-		 * barriers.  Switch to radix to verify.
-		 */
-		if (m == NULL || QMD_IS_TRASHED(m) || m->pindex != pindex ||
-		    atomic_load_ptr(&m->object) != object) {
-			/*
-			 * This guarantees the result is instantaneously
-			 * correct.
-			 */
-			m = NULL;
-		}
-		m = vm_page_acquire_unlocked(object, pindex, m, flags);
+	num_fetched = vm_radix_lookup_range_unlocked(&object->rtree, pindex,
+	    ma, count);
+	for (i = 0; i < num_fetched; i++, pindex++) {
+		m = vm_page_acquire_unlocked(object, pindex, ma[i], flags);
 		if (m == PAGE_NOT_ACQUIRED)
 			return (i);
 		if (m == NULL)
@@ -5308,8 +5296,8 @@ vm_page_grab_pages_unlocked(vm_object_t object, vm_pindex_t pindex,
 		}
 		/* m will still be wired or busy according to flags. */
 		vm_page_grab_release(m, allocflags);
+		/* vm_page_acquire_unlocked() may not return ma[i]. */
 		ma[i] = m;
-		m = TAILQ_NEXT(m, listq);
 	}
 	if (i == count || (allocflags & VM_ALLOC_NOCREAT) != 0)
 		return (i);
diff --git a/sys/vm/vm_radix.h b/sys/vm/vm_radix.h
index df3639091d58..231075d32754 100644
--- a/sys/vm/vm_radix.h
+++ b/sys/vm/vm_radix.h
@@ -121,6 +121,18 @@ vm_radix_lookup_unlocked(struct vm_radix *rtree, vm_pindex_t index)
 	return (VM_RADIX_PCTRIE_LOOKUP_UNLOCKED(&rtree->rt_trie, index));
 }
 
+/*
+ * Returns the number of contiguous, non-NULL pages read into the ma[]
+ * array, without requiring an external lock.
+ */
+static __inline int
+vm_radix_lookup_range_unlocked(struct vm_radix *rtree, vm_pindex_t index,
+    vm_page_t ma[], int count)
+{
+	return (VM_RADIX_PCTRIE_LOOKUP_RANGE_UNLOCKED(&rtree->rt_trie, index,
+	    ma, count));
+}
+
 /*
  * Initialize an iterator for vm_radix.
  */