git: fd1d66628968 - main - pctrie: create iterator

From: Doug Moore <dougm_at_FreeBSD.org>
Date: Fri, 13 Sep 2024 15:38:44 UTC
The branch main has been updated by dougm:

URL: https://cgit.FreeBSD.org/src/commit/?id=fd1d66628968df747b66ee68df409211f642dd22

commit fd1d66628968df747b66ee68df409211f642dd22
Author:     Doug Moore <dougm@FreeBSD.org>
AuthorDate: 2024-09-13 15:36:54 +0000
Commit:     Doug Moore <dougm@FreeBSD.org>
CommitDate: 2024-09-13 15:36:54 +0000

    pctrie: create iterator
    
    Define a pctrie iterator type. A pctrie iterator is a wrapper around a
    pctrie that remembers a position in the trie where the last search
    left off, and where a new search can resume. When the next search is
    for an item very near in the trie to where the last search left off,
    iter-based search is faster because instead of starting from the root,
    the search usually only has to back up one or two steps up the
    root-to-last-search path to find the branch that leads to the new
    search target.
    
    Every kind of lookup (plain, lookup_ge, lookup_le) that can begin with
    the trie root can begin with an iterator instead. An iterator can also
    do a relative search ("look for the item 4 greater than the last item
    I found") because it remembers where that last search ended. It can
    also search within limits ("look for the item bigger than this one,
    but it has to be less than 100"), which can save time when the next
    item beyond the limits and that is known before we actually know what
    that item it is. An iterator can also be used to remove an item that
    has already been found, without having to search for it again.
    
    Iterators are vulnerable to unsynchronized data changes. If the
    iterator is created with a lock held, and that lock is released and
    acquired again, there's no guarantee that the iterator path remains
    valid.
    
    Reviewed by:    markj
    Tested by:      pho
    Differential Revision:  https://reviews.freebsd.org/D45627
---
 sys/kern/subr_pctrie.c | 409 +++++++++++++++++++++++++++++++++++++++++++------
 sys/sys/pctrie.h       | 134 +++++++++++++++-
 2 files changed, 493 insertions(+), 50 deletions(-)

diff --git a/sys/kern/subr_pctrie.c b/sys/kern/subr_pctrie.c
index 6949e3de99bf..b461ffa3c5be 100644
--- a/sys/kern/subr_pctrie.c
+++ b/sys/kern/subr_pctrie.c
@@ -62,9 +62,6 @@
 #include <ddb/ddb.h>
 #endif
 
-#define	PCTRIE_MASK	(PCTRIE_COUNT - 1)
-#define	PCTRIE_LIMIT	(howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1)
-
 #if PCTRIE_WIDTH == 3
 typedef uint8_t pn_popmap_t;
 #elif PCTRIE_WIDTH == 4
@@ -87,18 +84,13 @@ struct pctrie_node {
 	smr_pctnode_t	pn_child[PCTRIE_COUNT];		/* Child nodes. */
 };
 
-enum pctrie_access { PCTRIE_SMR, PCTRIE_LOCKED, PCTRIE_UNSERIALIZED };
-
-static __inline void pctrie_node_store(smr_pctnode_t *p, void *val,
-    enum pctrie_access access);
-
 /*
  * Map index to an array position for the children of node,
  */
 static __inline int
 pctrie_slot(struct pctrie_node *node, uint64_t index)
 {
-	return ((index >> node->pn_clev) & PCTRIE_MASK);
+	return ((index >> node->pn_clev) & (PCTRIE_COUNT - 1));
 }
 
 /*
@@ -137,6 +129,8 @@ pctrie_node_put(struct pctrie_node *node)
 #endif
 }
 
+enum pctrie_access { PCTRIE_SMR, PCTRIE_LOCKED, PCTRIE_UNSERIALIZED };
+
 /*
  * Fetch a node pointer from a slot.
  */
@@ -476,6 +470,21 @@ pctrie_insert_node(void *parentp, struct pctrie_node *parent, uint64_t *val)
 	pctrie_node_store(parentp, parent, PCTRIE_LOCKED);
 }
 
+/*
+ * Return the value associated with the node, if the node is a leaf that matches
+ * the index; otherwise NULL.
+ */
+static __always_inline uint64_t *
+pctrie_match_value(struct pctrie_node *node, uint64_t index)
+{
+	uint64_t *m;
+
+	if (!pctrie_isleaf(node) || (m = pctrie_toval(node)) == NULL ||
+	    *m != index)
+		m = NULL;
+	return (m);
+}
+
 /*
  * Returns the value stored at the index.  If the index is not present,
  * NULL is returned.
@@ -485,21 +494,13 @@ _pctrie_lookup(struct pctrie *ptree, uint64_t index, smr_t smr,
     enum pctrie_access access)
 {
 	struct pctrie_node *node;
-	uint64_t *m;
 	int slot;
 
 	node = pctrie_root_load(ptree, smr, access);
-	for (;;) {
-		if (pctrie_isleaf(node)) {
-			if ((m = pctrie_toval(node)) != NULL && *m == index)
-				return (m);
-			break;
-		}
-		if (pctrie_keybarr(node, index, &slot))
-			break;
+	/* Seek a node that matches index. */
+	while (!pctrie_isleaf(node) && !pctrie_keybarr(node, index, &slot))
 		node = pctrie_node_load(&node->pn_child[slot], smr, access);
-	}
-	return (NULL);
+	return (pctrie_match_value(node, index));
 }
 
 /*
@@ -530,6 +531,118 @@ pctrie_lookup_unlocked(struct pctrie *ptree, uint64_t index, smr_t smr)
 	return (res);
 }
 
+/*
+ * Returns the last node examined in the search for the index, and updates the
+ * search path to that node.
+ */
+static __always_inline struct pctrie_node *
+_pctrie_iter_lookup_node(struct pctrie_iter *it, uint64_t index, smr_t smr,
+    enum pctrie_access access)
+{
+	struct pctrie_node *node;
+	int slot;
+
+	/*
+	 * Climb the search path to find the lowest node from which to start the
+	 * search for a value matching 'index'.
+	 */
+	while (it->top != 0) {
+		node = it->path[it->top - 1];
+		KASSERT(!powerof2(node->pn_popmap),
+		    ("%s: freed node in iter path", __func__));
+		if (!pctrie_keybarr(node, index, &slot)) {
+			node = pctrie_node_load(
+			    &node->pn_child[slot], smr, access);
+			break;
+		}
+		--it->top;
+	}
+	if (it->top == 0)
+		node = pctrie_root_load(it->ptree, smr, access);
+
+	/* Seek a node that matches index. */
+	while (!pctrie_isleaf(node) && !pctrie_keybarr(node, index, &slot)) {
+		KASSERT(it->top < nitems(it->path),
+		    ("%s: path overflow in trie %p", __func__, it->ptree));
+		it->path[it->top++] = node;
+		node = pctrie_node_load(&node->pn_child[slot], smr, access);
+	}
+	return (node);
+}
+
+/*
+ * Returns the value stored at a given index value, possibly NULL.
+ */
+static __always_inline uint64_t *
+_pctrie_iter_lookup(struct pctrie_iter *it, uint64_t index, smr_t smr,
+    enum pctrie_access access)
+{
+	struct pctrie_node *node;
+
+	it->index = index;
+	node = _pctrie_iter_lookup_node(it, index, smr, access);
+	return (pctrie_match_value(node, index));
+}
+
+/*
+ * Returns the value stored at a given index value, possibly NULL.
+ */
+uint64_t *
+pctrie_iter_lookup(struct pctrie_iter *it, uint64_t index)
+{
+	return (_pctrie_iter_lookup(it, index, NULL, PCTRIE_LOCKED));
+}
+
+/*
+ * Returns the value stored at a fixed offset from the current index value,
+ * possibly NULL.
+ */
+static __always_inline uint64_t *
+_pctrie_iter_stride(struct pctrie_iter *it, int stride, smr_t smr,
+    enum pctrie_access access)
+{
+	uint64_t index = it->index + stride;
+
+	/* Detect stride overflow. */
+	if ((stride > 0) != (index > it->index))
+		return (NULL);
+	/* Detect crossing limit */
+	if ((index < it->limit) != (it->index < it->limit))
+		return (NULL);
+
+	return (_pctrie_iter_lookup(it, index, smr, access));
+}
+
+/*
+ * Returns the value stored at a fixed offset from the current index value,
+ * possibly NULL.
+ */
+uint64_t *
+pctrie_iter_stride(struct pctrie_iter *it, int stride)
+{
+	return (_pctrie_iter_stride(it, stride, NULL, PCTRIE_LOCKED));
+}
+
+/*
+ * Returns the value stored at one more than the current index value, possibly
+ * NULL, assuming access is externally synchronized by a lock.
+ */
+uint64_t *
+pctrie_iter_next(struct pctrie_iter *it)
+{
+	return (_pctrie_iter_stride(it, 1, NULL, PCTRIE_LOCKED));
+}
+
+/*
+ * Returns the value stored at one less than the current index value, possibly
+ * NULL, assuming access is externally synchronized by a lock.
+ */
+uint64_t *
+pctrie_iter_prev(struct pctrie_iter *it)
+{
+	return (_pctrie_iter_stride(it, -1, NULL, PCTRIE_LOCKED));
+}
+
 /*
  * Returns the value with the least index that is greater than or equal to the
  * specified index, or NULL if there are no such values.
@@ -633,6 +746,78 @@ pctrie_subtree_lookup_gt(struct pctrie_node *node, uint64_t index)
 	return (pctrie_lookup_ge_node(node, index + 1));
 }
 
+/*
+ * Find first leaf >= index, and fill iter with the path to the parent of that
+ * leaf.  Return NULL if there is no such leaf less than limit.
+ */
+uint64_t *
+pctrie_iter_lookup_ge(struct pctrie_iter *it, uint64_t index)
+{
+	struct pctrie_node *node;
+	uint64_t *m;
+	int slot;
+
+	/* Seek a node that matches index. */
+	node = _pctrie_iter_lookup_node(it, index, NULL, PCTRIE_LOCKED);
+
+	/*
+	 * If no such node was found, and instead this path leads only to nodes
+	 * < index, back up to find a subtrie with the least value > index.
+	 */
+	if (pctrie_isleaf(node) ?
+	    (m = pctrie_toval(node)) == NULL || *m < index :
+	    node->pn_owner < index) {
+		/* Climb the path to find a node with a descendant > index. */
+		while (it->top != 0) {
+			node = it->path[it->top - 1];
+			slot = pctrie_slot(node, index) + 1;
+			if ((node->pn_popmap >> slot) != 0)
+				break;
+			--it->top;
+		}
+		if (it->top == 0)
+			return (NULL);
+
+		/* Step to the least child with a descendant > index. */
+		slot += ffs(node->pn_popmap >> slot) - 1;
+		node = pctrie_node_load(&node->pn_child[slot], NULL,
+		    PCTRIE_LOCKED);
+	}
+	/* Descend to the least leaf of the subtrie. */
+	while (!pctrie_isleaf(node)) {
+		if (it->limit != 0 && node->pn_owner >= it->limit)
+			return (NULL);
+		slot = ffs(node->pn_popmap) - 1;
+		KASSERT(it->top < nitems(it->path),
+		    ("%s: path overflow in trie %p", __func__, it->ptree));
+		it->path[it->top++] = node;
+		node = pctrie_node_load(&node->pn_child[slot], NULL,
+		    PCTRIE_LOCKED);
+	}
+	m = pctrie_toval(node);
+	if (it->limit != 0 && *m >= it->limit)
+		return (NULL);
+	it->index = *m;
+	return (m);
+}
+
+/*
+ * Find the first leaf with value at least 'jump' greater than the previous
+ * leaf.  Return NULL if that value is >= limit.
+ */
+uint64_t *
+pctrie_iter_jump_ge(struct pctrie_iter *it, int64_t jump)
+{
+	uint64_t index = it->index + jump;
+
+	/* Detect jump overflow. */
+	if ((jump > 0) != (index > it->index))
+		return (NULL);
+	if (it->limit != 0 && index >= it->limit)
+		return (NULL);
+	return (pctrie_iter_lookup_ge(it, index));
+}
+
 #ifdef INVARIANTS
 void
 pctrie_subtree_lookup_gt_assert(struct pctrie_node *node, uint64_t index,
@@ -720,6 +905,79 @@ pctrie_subtree_lookup_lt(struct pctrie_node *node, uint64_t index)
 	return (pctrie_lookup_le_node(node, index - 1));
 }
 
+/*
+ * Find first leaf <= index, and fill iter with the path to the parent of that
+ * leaf.  Return NULL if there is no such leaf greater than limit.
+ */
+uint64_t *
+pctrie_iter_lookup_le(struct pctrie_iter *it, uint64_t index)
+{
+	struct pctrie_node *node;
+	uint64_t *m;
+	int slot;
+
+	/* Seek a node that matches index. */
+	node = _pctrie_iter_lookup_node(it, index, NULL, PCTRIE_LOCKED);
+
+	/*
+	 * If no such node was found, and instead this path leads only to nodes
+	 * > index, back up to find a subtrie with the least value > index.
+	 */
+	if (pctrie_isleaf(node) ?
+	    (m = pctrie_toval(node)) == NULL || *m > index :
+	    node->pn_owner > index) {
+		/* Climb the path to find a node with a descendant < index. */
+		while (it->top != 0) {
+			node = it->path[it->top - 1];
+			slot = pctrie_slot(node, index);
+			if ((node->pn_popmap & ((1 << slot) - 1)) != 0)
+				break;
+			--it->top;
+		}
+		if (it->top == 0)
+			return (NULL);
+
+		/* Step to the greatest child with a descendant < index. */
+		slot = ilog2(node->pn_popmap & ((1 << slot) - 1));
+		node = pctrie_node_load(&node->pn_child[slot], NULL,
+		    PCTRIE_LOCKED);
+	}
+	/* Descend to the greatest leaf of the subtrie. */
+	while (!pctrie_isleaf(node)) {
+		if (it->limit != 0 && it->limit >=
+		    node->pn_owner + (PCTRIE_COUNT << node->pn_clev) - 1)
+			return (NULL);
+		slot = ilog2(node->pn_popmap);
+		KASSERT(it->top < nitems(it->path),
+		    ("%s: path overflow in trie %p", __func__, it->ptree));
+		it->path[it->top++] = node;
+		node = pctrie_node_load(&node->pn_child[slot], NULL,
+		    PCTRIE_LOCKED);
+	}
+	m = pctrie_toval(node);
+	if (it->limit != 0 && *m <= it->limit)
+		return (NULL);
+	it->index = *m;
+	return (m);
+}
+
+/*
+ * Find the first leaf with value at most 'jump' less than the previous
+ * leaf.  Return NULL if that value is <= limit.
+ */
+uint64_t *
+pctrie_iter_jump_le(struct pctrie_iter *it, int64_t jump)
+{
+	uint64_t index = it->index - jump;
+
+	/* Detect jump overflow. */
+	if ((jump > 0) != (index < it->index))
+		return (NULL);
+	if (it->limit != 0 && index <= it->limit)
+		return (NULL);
+	return (pctrie_iter_lookup_le(it, index));
+}
+
 #ifdef INVARIANTS
 void
 pctrie_subtree_lookup_lt_assert(struct pctrie_node *node, uint64_t index,
@@ -738,42 +996,25 @@ pctrie_subtree_lookup_lt_assert(struct pctrie_node *node, uint64_t index,
 }
 #endif
 
-/*
- * Remove the specified index from the tree, and return the value stored at
- * that index.  If the index is not present, return NULL.
- */
-uint64_t *
-pctrie_remove_lookup(struct pctrie *ptree, uint64_t index,
-    struct pctrie_node **freenode)
+static void
+pctrie_remove(struct pctrie *ptree, uint64_t index, struct pctrie_node *parent,
+    struct pctrie_node *node, struct pctrie_node **freenode)
 {
-	struct pctrie_node *child, *node, *parent;
-	uint64_t *m;
+	struct pctrie_node *child;
 	int slot;
 
-	*freenode = node = NULL;
-	child = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
-	for (;;) {
-		if (pctrie_isleaf(child))
-			break;
-		parent = node;
-		node = child;
-		slot = pctrie_slot(node, index);
-		child = pctrie_node_load(&node->pn_child[slot], NULL,
-		    PCTRIE_LOCKED);
-	}
-	if ((m = pctrie_toval(child)) == NULL || *m != index)
-		return (NULL);
 	if (node == NULL) {
 		pctrie_root_store(ptree, PCTRIE_NULL, PCTRIE_LOCKED);
-		return (m);
+		return;
 	}
+	slot = pctrie_slot(node, index);
 	KASSERT((node->pn_popmap & (1 << slot)) != 0,
 	    ("%s: bad popmap slot %d in node %p",
 	    __func__, slot, node));
 	node->pn_popmap ^= 1 << slot;
 	pctrie_node_store(&node->pn_child[slot], PCTRIE_NULL, PCTRIE_LOCKED);
 	if (!powerof2(node->pn_popmap))
-		return (m);
+		return;
 	KASSERT(node->pn_popmap != 0, ("%s: bad popmap all zeroes", __func__));
 	slot = ffs(node->pn_popmap) - 1;
 	child = pctrie_node_load(&node->pn_child[slot], NULL, PCTRIE_LOCKED);
@@ -795,9 +1036,89 @@ pctrie_remove_lookup(struct pctrie *ptree, uint64_t index,
 	 */
 	pctrie_node_put(node);
 	*freenode = node;
+}
+
+/*
+ * Remove the specified index from the tree, and return the value stored at
+ * that index.  If the index is not present, return NULL.
+ */
+uint64_t *
+pctrie_remove_lookup(struct pctrie *ptree, uint64_t index,
+    struct pctrie_node **freenode)
+{
+	struct pctrie_node *child, *node, *parent;
+	uint64_t *m;
+	int slot;
+
+	DEBUG_POISON_POINTER(parent);
+	*freenode = node = NULL;
+	child = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
+	while (!pctrie_isleaf(child)) {
+		parent = node;
+		node = child;
+		slot = pctrie_slot(node, index);
+		child = pctrie_node_load(&node->pn_child[slot], NULL,
+		    PCTRIE_LOCKED);
+	}
+	m = pctrie_match_value(child, index);
+	if (m != NULL)
+		pctrie_remove(ptree, index, parent, node, freenode);
 	return (m);
 }
 
+/*
+ * Remove from the trie the leaf last chosen by the iterator, and
+ * adjust the path if it's last member is to be freed.
+ */
+uint64_t *
+pctrie_iter_remove(struct pctrie_iter *it, struct pctrie_node **freenode)
+{
+	struct pctrie_node *child, *node, *parent;
+	uint64_t *m;
+	int slot;
+
+	DEBUG_POISON_POINTER(parent);
+	*freenode = NULL;
+	if (it->top >= 1) {
+		parent = (it->top >= 2) ? it->path[it->top - 2] : NULL;
+		node = it->path[it->top - 1];
+		slot = pctrie_slot(node, it->index);
+		child = pctrie_node_load(&node->pn_child[slot], NULL,
+		    PCTRIE_LOCKED);
+	} else {
+		node = NULL;
+		child = pctrie_root_load(it->ptree, NULL, PCTRIE_LOCKED);
+	}
+	m = pctrie_match_value(child, it->index);
+	if (m != NULL)
+		pctrie_remove(it->ptree, it->index, parent, node, freenode);
+	if (*freenode != NULL)
+		--it->top;
+	return (m);
+}
+
+/*
+ * Return the current leaf, assuming access is externally synchronized by a
+ * lock.
+ */
+uint64_t *
+pctrie_iter_value(struct pctrie_iter *it)
+{
+	struct pctrie_node *node;
+	int slot;
+
+	if (it->top == 0)
+		node = pctrie_root_load(it->ptree, NULL,
+		    PCTRIE_LOCKED);
+	else {
+		node = it->path[it->top - 1];
+		slot = pctrie_slot(node, it->index);
+		node = pctrie_node_load(&node->pn_child[slot], NULL,
+		    PCTRIE_LOCKED);
+	}
+	return (pctrie_toval(node));
+}
+
 /*
  * Walk the subtrie rooted at *pnode in order, invoking callback on leaves and
  * using the leftmost child pointer for path reversal, until an interior node
diff --git a/sys/sys/pctrie.h b/sys/sys/pctrie.h
index d06b533a54b7..4d5450c25079 100644
--- a/sys/sys/pctrie.h
+++ b/sys/sys/pctrie.h
@@ -240,6 +240,85 @@ name##_PCTRIE_RECLAIM_CALLBACK(struct pctrie *ptree,			\
 }									\
 									\
 static __inline __unused struct type *					\
+name##_PCTRIE_ITER_LOOKUP(struct pctrie_iter *it, uint64_t index)	\
+{									\
+	return name##_PCTRIE_VAL2PTR(pctrie_iter_lookup(it, index));	\
+}									\
+									\
+static __inline __unused struct type *					\
+name##_PCTRIE_ITER_STRIDE(struct pctrie_iter *it, int stride)		\
+{									\
+	return name##_PCTRIE_VAL2PTR(pctrie_iter_stride(it, stride));	\
+}									\
+									\
+static __inline __unused struct type *					\
+name##_PCTRIE_ITER_NEXT(struct pctrie_iter *it)				\
+{									\
+	return name##_PCTRIE_VAL2PTR(pctrie_iter_next(it));		\
+}									\
+									\
+static __inline __unused struct type *					\
+name##_PCTRIE_ITER_PREV(struct pctrie_iter *it)				\
+{									\
+	return name##_PCTRIE_VAL2PTR(pctrie_iter_prev(it));		\
+}									\
+									\
+static __inline __unused struct type *					\
+name##_PCTRIE_ITER_VALUE(struct pctrie_iter *it)			\
+{									\
+	return name##_PCTRIE_VAL2PTR(pctrie_iter_value(it));		\
+}									\
+									\
+static __inline __unused struct type *					\
+name##_PCTRIE_ITER_LOOKUP_GE(struct pctrie_iter *it, uint64_t index)	\
+{									\
+	return name##_PCTRIE_VAL2PTR(pctrie_iter_lookup_ge(it, index));	\
+}									\
+									\
+static __inline __unused struct type *					\
+name##_PCTRIE_ITER_JUMP_GE(struct pctrie_iter *it, int64_t jump)	\
+{									\
+	return name##_PCTRIE_VAL2PTR(pctrie_iter_jump_ge(it, jump));	\
+}									\
+									\
+static __inline __unused struct type *					\
+name##_PCTRIE_ITER_STEP_GE(struct pctrie_iter *it)			\
+{									\
+	return name##_PCTRIE_VAL2PTR(pctrie_iter_jump_ge(it, 1));	\
+}									\
+									\
+static __inline __unused struct type *					\
+name##_PCTRIE_ITER_LOOKUP_LE(struct pctrie_iter *it, uint64_t index)	\
+{									\
+	return name##_PCTRIE_VAL2PTR(pctrie_iter_lookup_le(it, index));	\
+}									\
+									\
+static __inline __unused struct type *					\
+name##_PCTRIE_ITER_JUMP_LE(struct pctrie_iter *it, int64_t jump)	\
+{									\
+	return name##_PCTRIE_VAL2PTR(pctrie_iter_jump_le(it, jump));	\
+}									\
+									\
+static __inline __unused struct type *					\
+name##_PCTRIE_ITER_STEP_LE(struct pctrie_iter *it)			\
+{									\
+	return name##_PCTRIE_VAL2PTR(pctrie_iter_jump_le(it, 1));	\
+}									\
+									\
+static __inline __unused void						\
+name##_PCTRIE_ITER_REMOVE(struct pctrie_iter *it)			\
+{									\
+	uint64_t *val;							\
+	struct pctrie_node *freenode;					\
+									\
+	val = pctrie_iter_remove(it, &freenode);			\
+	if (val == NULL)						\
+		panic("%s: key not found", __func__);			\
+	if (freenode != NULL)						\
+		freefn(it->ptree, freenode);				\
+}									\
+									\
+static __inline __unused struct type *					\
 name##_PCTRIE_REPLACE(struct pctrie *ptree, struct type *ptr)		\
 {									\
 									\
@@ -272,6 +351,7 @@ name##_PCTRIE_REMOVE_LOOKUP(struct pctrie *ptree, uint64_t key)		\
 	return name##_PCTRIE_VAL2PTR(val);				\
 }
 
+struct pctrie_iter;
 void		*pctrie_insert_lookup(struct pctrie *ptree, uint64_t *val,
 		    uint64_t **found_out);
 void		*pctrie_insert_lookup_gt(struct pctrie *ptree, uint64_t *val,
@@ -283,10 +363,22 @@ void		*pctrie_insert_lookup_strict(struct pctrie *ptree,
 void		pctrie_insert_node(void *parentp,
 		    struct pctrie_node *parent, uint64_t *val);
 uint64_t	*pctrie_lookup(struct pctrie *ptree, uint64_t key);
-uint64_t	*pctrie_lookup_ge(struct pctrie *ptree, uint64_t key);
-uint64_t	*pctrie_lookup_le(struct pctrie *ptree, uint64_t key);
 uint64_t	*pctrie_lookup_unlocked(struct pctrie *ptree, uint64_t key,
 		    smr_t smr);
+uint64_t	*pctrie_iter_lookup(struct pctrie_iter *it, uint64_t index);
+uint64_t	*pctrie_iter_stride(struct pctrie_iter *it, int stride);
+uint64_t	*pctrie_iter_next(struct pctrie_iter *it);
+uint64_t	*pctrie_iter_prev(struct pctrie_iter *it);
+uint64_t	*pctrie_lookup_ge(struct pctrie *ptree, uint64_t key);
+uint64_t	*pctrie_subtree_lookup_gt(struct pctrie_node *node,
+		    uint64_t key);
+uint64_t	*pctrie_iter_lookup_ge(struct pctrie_iter *it, uint64_t index);
+uint64_t	*pctrie_iter_jump_ge(struct pctrie_iter *it, int64_t jump);
+uint64_t	*pctrie_lookup_le(struct pctrie *ptree, uint64_t key);
+uint64_t	*pctrie_subtree_lookup_lt(struct pctrie_node *node,
+		    uint64_t key);
+uint64_t	*pctrie_iter_lookup_le(struct pctrie_iter *it, uint64_t index);
+uint64_t	*pctrie_iter_jump_le(struct pctrie_iter *it, int64_t jump);
 struct pctrie_node *pctrie_reclaim_begin(struct pctrie_node **pnode,
 		    struct pctrie *ptree);
 struct pctrie_node *pctrie_reclaim_resume(struct pctrie_node **pnode);
@@ -297,11 +389,10 @@ struct pctrie_node *pctrie_reclaim_resume_cb(struct pctrie_node **pnode,
 		    pctrie_cb_t callback, int keyoff, void *arg);
 uint64_t	*pctrie_remove_lookup(struct pctrie *ptree, uint64_t index,
 		    struct pctrie_node **killnode);
+uint64_t	*pctrie_iter_remove(struct pctrie_iter *it,
+		    struct pctrie_node **freenode);
+uint64_t	*pctrie_iter_value(struct pctrie_iter *it);
 uint64_t	*pctrie_replace(struct pctrie *ptree, uint64_t *newval);
-uint64_t	*pctrie_subtree_lookup_gt(struct pctrie_node *node,
-		    uint64_t key);
-uint64_t	*pctrie_subtree_lookup_lt(struct pctrie_node *node,
-		    uint64_t key);
 size_t		pctrie_node_size(void);
 int		pctrie_zone_init(void *mem, int size, int flags);
 
@@ -342,6 +433,37 @@ pctrie_is_empty(struct pctrie *ptree)
 #endif
 
 #define	PCTRIE_COUNT	(1 << PCTRIE_WIDTH)
+#define PCTRIE_LIMIT	howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH)
+
+struct pctrie_iter {
+	struct pctrie *ptree;
+	struct pctrie_node *path[PCTRIE_LIMIT];
+	uint64_t index;
+	uint64_t limit;
+	int top;
+};
+
+static __inline void
+pctrie_iter_reset(struct pctrie_iter *it)
+{
+	it->top = 0;
+}
+
+static __inline void
+pctrie_iter_init(struct pctrie_iter *it, struct pctrie *ptree)
+{
+	it->ptree = ptree;
+	it->top = 0;
+	it->limit = 0;
+}
+
+static __inline void
+pctrie_iter_limit_init(struct pctrie_iter *it, struct pctrie *ptree,
+    uint64_t limit)
+{
+	pctrie_iter_init(it, ptree);
+	it->limit = limit;
+}
 
 #endif /* _KERNEL */
 #endif /* !_SYS_PCTRIE_H_ */