git: 9f984fc683d7 - stable/14 - netmap: Make memory pools NUMA-aware

From: Mark Johnston <markj_at_FreeBSD.org>
Date: Mon, 28 Oct 2024 16:59:43 UTC
The branch stable/14 has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=9f984fc683d7a8df452411ba6f25b838d6a7ea81

commit 9f984fc683d7a8df452411ba6f25b838d6a7ea81
Author:     Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2024-10-14 13:30:09 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2024-10-28 16:39:50 +0000

    netmap: Make memory pools NUMA-aware
    
    Each netmap adapter associated with a physical adapter is attached to a
    netmap memory pool.  contigmalloc() is used to allocate physically
    contiguous memory for the pool, but ideally we would ensure that all
    such memory is allocated from the NUMA domain local to the adapter.
    
    Augment netmap's memory pools with a NUMA domain ID, similar to how
    IOMMU groups are handled in the Linux port.  That is, when attaching to
    a physical adapter, ensure that the associated memory pools are local to
    the adapter's associated memory domain, creating new pools as needed.
    
    Some types of ifnets do not have any defined NUMA affinity; in this case
    the domain ID in question is the sentinel value -1.
    
    Add a sysctl, dev.netmap.port_numa_affinity, which can be used to enable
    the new behaviour.  Keep it disabled by now to avoid surprises in case
    netmap applications are relying on zero-copy optimizations to forward
    packets between ports belonging to different NUMA domains.
    
    Reviewed by:    vmaffione
    MFC after:      2 weeks
    Sponsored by:   Klara, Inc.
    Differential Revision:  https://reviews.freebsd.org/D46666
    
    (cherry picked from commit 1bae9dc584272dd75dc4e04cb5d73be0e9fb562a)
---
 share/man/man4/netmap.4      | 12 +++++++++-
 sys/dev/netmap/netmap.c      |  4 ++--
 sys/dev/netmap/netmap_kern.h | 27 ++++++++++++++++++---
 sys/dev/netmap/netmap_mem2.c | 56 +++++++++++++++++++++++++++++---------------
 sys/dev/netmap/netmap_mem2.h |  2 +-
 5 files changed, 75 insertions(+), 26 deletions(-)

diff --git a/share/man/man4/netmap.4 b/share/man/man4/netmap.4
index fd713f3c384b..e258b60e11f6 100644
--- a/share/man/man4/netmap.4
+++ b/share/man/man4/netmap.4
@@ -25,7 +25,7 @@
 .\" This document is derived in part from the enet man page (enet.4)
 .\" distributed with 4.3BSD Unix.
 .\"
-.Dd March 6, 2022
+.Dd October 10, 2024
 .Dt NETMAP 4
 .Os
 .Sh NAME
@@ -938,6 +938,16 @@ switches that can be created. This tunable can be specified
 at loader time.
 .It Va dev.netmap.ptnet_vnet_hdr: 1
 Allow ptnet devices to use virtio-net headers
+.It Va dev.netmap.port_numa_affinity: 0
+On
+.Xr numa 4
+systems, allocate memory for netmap ports from the local NUMA domain when
+possible.
+This can improve performance by reducing the number of remote memory accesses.
+However, when forwarding packets between ports attached to different NUMA
+domains, this will prevent zero-copy forwarding optimizations and thus may hurt
+performance.
+Note that this setting must be specified as a loader tunable at boot time.
 .El
 .Sh SYSTEM CALLS
 .Nm
diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c
index 832d0ecc0c6e..f531151fb656 100644
--- a/sys/dev/netmap/netmap.c
+++ b/sys/dev/netmap/netmap.c
@@ -4010,8 +4010,8 @@ netmap_attach_common(struct netmap_adapter *na)
 	na->active_fds = 0;
 
 	if (na->nm_mem == NULL) {
-		/* use iommu or global allocator */
-		na->nm_mem = netmap_mem_get_iommu(na);
+		/* select an allocator based on IOMMU and NUMA affinity */
+		na->nm_mem = netmap_mem_get_allocator(na);
 	}
 	if (na->nm_bdg_attach == NULL)
 		/* no special nm_bdg_attach callback. On VALE
diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h
index 8618aaf82299..22bd213d9c42 100644
--- a/sys/dev/netmap/netmap_kern.h
+++ b/sys/dev/netmap/netmap_kern.h
@@ -81,6 +81,7 @@
 
 #if defined(__FreeBSD__)
 #include <sys/selinfo.h>
+#include <vm/vm.h>
 
 #define likely(x)	__builtin_expect((long)!!(x), 1L)
 #define unlikely(x)	__builtin_expect((long)!!(x), 0L)
@@ -1726,10 +1727,30 @@ extern int netmap_generic_txqdisc;
 #define NM_IS_NATIVE(ifp)	(NM_NA_VALID(ifp) && NA(ifp)->nm_dtor == netmap_hw_dtor)
 
 #if defined(__FreeBSD__)
+extern int netmap_port_numa_affinity;
 
-/* Assigns the device IOMMU domain to an allocator.
- * Returns -ENOMEM in case the domain is different */
-#define nm_iommu_group_id(dev) (-1)
+static inline int
+nm_iommu_group_id(struct netmap_adapter *na)
+{
+	return (-1);
+}
+
+static inline int
+nm_numa_domain(struct netmap_adapter *na)
+{
+	int domain;
+
+	/*
+	 * If the system has only one NUMA domain, don't bother distinguishing
+	 * between IF_NODOM and domain 0.
+	 */
+	if (vm_ndomains == 1 || netmap_port_numa_affinity == 0)
+		return (-1);
+	domain = if_getnumadomain(na->ifp);
+	if (domain == IF_NODOM)
+		domain = -1;
+	return (domain);
+}
 
 /* Callback invoked by the dma machinery after a successful dmamap_load */
 static void netmap_dmamap_cb(__unused void *arg,
diff --git a/sys/dev/netmap/netmap_mem2.c b/sys/dev/netmap/netmap_mem2.c
index 1ba966e7666a..50a5d2bc50f2 100644
--- a/sys/dev/netmap/netmap_mem2.c
+++ b/sys/dev/netmap/netmap_mem2.c
@@ -37,8 +37,8 @@
 #endif /* __APPLE__ */
 
 #ifdef __FreeBSD__
-#include <sys/cdefs.h> /* prerequisite */
-#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/domainset.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>		/* MALLOC_DEFINE */
 #include <sys/proc.h>
@@ -174,7 +174,8 @@ struct netmap_mem_d {
 	struct netmap_obj_pool pools[NETMAP_POOLS_NR];
 
 	nm_memid_t nm_id;	/* allocator identifier */
-	int nm_grp;	/* iommu group id */
+	int nm_grp;		/* iommu group id */
+	int nm_numa_domain;	/* local NUMA domain */
 
 	/* list of all existing allocators, sorted by nm_id */
 	struct netmap_mem_d *prev, *next;
@@ -310,7 +311,7 @@ netmap_mem_rings_delete(struct netmap_adapter *na)
 
 static int netmap_mem_map(struct netmap_obj_pool *, struct netmap_adapter *);
 static int netmap_mem_unmap(struct netmap_obj_pool *, struct netmap_adapter *);
-static int nm_mem_check_group(struct netmap_mem_d *, bus_dma_tag_t);
+static int nm_mem_check_group(struct netmap_mem_d *, void *);
 static void nm_mem_release_id(struct netmap_mem_d *);
 
 nm_memid_t
@@ -576,6 +577,7 @@ struct netmap_mem_d nm_mem = {	/* Our memory allocator. */
 
 	.nm_id = 1,
 	.nm_grp = -1,
+	.nm_numa_domain = -1,
 
 	.prev = &nm_mem,
 	.next = &nm_mem,
@@ -615,6 +617,7 @@ static const struct netmap_mem_d nm_blueprint = {
 	},
 
 	.nm_grp = -1,
+	.nm_numa_domain = -1,
 
 	.flags = NETMAP_MEM_PRIVATE,
 
@@ -625,7 +628,6 @@ static const struct netmap_mem_d nm_blueprint = {
 
 #define STRINGIFY(x) #x
 
-
 #define DECLARE_SYSCTLS(id, name) \
 	SYSBEGIN(mem2_ ## name); \
 	SYSCTL_INT(_dev_netmap, OID_AUTO, name##_size, \
@@ -649,9 +651,14 @@ DECLARE_SYSCTLS(NETMAP_IF_POOL, if);
 DECLARE_SYSCTLS(NETMAP_RING_POOL, ring);
 DECLARE_SYSCTLS(NETMAP_BUF_POOL, buf);
 
+int netmap_port_numa_affinity = 0;
+SYSCTL_INT(_dev_netmap, OID_AUTO, port_numa_affinity,
+    CTLFLAG_RDTUN, &netmap_port_numa_affinity, 0,
+    "Use NUMA-local memory for memory pools when possible");
+
 /* call with nm_mem_list_lock held */
 static int
-nm_mem_assign_id_locked(struct netmap_mem_d *nmd, int grp_id)
+nm_mem_assign_id_locked(struct netmap_mem_d *nmd, int grp_id, int domain)
 {
 	nm_memid_t id;
 	struct netmap_mem_d *scan = netmap_last_mem_d;
@@ -666,6 +673,7 @@ nm_mem_assign_id_locked(struct netmap_mem_d *nmd, int grp_id)
 		if (id != scan->nm_id) {
 			nmd->nm_id = id;
 			nmd->nm_grp = grp_id;
+			nmd->nm_numa_domain = domain;
 			nmd->prev = scan->prev;
 			nmd->next = scan;
 			scan->prev->next = nmd;
@@ -688,7 +696,7 @@ nm_mem_assign_id(struct netmap_mem_d *nmd, int grp_id)
 	int ret;
 
 	NM_MTX_LOCK(nm_mem_list_lock);
-	ret = nm_mem_assign_id_locked(nmd, grp_id);
+	ret = nm_mem_assign_id_locked(nmd, grp_id, -1);
 	NM_MTX_UNLOCK(nm_mem_list_lock);
 
 	return ret;
@@ -728,7 +736,7 @@ netmap_mem_find(nm_memid_t id)
 }
 
 static int
-nm_mem_check_group(struct netmap_mem_d *nmd, bus_dma_tag_t dev)
+nm_mem_check_group(struct netmap_mem_d *nmd, void *dev)
 {
 	int err = 0, id;
 
@@ -1399,7 +1407,7 @@ netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int obj
 
 /* call with NMA_LOCK held */
 static int
-netmap_finalize_obj_allocator(struct netmap_obj_pool *p)
+netmap_finalize_obj_allocator(struct netmap_mem_d *nmd, struct netmap_obj_pool *p)
 {
 	int i; /* must be signed */
 	size_t n;
@@ -1442,8 +1450,16 @@ netmap_finalize_obj_allocator(struct netmap_obj_pool *p)
 		 * can live with standard malloc, because the hardware will not
 		 * access the pages directly.
 		 */
-		clust = contigmalloc(n, M_NETMAP, M_NOWAIT | M_ZERO,
-		    (size_t)0, -1UL, PAGE_SIZE, 0);
+		if (nmd->nm_numa_domain == -1) {
+			clust = contigmalloc(n, M_NETMAP,
+			    M_NOWAIT | M_ZERO, (size_t)0, -1UL, PAGE_SIZE, 0);
+		} else {
+			struct domainset *ds;
+
+			ds = DOMAINSET_PREF(nmd->nm_numa_domain);
+			clust = contigmalloc_domainset(n, M_NETMAP,
+			    ds, M_NOWAIT | M_ZERO, (size_t)0, -1UL, PAGE_SIZE, 0);
+		}
 		if (clust == NULL) {
 			/*
 			 * If we get here, there is a severe memory shortage,
@@ -1637,7 +1653,7 @@ netmap_mem_finalize_all(struct netmap_mem_d *nmd)
 	nmd->lasterr = 0;
 	nmd->nm_totalsize = 0;
 	for (i = 0; i < NETMAP_POOLS_NR; i++) {
-		nmd->lasterr = netmap_finalize_obj_allocator(&nmd->pools[i]);
+		nmd->lasterr = netmap_finalize_obj_allocator(nmd, &nmd->pools[i]);
 		if (nmd->lasterr)
 			goto error;
 		nmd->nm_totalsize += nmd->pools[i].memtotal;
@@ -1805,24 +1821,26 @@ netmap_mem_private_new(u_int txr, u_int txd, u_int rxr, u_int rxd,
 	return d;
 }
 
-/* Reference iommu allocator - find existing or create new,
- * for not hw addapeters fallback to global allocator.
+/* Reference IOMMU and NUMA local allocator - find existing or create new,
+ * for non-hw adapters, fall back to global allocator.
  */
 struct netmap_mem_d *
-netmap_mem_get_iommu(struct netmap_adapter *na)
+netmap_mem_get_allocator(struct netmap_adapter *na)
 {
-	int i, err, grp_id;
+	int i, domain, err, grp_id;
 	struct netmap_mem_d *nmd;
 
 	if (na == NULL || na->pdev == NULL)
 		return netmap_mem_get(&nm_mem);
 
+	domain = nm_numa_domain(na->pdev);
 	grp_id = nm_iommu_group_id(na->pdev);
 
 	NM_MTX_LOCK(nm_mem_list_lock);
 	nmd = netmap_last_mem_d;
 	do {
-		if (!(nmd->flags & NETMAP_MEM_HIDDEN) && nmd->nm_grp == grp_id) {
+		if (!(nmd->flags & NETMAP_MEM_HIDDEN) &&
+		    nmd->nm_grp == grp_id && nmd->nm_numa_domain == domain) {
 			nmd->refcount++;
 			NM_DBG_REFC(nmd, __FUNCTION__, __LINE__);
 			NM_MTX_UNLOCK(nm_mem_list_lock);
@@ -1837,7 +1855,7 @@ netmap_mem_get_iommu(struct netmap_adapter *na)
 
 	*nmd = nm_mem_blueprint;
 
-	err = nm_mem_assign_id_locked(nmd, grp_id);
+	err = nm_mem_assign_id_locked(nmd, grp_id, domain);
 	if (err)
 		goto error_free;
 
@@ -2881,7 +2899,7 @@ netmap_mem_pt_guest_create(nm_memid_t mem_id)
 	ptnmd->pt_ifs = NULL;
 
 	/* Assign new id in the guest (We have the lock) */
-	err = nm_mem_assign_id_locked(&ptnmd->up, -1);
+	err = nm_mem_assign_id_locked(&ptnmd->up, -1, -1);
 	if (err)
 		goto error;
 
diff --git a/sys/dev/netmap/netmap_mem2.h b/sys/dev/netmap/netmap_mem2.h
index 1681d5c7721f..0123b010e944 100644
--- a/sys/dev/netmap/netmap_mem2.h
+++ b/sys/dev/netmap/netmap_mem2.h
@@ -146,7 +146,7 @@ struct netmap_mem_d* netmap_mem_private_new( u_int txr, u_int txd, u_int rxr, u_
 #define netmap_mem_get(d) __netmap_mem_get(d, __FUNCTION__, __LINE__)
 #define netmap_mem_put(d) __netmap_mem_put(d, __FUNCTION__, __LINE__)
 struct netmap_mem_d* __netmap_mem_get(struct netmap_mem_d *, const char *, int);
-struct netmap_mem_d* netmap_mem_get_iommu(struct netmap_adapter *);
+struct netmap_mem_d* netmap_mem_get_allocator(struct netmap_adapter *);
 void __netmap_mem_put(struct netmap_mem_d *, const char *, int);
 struct netmap_mem_d* netmap_mem_find(nm_memid_t);
 unsigned netmap_mem_bufsize(struct netmap_mem_d *nmd);