git: ce12afaa6fff - main - netmap: Fix queue stalls with generic interfaces

From: Mark Johnston <markj_at_FreeBSD.org>
Date: Wed, 05 Apr 2023 16:19:46 UTC
The branch main has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=ce12afaa6fff46c9027eb6b2bd515a4e46ecefc9

commit ce12afaa6fff46c9027eb6b2bd515a4e46ecefc9
Author:     Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2023-04-05 16:12:30 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2023-04-05 16:12:30 +0000

    netmap: Fix queue stalls with generic interfaces
    
    In emulated mode, the FreeBSD netmap port attempts to perform zero-copy
    transmission.  This works as follows: the kernel ring is populated with
    mbuf headers to which netmap buffers are attached.  When transmitting,
    the mbuf refcount is initialized to 2, and when the counter value has
    been decremented to 1 netmap infers that the driver has freed the mbuf
    and thus transmission is complete.
    
    This scheme does not generalize to the situation where netmap is
    attaching to a software interface which may transmit packets among
    multiple "queues", as is the case with bridge or lagg interfaces.  In
    that case, we would be relying on backing hardware drivers to free
    transmitted mbufs promptly, but this isn't guaranteed; a driver may
    reasonably defer freeing a small number of transmitted buffers
    indefinitely.  If such a buffer ends up at the tail of a netmap transmit
    ring, further transmits can end up blocked indefinitely.
    
    Fix the problem by removing the zero-copy scheme (which is also not
    implemented in the Linux port of netmap).  Instead, the kernel ring is
    populated with regular mbuf clusters into which netmap buffers are
    copied by nm_os_generic_xmit_frame().  The refcounting scheme is
    preserved, and this lets us avoid allocating a fresh cluster per
    transmitted packet in the common case.  If the transmit ring is full, a
    callout is used to free the "stuck" mbuf, avoiding the queue deadlock
    described above.
    
    Furthermore, when recycling mbuf clusters, be sure to fully reinitialize
    the mbuf header instead of simply re-setting M_PKTHDR.  Some software
    interfaces, like if_vlan, may set fields in the header which should be
    reset before the mbuf is reused.
    
    Reviewed by:    vmaffione
    MFC after:      1 month
    Sponsored by:   Zenarmor
    Sponsored by:   OPNsense
    Sponsored by:   Klara, Inc.
    Differential Revision:  https://reviews.freebsd.org/D38065
---
 sys/dev/netmap/netmap_freebsd.c | 35 ++++++++++++++++---------
 sys/dev/netmap/netmap_generic.c | 58 +++++++++++++++++++++++++++++++++--------
 sys/dev/netmap/netmap_kern.h    | 53 ++++++++++++++++++++++++++-----------
 3 files changed, 107 insertions(+), 39 deletions(-)

diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c
index 4b3b1a6edacd..e67d26d6788f 100644
--- a/sys/dev/netmap/netmap_freebsd.c
+++ b/sys/dev/netmap/netmap_freebsd.c
@@ -387,15 +387,20 @@ nm_os_catch_tx(struct netmap_generic_adapter *gna, int intercept)
  * addr and len identify the netmap buffer, m is the (preallocated)
  * mbuf to use for transmissions.
  *
- * We should add a reference to the mbuf so the m_freem() at the end
- * of the transmission does not consume resources.
+ * Zero-copy transmission is possible if netmap is attached directly to a
+ * hardware interface: when cleaning we simply wait for the mbuf cluster
+ * refcount to decrement to 1, indicating that the driver has completed
+ * transmission and is done with the buffer.  However, this approach can
+ * lead to queue deadlocks when attaching to software interfaces (e.g.,
+ * if_bridge) since we cannot rely on member ports to promptly reclaim
+ * transmitted mbufs.  Since there is no easy way to distinguish these
+ * cases, we currently always copy the buffer.
  *
- * On FreeBSD, and on multiqueue cards, we can force the queue using
+ * On multiqueue cards, we can force the queue using
  *      if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
  *              i = m->m_pkthdr.flowid % adapter->num_queues;
  *      else
  *              i = curcpu % adapter->num_queues;
- *
  */
 int
 nm_os_generic_xmit_frame(struct nm_os_gen_arg *a)
@@ -405,16 +410,21 @@ nm_os_generic_xmit_frame(struct nm_os_gen_arg *a)
 	if_t ifp = a->ifp;
 	struct mbuf *m = a->m;
 
-	/* Link the external storage to
-	 * the netmap buffer, so that no copy is necessary. */
-	m->m_ext.ext_buf = m->m_data = a->addr;
-	m->m_ext.ext_size = len;
+	M_ASSERTPKTHDR(m);
+	KASSERT((m->m_flags & M_EXT) != 0,
+	    ("%s: mbuf %p has no cluster", __func__, m));
 
-	m->m_flags |= M_PKTHDR;
-	m->m_len = m->m_pkthdr.len = len;
+	if (MBUF_REFCNT(m) != 1) {
+		nm_prerr("invalid refcnt %d for %p", MBUF_REFCNT(m), m);
+		panic("in generic_xmit_frame");
+	}
+	if (unlikely(m->m_ext.ext_size < len)) {
+		nm_prlim(2, "size %d < len %d", m->m_ext.ext_size, len);
+		len = m->m_ext.ext_size;
+	}
 
-	/* mbuf refcnt is not contended, no need to use atomic
-	 * (a memory barrier is enough). */
+	m_copyback(m, 0, len, a->addr);
+	m->m_len = m->m_pkthdr.len = len;
 	SET_MBUF_REFCNT(m, 2);
 	M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
 	m->m_pkthdr.flowid = a->ring_nr;
@@ -425,7 +435,6 @@ nm_os_generic_xmit_frame(struct nm_os_gen_arg *a)
 	return ret ? -1 : 0;
 }
 
-
 struct netmap_adapter *
 netmap_getna(if_t ifp)
 {
diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c
index 77c56c74df65..f1c4fdd8b6ea 100644
--- a/sys/dev/netmap/netmap_generic.c
+++ b/sys/dev/netmap/netmap_generic.c
@@ -272,6 +272,7 @@ generic_netmap_unregister(struct netmap_adapter *na)
 		}
 
 		for_each_tx_kring(r, kring, na) {
+			callout_drain(&kring->tx_event_callout);
 			mtx_destroy(&kring->tx_event_lock);
 			if (kring->tx_pool == NULL) {
 				continue;
@@ -357,6 +358,9 @@ generic_netmap_register(struct netmap_adapter *na, int enable)
 			}
 			mtx_init(&kring->tx_event_lock, "tx_event_lock",
 				 NULL, MTX_SPIN);
+			callout_init_mtx(&kring->tx_event_callout,
+					 &kring->tx_event_lock,
+					 CALLOUT_RETURNUNLOCKED);
 		}
 	}
 
@@ -430,7 +434,7 @@ out:
  * the NIC notifies the driver that transmission is completed.
  */
 static void
-generic_mbuf_destructor(struct mbuf *m)
+generic_mbuf_dtor(struct mbuf *m)
 {
 	struct netmap_adapter *na = NA(GEN_TX_MBUF_IFP(m));
 	struct netmap_kring *kring;
@@ -473,7 +477,14 @@ generic_mbuf_destructor(struct mbuf *m)
 		if (++r == na->num_tx_rings) r = 0;
 
 		if (r == r_orig) {
+#ifndef __FreeBSD__
+			/*
+			 * On FreeBSD this situation can arise if the tx_event
+			 * callout handler cleared a stuck packet.
+			 */
 			nm_prlim(1, "Cannot match event %p", m);
+#endif
+			nm_generic_mbuf_dtor(m);
 			return;
 		}
 	}
@@ -481,9 +492,7 @@ generic_mbuf_destructor(struct mbuf *m)
 	/* Second, wake up clients. They will reclaim the event through
 	 * txsync. */
 	netmap_generic_irq(na, r, NULL);
-#ifdef __FreeBSD__
-	void_mbuf_dtor(m);
-#endif
+	nm_generic_mbuf_dtor(m);
 }
 
 /* Record completed transmissions and update hwtail.
@@ -537,7 +546,6 @@ generic_netmap_tx_clean(struct netmap_kring *kring, int txqdisc)
 				}
 				/* The event has been consumed, we can go
 				 * ahead. */
-
 			} else if (MBUF_REFCNT(m) != 1) {
 				/* This mbuf is still busy: its refcnt is 2. */
 				break;
@@ -577,6 +585,18 @@ ring_middle(u_int inf, u_int sup, u_int lim)
 	return e;
 }
 
+#ifdef __FreeBSD__
+static void
+generic_tx_callout(void *arg)
+{
+	struct netmap_kring *kring = arg;
+
+	kring->tx_event = NULL;
+	mtx_unlock_spin(&kring->tx_event_lock);
+	netmap_generic_irq(kring->na, kring->ring_id, NULL);
+}
+#endif
+
 static void
 generic_set_tx_event(struct netmap_kring *kring, u_int hwcur)
 {
@@ -620,8 +640,24 @@ generic_set_tx_event(struct netmap_kring *kring, u_int hwcur)
 		return;
 	}
 
-	SET_MBUF_DESTRUCTOR(m, generic_mbuf_destructor);
+	SET_MBUF_DESTRUCTOR(m, generic_mbuf_dtor);
 	kring->tx_event = m;
+#ifdef __FreeBSD__
+	/*
+	 * Handle the possibility that the transmitted buffer isn't reclaimed
+	 * within a bounded period of time.  This can arise when transmitting
+	 * out of multiple ports via a lagg or bridge interface, since the
+	 * member ports may legitimately only free transmitted buffers in
+	 * batches.
+	 *
+	 * The callout handler clears the stuck packet from the ring, allowing
+	 * transmission to proceed.  In the common case we let
+	 * generic_mbuf_dtor() unstick the ring, allowing mbufs to be
+	 * reused most of the time.
+	 */
+	callout_reset_sbt_curcpu(&kring->tx_event_callout, SBT_1MS, 0,
+	    generic_tx_callout, kring, 0);
+#endif
 	mtx_unlock_spin(&kring->tx_event_lock);
 
 	kring->tx_pool[e] = NULL;
@@ -631,10 +667,8 @@ generic_set_tx_event(struct netmap_kring *kring, u_int hwcur)
 	/* Decrement the refcount. This will free it if we lose the race
 	 * with the driver. */
 	m_freem(m);
-	smp_mb();
 }
 
-
 /*
  * generic_netmap_txsync() transforms netmap buffers into mbufs
  * and passes them to the standard device driver
@@ -712,6 +746,8 @@ generic_netmap_txsync(struct netmap_kring *kring, int flags)
 					break;
 				}
 				IFRATE(rate_ctx.new.txrepl++);
+			} else {
+				nm_os_mbuf_reinit(m);
 			}
 
 			a.m = m;
@@ -783,9 +819,6 @@ generic_netmap_txsync(struct netmap_kring *kring, int flags)
 #endif
 	}
 
-	/*
-	 * Second, reclaim completed buffers
-	 */
 	if (!gna->txqdisc && (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring))) {
 		/* No more available slots? Set a notification event
 		 * on a netmap slot that will be cleaned in the future.
@@ -795,6 +828,9 @@ generic_netmap_txsync(struct netmap_kring *kring, int flags)
 		generic_set_tx_event(kring, nm_i);
 	}
 
+	/*
+	 * Second, reclaim completed buffers
+	 */
 	generic_netmap_tx_clean(kring, gna->txqdisc);
 
 	return 0;
diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h
index 7c68c79c61ef..04b9c199e1dc 100644
--- a/sys/dev/netmap/netmap_kern.h
+++ b/sys/dev/netmap/netmap_kern.h
@@ -501,6 +501,9 @@ struct netmap_kring {
 	struct mbuf	**tx_pool;
 	struct mbuf	*tx_event;	/* TX event used as a notification */
 	NM_LOCK_T	tx_event_lock;	/* protects the tx_event mbuf */
+#ifdef __FreeBSD__
+	struct callout	tx_event_callout;
+#endif
 	struct mbq	rx_queue;       /* intercepted rx mbufs. */
 
 	uint32_t	users;		/* existing bindings for this ring */
@@ -2381,39 +2384,59 @@ ptnet_sync_tail(struct nm_csb_ktoa *ktoa, struct netmap_kring *kring)
  *
  * We allocate mbufs with m_gethdr(), since the mbuf header is needed
  * by the driver. We also attach a customly-provided external storage,
- * which in this case is a netmap buffer. When calling m_extadd(), however
- * we pass a NULL address, since the real address (and length) will be
- * filled in by nm_os_generic_xmit_frame() right before calling
- * if_transmit().
+ * which in this case is a netmap buffer.
  *
  * The dtor function does nothing, however we need it since mb_free_ext()
  * has a KASSERT(), checking that the mbuf dtor function is not NULL.
  */
 
-static void void_mbuf_dtor(struct mbuf *m) { }
+static inline void
+nm_generic_mbuf_dtor(struct mbuf *m)
+{
+	uma_zfree(zone_clust, m->m_ext.ext_buf);
+}
 
 #define SET_MBUF_DESTRUCTOR(m, fn)	do {		\
 	(m)->m_ext.ext_free = (fn != NULL) ?		\
-	    (void *)fn : (void *)void_mbuf_dtor;	\
+	    (void *)fn : (void *)nm_generic_mbuf_dtor;	\
 } while (0)
 
 static inline struct mbuf *
-nm_os_get_mbuf(if_t ifp, int len)
+nm_os_get_mbuf(if_t ifp __unused, int len)
 {
 	struct mbuf *m;
+	void *buf;
 
-	(void)ifp;
-	(void)len;
+	KASSERT(len <= MCLBYTES, ("%s: len %d", __func__, len));
 
 	m = m_gethdr(M_NOWAIT, MT_DATA);
-	if (m == NULL) {
-		return m;
+	if (__predict_false(m == NULL))
+		return (NULL);
+	buf = uma_zalloc(zone_clust, M_NOWAIT);
+	if (__predict_false(buf == NULL)) {
+		m_free(m);
+		return (NULL);
 	}
+	m_extadd(m, buf, MCLBYTES, nm_generic_mbuf_dtor, NULL, NULL, 0,
+	    EXT_NET_DRV);
+	return (m);
+}
 
-	m_extadd(m, NULL /* buf */, 0 /* size */, void_mbuf_dtor,
-		 NULL, NULL, 0, EXT_NET_DRV);
-
-	return m;
+static inline void
+nm_os_mbuf_reinit(struct mbuf *m)
+{
+	void *buf;
+
+	KASSERT((m->m_flags & M_EXT) != 0,
+	    ("%s: mbuf %p has no external storage", __func__, m));
+	KASSERT(m->m_ext.ext_size == MCLBYTES,
+	    ("%s: mbuf %p has wrong external storage size %u", __func__, m,
+	    m->m_ext.ext_size));
+
+	buf = m->m_ext.ext_buf;
+	m_init(m, M_NOWAIT, MT_DATA, M_PKTHDR);
+	m_extadd(m, buf, MCLBYTES, nm_generic_mbuf_dtor, NULL, NULL, 0,
+	    EXT_NET_DRV);
 }
 
 #endif /* __FreeBSD__ */