git: 5e49311e0177 - releng/13.2 - epair: Simplify the transmit path and address lost wakeups

From: Mark Johnston <markj_at_FreeBSD.org>
Date: Wed, 15 Mar 2023 14:49:13 UTC
The branch releng/13.2 has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=5e49311e0177932e0f0323fd547eedb4e50bbb4d

commit 5e49311e0177932e0f0323fd547eedb4e50bbb4d
Author:     Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2023-03-01 20:21:30 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2023-03-15 13:39:55 +0000

    epair: Simplify the transmit path and address lost wakeups
    
    epairs currently shuttle all transmitted packets through a single global
    taskqueue thread.  To hand packets over to the taskqueue thread, each
    epair maintains a pair of ring buffers and a lockless scheme for
    notifying the thread of pending work.  The implementation can lead to
    lost wakeups, causing to-be-transmitted packets to end up stuck in the
    queue.
    
    Rather than extending the existing scheme, simply replace it with a
    linked list protected by a mutex, and use the mutex to synchronize
    wakeups of the taskqueue thread.  This appears to give equivalent or
    better throughput with >= 16 producer threads and eliminates the lost
    wakeups.
    
    Approved by:    re (cperciva)
    Reviewed by:    kp
    MFC after:      1 week
    Sponsored by:   Klara, Inc.
    Sponsored by:   Modirum MDPay
    Differential Revision:  https://reviews.freebsd.org/D38843
    
    (cherry picked from commit df7bbd8c354a907d2c2f85a6e18f356f76458f57)
    (cherry picked from commit 762ad964ee346cffdbf3eaa6ff87fa5b32d30738)
---
 sys/net/if_epair.c | 162 ++++++++++++++++++++++++++---------------------------
 1 file changed, 79 insertions(+), 83 deletions(-)

diff --git a/sys/net/if_epair.c b/sys/net/if_epair.c
index e2c61f83d940..aebcaca0cd2f 100644
--- a/sys/net/if_epair.c
+++ b/sys/net/if_epair.c
@@ -104,15 +104,16 @@ static unsigned int next_index = 0;
 #define	EPAIR_LOCK()			mtx_lock(&epair_n_index_mtx)
 #define	EPAIR_UNLOCK()			mtx_unlock(&epair_n_index_mtx)
 
-#define BIT_QUEUE_TASK		0
-#define BIT_MBUF_QUEUED		1
-
 struct epair_softc;
 struct epair_queue {
+	struct mtx		 mtx;
+	struct mbufq		 q;
 	int			 id;
-	struct buf_ring		*rxring[2];
-	volatile int		 ridx;		/* 0 || 1 */
-	volatile long		 state;		/* taskqueue coordination */
+	enum {
+		EPAIR_QUEUE_IDLE,
+		EPAIR_QUEUE_WAKING,
+		EPAIR_QUEUE_RUNNING,
+	}			 state;
 	struct task		 tx_task;
 	struct epair_softc	*sc;
 };
@@ -148,44 +149,49 @@ epair_clear_mbuf(struct mbuf *m)
 }
 
 static void
-epair_if_input(struct epair_softc *sc, struct epair_queue *q, int ridx)
+epair_tx_start_deferred(void *arg, int pending)
 {
-	struct ifnet *ifp;
-	struct mbuf *m;
+	struct epair_queue *q = (struct epair_queue *)arg;
+	if_t ifp;
+	struct mbuf *m, *n;
+	bool resched;
+
+	ifp = q->sc->ifp;
 
-	ifp = sc->ifp;
+	if_ref(ifp);
 	CURVNET_SET(ifp->if_vnet);
-	while (! buf_ring_empty(q->rxring[ridx])) {
-		m = buf_ring_dequeue_mc(q->rxring[ridx]);
-		if (m == NULL)
-			continue;
 
-		MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
-		(*ifp->if_input)(ifp, m);
+	mtx_lock(&q->mtx);
+	m = mbufq_flush(&q->q);
+	q->state = EPAIR_QUEUE_RUNNING;
+	mtx_unlock(&q->mtx);
 
+	while (m != NULL) {
+		n = STAILQ_NEXT(m, m_stailqpkt);
+		m->m_nextpkt = NULL;
+		if_input(ifp, m);
+		m = n;
 	}
-	CURVNET_RESTORE();
-}
 
-static void
-epair_tx_start_deferred(void *arg, int pending)
-{
-	struct epair_queue *q = (struct epair_queue *)arg;
-	struct epair_softc *sc = q->sc;
-	int ridx, nidx;
-
-	if_ref(sc->ifp);
-	ridx = atomic_load_int(&q->ridx);
-	do {
-		nidx = (ridx == 0) ? 1 : 0;
-	} while (!atomic_fcmpset_int(&q->ridx, &ridx, nidx));
-	epair_if_input(sc, q, ridx);
-
-	atomic_clear_long(&q->state, (1 << BIT_QUEUE_TASK));
-	if (atomic_testandclear_long(&q->state, BIT_MBUF_QUEUED))
+	/*
+	 * Avoid flushing the queue more than once per task.  We can otherwise
+	 * end up starving ourselves in a multi-epair routing configuration.
+	 */
+	mtx_lock(&q->mtx);
+	if (mbufq_len(&q->q) > 0) {
+		resched = true;
+		q->state = EPAIR_QUEUE_WAKING;
+	} else {
+		resched = false;
+		q->state = EPAIR_QUEUE_IDLE;
+	}
+	mtx_unlock(&q->mtx);
+
+	if (resched)
 		taskqueue_enqueue(epair_tasks.tq[q->id], &q->tx_task);
 
-	if_rele(sc->ifp);
+	CURVNET_RESTORE();
+	if_rele(ifp);
 }
 
 static struct epair_queue *
@@ -239,9 +245,9 @@ epair_prepare_mbuf(struct mbuf *m, struct ifnet *src_ifp)
 static void
 epair_menq(struct mbuf *m, struct epair_softc *osc)
 {
+	struct epair_queue *q;
 	struct ifnet *ifp, *oifp;
-	int len, ret;
-	int ridx;
+	int error, len;
 	bool mcast;
 
 	/*
@@ -257,32 +263,26 @@ epair_menq(struct mbuf *m, struct epair_softc *osc)
 	len = m->m_pkthdr.len;
 	mcast = (m->m_flags & (M_BCAST | M_MCAST)) != 0;
 
-	struct epair_queue *q = epair_select_queue(osc, m);
+	q = epair_select_queue(osc, m);
 
-	atomic_set_long(&q->state, (1 << BIT_MBUF_QUEUED));
-	ridx = atomic_load_int(&q->ridx);
-	ret = buf_ring_enqueue(q->rxring[ridx], m);
-	if (ret != 0) {
-		/* Ring is full. */
-		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
-		m_freem(m);
-		return;
+	mtx_lock(&q->mtx);
+	if (q->state == EPAIR_QUEUE_IDLE) {
+		q->state = EPAIR_QUEUE_WAKING;
+		taskqueue_enqueue(epair_tasks.tq[q->id], &q->tx_task);
 	}
+	error = mbufq_enqueue(&q->q, m);
+	mtx_unlock(&q->mtx);
 
-	if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
-	/*
-	 * IFQ_HANDOFF_ADJ/ip_handoff() update statistics,
-	 * but as we bypass all this we have to duplicate
-	 * the logic another time.
-	 */
-	if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
-	if (mcast)
-		if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
-	/* Someone else received the packet. */
-	if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1);
-
-	if (!atomic_testandset_long(&q->state, BIT_QUEUE_TASK))
-		taskqueue_enqueue(epair_tasks.tq[q->id], &q->tx_task);
+	if (error != 0) {
+		m_freem(m);
+		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
+	} else {
+		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
+		if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
+		if (mcast)
+			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
+		if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1);
+	}
 }
 
 static void
@@ -561,10 +561,9 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
 	for (int i = 0; i < sca->num_queues; i++) {
 		struct epair_queue *q = &sca->queues[i];
 		q->id = i;
-		q->rxring[0] = buf_ring_alloc(RXRSIZE, M_EPAIR, M_WAITOK, NULL);
-		q->rxring[1] = buf_ring_alloc(RXRSIZE, M_EPAIR, M_WAITOK, NULL);
-		q->ridx = 0;
-		q->state = 0;
+		q->state = EPAIR_QUEUE_IDLE;
+		mtx_init(&q->mtx, "epaiq", NULL, MTX_DEF | MTX_NEW);
+		mbufq_init(&q->q, RXRSIZE);
 		q->sc = sca;
 		NET_TASK_INIT(&q->tx_task, 0, epair_tx_start_deferred, q);
 	}
@@ -584,10 +583,9 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
 	for (int i = 0; i < scb->num_queues; i++) {
 		struct epair_queue *q = &scb->queues[i];
 		q->id = i;
-		q->rxring[0] = buf_ring_alloc(RXRSIZE, M_EPAIR, M_WAITOK, NULL);
-		q->rxring[1] = buf_ring_alloc(RXRSIZE, M_EPAIR, M_WAITOK, NULL);
-		q->ridx = 0;
-		q->state = 0;
+		q->state = EPAIR_QUEUE_IDLE;
+		mtx_init(&q->mtx, "epaiq", NULL, MTX_DEF | MTX_NEW);
+		mbufq_init(&q->q, RXRSIZE);
 		q->sc = scb;
 		NET_TASK_INIT(&q->tx_task, 0, epair_tx_start_deferred, q);
 	}
@@ -707,18 +705,18 @@ epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
 static void
 epair_drain_rings(struct epair_softc *sc)
 {
-	int ridx;
-	struct mbuf *m;
+	for (int i = 0; i < sc->num_queues; i++) {
+		struct epair_queue *q;
+		struct mbuf *m, *n;
 
-	for (ridx = 0; ridx < 2; ridx++) {
-		for (int i = 0; i < sc->num_queues; i++) {
-			struct epair_queue *q = &sc->queues[i];
-			do {
-				m = buf_ring_dequeue_sc(q->rxring[ridx]);
-				if (m == NULL)
-					break;
-				m_freem(m);
-			} while (1);
+		q = &sc->queues[i];
+		mtx_lock(&q->mtx);
+		m = mbufq_flush(&q->q);
+		mtx_unlock(&q->mtx);
+
+		for (; m != NULL; m = n) {
+			n = m->m_nextpkt;
+			m_freem(m);
 		}
 	}
 }
@@ -764,8 +762,7 @@ epair_clone_destroy(struct if_clone *ifc, struct ifnet *ifp)
 	ifmedia_removeall(&scb->media);
 	for (int i = 0; i < scb->num_queues; i++) {
 		struct epair_queue *q = &scb->queues[i];
-		buf_ring_free(q->rxring[0], M_EPAIR);
-		buf_ring_free(q->rxring[1], M_EPAIR);
+		mtx_destroy(&q->mtx);
 	}
 	free(scb->queues, M_EPAIR);
 	free(scb, M_EPAIR);
@@ -776,8 +773,7 @@ epair_clone_destroy(struct if_clone *ifc, struct ifnet *ifp)
 	ifmedia_removeall(&sca->media);
 	for (int i = 0; i < sca->num_queues; i++) {
 		struct epair_queue *q = &sca->queues[i];
-		buf_ring_free(q->rxring[0], M_EPAIR);
-		buf_ring_free(q->rxring[1], M_EPAIR);
+		mtx_destroy(&q->mtx);
 	}
 	free(sca->queues, M_EPAIR);
 	free(sca, M_EPAIR);