git: dabb3db7a817 - main - xen/netfront: deal with mbuf data crossing a page boundary

From: Roger Pau Monné <royger_at_FreeBSD.org>
Date: Thu, 03 Nov 2022 12:37:34 UTC
The branch main has been updated by royger:

URL: https://cgit.FreeBSD.org/src/commit/?id=dabb3db7a817f003af3f89c965ba369c67fc4910

commit dabb3db7a817f003af3f89c965ba369c67fc4910
Author:     Roger Pau Monné <royger@FreeBSD.org>
AuthorDate: 2022-11-03 12:29:22 +0000
Commit:     Roger Pau Monné <royger@FreeBSD.org>
CommitDate: 2022-11-03 12:32:21 +0000

    xen/netfront: deal with mbuf data crossing a page boundary
    
    There's been a report recently of mbufs with data that crosses a page
    boundary. It seems those mbufs are generated by the iSCSI target
    system:
    
    https://lists.xenproject.org/archives/html/xen-devel/2021-12/msg01581.html
    
    In order to handle those mbufs correctly on netfront use the bus_dma
    interface and explicitly request that segments must not cross a page
    boundary. No other requirements are necessary, so it's expected that
    bus_dma won't need to bounce the data and hence it shouldn't
    introduce a too big performance penalty.
    
    Using bus_dma requires some changes to netfront, mainly in order to
    accommodate for the fact that now ring slots no longer have a 1:1
    match with mbufs, as a single mbuf can use two ring slots if the data
    buffer crosses a page boundary. Store the first packet of the mbuf
    chain in every ring slot that's used, and use a mbuf tag in order to
    store the bus_dma related structures and a refcount to keep track of
    the pending slots before the mbuf chain can be freed.
    
    Reported by: G.R.
    Tested by: G.R.
    MFC: 1 week
    Differential revision: https://reviews.freebsd.org/D33876
---
 sys/dev/xen/netfront/netfront.c | 194 ++++++++++++++++++++++++++++------------
 1 file changed, 138 insertions(+), 56 deletions(-)

diff --git a/sys/dev/xen/netfront/netfront.c b/sys/dev/xen/netfront/netfront.c
index fc151453a49a..cfb6172dbe19 100644
--- a/sys/dev/xen/netfront/netfront.c
+++ b/sys/dev/xen/netfront/netfront.c
@@ -71,6 +71,8 @@ __FBSDID("$FreeBSD$");
 #include <contrib/xen/io/netif.h>
 #include <xen/xenbus/xenbusvar.h>
 
+#include <machine/bus.h>
+
 #include "xenbus_if.h"
 
 /* Features supported by all backends.  TSO and LRO can be negotiated */
@@ -126,7 +128,6 @@ static void xn_release_tx_bufs(struct netfront_txq *);
 static void xn_rxq_intr(struct netfront_rxq *);
 static void xn_txq_intr(struct netfront_txq *);
 static void xn_intr(void *);
-static inline int xn_count_frags(struct mbuf *m);
 static int xn_assemble_tx_request(struct netfront_txq *, struct mbuf *);
 static int xn_ioctl(struct ifnet *, u_long, caddr_t);
 static void xn_ifinit_locked(struct netfront_info *);
@@ -199,6 +200,17 @@ struct netfront_txq {
 	struct taskqueue 	*tq;
 	struct task       	defrtask;
 
+	bus_dma_segment_t	segs[MAX_TX_REQ_FRAGS];
+	struct mbuf_xennet {
+		struct m_tag 	tag;
+		bus_dma_tag_t	dma_tag;
+		bus_dmamap_t	dma_map;
+		struct netfront_txq *txq;
+		SLIST_ENTRY(mbuf_xennet) next;
+		u_int 		count;
+	}			xennet_tag[NET_TX_RING_SIZE + 1];
+	SLIST_HEAD(, mbuf_xennet) tags;
+
 	bool			full;
 };
 
@@ -221,6 +233,8 @@ struct netfront_info {
 
 	struct ifmedia		sc_media;
 
+	bus_dma_tag_t		dma_tag;
+
 	bool			xn_reset;
 };
 
@@ -301,6 +315,42 @@ xn_get_rx_ref(struct netfront_rxq *rxq, RING_IDX ri)
 	return (ref);
 }
 
+#define MTAG_COOKIE 1218492000
+#define MTAG_XENNET 0
+
+static void mbuf_grab(struct mbuf *m)
+{
+	struct mbuf_xennet *ref;
+
+	ref = (struct mbuf_xennet *)m_tag_locate(m, MTAG_COOKIE,
+	    MTAG_XENNET, NULL);
+	KASSERT(ref != NULL, ("Cannot find refcount"));
+	ref->count++;
+}
+
+static void mbuf_release(struct mbuf *m)
+{
+	struct mbuf_xennet *ref;
+
+	ref = (struct mbuf_xennet *)m_tag_locate(m, MTAG_COOKIE,
+	    MTAG_XENNET, NULL);
+	KASSERT(ref != NULL, ("Cannot find refcount"));
+	KASSERT(ref->count > 0, ("Invalid reference count"));
+
+	if (--ref->count == 0)
+		m_freem(m);
+}
+
+static void tag_free(struct m_tag *t)
+{
+	struct mbuf_xennet *ref = (struct mbuf_xennet *)t;
+
+	KASSERT(ref->count == 0, ("Free mbuf tag with pending refcnt"));
+	bus_dmamap_sync(ref->dma_tag, ref->dma_map, BUS_DMASYNC_POSTWRITE);
+	bus_dmamap_destroy(ref->dma_tag, ref->dma_map);
+	SLIST_INSERT_HEAD(&ref->txq->tags, ref, next);
+}
+
 #define IPRINTK(fmt, args...) \
     printf("[XEN] " fmt, ##args)
 #ifdef INVARIANTS
@@ -778,11 +828,18 @@ disconnect_txq(struct netfront_txq *txq)
 static void
 destroy_txq(struct netfront_txq *txq)
 {
+	unsigned int i;
 
 	free(txq->ring.sring, M_DEVBUF);
 	buf_ring_free(txq->br, M_DEVBUF);
 	taskqueue_drain_all(txq->tq);
 	taskqueue_free(txq->tq);
+
+	for (i = 0; i <= NET_TX_RING_SIZE; i++) {
+		bus_dmamap_destroy(txq->info->dma_tag,
+		    txq->xennet_tag[i].dma_map);
+		txq->xennet_tag[i].dma_map = NULL;
+	}
 }
 
 static void
@@ -822,10 +879,27 @@ setup_txqs(device_t dev, struct netfront_info *info,
 
 		mtx_init(&txq->lock, txq->name, "netfront transmit lock",
 		    MTX_DEF);
+		SLIST_INIT(&txq->tags);
 
 		for (i = 0; i <= NET_TX_RING_SIZE; i++) {
 			txq->mbufs[i] = (void *) ((u_long) i+1);
 			txq->grant_ref[i] = GRANT_REF_INVALID;
+			txq->xennet_tag[i].txq = txq;
+			txq->xennet_tag[i].dma_tag = info->dma_tag;
+			error = bus_dmamap_create(info->dma_tag, 0,
+			    &txq->xennet_tag[i].dma_map);
+			if (error != 0) {
+				device_printf(dev,
+				    "failed to allocate dma map\n");
+				goto fail;
+			}
+			m_tag_setup(&txq->xennet_tag[i].tag,
+			    MTAG_COOKIE, MTAG_XENNET,
+			    sizeof(txq->xennet_tag[i]) -
+			    sizeof(txq->xennet_tag[i].tag));
+			txq->xennet_tag[i].tag.m_tag_free = &tag_free;
+			SLIST_INSERT_HEAD(&txq->tags, &txq->xennet_tag[i],
+			    next);
 		}
 		txq->mbufs[NET_TX_RING_SIZE] = (void *)0;
 
@@ -1041,7 +1115,7 @@ xn_release_tx_bufs(struct netfront_txq *txq)
 		if (txq->mbufs_cnt < 0) {
 			panic("%s: tx_chain_cnt must be >= 0", __func__);
 		}
-		m_free(m);
+		mbuf_release(m);
 	}
 }
 
@@ -1311,7 +1385,7 @@ xn_txeof(struct netfront_txq *txq)
 			txq->mbufs[id] = NULL;
 			add_id_to_freelist(txq->mbufs, id);
 			txq->mbufs_cnt--;
-			m_free(m);
+			mbuf_release(m);
 			/* Only mark the txq active if we've freed up at least one slot to try */
 			ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 		}
@@ -1507,22 +1581,6 @@ next_skip_queue:
 	return (err);
 }
 
-/**
- * \brief Count the number of fragments in an mbuf chain.
- *
- * Surprisingly, there isn't an M* macro for this.
- */
-static inline int
-xn_count_frags(struct mbuf *m)
-{
-	int nfrags;
-
-	for (nfrags = 0; m != NULL; m = m->m_next)
-		nfrags++;
-
-	return (nfrags);
-}
-
 /**
  * Given an mbuf chain, make sure we have enough room and then push
  * it onto the transmit ring.
@@ -1530,46 +1588,50 @@ xn_count_frags(struct mbuf *m)
 static int
 xn_assemble_tx_request(struct netfront_txq *txq, struct mbuf *m_head)
 {
-	struct mbuf *m;
 	struct netfront_info *np = txq->info;
 	struct ifnet *ifp = np->xn_ifp;
-	u_int nfrags;
-	int otherend_id;
+	int otherend_id, error, nfrags;
+	bus_dma_segment_t *segs = txq->segs;
+	struct mbuf_xennet *tag;
+	bus_dmamap_t map;
+	unsigned int i;
 
-	/**
-	 * Defragment the mbuf if necessary.
-	 */
-	nfrags = xn_count_frags(m_head);
+	KASSERT(!SLIST_EMPTY(&txq->tags), ("no tags available"));
+	tag = SLIST_FIRST(&txq->tags);
+	SLIST_REMOVE_HEAD(&txq->tags, next);
+	KASSERT(tag->count == 0, ("tag already in-use"));
+	map = tag->dma_map;
+	error = bus_dmamap_load_mbuf_sg(np->dma_tag, map, m_head, segs,
+	    &nfrags, 0);
+	if (error == EFBIG || nfrags > np->maxfrags) {
+		struct mbuf *m;
 
-	/*
-	 * Check to see whether this request is longer than netback
-	 * can handle, and try to defrag it.
-	 */
-	/**
-	 * It is a bit lame, but the netback driver in Linux can't
-	 * deal with nfrags > MAX_TX_REQ_FRAGS, which is a quirk of
-	 * the Linux network stack.
-	 */
-	if (nfrags > np->maxfrags) {
+		bus_dmamap_unload(np->dma_tag, map);
 		m = m_defrag(m_head, M_NOWAIT);
 		if (!m) {
 			/*
 			 * Defrag failed, so free the mbuf and
 			 * therefore drop the packet.
 			 */
+			SLIST_INSERT_HEAD(&txq->tags, tag, next);
 			m_freem(m_head);
 			return (EMSGSIZE);
 		}
 		m_head = m;
+		error = bus_dmamap_load_mbuf_sg(np->dma_tag, map, m_head, segs,
+		    &nfrags, 0);
+		if (error != 0 || nfrags > np->maxfrags) {
+			bus_dmamap_unload(np->dma_tag, map);
+			SLIST_INSERT_HEAD(&txq->tags, tag, next);
+			m_freem(m_head);
+			return (error ?: EFBIG);
+		}
+	} else if (error != 0) {
+		SLIST_INSERT_HEAD(&txq->tags, tag, next);
+		m_freem(m_head);
+		return (error);
 	}
 
-	/* Determine how many fragments now exist */
-	nfrags = xn_count_frags(m_head);
-
-	/*
-	 * Check to see whether the defragmented packet has too many
-	 * segments for the Linux netback driver.
-	 */
 	/**
 	 * The FreeBSD TCP stack, with TSO enabled, can produce a chain
 	 * of mbufs longer than Linux can handle.  Make sure we don't
@@ -1583,6 +1645,8 @@ xn_assemble_tx_request(struct netfront_txq *txq, struct mbuf *m_head)
 		       "won't be able to handle it, dropping\n",
 		       __func__, nfrags, MAX_TX_REQ_FRAGS);
 #endif
+		SLIST_INSERT_HEAD(&txq->tags, tag, next);
+		bus_dmamap_unload(np->dma_tag, map);
 		m_freem(m_head);
 		return (EMSGSIZE);
 	}
@@ -1604,9 +1668,9 @@ xn_assemble_tx_request(struct netfront_txq *txq, struct mbuf *m_head)
 	 * the fragment pointers. Stop when we run out
 	 * of fragments or hit the end of the mbuf chain.
 	 */
-	m = m_head;
 	otherend_id = xenbus_get_otherend_id(np->xbdev);
-	for (m = m_head; m; m = m->m_next) {
+	m_tag_prepend(m_head, &tag->tag);
+	for (i = 0; i < nfrags; i++) {
 		netif_tx_request_t *tx;
 		uintptr_t id;
 		grant_ref_t ref;
@@ -1621,17 +1685,20 @@ xn_assemble_tx_request(struct netfront_txq *txq, struct mbuf *m_head)
 		if (txq->mbufs_cnt > NET_TX_RING_SIZE)
 			panic("%s: tx_chain_cnt must be <= NET_TX_RING_SIZE\n",
 			    __func__);
-		txq->mbufs[id] = m;
+		mbuf_grab(m_head);
+		txq->mbufs[id] = m_head;
 		tx->id = id;
 		ref = gnttab_claim_grant_reference(&txq->gref_head);
 		KASSERT((short)ref >= 0, ("Negative ref"));
-		mfn = virt_to_mfn(mtod(m, vm_offset_t));
+		mfn = atop(segs[i].ds_addr);
 		gnttab_grant_foreign_access_ref(ref, otherend_id,
 		    mfn, GNTMAP_readonly);
 		tx->gref = txq->grant_ref[id] = ref;
-		tx->offset = mtod(m, vm_offset_t) & (PAGE_SIZE - 1);
+		tx->offset = segs[i].ds_addr & PAGE_MASK;
+		KASSERT(tx->offset + segs[i].ds_len <= PAGE_SIZE,
+		    ("mbuf segment crosses a page boundary"));
 		tx->flags = 0;
-		if (m == m_head) {
+		if (i == 0) {
 			/*
 			 * The first fragment has the entire packet
 			 * size, subsequent fragments have just the
@@ -1640,7 +1707,7 @@ xn_assemble_tx_request(struct netfront_txq *txq, struct mbuf *m_head)
 			 * subtracting the sizes of the other
 			 * fragments.
 			 */
-			tx->size = m->m_pkthdr.len;
+			tx->size = m_head->m_pkthdr.len;
 
 			/*
 			 * The first fragment contains the checksum flags
@@ -1654,12 +1721,12 @@ xn_assemble_tx_request(struct netfront_txq *txq, struct mbuf *m_head)
 			 * so we have to test for CSUM_TSO
 			 * explicitly.
 			 */
-			if (m->m_pkthdr.csum_flags
+			if (m_head->m_pkthdr.csum_flags
 			    & (CSUM_DELAY_DATA | CSUM_TSO)) {
 				tx->flags |= (NETTXF_csum_blank
 				    | NETTXF_data_validated);
 			}
-			if (m->m_pkthdr.csum_flags & CSUM_TSO) {
+			if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
 				struct netif_extra_info *gso =
 					(struct netif_extra_info *)
 					RING_GET_REQUEST(&txq->ring,
@@ -1667,7 +1734,7 @@ xn_assemble_tx_request(struct netfront_txq *txq, struct mbuf *m_head)
 
 				tx->flags |= NETTXF_extra_info;
 
-				gso->u.gso.size = m->m_pkthdr.tso_segsz;
+				gso->u.gso.size = m_head->m_pkthdr.tso_segsz;
 				gso->u.gso.type =
 					XEN_NETIF_GSO_TYPE_TCPV4;
 				gso->u.gso.pad = 0;
@@ -1677,13 +1744,14 @@ xn_assemble_tx_request(struct netfront_txq *txq, struct mbuf *m_head)
 				gso->flags = 0;
 			}
 		} else {
-			tx->size = m->m_len;
+			tx->size = segs[i].ds_len;
 		}
-		if (m->m_next)
+		if (i != nfrags - 1)
 			tx->flags |= NETTXF_more_data;
 
 		txq->ring.req_prod_pvt++;
 	}
+	bus_dmamap_sync(np->dma_tag, map, BUS_DMASYNC_PREWRITE);
 	BPF_MTAP(ifp, m_head);
 
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
@@ -2244,7 +2312,20 @@ create_netdev(device_t dev)
     	ether_ifattach(ifp, np->mac);
 	netfront_carrier_off(np);
 
-	return (0);
+	err = bus_dma_tag_create(
+	    bus_get_dma_tag(dev),		/* parent */
+	    1, PAGE_SIZE,			/* algnmnt, boundary */
+	    BUS_SPACE_MAXADDR,			/* lowaddr */
+	    BUS_SPACE_MAXADDR,			/* highaddr */
+	    NULL, NULL,				/* filter, filterarg */
+	    PAGE_SIZE * MAX_TX_REQ_FRAGS,	/* max request size */
+	    MAX_TX_REQ_FRAGS,			/* max segments */
+	    PAGE_SIZE,				/* maxsegsize */
+	    BUS_DMA_ALLOCNOW,			/* flags */
+	    NULL, NULL,				/* lockfunc, lockarg */
+	    &np->dma_tag);
+
+	return (err);
 
 error:
 	KASSERT(err != 0, ("Error path with no error code specified"));
@@ -2277,6 +2358,7 @@ netif_free(struct netfront_info *np)
 	if_free(np->xn_ifp);
 	np->xn_ifp = NULL;
 	ifmedia_removeall(&np->sc_media);
+	bus_dma_tag_destroy(np->dma_tag);
 }
 
 static void