git: 1d528f95e8ce - main - xen/blkback: remove bounce buffering mode

From: Roger Pau Monné <royger_at_FreeBSD.org>
Date: Tue, 07 Jun 2022 10:30:03 UTC
The branch main has been updated by royger:

URL: https://cgit.FreeBSD.org/src/commit/?id=1d528f95e8ce4f2b11f63a3e3aa4bca14df1eb51

commit 1d528f95e8ce4f2b11f63a3e3aa4bca14df1eb51
Author:     Roger Pau Monné <royger@FreeBSD.org>
AuthorDate: 2022-04-12 14:17:09 +0000
Commit:     Roger Pau Monné <royger@FreeBSD.org>
CommitDate: 2022-06-07 10:29:53 +0000

    xen/blkback: remove bounce buffering mode
    
    Remove bounce buffering code for blkback and only attach if Xen
    creates IOMMU entries for grant mapped pages.
    
    Such bounce buffering consumed a non trivial amount of memory and CPU
    resources to do the memory copy, when it's been a long time since Xen
    has been creating IOMMU entries for grant maps.
    
    Refuse to attach blkback if Xen doesn't advertise that IOMMU entries
    are created for grant maps.
    
    Sponsored by: Citrix Systems R&D
---
 sys/dev/xen/blkback/blkback.c | 200 +++++-------------------------------------
 1 file changed, 22 insertions(+), 178 deletions(-)

diff --git a/sys/dev/xen/blkback/blkback.c b/sys/dev/xen/blkback/blkback.c
index fa0bb3ff85ed..b15dbb1e9155 100644
--- a/sys/dev/xen/blkback/blkback.c
+++ b/sys/dev/xen/blkback/blkback.c
@@ -80,6 +80,7 @@ __FBSDID("$FreeBSD$");
 #include <xen/gnttab.h>
 #include <xen/xen_intr.h>
 
+#include <contrib/xen/arch-x86/cpuid.h>
 #include <contrib/xen/event_channel.h>
 #include <contrib/xen/grant_table.h>
 
@@ -101,27 +102,6 @@ __FBSDID("$FreeBSD$");
 #define	XBB_MAX_REQUESTS 					\
 	__CONST_RING_SIZE(blkif, PAGE_SIZE * XBB_MAX_RING_PAGES)
 
-/**
- * \brief Define to force all I/O to be performed on memory owned by the
- *        backend device, with a copy-in/out to the remote domain's memory.
- *
- * \note  This option is currently required when this driver's domain is
- *        operating in HVM mode on a system using an IOMMU.
- *
- * This driver uses Xen's grant table API to gain access to the memory of
- * the remote domains it serves.  When our domain is operating in PV mode,
- * the grant table mechanism directly updates our domain's page table entries
- * to point to the physical pages of the remote domain.  This scheme guarantees
- * that blkback and the backing devices it uses can safely perform DMA
- * operations to satisfy requests.  In HVM mode, Xen may use a HW IOMMU to
- * insure that our domain cannot DMA to pages owned by another domain.  As
- * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant
- * table API.  For this reason, in HVM mode, we must bounce all requests into
- * memory that is mapped into our domain at domain startup and thus has
- * valid IOMMU mappings.
- */
-#define XBB_USE_BOUNCE_BUFFERS
-
 /**
  * \brief Define to enable rudimentary request logging to the console.
  */
@@ -257,14 +237,6 @@ struct xbb_xen_reqlist {
 	 */
 	uint64_t	 	 gnt_base;
 
-#ifdef XBB_USE_BOUNCE_BUFFERS
-	/**
-	 * Pre-allocated domain local memory used to proxy remote
-	 * domain memory during I/O operations.
-	 */
-	uint8_t			*bounce;
-#endif
-
 	/**
 	 * Array of grant handles (one per page) used to map this request.
 	 */
@@ -500,30 +472,6 @@ struct xbb_file_data {
 	 * so we only need one of these.
 	 */
 	struct iovec	xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
-#ifdef XBB_USE_BOUNCE_BUFFERS
-
-	/**
-	 * \brief Array of io vectors used to handle bouncing of file reads.
-	 *
-	 * Vnode operations are free to modify uio data during their
-	 * exectuion.  In the case of a read with bounce buffering active,
-	 * we need some of the data from the original uio in order to
-	 * bounce-out the read data.  This array serves as the temporary
-	 * storage for this saved data.
-	 */
-	struct iovec	saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
-
-	/**
-	 * \brief Array of memoized bounce buffer kva offsets used
-	 *        in the file based backend.
-	 *
-	 * Due to the way that the mapping of the memory backing an
-	 * I/O transaction is handled by Xen, a second pass through
-	 * the request sg elements is unavoidable. We memoize the computed
-	 * bounce address here to reduce the cost of the second walk.
-	 */
-	void		*xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST];
-#endif /* XBB_USE_BOUNCE_BUFFERS */
 };
 
 /**
@@ -891,25 +839,6 @@ xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
 	return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9));
 }
 
-#ifdef XBB_USE_BOUNCE_BUFFERS
-/**
- * Given a page index and 512b sector offset within that page,
- * calculate an offset into a request's local bounce memory region.
- *
- * \param reqlist The request structure whose bounce region will be accessed.
- * \param pagenr  The page index used to compute the bounce offset.
- * \param sector  The 512b sector index used to compute the page relative
- *                bounce offset.
- *
- * \return  The computed global bounce buffer address.
- */
-static inline uint8_t *
-xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
-{
-	return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9));
-}
-#endif
-
 /**
  * Given a page number and 512b sector offset within that page,
  * calculate an offset into the request's memory region that the
@@ -929,11 +858,7 @@ xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
 static inline uint8_t *
 xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
 {
-#ifdef XBB_USE_BOUNCE_BUFFERS
-	return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector));
-#else
 	return (xbb_reqlist_vaddr(reqlist, pagenr, sector));
-#endif
 }
 
 /**
@@ -1508,17 +1433,6 @@ xbb_bio_done(struct bio *bio)
 		}
 	}
 
-#ifdef XBB_USE_BOUNCE_BUFFERS
-	if (bio->bio_cmd == BIO_READ) {
-		vm_offset_t kva_offset;
-
-		kva_offset = (vm_offset_t)bio->bio_data
-			   - (vm_offset_t)reqlist->bounce;
-		memcpy((uint8_t *)reqlist->kva + kva_offset,
-		       bio->bio_data, bio->bio_bcount);
-	}
-#endif /* XBB_USE_BOUNCE_BUFFERS */
-
 	/*
 	 * Decrement the pending count for the request list.  When we're
 	 * done with the requests, send status back for all of them.
@@ -2180,17 +2094,6 @@ xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
 
 	for (bio_idx = 0; bio_idx < nbio; bio_idx++)
 	{
-#ifdef XBB_USE_BOUNCE_BUFFERS
-		vm_offset_t kva_offset;
-
-		kva_offset = (vm_offset_t)bios[bio_idx]->bio_data
-			   - (vm_offset_t)reqlist->bounce;
-		if (operation == BIO_WRITE) {
-			memcpy(bios[bio_idx]->bio_data,
-			       (uint8_t *)reqlist->kva + kva_offset,
-			       bios[bio_idx]->bio_bcount);
-		}
-#endif
 		if (operation == BIO_READ) {
 			SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read,
 				   device_get_unit(xbb->dev),
@@ -2241,10 +2144,6 @@ xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
 	struct uio            xuio;
 	struct xbb_sg        *xbb_sg;
 	struct iovec         *xiovec;
-#ifdef XBB_USE_BOUNCE_BUFFERS
-	void                **p_vaddr;
-	int                   saved_uio_iovcnt;
-#endif /* XBB_USE_BOUNCE_BUFFERS */
 	int                   error;
 
 	file_data = &xbb->backend.file;
@@ -2300,18 +2199,6 @@ xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
 			xiovec = &file_data->xiovecs[xuio.uio_iovcnt];
 			xiovec->iov_base = xbb_reqlist_ioaddr(reqlist,
 			    seg_idx, xbb_sg->first_sect);
-#ifdef XBB_USE_BOUNCE_BUFFERS
-			/*
-			 * Store the address of the incoming
-			 * buffer at this particular offset
-			 * as well, so we can do the copy
-			 * later without having to do more
-			 * work to recalculate this address.
-		 	 */
-			p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt];
-			*p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx,
-			    xbb_sg->first_sect);
-#endif /* XBB_USE_BOUNCE_BUFFERS */
 			xiovec->iov_len = 0;
 			xuio.uio_iovcnt++;
 		}
@@ -2331,28 +2218,6 @@ xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
 
 	xuio.uio_td = curthread;
 
-#ifdef XBB_USE_BOUNCE_BUFFERS
-	saved_uio_iovcnt = xuio.uio_iovcnt;
-
-	if (operation == BIO_WRITE) {
-		/* Copy the write data to the local buffer. */
-		for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
-		     xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt;
-		     seg_idx++, xiovec++, p_vaddr++) {
-			memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len);
-		}
-	} else {
-		/*
-		 * We only need to save off the iovecs in the case of a
-		 * read, because the copy for the read happens after the
-		 * VOP_READ().  (The uio will get modified in that call
-		 * sequence.)
-		 */
-		memcpy(file_data->saved_xiovecs, xuio.uio_iov,
-		       xuio.uio_iovcnt * sizeof(xuio.uio_iov[0]));
-	}
-#endif /* XBB_USE_BOUNCE_BUFFERS */
-
 	switch (operation) {
 	case BIO_READ:
 
@@ -2429,25 +2294,6 @@ xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
 		/* NOTREACHED */
 	}
 
-#ifdef XBB_USE_BOUNCE_BUFFERS
-	/* We only need to copy here for read operations */
-	if (operation == BIO_READ) {
-		for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
-		     xiovec = file_data->saved_xiovecs;
-		     seg_idx < saved_uio_iovcnt; seg_idx++,
-		     xiovec++, p_vaddr++) {
-			/*
-			 * Note that we have to use the copy of the 
-			 * io vector we made above.  uiomove() modifies
-			 * the uio and its referenced vector as uiomove
-			 * performs the copy, so we can't rely on any
-			 * state from the original uio.
-			 */
-			memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len);
-		}
-	}
-#endif /* XBB_USE_BOUNCE_BUFFERS */
-
 bailout_send_response:
 
 	if (error != 0)
@@ -2826,12 +2672,6 @@ xbb_disconnect(struct xbb_softc *xbb)
 		/* There is one request list for ever allocated request. */
 		for (i = 0, reqlist = xbb->request_lists;
 		     i < xbb->max_requests; i++, reqlist++){
-#ifdef XBB_USE_BOUNCE_BUFFERS
-			if (reqlist->bounce != NULL) {
-				free(reqlist->bounce, M_XENBLOCKBACK);
-				reqlist->bounce = NULL;
-			}
-#endif
 			if (reqlist->gnt_handles != NULL) {
 				free(reqlist->gnt_handles, M_XENBLOCKBACK);
 				reqlist->gnt_handles = NULL;
@@ -3210,17 +3050,6 @@ xbb_alloc_request_lists(struct xbb_softc *xbb)
 
 		reqlist->xbb = xbb;
 
-#ifdef XBB_USE_BOUNCE_BUFFERS
-		reqlist->bounce = malloc(xbb->max_reqlist_size,
-					 M_XENBLOCKBACK, M_NOWAIT);
-		if (reqlist->bounce == NULL) {
-			xenbus_dev_fatal(xbb->dev, ENOMEM, 
-					 "Unable to allocate request "
-					 "bounce buffers");
-			return (ENOMEM);
-		}
-#endif /* XBB_USE_BOUNCE_BUFFERS */
-
 		reqlist->gnt_handles = malloc(xbb->max_reqlist_segments *
 					      sizeof(*reqlist->gnt_handles),
 					      M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
@@ -3489,14 +3318,29 @@ xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...)
 static int
 xbb_probe(device_t dev)
 {
+	uint32_t regs[4];
+
+	if (strcmp(xenbus_get_type(dev), "vbd"))
+		return (ENXIO);
 
-        if (!strcmp(xenbus_get_type(dev), "vbd")) {
-                device_set_desc(dev, "Backend Virtual Block Device");
-                device_quiet(dev);
-                return (0);
-        }
+	KASSERT(xen_cpuid_base != 0, ("Invalid base Xen CPUID leaf"));
+	cpuid_count(xen_cpuid_base + 4, 0, regs);
 
-        return (ENXIO);
+	/* Only attach if Xen creates IOMMU entries for grant mapped pages. */
+	if (!(regs[0] & XEN_HVM_CPUID_IOMMU_MAPPINGS)) {
+		static bool warned;
+
+		if (!warned) {
+			warned = true;
+			printf(
+	"xen-blkback disabled due to grant maps lacking IOMMU entries\n");
+		}
+		return (ENXIO);
+	}
+
+	device_set_desc(dev, "Backend Virtual Block Device");
+	device_quiet(dev);
+	return (0);
 }
 
 /**