git: fda7cc79aaf6 - stable/14 - mpi3mr: Fix EINPROGRESS errors hanging the card

From: Alexander Motin <mav_at_FreeBSD.org>
Date: Fri, 19 Jan 2024 17:17:35 UTC
The branch stable/14 has been updated by mav:

URL: https://cgit.FreeBSD.org/src/commit/?id=fda7cc79aaf65a5a5d62627602d6456c70309ce1

commit fda7cc79aaf65a5a5d62627602d6456c70309ce1
Author:     Warner Losh <imp@FreeBSD.org>
AuthorDate: 2023-11-29 01:49:39 +0000
Commit:     Alexander Motin <mav@FreeBSD.org>
CommitDate: 2024-01-19 17:16:50 +0000

    mpi3mr: Fix EINPROGRESS errors hanging the card
    
    Move enqueueing of commands to bus_dmamap_load_ccb callback
    
    Fix fundamental difference between FreeBSD and Linux. On Linux, your dma
    load callback always happends before it returns, so drivers are written
    to load the map, then submit to hardware. On FreeBSD, the callback may
    be deferred and return EINPROGRESS. This means the callback is
    responsible for queueing the request to the hardware is done after the
    SGL list is created. Make a number of interrelated cahnages:
    
    At the end of mpi3mr_prepare_sgls, add a call to mpi3mr_enqueue_request.
    
    Split the hardware submission out from the end of mpi3mr_action_scsiio
    and move it into a new routine mpi3mr_enqueue_request.
    
    Move all error completion from the end of mpi3mr_action_scsiio to where
    the error is detected. We cannot pass errors back from the
    mpi3mr_enqueue_request to do this on a 'failed' mpi3mr in a centralized
    place (since it has to be fire and forget).
    
    Add comments about zero length SGLs never making it into
    mpi3mr_prepare_sgls. Keep the code there for the moment, but we only set
    cm->data to non-NULL when scsiio_req->DataLength is not zero. So the
    datalength can't be zero and we can't send the zero SGLs.
    
    Add commentts about other "impossible" tests in mpi3mr_prepare_sgls that
    really should be simple asserts of some flavor.
    
    Eliminate cm->error_code, since we can't pass data back from the
    mpi3mr_prepare_sgl callback anymore.
    
    In mpi3mr_map_request, call mpi3mr_enqueue_request for the no data case.
    This seems to work even though we've not done the special zero length
    handling that was in mpi3mr_prepare_sgls, giving further evidence to it
    not actually being needed. This is needed for SCSI CDBs that have no
    data to pass to the drive like TEST UNIT READY.
    
    With this change, and the prior ones, we're now able to run with mpi3mr
    on 128GB systems and very heavy disk load (so many buffers land > 4GB:
    the driver instructs busdma to never use memory abouve 4GB, which may be
    too conservative, but an issue for another time).
    
    Sponsored by:           Netflix
    Reviewed by:            sumit.saxena_broadcom.com, mav, jhb
    Differential Revision:  https://reviews.freebsd.org/D42543
    
    (cherry picked from commit 3208a189c1e2c4ef35daa432fe45629a043d7047)
---
 sys/dev/mpi3mr/mpi3mr.h     |   1 -
 sys/dev/mpi3mr/mpi3mr_cam.c | 130 +++++++++++++++++++++++++-------------------
 2 files changed, 73 insertions(+), 58 deletions(-)

diff --git a/sys/dev/mpi3mr/mpi3mr.h b/sys/dev/mpi3mr/mpi3mr.h
index 2226c747d3cf..f1a2cbc0fd4c 100644
--- a/sys/dev/mpi3mr/mpi3mr.h
+++ b/sys/dev/mpi3mr/mpi3mr.h
@@ -467,7 +467,6 @@ struct mpi3mr_cmd {
 	U16				hosttag;
 	U8				req_qidx;
 	Mpi3SCSIIORequest_t		io_request;
-	int				error_code;
 };
 
 struct mpi3mr_chain {
diff --git a/sys/dev/mpi3mr/mpi3mr_cam.c b/sys/dev/mpi3mr/mpi3mr_cam.c
index b6e47eac058a..15ef2732ec56 100644
--- a/sys/dev/mpi3mr/mpi3mr_cam.c
+++ b/sys/dev/mpi3mr/mpi3mr_cam.c
@@ -86,7 +86,9 @@
 
 #define	smp_processor_id()  PCPU_GET(cpuid)
 
-static int
+static void
+mpi3mr_enqueue_request(struct mpi3mr_softc *sc, struct mpi3mr_cmd *cm);
+static void
 mpi3mr_map_request(struct mpi3mr_softc *sc, struct mpi3mr_cmd *cm);
 void
 mpi3mr_release_simq_reinit(struct mpi3mr_cam_softc *cam_sc);
@@ -118,18 +120,23 @@ static void mpi3mr_prepare_sgls(void *arg,
 	U8 last_chain_sgl_flags;
 	struct mpi3mr_chain *chain_req;
 	Mpi3SCSIIORequest_t *scsiio_req;
+	union ccb *ccb;
 	
 	cm = (struct mpi3mr_cmd *)arg;
 	sc = cm->sc;
 	scsiio_req = (Mpi3SCSIIORequest_t *) &cm->io_request;
+	ccb = cm->ccb;
 
 	if (error) {
-		cm->error_code = error;
 		device_printf(sc->mpi3mr_dev, "%s: error=%d\n",__func__, error);
 		if (error == EFBIG) {
-			cm->ccb->ccb_h.status = CAM_REQ_TOO_BIG;
-			return;
+			mpi3mr_set_ccbstatus(ccb, CAM_REQ_TOO_BIG);
+		} else {
+			mpi3mr_set_ccbstatus(ccb, CAM_REQ_CMP_ERR);
 		}
+		mpi3mr_release_command(cm);
+		xpt_done(ccb);
+		return;
 	}
 	
 	if (cm->data_dir == MPI3MR_READ)
@@ -138,10 +145,9 @@ static void mpi3mr_prepare_sgls(void *arg,
 	if (cm->data_dir == MPI3MR_WRITE)
 		bus_dmamap_sync(sc->buffer_dmat, cm->dmamap,
 		    BUS_DMASYNC_PREWRITE);
-	if (nsegs > MPI3MR_SG_DEPTH) {
-		device_printf(sc->mpi3mr_dev, "SGE count is too large or 0.\n");
-		return;
-	}
+
+	KASSERT(nsegs <= MPI3MR_SG_DEPTH && nsegs > 0,
+	    ("%s: bad SGE count: %d\n", device_get_nameunit(sc->mpi3mr_dev), nsegs));
 
 	simple_sgl_flags = MPI3_SGE_FLAGS_ELEMENT_TYPE_SIMPLE |
 	    MPI3_SGE_FLAGS_DLAS_SYSTEM;
@@ -152,24 +158,15 @@ static void mpi3mr_prepare_sgls(void *arg,
 
 	sg_local = (U8 *)&scsiio_req->SGL;
 
-	if (!scsiio_req->DataLength) {
+	if (scsiio_req->DataLength == 0) {
+		/* XXX we don't ever get here when DataLength == 0, right? cm->data is NULL */
+		/* This whole if can likely be removed -- we handle it in mpi3mr_request_map */
 		mpi3mr_build_zero_len_sge(sg_local);
-		return;
+		goto enqueue;
 	}
 	
 	sges_left = nsegs;
 
-	if (sges_left < 0) {
-		printf("scsi_dma_map failed: request for %d bytes!\n",
-			scsiio_req->DataLength);
-		return;
-	}
-	if (sges_left > MPI3MR_SG_DEPTH) {
-		printf("scsi_dma_map returned unsupported sge count %d!\n",
-			sges_left);
-		return;
-	}
-
 	sges_in_segment = (sc->facts.op_req_sz -
 	    offsetof(Mpi3SCSIIORequest_t, SGL))/sizeof(Mpi3SGESimple_t);
 
@@ -218,33 +215,51 @@ fill_in_last_segment:
 		i++;
 	}
 
+enqueue:
+	/*
+	 * Now that we've created the sgls, we send the request to the device.
+	 * Unlike in Linux, dmaload isn't guaranteed to load every time, but
+	 * this function is always called when the resources are available, so
+	 * we can send the request to hardware here always. mpi3mr_map_request
+	 * knows about this quirk and will only take evasive action when an
+	 * error other than EINPROGRESS is returned from dmaload.
+	 */
+	mpi3mr_enqueue_request(sc, cm);
+
 	return;
 }
 
-int 
+static void
 mpi3mr_map_request(struct mpi3mr_softc *sc, struct mpi3mr_cmd *cm)
 {
 	u_int32_t retcode = 0;
+	union ccb *ccb;
 
+	ccb = cm->ccb;
 	if (cm->data != NULL) {
 		mtx_lock(&sc->io_lock);
 		/* Map data buffer into bus space */
 		retcode = bus_dmamap_load_ccb(sc->buffer_dmat, cm->dmamap,
-		    cm->ccb, mpi3mr_prepare_sgls, cm, 0);
+		    ccb, mpi3mr_prepare_sgls, cm, 0);
 		mtx_unlock(&sc->io_lock);
-		if (retcode)
-			device_printf(sc->mpi3mr_dev, "bus_dmamap_load(): retcode = %d\n", retcode);
-		if (retcode == EINPROGRESS) {
-			device_printf(sc->mpi3mr_dev, "request load in progress\n");
-			xpt_freeze_simq(sc->cam_sc->sim, 1);
+		if (retcode != 0 && retcode != EINPROGRESS) {
+			device_printf(sc->mpi3mr_dev,
+			    "bus_dmamap_load(): retcode = %d\n", retcode);
+			/*
+			 * Any other error means prepare_sgls wasn't called, and
+			 * will never be called, so we have to mop up. This error
+			 * should never happen, though.
+			 */
+			mpi3mr_set_ccbstatus(ccb, CAM_REQ_CMP_ERR);
+			mpi3mr_release_command(cm);
+			xpt_done(ccb);
 		}
+	} else {
+		/*
+		 * No data, we enqueue it directly here.
+		 */
+		mpi3mr_enqueue_request(sc, cm);
 	}
-	if (cm->error_code)
-		return cm->error_code;
-	if (retcode)
-		mpi3mr_set_ccbstatus(cm->ccb, CAM_REQ_INVALID);
-
-	return (retcode);
 }
 
 void
@@ -912,12 +927,6 @@ mpi3mr_action_scsiio(struct mpi3mr_cam_softc *cam_sc, union ccb *ccb)
 	struct mpi3mr_cmd *cm;
 	uint8_t scsi_opcode, queue_idx;
 	uint32_t mpi_control;
-	struct mpi3mr_op_req_queue *opreqq = NULL;
-	U32 data_len_blks = 0;
-	U32 tracked_io_sz = 0;
-	U32 ioc_pend_data_len = 0, tg_pend_data_len = 0;
-	struct mpi3mr_throttle_group_info *tg = NULL;
-	static int ratelimit;
 
 	sc = cam_sc->sc;
 	mtx_assert(&sc->mpi3mr_mtx, MA_OWNED);
@@ -1104,15 +1113,15 @@ mpi3mr_action_scsiio(struct mpi3mr_cam_softc *cam_sc, union ccb *ccb)
 	case CAM_DATA_SG_PADDR:
 		device_printf(sc->mpi3mr_dev, "%s: physical addresses not supported\n",
 		    __func__);
-		mpi3mr_release_command(cm);
 		mpi3mr_set_ccbstatus(ccb, CAM_REQ_INVALID);
+		mpi3mr_release_command(cm);
 		xpt_done(ccb);
 		return;
 	case CAM_DATA_SG:
 		device_printf(sc->mpi3mr_dev, "%s: scatter gather is not supported\n",
 		    __func__);
-		mpi3mr_release_command(cm);
 		mpi3mr_set_ccbstatus(ccb, CAM_REQ_INVALID);
+		mpi3mr_release_command(cm);
 		xpt_done(ccb);
 		return;
 	case CAM_DATA_VADDR:
@@ -1129,27 +1138,35 @@ mpi3mr_action_scsiio(struct mpi3mr_cam_softc *cam_sc, union ccb *ccb)
 			cm->data = csio->data_ptr;
 		break;
 	default:
-		mpi3mr_release_command(cm);
 		mpi3mr_set_ccbstatus(ccb, CAM_REQ_INVALID);
-		xpt_done(ccb);
-		return;
-	}
-
-	/* Prepare SGEs */
-	if (mpi3mr_map_request(sc, cm)) {
 		mpi3mr_release_command(cm);
 		xpt_done(ccb);
-		printf("func: %s line: %d Build SGLs failed\n", __func__, __LINE__);
 		return;
 	}
-	
-	opreqq = &sc->op_req_q[queue_idx];
+
+	/* Prepare SGEs and queue to hardware */
+	mpi3mr_map_request(sc, cm);
+}
+
+static void
+mpi3mr_enqueue_request(struct mpi3mr_softc *sc, struct mpi3mr_cmd *cm)
+{
+	static int ratelimit;
+	struct mpi3mr_op_req_queue *opreqq = &sc->op_req_q[cm->req_qidx];
+	struct mpi3mr_throttle_group_info *tg = NULL;
+	uint32_t data_len_blks = 0;
+	uint32_t tracked_io_sz = 0;
+	uint32_t ioc_pend_data_len = 0, tg_pend_data_len = 0;
+	struct mpi3mr_target *targ = cm->targ;
+	union ccb *ccb = cm->ccb;
+	Mpi3SCSIIORequest_t *req = (Mpi3SCSIIORequest_t *)&cm->io_request;
 
 	if (sc->iot_enable) {
-		data_len_blks = csio->dxfer_len >> 9;
-		
+		data_len_blks = ccb->csio.dxfer_len >> 9;
+
 		if ((data_len_blks >= sc->io_throttle_data_length) &&
 		    targ->io_throttle_enabled) {
+
 			tracked_io_sz = data_len_blks;
 			tg = targ->throttle_group;
 			if (tg) {
@@ -1207,19 +1224,18 @@ mpi3mr_action_scsiio(struct mpi3mr_cam_softc *cam_sc, union ccb *ccb)
 
 		if (targ->io_divert) {
 			req->MsgFlags |= MPI3_SCSIIO_MSGFLAGS_DIVERT_TO_FIRMWARE;
-			mpi_control |= MPI3_SCSIIO_FLAGS_DIVERT_REASON_IO_THROTTLING;
+			req->Flags = htole32(le32toh(req->Flags) | MPI3_SCSIIO_FLAGS_DIVERT_REASON_IO_THROTTLING);
 		}
 	}
-	req->Flags = htole32(mpi_control);
 
 	if (mpi3mr_submit_io(sc, opreqq, (U8 *)&cm->io_request)) {
-		mpi3mr_release_command(cm);
 		if (tracked_io_sz) {
 			mpi3mr_atomic_sub(&sc->pend_large_data_sz, tracked_io_sz);
 			if (tg)
 				mpi3mr_atomic_sub(&tg->pend_large_data_sz, tracked_io_sz);
 		}
 		mpi3mr_set_ccbstatus(ccb, CAM_RESRC_UNAVAIL);
+		mpi3mr_release_command(cm);
 		xpt_done(ccb);
 	} else {
 		callout_reset_sbt(&cm->callout, mstosbt(ccb->ccb_h.timeout), 0,