git: aa41354349c1 - main - nvme: Optimize timeout code further

From: Warner Losh <imp_at_FreeBSD.org>
Date: Tue, 23 Jul 2024 23:03:40 UTC
The branch main has been updated by imp:

URL: https://cgit.FreeBSD.org/src/commit/?id=aa41354349c16ea7220893010df78b47d67d0f74

commit aa41354349c16ea7220893010df78b47d67d0f74
Author:     Warner Losh <imp@FreeBSD.org>
AuthorDate: 2024-07-23 23:02:03 +0000
Commit:     Warner Losh <imp@FreeBSD.org>
CommitDate: 2024-07-23 23:04:02 +0000

    nvme: Optimize timeout code further
    
    Optimize timeout code based on three observations.
    
    (1) The tr queues are sorted in order of submission, so the first one
        that could time out is the first "real" one on the list.
    (2) Timeouts for a given queue are all the same length (well, except
        at startup, where timeout doesn't matter, and when you change it
        at runtime, where timeouts will still happen eventually and the
        difference isn't worth optimizing for).
    (3) Calling the ISR races the real ISR and we should avoid that better.
    
    So now, after checking to see if the card is there and working, the
    timeout routine scans the pending tracker list until it finds a non-AER
    tracker. If the deadline hasn't passed, we return, doing nothing
    further.  Otherwise, we call poll completions and then process the list
    looking for timed out items.
    
    This should move the timeout routine to touching hardware only when it's
    really necessary. It thus avoids racing the normal ISR, while still
    timig out stuck transactions quickly enough.
    
    There was also some minor code motion to make all of the above flow more
    nicely for the reader.
    
    When interrupts aren't working at all, then this will increase latency
    somewhat. But when interrupts aren't working at all, there's bigger
    problems and we should poll quite often in that case. That will be
    handled in future commits.
    
    Sponsored by: Netflix
    Differential Revision:  https://reviews.freebsd.org/D46026
---
 sys/dev/nvme/nvme_qpair.c | 138 ++++++++++++++++++++++++++++++----------------
 1 file changed, 90 insertions(+), 48 deletions(-)

diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c
index e4286b61a3fc..c917b34dbe43 100644
--- a/sys/dev/nvme/nvme_qpair.c
+++ b/sys/dev/nvme/nvme_qpair.c
@@ -1045,8 +1045,8 @@ nvme_qpair_timeout(void *arg)
 	struct nvme_controller	*ctrlr = qpair->ctrlr;
 	struct nvme_tracker	*tr;
 	sbintime_t		now;
-	bool			idle = false;
-	bool			needs_reset;
+	bool			idle = true;
+	bool			fast;
 	uint32_t		csts;
 	uint8_t			cfs;
 
@@ -1092,23 +1092,69 @@ nvme_qpair_timeout(void *arg)
 		 */
 		csts = nvme_mmio_read_4(ctrlr, csts);
 		cfs = NVMEV(NVME_CSTS_REG_CFS, csts);
-		if (csts == NVME_GONE || cfs == 1)
-			goto do_reset;
+		if (csts == NVME_GONE || cfs == 1) {
+			/*
+			 * We've had a command timeout that we weren't able to
+			 * abort or we have aborts disabled and any command
+			 * timed out.
+			 *
+			 * If we get here due to a possible surprise hot-unplug
+			 * event, then we let nvme_ctrlr_reset confirm and fail
+			 * the controller.
+			 */
+do_reset:
+			nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",
+			    (csts == 0xffffffff) ? " and possible hot unplug" :
+			    (cfs ? " and fatal error status" : ""));
+			qpair->recovery_state = RECOVERY_WAITING;
+			nvme_ctrlr_reset(ctrlr);
+			idle = false;
+			break;
+		}
+
 
 		/*
-		 * Process completions. We already have the recovery lock, so
-		 * call the locked version.
+		 * See if there's any recovery needed. First, do a fast check to
+		 * see if anything could have timed out. If not, then skip
+		 * everything else.
+		 */
+		fast = false;
+		mtx_lock(&qpair->lock);
+		now = getsbinuptime();
+		TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
+			/*
+			 * Skip async commands, they are posted to the card for
+			 * an indefinite amount of time and have no deadline.
+			 */
+			if (tr->deadline == SBT_MAX)
+				continue;
+
+			/*
+			 * If the first real transaction is not in timeout, then
+			 * we're done. Otherwise, we try recovery.
+			 */
+			idle = false;
+			if (now <= tr->deadline)
+				fast = true;
+			break;
+		}
+		mtx_unlock(&qpair->lock);
+		if (idle || fast)
+			break;
+
+		/*
+		 * There's a stale transaction at the start of the queue whose
+		 * deadline has passed. Poll the competions as a last-ditch
+		 * effort in case an interrupt has been missed.
 		 */
 		_nvme_qpair_process_completions(qpair);
 
 		/*
-		 * Check to see if we need to timeout any commands. If we do, then
-		 * we also enter a recovery phase.
+		 * Now that we've run the ISR, re-rheck to see if there's any
+		 * timed out commands and abort them or reset the card if so.
 		 */
-		now = getsbinuptime();
-		needs_reset = false;
-		idle = true;
 		mtx_lock(&qpair->lock);
+		idle = true;
 		TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
 			/*
 			 * Skip async commands, they are posted to the card for
@@ -1116,48 +1162,44 @@ nvme_qpair_timeout(void *arg)
 			 */
 			if (tr->deadline == SBT_MAX)
 				continue;
-			if (now > tr->deadline) {
-				if (tr->req->cb_fn != nvme_abort_complete &&
-				    ctrlr->enable_aborts) {
-					/*
-					 * This isn't an abort command, ask
-					 * for a hardware abort.
-					 */
-					nvme_ctrlr_cmd_abort(ctrlr, tr->cid,
-					    qpair->id, nvme_abort_complete, tr);
-				} else {
-					/*
-					 * Otherwise we have a live command in
-					 * the card (either one we couldn't
-					 * abort, or aborts weren't enabled).
-					 * The only safe way to proceed is to do
-					 * a reset.
-					 */
-					needs_reset = true;
-				}
+
+			/*
+			 * If we know this tracker hasn't timed out, we also
+			 * know all subsequent ones haven't timed out. The tr
+			 * queue is in submission order and all normal commands
+			 * in a queue have the same timeout (or the timeout was
+			 * changed by the user, but we eventually timeout then).
+			 */
+			idle = false;
+			if (now <= tr->deadline)
+				break;
+
+			/*
+			 * Timeout expired, abort it or reset controller.
+			 */
+			if (ctrlr->enable_aborts &&
+			    tr->req->cb_fn != nvme_abort_complete) {
+				/*
+				 * This isn't an abort command, ask for a
+				 * hardware abort. This goes to the admin
+				 * queue which will reset the card if it
+				 * times out.
+				 */
+				nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id,
+				    nvme_abort_complete, tr);
 			} else {
-				idle = false;
+				/*
+				 * We have a live command in the card (either
+				 * one we couldn't abort, or aborts weren't
+				 * enabled).  We can only reset.
+				 */
+				mtx_unlock(&qpair->lock);
+				goto do_reset;
 			}
 		}
 		mtx_unlock(&qpair->lock);
-		if (!needs_reset)
-			break;
-
-		/*
-		 * We've had a command timeout that we weren't able to abort
-		 *
-		 * If we get here due to a possible surprise hot-unplug event,
-		 * then we let nvme_ctrlr_reset confirm and fail the
-		 * controller.
-		 */
-	do_reset:
-		nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",
-		    (csts == 0xffffffff) ? " and possible hot unplug" :
-		    (cfs ? " and fatal error status" : ""));
-		qpair->recovery_state = RECOVERY_WAITING;
-		nvme_ctrlr_reset(ctrlr);
-		idle = false;			/* We want to keep polling */
 		break;
+
 	case RECOVERY_WAITING:
 		/*
 		 * These messages aren't interesting while we're suspended. We