git: 8052b01e7e41 - main - nvme: Add exclusion for ISR

From: Warner Losh <imp_at_FreeBSD.org>
Date: Fri, 25 Aug 2023 16:20:29 UTC
The branch main has been updated by imp:

URL: https://cgit.FreeBSD.org/src/commit/?id=8052b01e7e4113fa8296ce43c354116b0a1774b7

commit 8052b01e7e4113fa8296ce43c354116b0a1774b7
Author:     Warner Losh <imp@FreeBSD.org>
AuthorDate: 2023-08-25 16:10:16 +0000
Commit:     Warner Losh <imp@FreeBSD.org>
CommitDate: 2023-08-25 16:11:27 +0000

    nvme: Add exclusion for ISR
    
    Add a basically uncontended spinlock that we take out while the ISR is
    running. This has two effects: First, when we get a timeout, we can
    safely call the nvme_qpair_process_completions w/o racing any ISRs.
    Second, we can use it to ensure that we don't reset the card while
    the ISRs are active (right now we just sleep and hope for the best,
    which usually is fine, but not always).
    
    Sponsored by:           Netflix
    MFC After:              2 weeks
    Reviewed by:            chuck, gallatin
    Differential Revision:  https://reviews.freebsd.org/D41452
---
 sys/dev/nvme/nvme_ctrlr.c   |  51 ++++++++++++++++----
 sys/dev/nvme/nvme_private.h |  10 ++--
 sys/dev/nvme/nvme_qpair.c   | 115 ++++++++++++++++++++++++++++++--------------
 sys/dev/nvme/nvme_sysctl.c  |  24 +++++++++
 4 files changed, 148 insertions(+), 52 deletions(-)

diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c
index 7a2ecffe81d4..c4a750901743 100644
--- a/sys/dev/nvme/nvme_ctrlr.c
+++ b/sys/dev/nvme/nvme_ctrlr.c
@@ -416,6 +416,34 @@ nvme_ctrlr_disable_qpairs(struct nvme_controller *ctrlr)
 	}
 }
 
+static void
+nvme_pre_reset(struct nvme_controller *ctrlr)
+{
+	/*
+	 * Make sure that all the ISRs are done before proceeding with the reset
+	 * (and also keep any stray interrupts that happen during this process
+	 * from racing this process). For startup, this is a nop, since the
+	 * hardware is in a good state. But for recovery, where we randomly
+	 * reset the hardware, this ensure that we're not racing the ISRs.
+	 */
+	mtx_lock(&ctrlr->adminq.recovery);
+	for (int i = 0; i < ctrlr->num_io_queues; i++) {
+		mtx_lock(&ctrlr->ioq[i].recovery);
+	}
+}
+
+static void
+nvme_post_reset(struct nvme_controller *ctrlr)
+{
+	/*
+	 * Reset complete, unblock ISRs
+	 */
+	mtx_unlock(&ctrlr->adminq.recovery);
+	for (int i = 0; i < ctrlr->num_io_queues; i++) {
+		mtx_unlock(&ctrlr->ioq[i].recovery);
+	}
+}
+
 static int
 nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
 {
@@ -427,9 +455,11 @@ nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
 
 	err = nvme_ctrlr_disable(ctrlr);
 	if (err != 0)
-		return err;
+		goto out;
 
 	err = nvme_ctrlr_enable(ctrlr);
+out:
+
 	TSEXIT();
 	return (err);
 }
@@ -1157,6 +1187,11 @@ nvme_ctrlr_start_config_hook(void *arg)
 
 	TSENTER();
 
+	/*
+	 * Don't call pre/post reset here. We've not yet created the qpairs,
+	 * haven't setup the ISRs, so there's no need to 'drain' them or
+	 * 'exclude' them.
+	 */
 	if (nvme_ctrlr_hw_reset(ctrlr) != 0) {
 fail:
 		nvme_ctrlr_fail(ctrlr);
@@ -1201,16 +1236,9 @@ nvme_ctrlr_reset_task(void *arg, int pending)
 	int			status;
 
 	nvme_ctrlr_devctl_log(ctrlr, "RESET", "resetting controller");
+	nvme_pre_reset(ctrlr);
 	status = nvme_ctrlr_hw_reset(ctrlr);
-	/*
-	 * Use pause instead of DELAY, so that we yield to any nvme interrupt
-	 *  handlers on this CPU that were blocked on a qpair lock. We want
-	 *  all nvme interrupts completed before proceeding with restarting the
-	 *  controller.
-	 *
-	 * XXX - any way to guarantee the interrupt handlers have quiesced?
-	 */
-	pause("nvmereset", hz / 10);
+	nvme_post_reset(ctrlr);
 	if (status == 0)
 		nvme_ctrlr_start(ctrlr, true);
 	else
@@ -1697,6 +1725,7 @@ nvme_ctrlr_resume(struct nvme_controller *ctrlr)
 	if (ctrlr->is_failed)
 		return (0);
 
+	nvme_pre_reset(ctrlr);
 	if (nvme_ctrlr_hw_reset(ctrlr) != 0)
 		goto fail;
 #ifdef NVME_2X_RESET
@@ -1708,6 +1737,7 @@ nvme_ctrlr_resume(struct nvme_controller *ctrlr)
 	if (nvme_ctrlr_hw_reset(ctrlr) != 0)
 		goto fail;
 #endif
+	nvme_post_reset(ctrlr);
 
 	/*
 	 * Now that we've reset the hardware, we can restart the controller. Any
@@ -1724,6 +1754,7 @@ fail:
 	 * the controller. However, we have to return success for the resume
 	 * itself, due to questionable APIs.
 	 */
+	nvme_post_reset(ctrlr);
 	nvme_printf(ctrlr, "Failed to reset on resume, failing.\n");
 	nvme_ctrlr_fail(ctrlr);
 	(void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h
index e4b319b9d8b7..a6239f30f3bf 100644
--- a/sys/dev/nvme/nvme_private.h
+++ b/sys/dev/nvme/nvme_private.h
@@ -162,10 +162,9 @@ struct nvme_qpair {
 	struct resource		*res;
 	void 			*tag;
 
-	struct callout		timer;
-	sbintime_t		deadline;
-	bool			timer_armed;
-	enum nvme_recovery	recovery_state;
+	struct callout		timer;			/* recovery lock */
+	bool			timer_armed;		/* recovery lock */
+	enum nvme_recovery	recovery_state;		/* recovery lock */
 
 	uint32_t		num_entries;
 	uint32_t		num_trackers;
@@ -182,6 +181,7 @@ struct nvme_qpair {
 	int64_t			num_retries;
 	int64_t			num_failures;
 	int64_t			num_ignored;
+	int64_t			num_recovery_nolock;
 
 	struct nvme_command	*cmd;
 	struct nvme_completion	*cpl;
@@ -200,7 +200,7 @@ struct nvme_qpair {
 	struct nvme_tracker	**act_tr;
 
 	struct mtx_padalign	lock;
-
+	struct mtx_padalign	recovery;
 } __aligned(CACHE_LINE_SIZE);
 
 struct nvme_namespace {
diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c
index 6d34c7ddba2d..b256c4713c8d 100644
--- a/sys/dev/nvme/nvme_qpair.c
+++ b/sys/dev/nvme/nvme_qpair.c
@@ -528,14 +528,17 @@ nvme_qpair_manual_complete_request(struct nvme_qpair *qpair,
 	nvme_free_request(req);
 }
 
-bool
-nvme_qpair_process_completions(struct nvme_qpair *qpair)
+/* Locked version of completion processor */
+static bool
+_nvme_qpair_process_completions(struct nvme_qpair *qpair)
 {
 	struct nvme_tracker	*tr;
 	struct nvme_completion	cpl;
-	int done = 0;
+	bool done = false;
 	bool in_panic = dumping || SCHEDULER_STOPPED();
 
+	mtx_assert(&qpair->recovery, MA_OWNED);
+
 	/*
 	 * qpair is not enabled, likely because a controller reset is in
 	 * progress.  Ignore the interrupt - any I/O that was associated with
@@ -629,7 +632,7 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair)
 		else
 			tr = NULL;
 
-		done++;
+		done = true;
 		if (tr != NULL) {
 			nvme_qpair_complete_tracker(tr, &cpl, ERROR_PRINT_ALL);
 			qpair->sq_head = cpl.sqhd;
@@ -666,12 +669,35 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair)
 		}
 	}
 
-	if (done != 0) {
+	if (done) {
 		bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle,
 		    qpair->cq_hdbl_off, qpair->cq_head);
 	}
 
-	return (done != 0);
+	return (done);
+}
+
+bool
+nvme_qpair_process_completions(struct nvme_qpair *qpair)
+{
+	bool done;
+
+	/*
+	 * Interlock with reset / recovery code. This is an usually uncontended
+	 * to make sure that we drain out of the ISRs before we reset the card
+	 * and to prevent races with the recovery process called from a timeout
+	 * context.
+	 */
+	if (!mtx_trylock(&qpair->recovery)) {
+		qpair->num_recovery_nolock++;
+		return (false);
+	}
+
+	done = _nvme_qpair_process_completions(qpair);
+
+	mtx_unlock(&qpair->recovery);
+
+	return (done);
 }
 
 static void
@@ -699,6 +725,7 @@ nvme_qpair_construct(struct nvme_qpair *qpair,
 	qpair->ctrlr = ctrlr;
 
 	mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF);
+	mtx_init(&qpair->recovery, "nvme qpair recovery", NULL, MTX_DEF);
 
 	/* Note: NVMe PRP format is restricted to 4-byte alignment. */
 	err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
@@ -765,7 +792,7 @@ nvme_qpair_construct(struct nvme_qpair *qpair,
 	qpair->cpl_bus_addr = queuemem_phys + cmdsz;
 	prpmem_phys = queuemem_phys + cmdsz + cplsz;
 
-	callout_init(&qpair->timer, 1);
+	callout_init_mtx(&qpair->timer, &qpair->recovery, 0);
 	qpair->timer_armed = false;
 	qpair->recovery_state = RECOVERY_WAITING;
 
@@ -903,6 +930,8 @@ nvme_qpair_destroy(struct nvme_qpair *qpair)
 
 	if (mtx_initialized(&qpair->lock))
 		mtx_destroy(&qpair->lock);
+	if (mtx_initialized(&qpair->recovery))
+		mtx_destroy(&qpair->recovery);
 
 	if (qpair->res) {
 		bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ,
@@ -975,14 +1004,12 @@ nvme_qpair_timeout(void *arg)
 	struct nvme_controller	*ctrlr = qpair->ctrlr;
 	struct nvme_tracker	*tr;
 	sbintime_t		now;
-	bool			idle;
+	bool			idle = false;
 	bool			needs_reset;
 	uint32_t		csts;
 	uint8_t			cfs;
 
-
-	mtx_lock(&qpair->lock);
-	idle = TAILQ_EMPTY(&qpair->outstanding_tr);
+	mtx_assert(&qpair->recovery, MA_OWNED);
 
 	switch (qpair->recovery_state) {
 	case RECOVERY_NONE:
@@ -1003,25 +1030,10 @@ nvme_qpair_timeout(void *arg)
 			goto do_reset;
 
 		/*
-		 * Next, check to see if we have any completions. If we do,
-		 * we've likely missed an interrupt, but the card is otherwise
-		 * fine. This will also catch all the commands that are about
-		 * to timeout (but there's still a tiny race). Since the timeout
-		 * is long relative to the race between here and the check below,
-		 * this is still a win.
+		 * Process completions. We already have the recovery lock, so
+		 * call the locked version.
 		 */
-		mtx_unlock(&qpair->lock);
-		nvme_qpair_process_completions(qpair);
-		mtx_lock(&qpair->lock);
-		if (qpair->recovery_state != RECOVERY_NONE) {
-			/*
-			 * Somebody else adjusted recovery state while unlocked,
-			 * we should bail. Unlock the qpair and return without
-			 * doing anything else.
-			 */
-			mtx_unlock(&qpair->lock);
-			return;
-		}
+		_nvme_qpair_process_completions(qpair);
 
 		/*
 		 * Check to see if we need to timeout any commands. If we do, then
@@ -1029,7 +1041,13 @@ nvme_qpair_timeout(void *arg)
 		 */
 		now = getsbinuptime();
 		needs_reset = false;
+		idle = true;
+		mtx_lock(&qpair->lock);
 		TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
+			/*
+			 * Skip async commands, they are posted to the card for
+			 * an indefinite amount of time and have no deadline.
+			 */
 			if (tr->deadline == SBT_MAX)
 				continue;
 			if (now > tr->deadline) {
@@ -1055,6 +1073,7 @@ nvme_qpair_timeout(void *arg)
 				idle = false;
 			}
 		}
+		mtx_unlock(&qpair->lock);
 		if (!needs_reset)
 			break;
 
@@ -1076,6 +1095,7 @@ nvme_qpair_timeout(void *arg)
 		break;
 	case RECOVERY_WAITING:
 		nvme_printf(ctrlr, "waiting for reset to complete\n");
+		idle = false;			/* We want to keep polling */
 		break;
 	}
 
@@ -1087,7 +1107,6 @@ nvme_qpair_timeout(void *arg)
 	} else {
 		qpair->timer_armed = false;
 	}
-	mtx_unlock(&qpair->lock);
 }
 
 /*
@@ -1196,9 +1215,12 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
 
 	if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) {
 		/*
-		 * No tracker is available, or the qpair is disabled due to
-		 *  an in-progress controller-level reset or controller
-		 *  failure.
+		 * No tracker is available, or the qpair is disabled due to an
+		 * in-progress controller-level reset or controller failure. If
+		 * we lose the race with recovery_state, then we may add an
+		 * extra request to the queue which will be resubmitted later.
+		 * We only set recovery_state to NONE with qpair->lock also
+		 * held.
 		 */
 
 		if (qpair->ctrlr->is_failed) {
@@ -1260,7 +1282,10 @@ nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
 static void
 nvme_qpair_enable(struct nvme_qpair *qpair)
 {
-	mtx_assert(&qpair->lock, MA_OWNED);
+	if (mtx_initialized(&qpair->recovery))
+		mtx_assert(&qpair->recovery, MA_OWNED);
+	if (mtx_initialized(&qpair->lock))
+		mtx_assert(&qpair->lock, MA_OWNED);
 
 	qpair->recovery_state = RECOVERY_NONE;
 }
@@ -1311,9 +1336,11 @@ nvme_admin_qpair_enable(struct nvme_qpair *qpair)
 		nvme_printf(qpair->ctrlr,
 		    "done aborting outstanding admin\n");
 
+	mtx_lock(&qpair->recovery);
 	mtx_lock(&qpair->lock);
 	nvme_qpair_enable(qpair);
 	mtx_unlock(&qpair->lock);
+	mtx_unlock(&qpair->recovery);
 }
 
 void
@@ -1340,8 +1367,8 @@ nvme_io_qpair_enable(struct nvme_qpair *qpair)
 	if (report)
 		nvme_printf(qpair->ctrlr, "done aborting outstanding i/o\n");
 
+	mtx_lock(&qpair->recovery);
 	mtx_lock(&qpair->lock);
-
 	nvme_qpair_enable(qpair);
 
 	STAILQ_INIT(&temp);
@@ -1360,6 +1387,7 @@ nvme_io_qpair_enable(struct nvme_qpair *qpair)
 		nvme_printf(qpair->ctrlr, "done resubmitting i/o\n");
 
 	mtx_unlock(&qpair->lock);
+	mtx_unlock(&qpair->recovery);
 }
 
 static void
@@ -1367,27 +1395,40 @@ nvme_qpair_disable(struct nvme_qpair *qpair)
 {
 	struct nvme_tracker	*tr, *tr_temp;
 
-	mtx_lock(&qpair->lock);
+	if (mtx_initialized(&qpair->recovery))
+		mtx_assert(&qpair->recovery, MA_OWNED);
+	if (mtx_initialized(&qpair->lock))
+		mtx_assert(&qpair->lock, MA_OWNED);
+
 	qpair->recovery_state = RECOVERY_WAITING;
 	TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
 		tr->deadline = SBT_MAX;
 	}
-	mtx_unlock(&qpair->lock);
 }
 
 void
 nvme_admin_qpair_disable(struct nvme_qpair *qpair)
 {
+	mtx_lock(&qpair->recovery);
+	mtx_lock(&qpair->lock);
 
 	nvme_qpair_disable(qpair);
 	nvme_admin_qpair_abort_aers(qpair);
+
+	mtx_unlock(&qpair->lock);
+	mtx_unlock(&qpair->recovery);
 }
 
 void
 nvme_io_qpair_disable(struct nvme_qpair *qpair)
 {
+	mtx_lock(&qpair->recovery);
+	mtx_lock(&qpair->lock);
 
 	nvme_qpair_disable(qpair);
+
+	mtx_unlock(&qpair->lock);
+	mtx_unlock(&qpair->recovery);
 }
 
 void
diff --git a/sys/dev/nvme/nvme_sysctl.c b/sys/dev/nvme/nvme_sysctl.c
index a1a8a968eebe..ac0d507e2337 100644
--- a/sys/dev/nvme/nvme_sysctl.c
+++ b/sys/dev/nvme/nvme_sysctl.c
@@ -163,6 +163,7 @@ nvme_qpair_reset_stats(struct nvme_qpair *qpair)
 	qpair->num_retries = 0;
 	qpair->num_failures = 0;
 	qpair->num_ignored = 0;
+	qpair->num_recovery_nolock = 0;
 }
 
 static int
@@ -240,6 +241,21 @@ nvme_sysctl_num_ignored(SYSCTL_HANDLER_ARGS)
 	return (sysctl_handle_64(oidp, &num_ignored, 0, req));
 }
 
+static int
+nvme_sysctl_num_recovery_nolock(SYSCTL_HANDLER_ARGS)
+{
+	struct nvme_controller 	*ctrlr = arg1;
+	int64_t			num;
+	int			i;
+
+	num = ctrlr->adminq.num_recovery_nolock;
+
+	for (i = 0; i < ctrlr->num_io_queues; i++)
+		num += ctrlr->ioq[i].num_recovery_nolock;
+
+	return (sysctl_handle_64(oidp, &num, 0, req));
+}
+
 static int
 nvme_sysctl_reset_stats(SYSCTL_HANDLER_ARGS)
 {
@@ -298,6 +314,9 @@ nvme_sysctl_initialize_queue(struct nvme_qpair *qpair,
 	SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_ignored",
 	    CTLFLAG_RD, &qpair->num_ignored,
 	    "Number of interrupts posted, but were administratively ignored");
+	SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_recovery_nolock",
+	    CTLFLAG_RD, &qpair->num_recovery_nolock,
+	    "Number of times that we failed to lock recovery in the ISR");
 
 	SYSCTL_ADD_PROC(ctrlr_ctx, que_list, OID_AUTO,
 	    "dump_debug", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
@@ -366,6 +385,11 @@ nvme_sysctl_initialize_ctrlr(struct nvme_controller *ctrlr)
 	    ctrlr, 0, nvme_sysctl_num_ignored, "IU",
 	    "Number of interrupts ignored administratively");
 
+	SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
+	    "num_recovery_nolock", CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
+	    ctrlr, 0, nvme_sysctl_num_recovery_nolock, "IU",
+	    "Number of times that we failed to lock recovery in the ISR");
+
 	SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
 	    "reset_stats", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, ctrlr,
 	    0, nvme_sysctl_reset_stats, "IU", "Reset statistics to zero");