git: c1c524852f62 - main - cxgbe/t4_tom: Implement uld_stop and uld_restart for ULD_TOM.

From: Navdeep Parhar <np_at_FreeBSD.org>
Date: Mon, 22 Jul 2024 05:39:05 UTC
The branch main has been updated by np:

URL: https://cgit.FreeBSD.org/src/commit/?id=c1c524852f625cf5f420653f7850d1fe3ff6b4ca

commit c1c524852f625cf5f420653f7850d1fe3ff6b4ca
Author:     Navdeep Parhar <np@FreeBSD.org>
AuthorDate: 2024-07-12 23:56:15 +0000
Commit:     Navdeep Parhar <np@FreeBSD.org>
CommitDate: 2024-07-22 05:20:21 +0000

    cxgbe/t4_tom: Implement uld_stop and uld_restart for ULD_TOM.
    
    This allows the adapter to be suspended or reset even when stateful TOE is
    active, in some limited configurations.
    
    The LLD has already stopped the adapter hardware and all its queues by the time
    these ULD routines get called.  The general approach in t4_tom is to purge the
    lookup tables immediately so that they are ready for operation by the time the
    adapter resumes, and park all the resources left hanging by the stopped hardware
    into separate "stranded" queues that can be dealt with at leisure.
    
    Outstanding active opens, live connections, and synq entries (for connections in
    the middle of the 3-way handshake) are all treated as if the hardware had
    reported an abrupt error for the tid.  The servers/listeners are a bit different
    in that no error is reported.  They're just noted as non-functional when the
    hardware stops and are recreated by the driver during restart.
    
    MFC after:      1 month
    Sponsored by:   Chelsio Communications
---
 sys/dev/cxgbe/offload.h        |   1 +
 sys/dev/cxgbe/tom/t4_connect.c |  11 +-
 sys/dev/cxgbe/tom/t4_listen.c  | 136 ++++++++++++++++--
 sys/dev/cxgbe/tom/t4_tom.c     | 314 ++++++++++++++++++++++++++++++++++-------
 sys/dev/cxgbe/tom/t4_tom.h     |  19 ++-
 5 files changed, 409 insertions(+), 72 deletions(-)

diff --git a/sys/dev/cxgbe/offload.h b/sys/dev/cxgbe/offload.h
index b17056dc0338..c13901ee1a59 100644
--- a/sys/dev/cxgbe/offload.h
+++ b/sys/dev/cxgbe/offload.h
@@ -146,6 +146,7 @@ struct tid_info {
 	u_int stids_in_use;
 	u_int nstids_free_head;	/* # of available stids at the beginning */
 	struct stid_head stids;
+	bool stid_tab_stopped;
 
 	struct mtx atid_lock __aligned(CACHE_LINE_SIZE);
 	union aopen_entry *atid_tab;
diff --git a/sys/dev/cxgbe/tom/t4_connect.c b/sys/dev/cxgbe/tom/t4_connect.c
index 4e81f23dc267..1c98e70a4df5 100644
--- a/sys/dev/cxgbe/tom/t4_connect.c
+++ b/sys/dev/cxgbe/tom/t4_connect.c
@@ -110,15 +110,16 @@ done:
 }
 
 void
-act_open_failure_cleanup(struct adapter *sc, u_int atid, u_int status)
+act_open_failure_cleanup(struct adapter *sc, struct toepcb *toep, u_int status)
 {
-	struct toepcb *toep = lookup_atid(sc, atid);
 	struct inpcb *inp = toep->inp;
 	struct toedev *tod = &toep->td->tod;
 	struct epoch_tracker et;
 
-	free_atid(sc, atid);
-	toep->tid = -1;
+	if (toep->tid >= 0) {
+		free_atid(sc, toep->tid);
+		toep->tid = -1;
+	}
 
 	CURVNET_SET(toep->vnet);
 	if (status != EAGAIN)
@@ -158,7 +159,7 @@ do_act_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
 		release_tid(sc, GET_TID(cpl), toep->ctrlq);
 
 	rc = act_open_rpl_status_to_errno(status);
-	act_open_failure_cleanup(sc, atid, rc);
+	act_open_failure_cleanup(sc, toep, rc);
 
 	return (0);
 }
diff --git a/sys/dev/cxgbe/tom/t4_listen.c b/sys/dev/cxgbe/tom/t4_listen.c
index 6e8361734db7..897c5bcaab1e 100644
--- a/sys/dev/cxgbe/tom/t4_listen.c
+++ b/sys/dev/cxgbe/tom/t4_listen.c
@@ -88,6 +88,111 @@ static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
 
 static void send_abort_rpl_synqe(struct toedev *, struct synq_entry *, int);
 
+static int create_server6(struct adapter *, struct listen_ctx *);
+static int create_server(struct adapter *, struct listen_ctx *);
+
+int
+alloc_stid_tab(struct adapter *sc)
+{
+	struct tid_info *t = &sc->tids;
+
+	MPASS(t->nstids > 0);
+	MPASS(t->stid_tab == NULL);
+
+	t->stid_tab = malloc(t->nstids * sizeof(*t->stid_tab), M_CXGBE,
+	    M_ZERO | M_NOWAIT);
+	if (t->stid_tab == NULL)
+		return (ENOMEM);
+	mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF);
+	t->stids_in_use = 0;
+	TAILQ_INIT(&t->stids);
+	t->nstids_free_head = t->nstids;
+
+	return (0);
+}
+
+void
+free_stid_tab(struct adapter *sc)
+{
+	struct tid_info *t = &sc->tids;
+
+	KASSERT(t->stids_in_use == 0,
+	    ("%s: %d tids still in use.", __func__, t->stids_in_use));
+
+	if (mtx_initialized(&t->stid_lock))
+		mtx_destroy(&t->stid_lock);
+	free(t->stid_tab, M_CXGBE);
+	t->stid_tab = NULL;
+}
+
+void
+stop_stid_tab(struct adapter *sc)
+{
+	struct tid_info *t = &sc->tids;
+	struct tom_data *td = sc->tom_softc;
+	struct listen_ctx *lctx;
+	struct synq_entry *synqe;
+	int i, ntids;
+
+	mtx_lock(&t->stid_lock);
+	t->stid_tab_stopped = true;
+	mtx_unlock(&t->stid_lock);
+
+	mtx_lock(&td->lctx_hash_lock);
+	for (i = 0; i <= td->listen_mask; i++) {
+		LIST_FOREACH(lctx, &td->listen_hash[i], link)
+			lctx->flags &= ~(LCTX_RPL_PENDING | LCTX_SETUP_IN_HW);
+	}
+	mtx_unlock(&td->lctx_hash_lock);
+
+	mtx_lock(&td->toep_list_lock);
+	TAILQ_FOREACH(synqe, &td->synqe_list, link) {
+		MPASS(sc->incarnation == synqe->incarnation);
+		MPASS(synqe->tid >= 0);
+		MPASS(synqe == lookup_tid(sc, synqe->tid));
+		/* Remove tid from the lookup table immediately. */
+		CTR(KTR_CXGBE, "%s: tid %d@%d STRANDED, removed from table",
+		    __func__, synqe->tid, synqe->incarnation);
+		ntids = synqe->lctx->inp->inp_vflag & INP_IPV6 ? 2 : 1;
+		remove_tid(sc, synqe->tid, ntids);
+#if 0
+		/* synqe->tid is stale now but left alone for debug. */
+		synqe->tid = -1;
+#endif
+	}
+	MPASS(TAILQ_EMPTY(&td->stranded_synqe));
+	TAILQ_CONCAT(&td->stranded_synqe, &td->synqe_list, link);
+	MPASS(TAILQ_EMPTY(&td->synqe_list));
+	mtx_unlock(&td->toep_list_lock);
+}
+
+void
+restart_stid_tab(struct adapter *sc)
+{
+	struct tid_info *t = &sc->tids;
+	struct tom_data *td = sc->tom_softc;
+	struct listen_ctx *lctx;
+	int i;
+
+	mtx_lock(&td->lctx_hash_lock);
+	for (i = 0; i <= td->listen_mask; i++) {
+		LIST_FOREACH(lctx, &td->listen_hash[i], link) {
+			MPASS((lctx->flags & (LCTX_RPL_PENDING | LCTX_SETUP_IN_HW)) == 0);
+			lctx->flags |= LCTX_RPL_PENDING;
+			if (lctx->inp->inp_vflag & INP_IPV6)
+				create_server6(sc, lctx);
+			else
+				create_server(sc, lctx);
+		}
+	}
+	mtx_unlock(&td->lctx_hash_lock);
+
+	mtx_lock(&t->stid_lock);
+	t->stid_tab_stopped = false;
+	mtx_unlock(&t->stid_lock);
+
+}
+
 static int
 alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6)
 {
@@ -107,7 +212,7 @@ alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6)
 	    __func__, t->stid_base, t->nstids, n));
 
 	mtx_lock(&t->stid_lock);
-	if (n > t->nstids - t->stids_in_use) {
+	if (n > t->nstids - t->stids_in_use || t->stid_tab_stopped) {
 		mtx_unlock(&t->stid_lock);
 		return (-1);
 	}
@@ -632,12 +737,13 @@ t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
 		return (EINPROGRESS);
 	}
 
-	destroy_server(sc, lctx);
+	if (lctx->flags & LCTX_SETUP_IN_HW)
+		destroy_server(sc, lctx);
 	return (0);
 }
 
 static inline struct synq_entry *
-alloc_synqe(struct adapter *sc __unused, struct listen_ctx *lctx, int flags)
+alloc_synqe(struct adapter *sc, struct listen_ctx *lctx, int flags)
 {
 	struct synq_entry *synqe;
 
@@ -647,6 +753,7 @@ alloc_synqe(struct adapter *sc __unused, struct listen_ctx *lctx, int flags)
 	synqe = malloc(sizeof(*synqe), M_CXGBE, flags);
 	if (__predict_true(synqe != NULL)) {
 		synqe->flags = TPF_SYNQE;
+		synqe->incarnation = sc->incarnation;
 		refcount_init(&synqe->refcnt, 1);
 		synqe->lctx = lctx;
 		hold_lctx(lctx);	/* Every synqe has a ref on its lctx. */
@@ -761,8 +868,9 @@ do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
 	    __func__, stid, status, lctx->flags);
 
 	lctx->flags &= ~LCTX_RPL_PENDING;
-
-	if (status != CPL_ERR_NONE)
+	if (status == CPL_ERR_NONE)
+		lctx->flags |= LCTX_SETUP_IN_HW;
+	else
 		log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status);
 
 #ifdef INVARIANTS
@@ -856,13 +964,15 @@ done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
 	int ntids;
 
 	INP_WLOCK_ASSERT(inp);
-	ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1;
 
-	remove_tid(sc, synqe->tid, ntids);
-	mtx_lock(&td->toep_list_lock);
-	TAILQ_REMOVE(&td->synqe_list, synqe, link);
-	mtx_unlock(&td->toep_list_lock);
-	release_tid(sc, synqe->tid, lctx->ctrlq);
+	if (synqe->tid != -1) {
+		ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1;
+		remove_tid(sc, synqe->tid, ntids);
+		mtx_lock(&td->toep_list_lock);
+		TAILQ_REMOVE(&td->synqe_list, synqe, link);
+		mtx_unlock(&td->toep_list_lock);
+		release_tid(sc, synqe->tid, lctx->ctrlq);
+	}
 	t4_l2t_release(e);
 	inp = release_synqe(sc, synqe);
 	if (inp)
@@ -870,10 +980,8 @@ done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
 }
 
 void
-synack_failure_cleanup(struct adapter *sc, int tid)
+synack_failure_cleanup(struct adapter *sc, struct synq_entry *synqe)
 {
-	struct synq_entry *synqe = lookup_tid(sc, tid);
-
 	INP_WLOCK(synqe->lctx->inp);
 	done_with_synqe(sc, synqe);
 }
diff --git a/sys/dev/cxgbe/tom/t4_tom.c b/sys/dev/cxgbe/tom/t4_tom.c
index fb92c88aa358..58a77ff93c86 100644
--- a/sys/dev/cxgbe/tom/t4_tom.c
+++ b/sys/dev/cxgbe/tom/t4_tom.c
@@ -89,18 +89,23 @@ static int t4_tom_modevent(module_t, int, void *);
 /* ULD ops and helpers */
 static int t4_tom_activate(struct adapter *);
 static int t4_tom_deactivate(struct adapter *);
+static int t4_tom_stop(struct adapter *);
+static int t4_tom_restart(struct adapter *);
 
 static struct uld_info tom_uld_info = {
 	.uld_activate = t4_tom_activate,
 	.uld_deactivate = t4_tom_deactivate,
+	.uld_stop = t4_tom_stop,
+	.uld_restart = t4_tom_restart,
 };
 
 static void release_offload_resources(struct toepcb *);
 static void done_with_toepcb(struct toepcb *);
-static int alloc_tid_tabs(struct tid_info *);
-static void free_tid_tabs(struct tid_info *);
+static int alloc_tid_tabs(struct adapter *);
+static void free_tid_tabs(struct adapter *);
 static void free_tom_data(struct adapter *, struct tom_data *);
 static void reclaim_wr_resources(void *, int);
+static void cleanup_stranded_tids(void *, int);
 
 struct toepcb *
 alloc_toepcb(struct vi_info *vi, int flags)
@@ -135,6 +140,7 @@ alloc_toepcb(struct vi_info *vi, int flags)
 
 	refcount_init(&toep->refcount, 1);
 	toep->td = sc->tom_softc;
+	toep->incarnation = sc->incarnation;
 	toep->vi = vi;
 	toep->tid = -1;
 	toep->tx_total = tx_credits;
@@ -254,6 +260,7 @@ offload_socket(struct socket *so, struct toepcb *toep)
 	/* Add the TOE PCB to the active list */
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_INSERT_HEAD(&td->toep_list, toep, link);
+	toep->flags |= TPF_IN_TOEP_LIST;
 	mtx_unlock(&td->toep_list_lock);
 }
 
@@ -298,6 +305,7 @@ undo_offload_socket(struct socket *so)
 		panic("%s: inp freed.", __func__);
 
 	mtx_lock(&td->toep_list_lock);
+	toep->flags &= ~TPF_IN_TOEP_LIST;
 	TAILQ_REMOVE(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 }
@@ -364,7 +372,10 @@ done_with_toepcb(struct toepcb *toep)
 	MPASS(toep->ce == NULL);
 
 	mtx_lock(&td->toep_list_lock);
-	TAILQ_REMOVE(&td->toep_list, toep, link);
+	if (toep->flags & TPF_IN_TOEP_LIST) {
+		toep->flags &= ~TPF_IN_TOEP_LIST;
+		TAILQ_REMOVE(&td->toep_list, toep, link);
+	}
 	mtx_unlock(&td->toep_list_lock);
 
 	free_toepcb(toep);
@@ -1449,14 +1460,15 @@ negative_advice(int status)
 }
 
 static int
-alloc_tid_tab(struct tid_info *t, int flags)
+alloc_tid_tab(struct adapter *sc)
 {
+	struct tid_info *t = &sc->tids;
 
 	MPASS(t->ntids > 0);
 	MPASS(t->tid_tab == NULL);
 
 	t->tid_tab = malloc(t->ntids * sizeof(*t->tid_tab), M_CXGBE,
-	    M_ZERO | flags);
+	    M_ZERO | M_NOWAIT);
 	if (t->tid_tab == NULL)
 		return (ENOMEM);
 	atomic_store_rel_int(&t->tids_in_use, 0);
@@ -1465,8 +1477,9 @@ alloc_tid_tab(struct tid_info *t, int flags)
 }
 
 static void
-free_tid_tab(struct tid_info *t)
+free_tid_tab(struct adapter *sc)
 {
+	struct tid_info *t = &sc->tids;
 
 	KASSERT(t->tids_in_use == 0,
 	    ("%s: %d tids still in use.", __func__, t->tids_in_use));
@@ -1475,62 +1488,29 @@ free_tid_tab(struct tid_info *t)
 	t->tid_tab = NULL;
 }
 
-static int
-alloc_stid_tab(struct tid_info *t, int flags)
-{
-
-	MPASS(t->nstids > 0);
-	MPASS(t->stid_tab == NULL);
-
-	t->stid_tab = malloc(t->nstids * sizeof(*t->stid_tab), M_CXGBE,
-	    M_ZERO | flags);
-	if (t->stid_tab == NULL)
-		return (ENOMEM);
-	mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF);
-	t->stids_in_use = 0;
-	TAILQ_INIT(&t->stids);
-	t->nstids_free_head = t->nstids;
-
-	return (0);
-}
-
 static void
-free_stid_tab(struct tid_info *t)
+free_tid_tabs(struct adapter *sc)
 {
-
-	KASSERT(t->stids_in_use == 0,
-	    ("%s: %d tids still in use.", __func__, t->stids_in_use));
-
-	if (mtx_initialized(&t->stid_lock))
-		mtx_destroy(&t->stid_lock);
-	free(t->stid_tab, M_CXGBE);
-	t->stid_tab = NULL;
-}
-
-static void
-free_tid_tabs(struct tid_info *t)
-{
-
-	free_tid_tab(t);
-	free_stid_tab(t);
+	free_tid_tab(sc);
+	free_stid_tab(sc);
 }
 
 static int
-alloc_tid_tabs(struct tid_info *t)
+alloc_tid_tabs(struct adapter *sc)
 {
 	int rc;
 
-	rc = alloc_tid_tab(t, M_NOWAIT);
+	rc = alloc_tid_tab(sc);
 	if (rc != 0)
 		goto failed;
 
-	rc = alloc_stid_tab(t, M_NOWAIT);
+	rc = alloc_stid_tab(sc);
 	if (rc != 0)
 		goto failed;
 
 	return (0);
 failed:
-	free_tid_tabs(t);
+	free_tid_tabs(sc);
 	return (rc);
 }
 
@@ -1587,7 +1567,7 @@ free_tom_data(struct adapter *sc, struct tom_data *td)
 		mtx_destroy(&td->toep_list_lock);
 
 	free_tcb_history(sc, td);
-	free_tid_tabs(&sc->tids);
+	free_tid_tabs(sc);
 	free(td, M_CXGBE);
 }
 
@@ -1792,13 +1772,14 @@ reclaim_wr_resources(void *arg, int count)
 		case CPL_ACT_OPEN_REQ6:
 			atid = G_TID_TID(be32toh(OPCODE_TID(cpl)));
 			CTR2(KTR_CXGBE, "%s: atid %u ", __func__, atid);
-			act_open_failure_cleanup(sc, atid, EHOSTUNREACH);
+			act_open_failure_cleanup(sc, lookup_atid(sc, atid),
+						 EHOSTUNREACH);
 			free(wr, M_CXGBE);
 			break;
 		case CPL_PASS_ACCEPT_RPL:
 			tid = GET_TID(cpl);
 			CTR2(KTR_CXGBE, "%s: tid %u ", __func__, tid);
-			synack_failure_cleanup(sc, tid);
+			synack_failure_cleanup(sc, lookup_tid(sc, tid));
 			free(wr, M_CXGBE);
 			break;
 		default:
@@ -1809,6 +1790,83 @@ reclaim_wr_resources(void *arg, int count)
 	}
 }
 
+/*
+ * Based on do_abort_req.  We treat an abrupt hardware stop as a connection
+ * abort from the hardware.
+ */
+static void
+live_tid_failure_cleanup(struct adapter *sc, struct toepcb *toep, u_int status)
+{
+	struct inpcb *inp;
+	struct tcpcb *tp;
+	struct epoch_tracker et;
+
+	MPASS(!(toep->flags & TPF_SYNQE));
+
+	inp = toep->inp;
+	CURVNET_SET(toep->vnet);
+	NET_EPOCH_ENTER(et);	/* for tcp_close */
+	INP_WLOCK(inp);
+	tp = intotcpcb(inp);
+	toep->flags |= TPF_ABORT_SHUTDOWN;
+	if ((inp->inp_flags & INP_DROPPED) == 0) {
+		struct socket *so = inp->inp_socket;
+
+		if (so != NULL)
+			so_error_set(so, status);
+		tp = tcp_close(tp);
+		if (tp == NULL)
+			INP_WLOCK(inp);	/* re-acquire */
+	}
+	final_cpl_received(toep);
+	NET_EPOCH_EXIT(et);
+	CURVNET_RESTORE();
+}
+
+static void
+cleanup_stranded_tids(void *arg, int count)
+{
+	TAILQ_HEAD(, toepcb) tlist = TAILQ_HEAD_INITIALIZER(tlist);
+	TAILQ_HEAD(, synq_entry) slist = TAILQ_HEAD_INITIALIZER(slist);
+	struct tom_data *td = arg;
+	struct adapter *sc = td_adapter(td);
+	struct toepcb *toep;
+	struct synq_entry *synqe;
+
+	/* Clean up synq entries. */
+	mtx_lock(&td->toep_list_lock);
+	TAILQ_SWAP(&td->stranded_synqe, &slist, synq_entry, link);
+	mtx_unlock(&td->toep_list_lock);
+	while ((synqe = TAILQ_FIRST(&slist)) != NULL) {
+		TAILQ_REMOVE(&slist, synqe, link);
+		MPASS(synqe->tid >= 0);	/* stale, was kept around for debug */
+		synqe->tid = -1;
+		synack_failure_cleanup(sc, synqe);
+	}
+
+	/* Clean up in-flight active opens. */
+	mtx_lock(&td->toep_list_lock);
+	TAILQ_SWAP(&td->stranded_atids, &tlist, toepcb, link);
+	mtx_unlock(&td->toep_list_lock);
+	while ((toep = TAILQ_FIRST(&tlist)) != NULL) {
+		TAILQ_REMOVE(&tlist, toep, link);
+		MPASS(toep->tid >= 0);	/* stale, was kept around for debug */
+		toep->tid = -1;
+		act_open_failure_cleanup(sc, toep, EHOSTUNREACH);
+	}
+
+	/* Clean up live connections. */
+	mtx_lock(&td->toep_list_lock);
+	TAILQ_SWAP(&td->stranded_tids, &tlist, toepcb, link);
+	mtx_unlock(&td->toep_list_lock);
+	while ((toep = TAILQ_FIRST(&tlist)) != NULL) {
+		TAILQ_REMOVE(&tlist, toep, link);
+		MPASS(toep->tid >= 0);	/* stale, was kept around for debug */
+		toep->tid = -1;
+		live_tid_failure_cleanup(sc, toep, ECONNABORTED);
+	}
+}
+
 /*
  * Ground control to Major TOM
  * Commencing countdown, engines on
@@ -1832,6 +1890,9 @@ t4_tom_activate(struct adapter *sc)
 	mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF);
 	TAILQ_INIT(&td->toep_list);
 	TAILQ_INIT(&td->synqe_list);
+	TAILQ_INIT(&td->stranded_atids);
+	TAILQ_INIT(&td->stranded_tids);
+	TASK_INIT(&td->cleanup_stranded_tids, 0, cleanup_stranded_tids, td);
 
 	/* Listen context */
 	mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF);
@@ -1844,7 +1905,7 @@ t4_tom_activate(struct adapter *sc)
 	TASK_INIT(&td->reclaim_wr_resources, 0, reclaim_wr_resources, td);
 
 	/* TID tables */
-	rc = alloc_tid_tabs(&sc->tids);
+	rc = alloc_tid_tabs(sc);
 	if (rc != 0)
 		goto done;
 
@@ -1917,6 +1978,8 @@ t4_tom_deactivate(struct adapter *sc)
 	if (!TAILQ_EMPTY(&td->toep_list))
 		rc = EBUSY;
 	MPASS(TAILQ_EMPTY(&td->synqe_list));
+	MPASS(TAILQ_EMPTY(&td->stranded_tids));
+	mtx_unlock(&td->toep_list_lock);
 	mtx_unlock(&td->toep_list_lock);
 
 	mtx_lock(&td->lctx_hash_lock);
@@ -1925,6 +1988,7 @@ t4_tom_deactivate(struct adapter *sc)
 	mtx_unlock(&td->lctx_hash_lock);
 
 	taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources);
+	taskqueue_drain(taskqueue_thread, &td->cleanup_stranded_tids);
 	mtx_lock(&td->unsent_wr_lock);
 	if (!STAILQ_EMPTY(&td->unsent_wr_list))
 		rc = EBUSY;
@@ -1939,6 +2003,156 @@ t4_tom_deactivate(struct adapter *sc)
 	return (rc);
 }
 
+static void
+stop_atids(struct adapter *sc)
+{
+	struct tom_data *td = sc->tom_softc;
+	struct tid_info *t = &sc->tids;
+	struct toepcb *toep;
+	int atid;
+
+	/*
+	 * Hashfilters and T6-KTLS are the only other users of atids but they're
+	 * both mutually exclusive with TOE.  That means t4_tom owns all the
+	 * atids in the table.
+	 */
+	MPASS(!is_hashfilter(sc));
+	if (is_t6(sc))
+		MPASS(!(sc->flags & KERN_TLS_ON));
+
+	/* New atids are not being allocated. */
+#ifdef INVARIANTS
+	mtx_lock(&t->atid_lock);
+	MPASS(t->atid_alloc_stopped == true);
+	mtx_unlock(&t->atid_lock);
+#endif
+
+	/*
+	 * In-use atids fall in one of these two categories:
+	 * a) Those waiting for L2 resolution before being submitted to
+	 *    hardware.
+	 * b) Those that have been submitted to hardware and are awaiting
+	 *    replies that will never arrive because the LLD is stopped.
+	 */
+	for (atid = 0; atid < t->natids; atid++) {
+		toep = lookup_atid(sc, atid);
+		if ((uintptr_t)toep >= (uintptr_t)&t->atid_tab[0] &&
+		    (uintptr_t)toep < (uintptr_t)&t->atid_tab[t->natids])
+			continue;
+		MPASS(toep->tid == atid);
+		MPASS(toep->incarnation == sc->incarnation);
+		/*
+		 * Take the atid out of the lookup table.  toep->tid is stale
+		 * after this but useful for debug.
+		 */
+		CTR(KTR_CXGBE, "%s: atid %d@%d STRANDED, removed from table",
+		    __func__, atid, toep->incarnation);
+		free_atid(sc, toep->tid);
+#if 0
+		toep->tid = -1;
+#endif
+		mtx_lock(&td->toep_list_lock);
+		TAILQ_INSERT_TAIL(&td->stranded_atids, toep, link);
+		mtx_unlock(&td->toep_list_lock);
+	}
+	MPASS(atomic_load_int(&t->atids_in_use) == 0);
+}
+
+static void
+stop_tids(struct adapter *sc)
+{
+	struct tom_data *td = sc->tom_softc;
+	struct toepcb *toep;
+#ifdef INVARIANTS
+	struct tid_info *t = &sc->tids;
+#endif
+
+	/*
+	 * The LLD's offload queues are stopped so do_act_establish and
+	 * do_pass_accept_req cannot run and insert tids in parallel with this
+	 * thread.  stop_stid_tab has also run and removed the synq entries'
+	 * tids from the table.  The only tids in the table are for connections
+	 * at or beyond ESTABLISHED that are still waiting for the final CPL.
+	 */
+	mtx_lock(&td->toep_list_lock);
+	TAILQ_FOREACH(toep, &td->toep_list, link) {
+		MPASS(sc->incarnation == toep->incarnation);
+		MPASS(toep->tid >= 0);
+		MPASS(toep == lookup_tid(sc, toep->tid));
+		/* Remove tid from the lookup table immediately. */
+		CTR(KTR_CXGBE, "%s: tid %d@%d STRANDED, removed from table",
+		    __func__, toep->tid, toep->incarnation);
+		remove_tid(sc, toep->tid, toep->ce ? 2 : 1);
+#if 0
+		/* toep->tid is stale now but left alone for debug. */
+		toep->tid = -1;
+#endif
+		/* All toep in this list will get bulk moved to stranded_tids */
+		toep->flags &= ~TPF_IN_TOEP_LIST;
+	}
+	MPASS(TAILQ_EMPTY(&td->stranded_tids));
+	TAILQ_CONCAT(&td->stranded_tids, &td->toep_list, link);
+	MPASS(TAILQ_EMPTY(&td->toep_list));
+	mtx_unlock(&td->toep_list_lock);
+
+	MPASS(atomic_load_int(&t->tids_in_use) == 0);
+}
+
+/*
+ * L2T is stable because
+ * 1. stop_lld stopped all new allocations.
+ * 2. stop_lld also stopped the tx wrq so nothing is enqueueing new WRs to the
+ *    queue or to l2t_entry->wr_list.
+ * 3. t4_l2t_update is ignoring all L2 updates.
+ */
+static void
+stop_tom_l2t(struct adapter *sc)
+{
+	struct l2t_data *d = sc->l2t;
+	struct l2t_entry *e;
+	int i;
+
+	for (i = 0; i < d->l2t_size; i++) {
+		e = &d->l2tab[i];
+		mtx_lock(&e->lock);
+		if (e->state == L2T_STATE_VALID)
+			e->state = L2T_STATE_RESOLVING;
+		if (!STAILQ_EMPTY(&e->wr_list))
+			CXGBE_UNIMPLEMENTED("l2t e->wr_list");
+		mtx_unlock(&e->lock);
+	}
+}
+
+static int
+t4_tom_stop(struct adapter *sc)
+{
+	struct tid_info *t = &sc->tids;
+	struct tom_data *td = sc->tom_softc;
+
+	ASSERT_SYNCHRONIZED_OP(sc);
+
+	stop_tom_l2t(sc);
+	if (atomic_load_int(&t->atids_in_use) > 0)
+		stop_atids(sc);
+	if (atomic_load_int(&t->stids_in_use) > 0)
+		stop_stid_tab(sc);
+	if (atomic_load_int(&t->tids_in_use) > 0)
+		stop_tids(sc);
+	taskqueue_enqueue(taskqueue_thread, &td->cleanup_stranded_tids);
+
+	return (0);
+}
+
+static int
+t4_tom_restart(struct adapter *sc)
+{
+	ASSERT_SYNCHRONIZED_OP(sc);
+
+	restart_stid_tab(sc);
+
+	return (0);
+}
+
 static int
 t4_ctloutput_tom(struct socket *so, struct sockopt *sopt)
 {
diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h
index 0bc368fe3d56..8debab4940b1 100644
--- a/sys/dev/cxgbe/tom/t4_tom.h
+++ b/sys/dev/cxgbe/tom/t4_tom.h
@@ -76,6 +76,7 @@ enum {
 	TPF_TLS_RX_QUIESCING = (1 << 14), /* RX quiesced for TLS RX startup */
 	TPF_TLS_RX_QUIESCED = (1 << 15), /* RX quiesced for TLS RX startup */
 	TPF_WAITING_FOR_FINAL = (1<< 16), /* waiting for wakeup on final CPL */
+	TPF_IN_TOEP_LIST   = (1 << 17),	/* toep is in the main td->toep_list */
 };
 
 enum {
@@ -210,7 +211,7 @@ struct toepcb {
 	struct tom_data *td;
 	struct inpcb *inp;	/* backpointer to host stack's PCB */
 	u_int flags;		/* miscellaneous flags */
-	TAILQ_ENTRY(toepcb) link; /* toep_list */
+	TAILQ_ENTRY(toepcb) link; /* toep_list or stranded_toep_list */
 	int refcount;
 	struct vnet *vnet;
 	struct vi_info *vi;	/* virtual interface */
@@ -220,6 +221,7 @@ struct toepcb {
 	struct l2t_entry *l2te;	/* L2 table entry used by this connection */
 	struct clip_entry *ce;	/* CLIP table entry used by this tid */
 	int tid;		/* Connection identifier */
+	int incarnation;	/* sc->incarnation when toepcb was allocated */
 
 	/* tx credit handling */
 	u_int tx_total;		/* total tx WR credits (in 16B units) */
@@ -278,6 +280,7 @@ struct synq_entry {
 	uint32_t ts;
 	uint32_t rss_hash;
 	__be16 tcp_opt; /* from cpl_pass_establish */
+	int incarnation;
 	struct toepcb *toep;
 
 	struct conn_params params;
@@ -285,6 +288,7 @@ struct synq_entry {
 
 /* listen_ctx flags */
 #define LCTX_RPL_PENDING 1	/* waiting for a CPL_PASS_OPEN_RPL */
+#define LCTX_SETUP_IN_HW 2	/* stid entry is setup in hardware */
 
 struct listen_ctx {
 	LIST_ENTRY(listen_ctx) link;	/* listen hash linkage */
@@ -331,6 +335,11 @@ struct tom_data {
 	struct mtx toep_list_lock;
 	TAILQ_HEAD(, toepcb) toep_list;
 	TAILQ_HEAD(, synq_entry) synqe_list;
+	/* List of tids left stranded because hw stopped abruptly. */
+	TAILQ_HEAD(, toepcb) stranded_atids;
+	TAILQ_HEAD(, toepcb) stranded_tids;
+	TAILQ_HEAD(, synq_entry) stranded_synqe;
+	struct task cleanup_stranded_tids;
 
 	struct mtx lctx_hash_lock;
 	LIST_HEAD(, listen_ctx) *listen_hash;
@@ -477,7 +486,7 @@ void t4_init_connect_cpl_handlers(void);
 void t4_uninit_connect_cpl_handlers(void);
 int t4_connect(struct toedev *, struct socket *, struct nhop_object *,
     struct sockaddr *);
-void act_open_failure_cleanup(struct adapter *, u_int, u_int);
+void act_open_failure_cleanup(struct adapter *, struct toepcb *, u_int);
 
 /* t4_listen.c */
 void t4_init_listen_cpl_handlers(void);
@@ -492,7 +501,11 @@ int do_abort_req_synqe(struct sge_iq *, const struct rss_header *,
 int do_abort_rpl_synqe(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 void t4_offload_socket(struct toedev *, void *, struct socket *);
-void synack_failure_cleanup(struct adapter *, int);
+void synack_failure_cleanup(struct adapter *, struct synq_entry *);
+int alloc_stid_tab(struct adapter *);
+void free_stid_tab(struct adapter *);
+void stop_stid_tab(struct adapter *);
+void restart_stid_tab(struct adapter *);
 
 /* t4_cpl_io.c */
 void aiotx_init_toep(struct toepcb *);