git: 0d6f2ab194c7 - stable/13 - cxgbe(4): Add support for NIC suspend/resume and live reset.

From: Navdeep Parhar <np_at_FreeBSD.org>
Date: Wed, 20 Oct 2021 14:07:00 UTC
The branch stable/13 has been updated by np:

URL: https://cgit.FreeBSD.org/src/commit/?id=0d6f2ab194c79443b8870bc7a82fa4b2e6954f0e

commit 0d6f2ab194c79443b8870bc7a82fa4b2e6954f0e
Author:     Navdeep Parhar <np@FreeBSD.org>
AuthorDate: 2021-04-28 04:33:10 +0000
Commit:     Navdeep Parhar <np@FreeBSD.org>
CommitDate: 2021-10-20 13:59:41 +0000

    cxgbe(4): Add support for NIC suspend/resume and live reset.
    
    Add suspend/resume callbacks to the driver and a live reset built around
    them.  This commit covers the basic NIC and future commits will expand
    this functionality to other stateful parts of the chip.  Suspend and
    resume operate on the chip (the t?nex nexus device) and affect all its
    ports.  It is not possible to suspend/resume or reset individual ports.
    All these operations can be performed on a running NIC.  A reset will
    look like a link bounce to the networking stack.
    
    Here are some ways to exercise this functionality:
    
     /* Manual suspend and resume. */
     # devctl suspend t6nex0
     # devctl resume t6nex0
    
     /* Manual reset. */
     # devctl reset t6nex0
    
     /* Manual reset with driver sysctl. */
     # sysctl dev.t6nex.0.reset=1
    
     /* Automatic adapter reset on any fatal error. */
     # hw.cxgbe.reset_on_fatal_err=1
    
    Suspend disables the adapter (DMA, interrupts, and the port PHYs) and
    marks the hardware as unavailable to the driver.  All ifnets associated
    with the adapter are still visible to the kernel but operations that
    require hardware interaction will fail with ENXIO.  All ifnets report
    link-down while the adapter is suspended.
    
    Resume will reattach to the card, reconfigure it as before, and recreate
    the queues servicing the existing ifnets.  The ifnets are able to send
    and receive traffic as soon as the link comes back up.
    
    Reset is roughly the same as a suspend and a resume with at least one of
    these events in between: D0->D3Hot->D0, FLR, PCIe link retrain.
    
    (cherry picked from commit 83b5cda106a2dc0c8ace1718485c2ef05c5aa62b)
---
 sys/dev/cxgbe/adapter.h   |   52 +-
 sys/dev/cxgbe/t4_clip.c   |    2 +-
 sys/dev/cxgbe/t4_filter.c |   36 +-
 sys/dev/cxgbe/t4_main.c   | 1586 +++++++++++++++++++++++++++++++++++++--------
 sys/dev/cxgbe/t4_sched.c  |   13 +-
 sys/dev/cxgbe/t4_tracer.c |   10 +
 6 files changed, 1393 insertions(+), 306 deletions(-)

diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h
index 8809a10269a1..2f4619b1180f 100644
--- a/sys/dev/cxgbe/adapter.h
+++ b/sys/dev/cxgbe/adapter.h
@@ -155,6 +155,7 @@ enum {
 	IS_VF		= (1 << 7),
 	KERN_TLS_ON	= (1 << 8),	/* HW is configured for KERN_TLS */
 	CXGBE_BUSY	= (1 << 9),
+	HW_OFF_LIMITS	= (1 << 10),	/* off limits to all except reset_thread */
 
 	/* port flags */
 	HAS_TRACEQ	= (1 << 3),
@@ -945,13 +946,26 @@ struct adapter {
 	TAILQ_HEAD(, sge_fl) sfl;
 	struct callout sfl_callout;
 
-	struct mtx reg_lock;	/* for indirect register access */
+	/*
+	 * Driver code that can run when the adapter is suspended must use this
+	 * lock or a synchronized_op and check for HW_OFF_LIMITS before
+	 * accessing hardware.
+	 *
+	 * XXX: could be changed to rwlock.  wlock in suspend/resume and for
+	 * indirect register access, rlock everywhere else.
+	 */
+	struct mtx reg_lock;
 
 	struct memwin memwin[NUM_MEMWIN];	/* memory windows */
 
 	struct mtx tc_lock;
 	struct task tc_task;
 
+	struct task reset_task;
+	const void *reset_thread;
+	int num_resets;
+	int incarnation;
+
 	const char *last_op;
 	const void *last_op_thr;
 	int last_op_flags;
@@ -1041,24 +1055,34 @@ forwarding_intr_to_fwq(struct adapter *sc)
 	return (sc->intr_count == 1);
 }
 
+/* Works reliably inside a sync_op or with reg_lock held. */
+static inline bool
+hw_off_limits(struct adapter *sc)
+{
+	return (__predict_false(sc->flags & HW_OFF_LIMITS));
+}
+
 static inline uint32_t
 t4_read_reg(struct adapter *sc, uint32_t reg)
 {
-
+	if (hw_off_limits(sc))
+		MPASS(curthread == sc->reset_thread);
 	return bus_space_read_4(sc->bt, sc->bh, reg);
 }
 
 static inline void
 t4_write_reg(struct adapter *sc, uint32_t reg, uint32_t val)
 {
-
+	if (hw_off_limits(sc))
+		MPASS(curthread == sc->reset_thread);
 	bus_space_write_4(sc->bt, sc->bh, reg, val);
 }
 
 static inline uint64_t
 t4_read_reg64(struct adapter *sc, uint32_t reg)
 {
-
+	if (hw_off_limits(sc))
+		MPASS(curthread == sc->reset_thread);
 #ifdef __LP64__
 	return bus_space_read_8(sc->bt, sc->bh, reg);
 #else
@@ -1071,7 +1095,8 @@ t4_read_reg64(struct adapter *sc, uint32_t reg)
 static inline void
 t4_write_reg64(struct adapter *sc, uint32_t reg, uint64_t val)
 {
-
+	if (hw_off_limits(sc))
+		MPASS(curthread == sc->reset_thread);
 #ifdef __LP64__
 	bus_space_write_8(sc->bt, sc->bh, reg, val);
 #else
@@ -1083,14 +1108,16 @@ t4_write_reg64(struct adapter *sc, uint32_t reg, uint64_t val)
 static inline void
 t4_os_pci_read_cfg1(struct adapter *sc, int reg, uint8_t *val)
 {
-
+	if (hw_off_limits(sc))
+		MPASS(curthread == sc->reset_thread);
 	*val = pci_read_config(sc->dev, reg, 1);
 }
 
 static inline void
 t4_os_pci_write_cfg1(struct adapter *sc, int reg, uint8_t val)
 {
-
+	if (hw_off_limits(sc))
+		MPASS(curthread == sc->reset_thread);
 	pci_write_config(sc->dev, reg, val, 1);
 }
 
@@ -1098,27 +1125,32 @@ static inline void
 t4_os_pci_read_cfg2(struct adapter *sc, int reg, uint16_t *val)
 {
 
+	if (hw_off_limits(sc))
+		MPASS(curthread == sc->reset_thread);
 	*val = pci_read_config(sc->dev, reg, 2);
 }
 
 static inline void
 t4_os_pci_write_cfg2(struct adapter *sc, int reg, uint16_t val)
 {
-
+	if (hw_off_limits(sc))
+		MPASS(curthread == sc->reset_thread);
 	pci_write_config(sc->dev, reg, val, 2);
 }
 
 static inline void
 t4_os_pci_read_cfg4(struct adapter *sc, int reg, uint32_t *val)
 {
-
+	if (hw_off_limits(sc))
+		MPASS(curthread == sc->reset_thread);
 	*val = pci_read_config(sc->dev, reg, 4);
 }
 
 static inline void
 t4_os_pci_write_cfg4(struct adapter *sc, int reg, uint32_t val)
 {
-
+	if (hw_off_limits(sc))
+		MPASS(curthread == sc->reset_thread);
 	pci_write_config(sc->dev, reg, val, 4);
 }
 
diff --git a/sys/dev/cxgbe/t4_clip.c b/sys/dev/cxgbe/t4_clip.c
index f737c17eaaae..ad26d212315e 100644
--- a/sys/dev/cxgbe/t4_clip.c
+++ b/sys/dev/cxgbe/t4_clip.c
@@ -171,7 +171,7 @@ update_clip(struct adapter *sc, void *arg __unused)
 	if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4clip"))
 		return;
 
-	if (mtx_initialized(&sc->clip_table_lock))
+	if (mtx_initialized(&sc->clip_table_lock) && !hw_off_limits(sc))
 		update_clip_table(sc);
 
 	end_synchronized_op(sc, LOCK_HELD);
diff --git a/sys/dev/cxgbe/t4_filter.c b/sys/dev/cxgbe/t4_filter.c
index cddd2c96a620..3972111b4897 100644
--- a/sys/dev/cxgbe/t4_filter.c
+++ b/sys/dev/cxgbe/t4_filter.c
@@ -522,6 +522,11 @@ set_filter_mode(struct adapter *sc, uint32_t mode)
 	if (rc)
 		return (rc);
 
+	if (hw_off_limits(sc)) {
+		rc = ENXIO;
+		goto done;
+	}
+
 	if (sc->tids.ftids_in_use > 0 ||	/* TCAM filters active */
 	    sc->tids.hpftids_in_use > 0 ||	/* hi-pri TCAM filters active */
 	    sc->tids.tids_in_use > 0) {		/* TOE or hashfilters active */
@@ -568,6 +573,11 @@ set_filter_mask(struct adapter *sc, uint32_t mode)
 	if (rc)
 		return (rc);
 
+	if (hw_off_limits(sc)) {
+		rc = ENXIO;
+		goto done;
+	}
+
 	if (sc->tids.tids_in_use > 0) {		/* TOE or hashfilters active */
 		rc = EBUSY;
 		goto done;
@@ -589,20 +599,27 @@ static inline uint64_t
 get_filter_hits(struct adapter *sc, uint32_t tid)
 {
 	uint32_t tcb_addr;
+	uint64_t hits;
 
 	tcb_addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE) + tid * TCB_SIZE;
 
-	if (is_t4(sc)) {
-		uint64_t hits;
+	mtx_lock(&sc->reg_lock);
+	if (hw_off_limits(sc))
+		hits = 0;
+	else if (is_t4(sc)) {
+		uint64_t t;
 
-		read_via_memwin(sc, 0, tcb_addr + 16, (uint32_t *)&hits, 8);
-		return (be64toh(hits));
+		read_via_memwin(sc, 0, tcb_addr + 16, (uint32_t *)&t, 8);
+		hits = be64toh(t);
 	} else {
-		uint32_t hits;
+		uint32_t t;
 
-		read_via_memwin(sc, 0, tcb_addr + 24, &hits, 4);
-		return (be32toh(hits));
+		read_via_memwin(sc, 0, tcb_addr + 24, &t, 4);
+		hits = be32toh(t);
 	}
+	mtx_unlock(&sc->reg_lock);
+
+	return (hits);
 }
 
 int
@@ -961,6 +978,11 @@ set_filter(struct adapter *sc, struct t4_filter *t)
 	if (rc)
 		return (rc);
 
+	if (hw_off_limits(sc)) {
+		rc = ENXIO;
+		goto done;
+	}
+
 	if (!(sc->flags & FULL_INIT_DONE) && ((rc = adapter_init(sc)) != 0))
 		goto done;
 
diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c
index e5c11402d9ab..1c22b0c8f124 100644
--- a/sys/dev/cxgbe/t4_main.c
+++ b/sys/dev/cxgbe/t4_main.c
@@ -100,12 +100,20 @@ static int t4_detach(device_t);
 static int t4_child_location_str(device_t, device_t, char *, size_t);
 static int t4_ready(device_t);
 static int t4_read_port_device(device_t, int, device_t *);
+static int t4_suspend(device_t);
+static int t4_resume(device_t);
+static int t4_reset_prepare(device_t, device_t);
+static int t4_reset_post(device_t, device_t);
 static device_method_t t4_methods[] = {
 	DEVMETHOD(device_probe,		t4_probe),
 	DEVMETHOD(device_attach,	t4_attach),
 	DEVMETHOD(device_detach,	t4_detach),
+	DEVMETHOD(device_suspend,	t4_suspend),
+	DEVMETHOD(device_resume,	t4_resume),
 
 	DEVMETHOD(bus_child_location_str, t4_child_location_str),
+	DEVMETHOD(bus_reset_prepare, 	t4_reset_prepare),
+	DEVMETHOD(bus_reset_post, 	t4_reset_post),
 
 	DEVMETHOD(t4_is_main_ready,	t4_ready),
 	DEVMETHOD(t4_read_port_device,	t4_read_port_device),
@@ -165,8 +173,12 @@ static device_method_t t5_methods[] = {
 	DEVMETHOD(device_probe,		t5_probe),
 	DEVMETHOD(device_attach,	t4_attach),
 	DEVMETHOD(device_detach,	t4_detach),
+	DEVMETHOD(device_suspend,	t4_suspend),
+	DEVMETHOD(device_resume,	t4_resume),
 
 	DEVMETHOD(bus_child_location_str, t4_child_location_str),
+	DEVMETHOD(bus_reset_prepare, 	t4_reset_prepare),
+	DEVMETHOD(bus_reset_post, 	t4_reset_post),
 
 	DEVMETHOD(t4_is_main_ready,	t4_ready),
 	DEVMETHOD(t4_read_port_device,	t4_read_port_device),
@@ -200,8 +212,12 @@ static device_method_t t6_methods[] = {
 	DEVMETHOD(device_probe,		t6_probe),
 	DEVMETHOD(device_attach,	t4_attach),
 	DEVMETHOD(device_detach,	t4_detach),
+	DEVMETHOD(device_suspend,	t4_suspend),
+	DEVMETHOD(device_resume,	t4_resume),
 
 	DEVMETHOD(bus_child_location_str, t4_child_location_str),
+	DEVMETHOD(bus_reset_prepare, 	t4_reset_prepare),
+	DEVMETHOD(bus_reset_post, 	t4_reset_post),
 
 	DEVMETHOD(t4_is_main_ready,	t4_ready),
 	DEVMETHOD(t4_read_port_device,	t4_read_port_device),
@@ -596,6 +612,10 @@ static int t4_panic_on_fatal_err = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, panic_on_fatal_err, CTLFLAG_RWTUN,
     &t4_panic_on_fatal_err, 0, "panic on fatal errors");
 
+static int t4_reset_on_fatal_err = 0;
+SYSCTL_INT(_hw_cxgbe, OID_AUTO, reset_on_fatal_err, CTLFLAG_RWTUN,
+    &t4_reset_on_fatal_err, 0, "reset adapter on fatal errors");
+
 static int t4_tx_vm_wr = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, tx_vm_wr, CTLFLAG_RWTUN, &t4_tx_vm_wr, 0,
     "Use VM work requests to transmit packets.");
@@ -794,6 +814,7 @@ static int sysctl_tx_rate(SYSCTL_HANDLER_ARGS);
 static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS);
 static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS);
 static int sysctl_cpus(SYSCTL_HANDLER_ARGS);
+static int sysctl_reset(SYSCTL_HANDLER_ARGS);
 #ifdef TCP_OFFLOAD
 static int sysctl_tls(SYSCTL_HANDLER_ARGS);
 static int sysctl_tls_rx_ports(SYSCTL_HANDLER_ARGS);
@@ -829,6 +850,7 @@ static int notify_siblings(device_t, int);
 static uint64_t vi_get_counter(struct ifnet *, ift_counter);
 static uint64_t cxgbe_get_counter(struct ifnet *, ift_counter);
 static void enable_vxlan_rx(struct adapter *);
+static void reset_adapter(void *, int);
 
 struct {
 	uint16_t device;
@@ -1137,6 +1159,8 @@ t4_attach(device_t dev)
 
 	refcount_init(&sc->vxlan_refcount, 0);
 
+	TASK_INIT(&sc->reset_task, 0, reset_adapter, sc);
+
 	sc->ctrlq_oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(sc->dev),
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)), OID_AUTO, "ctrlq",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "control queues");
@@ -1785,6 +1809,572 @@ t4_detach_common(device_t dev)
 	return (0);
 }
 
+static inline bool
+ok_to_reset(struct adapter *sc)
+{
+	struct tid_info *t = &sc->tids;
+	struct port_info *pi;
+	struct vi_info *vi;
+	int i, j;
+	const int caps = IFCAP_TOE | IFCAP_TXTLS | IFCAP_NETMAP | IFCAP_TXRTLMT;
+
+	ASSERT_SYNCHRONIZED_OP(sc);
+	MPASS(!(sc->flags & IS_VF));
+
+	for_each_port(sc, i) {
+		pi = sc->port[i];
+		for_each_vi(pi, j, vi) {
+			if (vi->ifp->if_capenable & caps)
+				return (false);
+		}
+	}
+
+	if (atomic_load_int(&t->tids_in_use) > 0)
+		return (false);
+	if (atomic_load_int(&t->stids_in_use) > 0)
+		return (false);
+	if (atomic_load_int(&t->atids_in_use) > 0)
+		return (false);
+	if (atomic_load_int(&t->ftids_in_use) > 0)
+		return (false);
+	if (atomic_load_int(&t->hpftids_in_use) > 0)
+		return (false);
+	if (atomic_load_int(&t->etids_in_use) > 0)
+		return (false);
+
+	return (true);
+}
+
+static int
+t4_suspend(device_t dev)
+{
+	struct adapter *sc = device_get_softc(dev);
+	struct port_info *pi;
+	struct vi_info *vi;
+	struct ifnet *ifp;
+	struct sge_rxq *rxq;
+	struct sge_txq *txq;
+	struct sge_wrq *wrq;
+#ifdef TCP_OFFLOAD
+	struct sge_ofld_rxq *ofld_rxq;
+#endif
+#if defined(TCP_OFFLOAD) || defined(RATELIMIT)
+	struct sge_ofld_txq *ofld_txq;
+#endif
+	int rc, i, j, k;
+
+	CH_ALERT(sc, "suspend requested\n");
+
+	rc = begin_synchronized_op(sc, NULL, SLEEP_OK, "t4sus");
+	if (rc != 0)
+		return (ENXIO);
+
+	/* XXX: Can the kernel call suspend repeatedly without resume? */
+	MPASS(!hw_off_limits(sc));
+
+	if (!ok_to_reset(sc)) {
+		/* XXX: should list what resource is preventing suspend. */
+		CH_ERR(sc, "not safe to suspend.\n");
+		rc = EBUSY;
+		goto done;
+	}
+
+	/* No more DMA or interrupts. */
+	t4_shutdown_adapter(sc);
+
+	/* Quiesce all activity. */
+	for_each_port(sc, i) {
+		pi = sc->port[i];
+		pi->vxlan_tcam_entry = false;
+
+		PORT_LOCK(pi);
+		if (pi->up_vis > 0) {
+			/*
+			 * t4_shutdown_adapter has already shut down all the
+			 * PHYs but it also disables interrupts and DMA so there
+			 * won't be a link interrupt.  So we update the state
+			 * manually and inform the kernel.
+			 */
+			pi->link_cfg.link_ok = false;
+			t4_os_link_changed(pi);
+		}
+		PORT_UNLOCK(pi);
+
+		for_each_vi(pi, j, vi) {
+			vi->xact_addr_filt = -1;
+			if (!(vi->flags & VI_INIT_DONE))
+				continue;
+
+			ifp = vi->ifp;
+			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
+				mtx_lock(&vi->tick_mtx);
+				vi->flags |= VI_SKIP_STATS;
+				callout_stop(&vi->tick);
+				mtx_unlock(&vi->tick_mtx);
+				callout_drain(&vi->tick);
+			}
+
+			/*
+			 * Note that the HW is not available.
+			 */
+			for_each_txq(vi, k, txq) {
+				TXQ_LOCK(txq);
+				txq->eq.flags &= ~(EQ_ENABLED | EQ_HW_ALLOCATED);
+				TXQ_UNLOCK(txq);
+			}
+#if defined(TCP_OFFLOAD) || defined(RATELIMIT)
+			for_each_ofld_txq(vi, k, ofld_txq) {
+				ofld_txq->wrq.eq.flags &= ~EQ_HW_ALLOCATED;
+			}
+#endif
+			for_each_rxq(vi, k, rxq) {
+				rxq->iq.flags &= ~IQ_HW_ALLOCATED;
+			}
+#if defined(TCP_OFFLOAD)
+			for_each_ofld_rxq(vi, k, ofld_rxq) {
+				ofld_rxq->iq.flags &= ~IQ_HW_ALLOCATED;
+			}
+#endif
+
+			quiesce_vi(vi);
+		}
+
+		if (sc->flags & FULL_INIT_DONE) {
+			/* Control queue */
+			wrq = &sc->sge.ctrlq[i];
+			wrq->eq.flags &= ~EQ_HW_ALLOCATED;
+			quiesce_wrq(wrq);
+		}
+	}
+	if (sc->flags & FULL_INIT_DONE) {
+		/* Firmware event queue */
+		sc->sge.fwq.flags &= ~IQ_HW_ALLOCATED;
+		quiesce_iq_fl(sc, &sc->sge.fwq, NULL);
+	}
+
+	/* Mark the adapter totally off limits. */
+	mtx_lock(&sc->reg_lock);
+	sc->flags |= HW_OFF_LIMITS;
+	sc->flags &= ~(FW_OK | MASTER_PF);
+	sc->reset_thread = NULL;
+	mtx_unlock(&sc->reg_lock);
+
+	sc->num_resets++;
+	CH_ALERT(sc, "suspend completed.\n");
+done:
+	end_synchronized_op(sc, 0);
+	return (rc);
+}
+
+struct adapter_pre_reset_state {
+	u_int flags;
+	uint16_t nbmcaps;
+	uint16_t linkcaps;
+	uint16_t switchcaps;
+	uint16_t niccaps;
+	uint16_t toecaps;
+	uint16_t rdmacaps;
+	uint16_t cryptocaps;
+	uint16_t iscsicaps;
+	uint16_t fcoecaps;
+
+	u_int cfcsum;
+	char cfg_file[32];
+
+	struct adapter_params params;
+	struct t4_virt_res vres;
+	struct tid_info tids;
+	struct sge sge;
+
+	int rawf_base;
+	int nrawf;
+
+};
+
+static void
+save_caps_and_params(struct adapter *sc, struct adapter_pre_reset_state *o)
+{
+
+	ASSERT_SYNCHRONIZED_OP(sc);
+
+	o->flags = sc->flags;
+
+	o->nbmcaps =  sc->nbmcaps;
+	o->linkcaps = sc->linkcaps;
+	o->switchcaps = sc->switchcaps;
+	o->niccaps = sc->niccaps;
+	o->toecaps = sc->toecaps;
+	o->rdmacaps = sc->rdmacaps;
+	o->cryptocaps = sc->cryptocaps;
+	o->iscsicaps = sc->iscsicaps;
+	o->fcoecaps = sc->fcoecaps;
+
+	o->cfcsum = sc->cfcsum;
+	MPASS(sizeof(o->cfg_file) == sizeof(sc->cfg_file));
+	memcpy(o->cfg_file, sc->cfg_file, sizeof(o->cfg_file));
+
+	o->params = sc->params;
+	o->vres = sc->vres;
+	o->tids = sc->tids;
+	o->sge = sc->sge;
+
+	o->rawf_base = sc->rawf_base;
+	o->nrawf = sc->nrawf;
+}
+
+static int
+compare_caps_and_params(struct adapter *sc, struct adapter_pre_reset_state *o)
+{
+	int rc = 0;
+
+	ASSERT_SYNCHRONIZED_OP(sc);
+
+	/* Capabilities */
+#define COMPARE_CAPS(c) do { \
+	if (o->c##caps != sc->c##caps) { \
+		CH_ERR(sc, "%scaps 0x%04x -> 0x%04x.\n", #c, o->c##caps, \
+		    sc->c##caps); \
+		rc = EINVAL; \
+	} \
+} while (0)
+	COMPARE_CAPS(nbm);
+	COMPARE_CAPS(link);
+	COMPARE_CAPS(switch);
+	COMPARE_CAPS(nic);
+	COMPARE_CAPS(toe);
+	COMPARE_CAPS(rdma);
+	COMPARE_CAPS(crypto);
+	COMPARE_CAPS(iscsi);
+	COMPARE_CAPS(fcoe);
+#undef COMPARE_CAPS
+
+	/* Firmware config file */
+	if (o->cfcsum != sc->cfcsum) {
+		CH_ERR(sc, "config file %s (0x%x) -> %s (0x%x)\n", o->cfg_file,
+		    o->cfcsum, sc->cfg_file, sc->cfcsum);
+		rc = EINVAL;
+	}
+
+#define COMPARE_PARAM(p, name) do { \
+	if (o->p != sc->p) { \
+		CH_ERR(sc, #name " %d -> %d\n", o->p, sc->p); \
+		rc = EINVAL; \
+	} \
+} while (0)
+	COMPARE_PARAM(sge.iq_start, iq_start);
+	COMPARE_PARAM(sge.eq_start, eq_start);
+	COMPARE_PARAM(tids.ftid_base, ftid_base);
+	COMPARE_PARAM(tids.ftid_end, ftid_end);
+	COMPARE_PARAM(tids.nftids, nftids);
+	COMPARE_PARAM(vres.l2t.start, l2t_start);
+	COMPARE_PARAM(vres.l2t.size, l2t_size);
+	COMPARE_PARAM(sge.iqmap_sz, iqmap_sz);
+	COMPARE_PARAM(sge.eqmap_sz, eqmap_sz);
+	COMPARE_PARAM(tids.tid_base, tid_base);
+	COMPARE_PARAM(tids.hpftid_base, hpftid_base);
+	COMPARE_PARAM(tids.hpftid_end, hpftid_end);
+	COMPARE_PARAM(tids.nhpftids, nhpftids);
+	COMPARE_PARAM(rawf_base, rawf_base);
+	COMPARE_PARAM(nrawf, nrawf);
+	COMPARE_PARAM(params.mps_bg_map, mps_bg_map);
+	COMPARE_PARAM(params.filter2_wr_support, filter2_wr_support);
+	COMPARE_PARAM(params.ulptx_memwrite_dsgl, ulptx_memwrite_dsgl);
+	COMPARE_PARAM(params.fr_nsmr_tpte_wr_support, fr_nsmr_tpte_wr_support);
+	COMPARE_PARAM(params.max_pkts_per_eth_tx_pkts_wr, max_pkts_per_eth_tx_pkts_wr);
+	COMPARE_PARAM(tids.ntids, ntids);
+	COMPARE_PARAM(tids.etid_base, etid_base);
+	COMPARE_PARAM(tids.etid_end, etid_end);
+	COMPARE_PARAM(tids.netids, netids);
+	COMPARE_PARAM(params.eo_wr_cred, eo_wr_cred);
+	COMPARE_PARAM(params.ethoffload, ethoffload);
+	COMPARE_PARAM(tids.natids, natids);
+	COMPARE_PARAM(tids.stid_base, stid_base);
+	COMPARE_PARAM(vres.ddp.start, ddp_start);
+	COMPARE_PARAM(vres.ddp.size, ddp_size);
+	COMPARE_PARAM(params.ofldq_wr_cred, ofldq_wr_cred);
+	COMPARE_PARAM(vres.stag.start, stag_start);
+	COMPARE_PARAM(vres.stag.size, stag_size);
+	COMPARE_PARAM(vres.rq.start, rq_start);
+	COMPARE_PARAM(vres.rq.size, rq_size);
+	COMPARE_PARAM(vres.pbl.start, pbl_start);
+	COMPARE_PARAM(vres.pbl.size, pbl_size);
+	COMPARE_PARAM(vres.qp.start, qp_start);
+	COMPARE_PARAM(vres.qp.size, qp_size);
+	COMPARE_PARAM(vres.cq.start, cq_start);
+	COMPARE_PARAM(vres.cq.size, cq_size);
+	COMPARE_PARAM(vres.ocq.start, ocq_start);
+	COMPARE_PARAM(vres.ocq.size, ocq_size);
+	COMPARE_PARAM(vres.srq.start, srq_start);
+	COMPARE_PARAM(vres.srq.size, srq_size);
+	COMPARE_PARAM(params.max_ordird_qp, max_ordird_qp);
+	COMPARE_PARAM(params.max_ird_adapter, max_ird_adapter);
+	COMPARE_PARAM(vres.iscsi.start, iscsi_start);
+	COMPARE_PARAM(vres.iscsi.size, iscsi_size);
+	COMPARE_PARAM(vres.key.start, key_start);
+	COMPARE_PARAM(vres.key.size, key_size);
+#undef COMPARE_PARAM
+
+	return (rc);
+}
+
+static int
+t4_resume(device_t dev)
+{
+	struct adapter *sc = device_get_softc(dev);
+	struct adapter_pre_reset_state *old_state = NULL;
+	struct port_info *pi;
+	struct vi_info *vi;
+	struct ifnet *ifp;
+	struct sge_txq *txq;
+	int rc, i, j, k;
+
+	CH_ALERT(sc, "resume requested.\n");
+
+	rc = begin_synchronized_op(sc, NULL, SLEEP_OK, "t4res");
+	if (rc != 0)
+		return (ENXIO);
+	MPASS(hw_off_limits(sc));
+	MPASS((sc->flags & FW_OK) == 0);
+	MPASS((sc->flags & MASTER_PF) == 0);
+	MPASS(sc->reset_thread == NULL);
+	sc->reset_thread = curthread;
+
+	/* Register access is expected to work by the time we're here. */
+	if (t4_read_reg(sc, A_PL_WHOAMI) == 0xffffffff) {
+		CH_ERR(sc, "%s: can't read device registers\n", __func__);
+		rc = ENXIO;
+		goto done;
+	}
+
+	/* Restore memory window. */
+	setup_memwin(sc);
+
+	/* Go no further if recovery mode has been requested. */
+	if (TUNABLE_INT_FETCH("hw.cxgbe.sos", &i) && i != 0) {
+		CH_ALERT(sc, "recovery mode on resume.\n");
+		rc = 0;
+		mtx_lock(&sc->reg_lock);
+		sc->flags &= ~HW_OFF_LIMITS;
+		mtx_unlock(&sc->reg_lock);
+		goto done;
+	}
+
+	old_state = malloc(sizeof(*old_state), M_CXGBE, M_ZERO | M_WAITOK);
+	save_caps_and_params(sc, old_state);
+
+	/* Reestablish contact with firmware and become the primary PF. */
+	rc = contact_firmware(sc);
+	if (rc != 0)
+		goto done; /* error message displayed already */
+	MPASS(sc->flags & FW_OK);
+
+	if (sc->flags & MASTER_PF) {
+		rc = partition_resources(sc);
+		if (rc != 0)
+			goto done; /* error message displayed already */
+		t4_intr_clear(sc);
+	}
+
+	rc = get_params__post_init(sc);
+	if (rc != 0)
+		goto done; /* error message displayed already */
+
+	rc = set_params__post_init(sc);
+	if (rc != 0)
+		goto done; /* error message displayed already */
+
+	rc = compare_caps_and_params(sc, old_state);
+	if (rc != 0)
+		goto done; /* error message displayed already */
+
+	for_each_port(sc, i) {
+		pi = sc->port[i];
+		MPASS(pi != NULL);
+		MPASS(pi->vi != NULL);
+		MPASS(pi->vi[0].dev == pi->dev);
+
+		rc = -t4_port_init(sc, sc->mbox, sc->pf, 0, i);
+		if (rc != 0) {
+			CH_ERR(sc,
+			    "failed to re-initialize port %d: %d\n", i, rc);
+			goto done;
+		}
+		MPASS(sc->chan_map[pi->tx_chan] == i);
+
+		PORT_LOCK(pi);
+		fixup_link_config(pi);
+		build_medialist(pi);
+		PORT_UNLOCK(pi);
+		for_each_vi(pi, j, vi) {
+			if (IS_MAIN_VI(vi))
+				continue;
+			rc = alloc_extra_vi(sc, pi, vi);
+			if (rc != 0) {
+				CH_ERR(vi,
+				    "failed to re-allocate extra VI: %d\n", rc);
+				goto done;
+			}
+		}
+	}
+
+	/*
+	 * Interrupts and queues are about to be enabled and other threads will
+	 * want to access the hardware too.  It is safe to do so.  Note that
+	 * this thread is still in the middle of a synchronized_op.
+	 */
+	mtx_lock(&sc->reg_lock);
+	sc->flags &= ~HW_OFF_LIMITS;
+	mtx_unlock(&sc->reg_lock);
+
+	if (sc->flags & FULL_INIT_DONE) {
+		rc = adapter_full_init(sc);
+		if (rc != 0) {
+			CH_ERR(sc, "failed to re-initialize adapter: %d\n", rc);
+			goto done;
+		}
+
+		if (sc->vxlan_refcount > 0)
+			enable_vxlan_rx(sc);
+
+		for_each_port(sc, i) {
+			pi = sc->port[i];
+			for_each_vi(pi, j, vi) {
+				if (!(vi->flags & VI_INIT_DONE))
+					continue;
+				rc = vi_full_init(vi);
+				if (rc != 0) {
+					CH_ERR(vi, "failed to re-initialize "
+					    "interface: %d\n", rc);
+					goto done;
+				}
+
+				ifp = vi->ifp;
+				if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
+					continue;
+				/*
+				 * Note that we do not setup multicast addresses
+				 * in the first pass.  This ensures that the
+				 * unicast DMACs for all VIs on all ports get an
+				 * MPS TCAM entry.
+				 */
+				rc = update_mac_settings(ifp, XGMAC_ALL &
+				    ~XGMAC_MCADDRS);
+				if (rc != 0) {
+					CH_ERR(vi, "failed to re-configure MAC: %d\n", rc);
+					goto done;
+				}
+				rc = -t4_enable_vi(sc, sc->mbox, vi->viid, true,
+				    true);
+				if (rc != 0) {
+					CH_ERR(vi, "failed to re-enable VI: %d\n", rc);
+					goto done;
+				}
+				for_each_txq(vi, k, txq) {
+					TXQ_LOCK(txq);
+					txq->eq.flags |= EQ_ENABLED;
+					TXQ_UNLOCK(txq);
+				}
+				mtx_lock(&vi->tick_mtx);
+				vi->flags &= ~VI_SKIP_STATS;
+				callout_schedule(&vi->tick, hz);
+				mtx_unlock(&vi->tick_mtx);
+			}
+			PORT_LOCK(pi);
+			if (pi->up_vis > 0) {
+				t4_update_port_info(pi);
+				fixup_link_config(pi);
+				build_medialist(pi);
+				apply_link_config(pi);
+				if (pi->link_cfg.link_ok)
+					t4_os_link_changed(pi);
+			}
+			PORT_UNLOCK(pi);
+		}
+
+		/* Now reprogram the L2 multicast addresses. */
+		for_each_port(sc, i) {
+			pi = sc->port[i];
+			for_each_vi(pi, j, vi) {
+				if (!(vi->flags & VI_INIT_DONE))
+					continue;
+				ifp = vi->ifp;
+				if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
+					continue;
+				rc = update_mac_settings(ifp, XGMAC_MCADDRS);
+				if (rc != 0) {
+					CH_ERR(vi, "failed to re-configure MCAST MACs: %d\n", rc);
+					rc = 0;	/* carry on */
+				}
+			}
+		}
+	}
+done:
+	if (rc == 0) {
+		sc->incarnation++;
+		CH_ALERT(sc, "resume completed.\n");
+	}
+	end_synchronized_op(sc, 0);
+	free(old_state, M_CXGBE);
+	return (rc);
+}
+
+static int
+t4_reset_prepare(device_t dev, device_t child)
+{
+	struct adapter *sc = device_get_softc(dev);
+
+	CH_ALERT(sc, "reset_prepare.\n");
+	return (0);
+}
+
+static int
+t4_reset_post(device_t dev, device_t child)
+{
+	struct adapter *sc = device_get_softc(dev);
+
+	CH_ALERT(sc, "reset_post.\n");
+	return (0);
+}
+
+static void
+reset_adapter(void *arg, int pending)
+{
+	struct adapter *sc = arg;
+	int rc;
+
+	CH_ALERT(sc, "reset requested.\n");
+
+	rc = begin_synchronized_op(sc, NULL, SLEEP_OK, "t4rst1");
+	if (rc != 0)
+		return;
+
+	if (hw_off_limits(sc)) {
+		CH_ERR(sc, "adapter is suspended, use resume (not reset).\n");
+		rc = ENXIO;
+		goto done;
+	}
+
+	if (!ok_to_reset(sc)) {
+		/* XXX: should list what resource is preventing reset. */
+		CH_ERR(sc, "not safe to reset.\n");
+		rc = EBUSY;
+		goto done;
+	}
+
+done:
+	end_synchronized_op(sc, 0);
+	if (rc != 0)
+		return;	/* Error logged already. */
+
+	mtx_lock(&Giant);
+	rc = BUS_RESET_CHILD(device_get_parent(sc->dev), sc->dev, 0);
+	mtx_unlock(&Giant);
+	if (rc != 0)
+		CH_ERR(sc, "bus_reset_child failed: %d.\n", rc);
+	else
+		CH_ALERT(sc, "bus_reset_child succeeded.\n");
+}
+
 static int
 cxgbe_probe(device_t dev)
 {
@@ -2072,7 +2662,8 @@ cxgbe_ioctl(struct ifnet *ifp, unsigned long cmd, caddr_t data)
 		ifp->if_mtu = mtu;
 		if (vi->flags & VI_INIT_DONE) {
 			t4_update_fl_bufsize(ifp);
-			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
+			if (!hw_off_limits(sc) &&
+			    ifp->if_drv_flags & IFF_DRV_RUNNING)
 				rc = update_mac_settings(ifp, XGMAC_MTU);
 		}
 		end_synchronized_op(sc, 0);
@@ -2083,6 +2674,11 @@ cxgbe_ioctl(struct ifnet *ifp, unsigned long cmd, caddr_t data)
 		if (rc)
 			return (rc);
 
+		if (hw_off_limits(sc)) {
+			rc = ENXIO;
+			goto fail;
+		}
+
 		if (ifp->if_flags & IFF_UP) {
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 				flags = vi->if_flags;
@@ -2106,7 +2702,7 @@ cxgbe_ioctl(struct ifnet *ifp, unsigned long cmd, caddr_t data)
 		rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4multi");
 		if (rc)
 			return (rc);
-		if (ifp->if_drv_flags & IFF_DRV_RUNNING)
+		if (!hw_off_limits(sc) && ifp->if_drv_flags & IFF_DRV_RUNNING)
 			rc = update_mac_settings(ifp, XGMAC_MCADDRS);
 		end_synchronized_op(sc, 0);
 		break;
@@ -2281,8 +2877,11 @@ fail:
 		rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4i2c");
 		if (rc)
 			return (rc);
-		rc = -t4_i2c_rd(sc, sc->mbox, pi->port_id, i2c.dev_addr,
-		    i2c.offset, i2c.len, &i2c.data[0]);
+		if (hw_off_limits(sc))
*** 1771 LINES SKIPPED ***