git: a1eda74167b5 - main - nvmf: The in-kernel NVMe over Fabrics host

From: John Baldwin <jhb_at_FreeBSD.org>
Date: Fri, 03 May 2024 00:15:49 UTC
The branch main has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=a1eda74167b5edb99fd31d507d8a3f7d7e14ae2b

commit a1eda74167b5edb99fd31d507d8a3f7d7e14ae2b
Author:     John Baldwin <jhb@FreeBSD.org>
AuthorDate: 2024-05-02 23:29:37 +0000
Commit:     John Baldwin <jhb@FreeBSD.org>
CommitDate: 2024-05-02 23:29:37 +0000

    nvmf: The in-kernel NVMe over Fabrics host
    
    This is the client (initiator in SCSI terms) for NVMe over Fabrics.
    Userland is responsible for creating a set of queue pairs and then
    handing them off via an ioctl to this driver, e.g. via the 'connect'
    command from nvmecontrol(8).  An nvmeX new-bus device is created
    at the top-level to represent the remote controller similar to PCI
    nvmeX devices for PCI-express controllers.
    
    As with nvme(4), namespace devices named /dev/nvmeXnsY are created and
    pass through commands can be submitted to either the namespace devices
    or the controller device.  For example, 'nvmecontrol identify nvmeX'
    works for a remote Fabrics controller the same as for a PCI-express
    controller.
    
    nvmf exports remote namespaces via nda(4) devices using the new NVMF
    CAM transport.  nvmf does not support nvd(4), only nda(4).
    
    Sponsored by:   Chelsio Communications
    Differential Revision:  https://reviews.freebsd.org/D44714
---
 share/man/man4/Makefile         |   1 +
 share/man/man4/nvmf.4           |  87 ++++
 sys/conf/NOTES                  |   4 +-
 sys/conf/files                  |   8 +
 sys/dev/nvmf/host/nvmf.c        | 939 ++++++++++++++++++++++++++++++++++++++++
 sys/dev/nvmf/host/nvmf_aer.c    | 290 +++++++++++++
 sys/dev/nvmf/host/nvmf_cmd.c    | 171 ++++++++
 sys/dev/nvmf/host/nvmf_ctldev.c | 159 +++++++
 sys/dev/nvmf/host/nvmf_ns.c     | 483 +++++++++++++++++++++
 sys/dev/nvmf/host/nvmf_qpair.c  | 386 +++++++++++++++++
 sys/dev/nvmf/host/nvmf_sim.c    | 332 ++++++++++++++
 sys/dev/nvmf/host/nvmf_var.h    | 208 +++++++++
 sys/modules/nvmf/Makefile       |   3 +-
 sys/modules/nvmf/nvmf/Makefile  |  13 +
 14 files changed, 3082 insertions(+), 2 deletions(-)

diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile
index aab55d9b90b5..7b6f8849be59 100644
--- a/share/man/man4/Makefile
+++ b/share/man/man4/Makefile
@@ -408,6 +408,7 @@ MAN=	aac.4 \
 	nvd.4 \
 	${_nvdimm.4} \
 	nvme.4 \
+	nvmf.4 \
 	nvmf_tcp.4 \
 	${_nvram.4} \
 	oce.4 \
diff --git a/share/man/man4/nvmf.4 b/share/man/man4/nvmf.4
new file mode 100644
index 000000000000..8afbb4d9daaf
--- /dev/null
+++ b/share/man/man4/nvmf.4
@@ -0,0 +1,87 @@
+.\"
+.\" SPDX-License-Identifier: BSD-2-Clause
+.\"
+.\" Copyright (c) 2024 Chelsio Communications, Inc.
+.\"
+.Dd May 2, 2024
+.Dt NVMF 4
+.Os
+.Sh NAME
+.Nm nvmf
+.Nd "NVM Express over Fabrics host driver"
+.Sh SYNOPSIS
+To compile the driver into the kernel,
+place the following line in the
+kernel configuration file:
+.Bd -ragged -offset indent
+.Cd "device nvmf"
+.Ed
+.Pp
+Alternatively, to load the driver as a
+module at boot time, place the following line in
+.Xr loader.conf 5 :
+.Bd -literal -offset indent
+nvmf_load="YES"
+.Ed
+.Sh DESCRIPTION
+The
+.Nm
+driver provides the kernel component of an NVM Express over Fabrics
+host.
+The NVMeoF host is the client which provides local access to
+namespaces exported by a remote controller.
+.Pp
+Associations between the local host and remote controllers are managed
+using
+.Xr nvmecontrol 8 .
+New associations are created via the
+.Cm connect
+command and destroyed via the
+.Cm disconnect
+command.
+If an association's connection is interrupted,
+the
+.Cm reconnect
+command creates a new association to replace the interrupted association.
+.Pp
+Similar to
+.Xr nvme 4 ,
+.Nm
+creates controller device nodes using the format
+.Pa /dev/nvmeX
+and namespace device nodes using the format
+.Pa /dev/nvmeXnsY .
+.Nm
+also exports remote namespaces via the CAM
+.Xr nda 4
+peripheral driver.
+Unlike
+.Xr nvme 4 ,
+.Nm
+does not support the
+.Xr nvd 4
+disk driver.
+.Pp
+Associations require a supported transport such as
+.Xr nvmf_tcp 4
+for associations using TCP/IP.
+.Sh SEE ALSO
+.Xr nda 4 ,
+.Xr nvme 4 ,
+.Xr nvmf_tcp 4 ,
+.Xr nvmft 4 ,
+.Xr nvmecontrol 8
+.Sh HISTORY
+The
+.Nm
+module first appeared in
+.Fx 15.0 .
+.Sh AUTHORS
+The
+.Nm
+driver was developed by
+.An John Baldwin Aq Mt jhb@FreeBSD.org
+under sponsorship from Chelsio Communications, Inc.
+.Sh BUGS
+.Nm
+only supports a single I/O queue pair per association.
diff --git a/sys/conf/NOTES b/sys/conf/NOTES
index 1f52af4c99d8..ffb4b43f4efc 100644
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -1676,12 +1676,14 @@ device		mrsas		# LSI/Avago MegaRAID SAS/SATA, 6Gb/s and 12Gb/s
 # NVM Express
 #
 # nvme:	PCI-express NVM Express host controllers
+# nvmf:	NVM Express over Fabrics host
 # nvmf_tcp: TCP transport for NVM Express over Fabrics
 # nda:	CAM NVMe disk driver
 # nvd:	non-CAM NVMe disk driver
 
-device		nvme		# base NVMe driver
+device		nvme		# PCI-express NVMe host driver
 options 	NVME_USE_NVD=1	# Use nvd(4) instead of the CAM nda(4) driver
+device		nvmf		# NVMeoF host driver
 device		nvmf_tcp	# NVMeoF TCP transport
 device		nda		# NVMe direct access devices (aka disks)
 device		nvd		# expose NVMe namespaces as disks, depends on nvme
diff --git a/sys/conf/files b/sys/conf/files
index 143814301c20..4a631d979c78 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -2533,7 +2533,15 @@ dev/nvme/nvme_test.c		optional nvme
 dev/nvme/nvme_util.c		optional nvme
 dev/nvmem/nvmem.c		optional nvmem fdt
 dev/nvmem/nvmem_if.m		optional nvmem
+dev/nvmf/host/nvmf.c		optional nvmf
+dev/nvmf/host/nvmf_aer.c	optional nvmf
+dev/nvmf/host/nvmf_cmd.c	optional nvmf
+dev/nvmf/host/nvmf_ctldev.c	optional nvmf
+dev/nvmf/host/nvmf_ns.c		optional nvmf
+dev/nvmf/host/nvmf_qpair.c	optional nvmf
+dev/nvmf/host/nvmf_sim.c	optional nvmf
 dev/nvmf/nvmf_tcp.c		optional nvmf_tcp
+dev/nvmf/nvmf_transport.c	optional nvmf
 dev/oce/oce_hw.c		optional oce pci
 dev/oce/oce_if.c		optional oce pci
 dev/oce/oce_mbox.c		optional oce pci
diff --git a/sys/dev/nvmf/host/nvmf.c b/sys/dev/nvmf/host/nvmf.c
new file mode 100644
index 000000000000..0902bc78a7b5
--- /dev/null
+++ b/sys/dev/nvmf/host/nvmf.c
@@ -0,0 +1,939 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/memdesc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/taskqueue.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_transport.h>
+#include <dev/nvmf/host/nvmf_var.h>
+
+static struct cdevsw nvmf_cdevsw;
+
+MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
+
+static void	nvmf_disconnect_task(void *arg, int pending);
+
+void
+nvmf_complete(void *arg, const struct nvme_completion *cqe)
+{
+	struct nvmf_completion_status *status = arg;
+	struct mtx *mtx;
+
+	status->cqe = *cqe;
+	mtx = mtx_pool_find(mtxpool_sleep, status);
+	mtx_lock(mtx);
+	status->done = true;
+	mtx_unlock(mtx);
+	wakeup(status);
+}
+
+void
+nvmf_io_complete(void *arg, size_t xfered, int error)
+{
+	struct nvmf_completion_status *status = arg;
+	struct mtx *mtx;
+
+	status->io_error = error;
+	mtx = mtx_pool_find(mtxpool_sleep, status);
+	mtx_lock(mtx);
+	status->io_done = true;
+	mtx_unlock(mtx);
+	wakeup(status);
+}
+
+void
+nvmf_wait_for_reply(struct nvmf_completion_status *status)
+{
+	struct mtx *mtx;
+
+	mtx = mtx_pool_find(mtxpool_sleep, status);
+	mtx_lock(mtx);
+	while (!status->done || !status->io_done)
+		mtx_sleep(status, mtx, 0, "nvmfcmd", 0);
+	mtx_unlock(mtx);
+}
+
+static int
+nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
+    uint64_t *value)
+{
+	const struct nvmf_fabric_prop_get_rsp *rsp;
+	struct nvmf_completion_status status;
+
+	nvmf_status_init(&status);
+	if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status,
+	    M_WAITOK))
+		return (ECONNABORTED);
+	nvmf_wait_for_reply(&status);
+
+	if (status.cqe.status != 0) {
+		device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n",
+		    le16toh(status.cqe.status));
+		return (EIO);
+	}
+
+	rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe;
+	if (size == 8)
+		*value = le64toh(rsp->value.u64);
+	else
+		*value = le32toh(rsp->value.u32.low);
+	return (0);
+}
+
+static int
+nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
+    uint64_t value)
+{
+	struct nvmf_completion_status status;
+
+	nvmf_status_init(&status);
+	if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status,
+	    M_WAITOK))
+		return (ECONNABORTED);
+	nvmf_wait_for_reply(&status);
+
+	if (status.cqe.status != 0) {
+		device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n",
+		    le16toh(status.cqe.status));
+		return (EIO);
+	}
+	return (0);
+}
+
+static void
+nvmf_shutdown_controller(struct nvmf_softc *sc)
+{
+	uint64_t cc;
+	int error;
+
+	error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc);
+	if (error != 0) {
+		device_printf(sc->dev, "Failed to fetch CC for shutdown\n");
+		return;
+	}
+
+	cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL);
+
+	error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc);
+	if (error != 0)
+		device_printf(sc->dev,
+		    "Failed to set CC to trigger shutdown\n");
+}
+
+static void
+nvmf_check_keep_alive(void *arg)
+{
+	struct nvmf_softc *sc = arg;
+	int traffic;
+
+	traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic);
+	if (traffic == 0) {
+		device_printf(sc->dev,
+		    "disconnecting due to KeepAlive timeout\n");
+		nvmf_disconnect(sc);
+		return;
+	}
+
+	callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK);
+}
+
+static void
+nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe)
+{
+	struct nvmf_softc *sc = arg;
+
+	atomic_store_int(&sc->ka_active_rx_traffic, 1);
+	if (cqe->status != 0) {
+		device_printf(sc->dev,
+		    "KeepAlive response reported status %#x\n",
+		    le16toh(cqe->status));
+	}
+}
+
+static void
+nvmf_send_keep_alive(void *arg)
+{
+	struct nvmf_softc *sc = arg;
+	int traffic;
+
+	/*
+	 * Don't bother sending a KeepAlive command if TKAS is active
+	 * and another command has been sent during the interval.
+	 */
+	traffic = atomic_load_int(&sc->ka_active_tx_traffic);
+	if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete,
+	    sc, M_NOWAIT))
+		device_printf(sc->dev,
+		    "Failed to allocate KeepAlive command\n");
+
+	/* Clear ka_active_tx_traffic after sending the keep alive command. */
+	atomic_store_int(&sc->ka_active_tx_traffic, 0);
+
+	callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK);
+}
+
+int
+nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh)
+{
+	size_t len;
+	u_int i;
+	int error;
+
+	memset(ivars, 0, sizeof(*ivars));
+
+	if (!hh->admin.admin || hh->num_io_queues < 1)
+		return (EINVAL);
+
+	ivars->cdata = malloc(sizeof(*ivars->cdata), M_NVMF, M_WAITOK);
+	error = copyin(hh->cdata, ivars->cdata, sizeof(*ivars->cdata));
+	if (error != 0)
+		goto out;
+	nvme_controller_data_swapbytes(ivars->cdata);
+
+	len = hh->num_io_queues * sizeof(*ivars->io_params);
+	ivars->io_params = malloc(len, M_NVMF, M_WAITOK);
+	error = copyin(hh->io, ivars->io_params, len);
+	if (error != 0)
+		goto out;
+	for (i = 0; i < hh->num_io_queues; i++) {
+		if (ivars->io_params[i].admin) {
+			error = EINVAL;
+			goto out;
+		}
+
+		/* Require all I/O queues to be the same size. */
+		if (ivars->io_params[i].qsize != ivars->io_params[0].qsize) {
+			error = EINVAL;
+			goto out;
+		}
+	}
+
+	ivars->hh = hh;
+	return (0);
+
+out:
+	free(ivars->io_params, M_NVMF);
+	free(ivars->cdata, M_NVMF);
+	return (error);
+}
+
+void
+nvmf_free_ivars(struct nvmf_ivars *ivars)
+{
+	free(ivars->io_params, M_NVMF);
+	free(ivars->cdata, M_NVMF);
+}
+
+static int
+nvmf_probe(device_t dev)
+{
+	struct nvmf_ivars *ivars = device_get_ivars(dev);
+	char desc[260];
+
+	if (ivars == NULL)
+		return (ENXIO);
+
+	snprintf(desc, sizeof(desc), "Fabrics: %.256s", ivars->cdata->subnqn);
+	device_set_desc_copy(dev, desc);
+	return (BUS_PROBE_DEFAULT);
+}
+
+static int
+nvmf_establish_connection(struct nvmf_softc *sc, struct nvmf_ivars *ivars)
+{
+	char name[16];
+
+	/* Setup the admin queue. */
+	sc->admin = nvmf_init_qp(sc, ivars->hh->trtype, &ivars->hh->admin,
+	    "admin queue");
+	if (sc->admin == NULL) {
+		device_printf(sc->dev, "Failed to setup admin queue\n");
+		return (ENXIO);
+	}
+
+	/* Setup I/O queues. */
+	sc->io = malloc(ivars->hh->num_io_queues * sizeof(*sc->io), M_NVMF,
+	    M_WAITOK | M_ZERO);
+	sc->num_io_queues = ivars->hh->num_io_queues;
+	for (u_int i = 0; i < sc->num_io_queues; i++) {
+		snprintf(name, sizeof(name), "I/O queue %u", i);
+		sc->io[i] = nvmf_init_qp(sc, ivars->hh->trtype,
+		    &ivars->io_params[i], name);
+		if (sc->io[i] == NULL) {
+			device_printf(sc->dev, "Failed to setup I/O queue %u\n",
+			    i + 1);
+			return (ENXIO);
+		}
+	}
+
+	/* Start KeepAlive timers. */
+	if (ivars->hh->kato != 0) {
+		sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS,
+		    sc->cdata->ctratt) != 0;
+		sc->ka_rx_sbt = mstosbt(ivars->hh->kato);
+		sc->ka_tx_sbt = sc->ka_rx_sbt / 2;
+		callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0,
+		    nvmf_check_keep_alive, sc, C_HARDCLOCK);
+		callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0,
+		    nvmf_send_keep_alive, sc, C_HARDCLOCK);
+	}
+
+	return (0);
+}
+
+static bool
+nvmf_scan_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
+    struct nvme_namespace_data *data, uint32_t *nsidp)
+{
+	struct nvmf_completion_status status;
+	uint32_t nsid;
+
+	nvmf_status_init(&status);
+	nvmf_status_wait_io(&status);
+	if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist,
+	    nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) {
+		device_printf(sc->dev,
+		    "failed to send IDENTIFY active namespaces command\n");
+		return (false);
+	}
+	nvmf_wait_for_reply(&status);
+
+	if (status.cqe.status != 0) {
+		device_printf(sc->dev,
+		    "IDENTIFY active namespaces failed, status %#x\n",
+		    le16toh(status.cqe.status));
+		return (false);
+	}
+
+	if (status.io_error != 0) {
+		device_printf(sc->dev,
+		    "IDENTIFY active namespaces failed with I/O error %d\n",
+		    status.io_error);
+		return (false);
+	}
+
+	for (u_int i = 0; i < nitems(nslist->ns); i++) {
+		nsid = nslist->ns[i];
+		if (nsid == 0) {
+			*nsidp = 0;
+			return (true);
+		}
+
+		if (sc->ns[nsid - 1] != NULL) {
+			device_printf(sc->dev,
+			    "duplicate namespace %u in active namespace list\n",
+			    nsid);
+			return (false);
+		}
+
+		nvmf_status_init(&status);
+		nvmf_status_wait_io(&status);
+		if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
+		    &status, nvmf_io_complete, &status, M_WAITOK)) {
+			device_printf(sc->dev,
+			    "failed to send IDENTIFY namespace %u command\n",
+			    nsid);
+			return (false);
+		}
+		nvmf_wait_for_reply(&status);
+
+		if (status.cqe.status != 0) {
+			device_printf(sc->dev,
+			    "IDENTIFY namespace %u failed, status %#x\n", nsid,
+			    le16toh(status.cqe.status));
+			return (false);
+		}
+
+		if (status.io_error != 0) {
+			device_printf(sc->dev,
+			    "IDENTIFY namespace %u failed with I/O error %d\n",
+			    nsid, status.io_error);
+			return (false);
+		}
+
+		/*
+		 * As in nvme_ns_construct, a size of zero indicates an
+		 * invalid namespace.
+		 */
+		nvme_namespace_data_swapbytes(data);
+		if (data->nsze == 0) {
+			device_printf(sc->dev,
+			    "ignoring active namespace %u with zero size\n",
+			    nsid);
+			continue;
+		}
+
+		sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
+
+		nvmf_sim_rescan_ns(sc, nsid);
+	}
+
+	MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0);
+
+	if (nsid >= 0xfffffffd)
+		*nsidp = 0;
+	else
+		*nsidp = nsid + 1;
+	return (true);
+}
+
+static bool
+nvmf_add_namespaces(struct nvmf_softc *sc)
+{
+	struct nvme_namespace_data *data;
+	struct nvme_ns_list *nslist;
+	uint32_t nsid;
+	bool retval;
+
+	sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
+	    M_WAITOK | M_ZERO);
+	nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK);
+	data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
+
+	nsid = 0;
+	retval = true;
+	for (;;) {
+		if (!nvmf_scan_nslist(sc, nslist, data, &nsid)) {
+			retval = false;
+			break;
+		}
+		if (nsid == 0)
+			break;
+	}
+
+	free(data, M_NVMF);
+	free(nslist, M_NVMF);
+	return (retval);
+}
+
+static int
+nvmf_attach(device_t dev)
+{
+	struct make_dev_args mda;
+	struct nvmf_softc *sc = device_get_softc(dev);
+	struct nvmf_ivars *ivars = device_get_ivars(dev);
+	uint64_t val;
+	u_int i;
+	int error;
+
+	if (ivars == NULL)
+		return (ENXIO);
+
+	sc->dev = dev;
+	sc->trtype = ivars->hh->trtype;
+	callout_init(&sc->ka_rx_timer, 1);
+	callout_init(&sc->ka_tx_timer, 1);
+	sx_init(&sc->connection_lock, "nvmf connection");
+	TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
+
+	/* Claim the cdata pointer from ivars. */
+	sc->cdata = ivars->cdata;
+	ivars->cdata = NULL;
+
+	nvmf_init_aer(sc);
+
+	/* TODO: Multiqueue support. */
+	sc->max_pending_io = ivars->io_params[0].qsize /* * sc->num_io_queues */;
+
+	error = nvmf_establish_connection(sc, ivars);
+	if (error != 0)
+		goto out;
+
+	error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap);
+	if (error != 0) {
+		device_printf(sc->dev, "Failed to fetch CAP\n");
+		error = ENXIO;
+		goto out;
+	}
+
+	error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val);
+	if (error != 0) {
+		device_printf(sc->dev, "Failed to fetch VS\n");
+		error = ENXIO;
+		goto out;
+	}
+	sc->vs = val;
+
+	/* Honor MDTS if it is set. */
+	sc->max_xfer_size = maxphys;
+	if (sc->cdata->mdts != 0) {
+		sc->max_xfer_size = ulmin(sc->max_xfer_size,
+		    1 << (sc->cdata->mdts + NVME_MPS_SHIFT +
+		    NVME_CAP_HI_MPSMIN(sc->cap >> 32)));
+	}
+
+	error = nvmf_init_sim(sc);
+	if (error != 0)
+		goto out;
+
+	error = nvmf_start_aer(sc);
+	if (error != 0) {
+		nvmf_destroy_sim(sc);
+		goto out;
+	}
+
+	if (!nvmf_add_namespaces(sc)) {
+		nvmf_destroy_sim(sc);
+		goto out;
+	}
+
+	make_dev_args_init(&mda);
+	mda.mda_devsw = &nvmf_cdevsw;
+	mda.mda_uid = UID_ROOT;
+	mda.mda_gid = GID_WHEEL;
+	mda.mda_mode = 0600;
+	mda.mda_si_drv1 = sc;
+	error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
+	if (error != 0) {
+		nvmf_destroy_sim(sc);
+		goto out;
+	}
+
+	return (0);
+out:
+	if (sc->ns != NULL) {
+		for (i = 0; i < sc->cdata->nn; i++) {
+			if (sc->ns[i] != NULL)
+				nvmf_destroy_ns(sc->ns[i]);
+		}
+		free(sc->ns, M_NVMF);
+	}
+
+	callout_drain(&sc->ka_tx_timer);
+	callout_drain(&sc->ka_rx_timer);
+
+	if (sc->admin != NULL)
+		nvmf_shutdown_controller(sc);
+
+	for (i = 0; i < sc->num_io_queues; i++) {
+		if (sc->io[i] != NULL)
+			nvmf_destroy_qp(sc->io[i]);
+	}
+	free(sc->io, M_NVMF);
+	if (sc->admin != NULL)
+		nvmf_destroy_qp(sc->admin);
+
+	nvmf_destroy_aer(sc);
+
+	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
+	sx_destroy(&sc->connection_lock);
+	free(sc->cdata, M_NVMF);
+	return (error);
+}
+
+void
+nvmf_disconnect(struct nvmf_softc *sc)
+{
+	taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task);
+}
+
+static void
+nvmf_disconnect_task(void *arg, int pending __unused)
+{
+	struct nvmf_softc *sc = arg;
+	u_int i;
+
+	sx_xlock(&sc->connection_lock);
+	if (sc->admin == NULL) {
+		/*
+		 * Ignore transport errors if there is no active
+		 * association.
+		 */
+		sx_xunlock(&sc->connection_lock);
+		return;
+	}
+
+	if (sc->detaching) {
+		if (sc->admin != NULL) {
+			/*
+			 * This unsticks the detach process if a
+			 * transport error occurs during detach.
+			 */
+			nvmf_shutdown_qp(sc->admin);
+		}
+		sx_xunlock(&sc->connection_lock);
+		return;
+	}
+
+	if (sc->cdev == NULL) {
+		/*
+		 * Transport error occurred during attach (nvmf_add_namespaces).
+		 * Shutdown the admin queue.
+		 */
+		nvmf_shutdown_qp(sc->admin);
+		sx_xunlock(&sc->connection_lock);
+		return;
+	}
+
+	callout_drain(&sc->ka_tx_timer);
+	callout_drain(&sc->ka_rx_timer);
+	sc->ka_traffic = false;
+
+	/* Quiesce namespace consumers. */
+	nvmf_disconnect_sim(sc);
+	for (i = 0; i < sc->cdata->nn; i++) {
+		if (sc->ns[i] != NULL)
+			nvmf_disconnect_ns(sc->ns[i]);
+	}
+
+	/* Shutdown the existing qpairs. */
+	for (i = 0; i < sc->num_io_queues; i++) {
+		nvmf_destroy_qp(sc->io[i]);
+	}
+	free(sc->io, M_NVMF);
+	sc->io = NULL;
+	sc->num_io_queues = 0;
+	nvmf_destroy_qp(sc->admin);
+	sc->admin = NULL;
+
+	sx_xunlock(&sc->connection_lock);
+}
+
+static int
+nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh)
+{
+	struct nvmf_ivars ivars;
+	u_int i;
+	int error;
+
+	/* XXX: Should we permit changing the transport type? */
+	if (sc->trtype != hh->trtype) {
+		device_printf(sc->dev,
+		    "transport type mismatch on reconnect\n");
+		return (EINVAL);
+	}
+
+	error = nvmf_init_ivars(&ivars, hh);
+	if (error != 0)
+		return (error);
+
+	sx_xlock(&sc->connection_lock);
+	if (sc->admin != NULL || sc->detaching) {
+		error = EBUSY;
+		goto out;
+	}
+
+	/*
+	 * Ensure this is for the same controller.  Note that the
+	 * controller ID can vary across associations if the remote
+	 * system is using the dynamic controller model.  This merely
+	 * ensures the new association is connected to the same NVMe
+	 * subsystem.
+	 */
+	if (memcmp(sc->cdata->subnqn, ivars.cdata->subnqn,
+	    sizeof(ivars.cdata->subnqn)) != 0) {
+		device_printf(sc->dev,
+		    "controller subsystem NQN mismatch on reconnect\n");
+		error = EINVAL;
+		goto out;
+	}
+
+	/*
+	 * XXX: Require same number and size of I/O queues so that
+	 * max_pending_io is still correct?
+	 */
+
+	error = nvmf_establish_connection(sc, &ivars);
+	if (error != 0)
+		goto out;
+
+	error = nvmf_start_aer(sc);
+	if (error != 0)
+		goto out;
+
+	device_printf(sc->dev,
+	    "established new association with %u I/O queues\n",
+	    sc->num_io_queues);
+
+	/* Restart namespace consumers. */
+	for (i = 0; i < sc->cdata->nn; i++) {
+		if (sc->ns[i] != NULL)
+			nvmf_reconnect_ns(sc->ns[i]);
+	}
+	nvmf_reconnect_sim(sc);
+out:
+	sx_xunlock(&sc->connection_lock);
+	nvmf_free_ivars(&ivars);
+	return (error);
+}
+
+static int
+nvmf_detach(device_t dev)
+{
+	struct nvmf_softc *sc = device_get_softc(dev);
+	u_int i;
+
+	destroy_dev(sc->cdev);
+
+	sx_xlock(&sc->connection_lock);
+	sc->detaching = true;
+	sx_xunlock(&sc->connection_lock);
+
+	nvmf_destroy_sim(sc);
+	for (i = 0; i < sc->cdata->nn; i++) {
+		if (sc->ns[i] != NULL)
+			nvmf_destroy_ns(sc->ns[i]);
+	}
+	free(sc->ns, M_NVMF);
+
+	callout_drain(&sc->ka_tx_timer);
+	callout_drain(&sc->ka_rx_timer);
+
+	if (sc->admin != NULL)
+		nvmf_shutdown_controller(sc);
+
+	for (i = 0; i < sc->num_io_queues; i++) {
+		nvmf_destroy_qp(sc->io[i]);
+	}
+	free(sc->io, M_NVMF);
+
+	taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
+
+	if (sc->admin != NULL)
+		nvmf_destroy_qp(sc->admin);
+
+	nvmf_destroy_aer(sc);
+
+	sx_destroy(&sc->connection_lock);
+	free(sc->cdata, M_NVMF);
+	return (0);
+}
+
+void
+nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
+{
+	struct nvmf_completion_status status;
+	struct nvme_namespace_data *data;
+	struct nvmf_namespace *ns;
+
+	data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
+
+	nvmf_status_init(&status);
+	nvmf_status_wait_io(&status);
+	if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
+	    &status, nvmf_io_complete, &status, M_WAITOK)) {
+		device_printf(sc->dev,
+		    "failed to send IDENTIFY namespace %u command\n", nsid);
+		free(data, M_NVMF);
+		return;
+	}
+	nvmf_wait_for_reply(&status);
+
+	if (status.cqe.status != 0) {
+		device_printf(sc->dev,
+		    "IDENTIFY namespace %u failed, status %#x\n", nsid,
+		    le16toh(status.cqe.status));
+		free(data, M_NVMF);
+		return;
+	}
+
+	if (status.io_error != 0) {
+		device_printf(sc->dev,
+		    "IDENTIFY namespace %u failed with I/O error %d\n",
+		    nsid, status.io_error);
+		free(data, M_NVMF);
+		return;
+	}
+
+	nvme_namespace_data_swapbytes(data);
+
+	/* XXX: Needs locking around sc->ns[]. */
+	ns = sc->ns[nsid - 1];
+	if (data->nsze == 0) {
+		/* XXX: Needs locking */
+		if (ns != NULL) {
+			nvmf_destroy_ns(ns);
+			sc->ns[nsid - 1] = NULL;
+		}
+	} else {
+		/* XXX: Needs locking */
+		if (ns == NULL) {
+			sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
+		} else {
+			if (!nvmf_update_ns(ns, data)) {
+				nvmf_destroy_ns(ns);
+				sc->ns[nsid - 1] = NULL;
+			}
+		}
+	}
+
+	free(data, M_NVMF);
+
+	nvmf_sim_rescan_ns(sc, nsid);
+}
+
+int
+nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
+    bool admin)
+{
+	struct nvmf_completion_status status;
+	struct nvme_command cmd;
+	struct memdesc mem;
+	struct nvmf_host_qpair *qp;
+	struct nvmf_request *req;
+	void *buf;
*** 2252 LINES SKIPPED ***