git: a8089ea5aee5 - main - nvmfd: A simple userspace daemon for the NVMe over Fabrics controller

From: John Baldwin <jhb_at_FreeBSD.org>
Date: Fri, 03 May 2024 00:16:15 UTC
The branch main has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=a8089ea5aee578e08acab2438e82fc9a9ae50ed8

commit a8089ea5aee578e08acab2438e82fc9a9ae50ed8
Author:     John Baldwin <jhb@FreeBSD.org>
AuthorDate: 2024-05-02 23:35:40 +0000
Commit:     John Baldwin <jhb@FreeBSD.org>
CommitDate: 2024-05-02 23:38:39 +0000

    nvmfd: A simple userspace daemon for the NVMe over Fabrics controller
    
    This daemon can operate as a purely userspace controller exporting one
    or more simulated RAM disks or local block devices as NVMe namespaces
    to a remote host.  In this case the daemon provides a discovery
    controller with a single entry for an I/O controller.
    
    nvmfd can also offload I/O controller queue pairs to the nvmft.ko
    in-kernel Fabrics controller when -K is passed.  In this mode, nvmfd
    still accepts connections and performs initial transport-specific
    negotitation in userland.  The daemon still provides a userspace-only
    discovery controller with a single entry for an I/O controller.
    However, queue pairs for the I/O controller are handed off to the CTL
    NVMF frontend.
    
    Eventually ctld(8) should be refactored to to provide an abstraction
    for the frontend protocol and the discovery and the kernel mode of
    this daemon should be merged into ctld(8).  At that point this daemon
    can be moved to tools/tools/nvmf as a debugging tool (mostly as sample
    code for a userspace controller using libnvmf).
    
    Reviewed by:    imp
    Sponsored by:   Chelsio Communications
    Differential Revision:  https://reviews.freebsd.org/D44731
---
 usr.sbin/Makefile           |   1 +
 usr.sbin/nvmfd/Makefile     |  14 +
 usr.sbin/nvmfd/controller.c | 244 ++++++++++++++++
 usr.sbin/nvmfd/ctl.c        | 139 +++++++++
 usr.sbin/nvmfd/devices.c    | 386 +++++++++++++++++++++++++
 usr.sbin/nvmfd/discovery.c  | 343 ++++++++++++++++++++++
 usr.sbin/nvmfd/internal.h   |  65 +++++
 usr.sbin/nvmfd/io.c         | 677 ++++++++++++++++++++++++++++++++++++++++++++
 usr.sbin/nvmfd/nvmfd.8      | 126 +++++++++
 usr.sbin/nvmfd/nvmfd.c      | 260 +++++++++++++++++
 10 files changed, 2255 insertions(+)

diff --git a/usr.sbin/Makefile b/usr.sbin/Makefile
index c3a4cc42f721..0aac7062146d 100644
--- a/usr.sbin/Makefile
+++ b/usr.sbin/Makefile
@@ -56,6 +56,7 @@ SUBDIR=	adduser \
 	nfsuserd \
 	nmtree \
 	nologin \
+	nvmfd \
 	pciconf \
 	periodic \
 	pnfsdscopymr \
diff --git a/usr.sbin/nvmfd/Makefile b/usr.sbin/nvmfd/Makefile
new file mode 100644
index 000000000000..dc3dcc5e3a5c
--- /dev/null
+++ b/usr.sbin/nvmfd/Makefile
@@ -0,0 +1,14 @@
+.include <src.opts.mk>
+.PATH:  ${SRCTOP}/sys/libkern
+
+PACKAGE=nvme-tools
+PROG=	nvmfd
+SRCS=	nvmfd.c controller.c ctl.c devices.c discovery.c gsb_crc32.c io.c
+CFLAGS+= -I${SRCTOP}/lib/libnvmf
+MAN=	nvmfd.8
+LIBADD+= nvmf pthread util nv
+
+.include <bsd.prog.mk>
+
+CFLAGS.ctl.c=	-I${SRCTOP}/sys
+CWARNFLAGS.gsb_crc32.c=	-Wno-cast-align
diff --git a/usr.sbin/nvmfd/controller.c b/usr.sbin/nvmfd/controller.c
new file mode 100644
index 000000000000..09baaea74ab4
--- /dev/null
+++ b/usr.sbin/nvmfd/controller.c
@@ -0,0 +1,244 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <err.h>
+#include <errno.h>
+#include <libnvmf.h>
+#include <stdlib.h>
+
+#include "internal.h"
+
+struct controller {
+	struct nvmf_qpair *qp;
+
+	uint64_t cap;
+	uint32_t vs;
+	uint32_t cc;
+	uint32_t csts;
+
+	bool shutdown;
+
+	struct nvme_controller_data cdata;
+};
+
+static bool
+update_cc(struct controller *c, uint32_t new_cc)
+{
+	uint32_t changes;
+
+	if (c->shutdown)
+		return (false);
+	if (!nvmf_validate_cc(c->qp, c->cap, c->cc, new_cc))
+		return (false);
+
+	changes = c->cc ^ new_cc;
+	c->cc = new_cc;
+
+	/* Handle shutdown requests. */
+	if (NVMEV(NVME_CC_REG_SHN, changes) != 0 &&
+	    NVMEV(NVME_CC_REG_SHN, new_cc) != 0) {
+		c->csts &= ~NVMEM(NVME_CSTS_REG_SHST);
+		c->csts |= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_COMPLETE);
+		c->shutdown = true;
+	}
+
+	if (NVMEV(NVME_CC_REG_EN, changes) != 0) {
+		if (NVMEV(NVME_CC_REG_EN, new_cc) == 0) {
+			/* Controller reset. */
+			c->csts = 0;
+			c->shutdown = true;
+		} else
+			c->csts |= NVMEF(NVME_CSTS_REG_RDY, 1);
+	}
+	return (true);
+}
+
+static void
+handle_property_get(const struct controller *c, const struct nvmf_capsule *nc,
+    const struct nvmf_fabric_prop_get_cmd *pget)
+{
+	struct nvmf_fabric_prop_get_rsp rsp;
+
+	nvmf_init_cqe(&rsp, nc, 0);
+
+	switch (le32toh(pget->ofst)) {
+	case NVMF_PROP_CAP:
+		if (pget->attrib.size != NVMF_PROP_SIZE_8)
+			goto error;
+		rsp.value.u64 = htole64(c->cap);
+		break;
+	case NVMF_PROP_VS:
+		if (pget->attrib.size != NVMF_PROP_SIZE_4)
+			goto error;
+		rsp.value.u32.low = htole32(c->vs);
+		break;
+	case NVMF_PROP_CC:
+		if (pget->attrib.size != NVMF_PROP_SIZE_4)
+			goto error;
+		rsp.value.u32.low = htole32(c->cc);
+		break;
+	case NVMF_PROP_CSTS:
+		if (pget->attrib.size != NVMF_PROP_SIZE_4)
+			goto error;
+		rsp.value.u32.low = htole32(c->csts);
+		break;
+	default:
+		goto error;
+	}
+
+	nvmf_send_response(nc, &rsp);
+	return;
+error:
+	nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
+}
+
+static void
+handle_property_set(struct controller *c, const struct nvmf_capsule *nc,
+    const struct nvmf_fabric_prop_set_cmd *pset)
+{
+	switch (le32toh(pset->ofst)) {
+	case NVMF_PROP_CC:
+		if (pset->attrib.size != NVMF_PROP_SIZE_4)
+			goto error;
+		if (!update_cc(c, le32toh(pset->value.u32.low)))
+			goto error;
+		break;
+	default:
+		goto error;
+	}
+
+	nvmf_send_success(nc);
+	return;
+error:
+	nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
+}
+
+static void
+handle_fabrics_command(struct controller *c,
+    const struct nvmf_capsule *nc, const struct nvmf_fabric_cmd *fc)
+{
+	switch (fc->fctype) {
+	case NVMF_FABRIC_COMMAND_PROPERTY_GET:
+		handle_property_get(c, nc,
+		    (const struct nvmf_fabric_prop_get_cmd *)fc);
+		break;
+	case NVMF_FABRIC_COMMAND_PROPERTY_SET:
+		handle_property_set(c, nc,
+		    (const struct nvmf_fabric_prop_set_cmd *)fc);
+		break;
+	case NVMF_FABRIC_COMMAND_CONNECT:
+		warnx("CONNECT command on connected queue");
+		nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR);
+		break;
+	case NVMF_FABRIC_COMMAND_DISCONNECT:
+		warnx("DISCONNECT command on admin queue");
+		nvmf_send_error(nc, NVME_SCT_COMMAND_SPECIFIC,
+		    NVMF_FABRIC_SC_INVALID_QUEUE_TYPE);
+		break;
+	default:
+		warnx("Unsupported fabrics command %#x", fc->fctype);
+		nvmf_send_generic_error(nc, NVME_SC_INVALID_OPCODE);
+		break;
+	}
+}
+
+static void
+handle_identify_command(const struct controller *c,
+    const struct nvmf_capsule *nc, const struct nvme_command *cmd)
+{
+	uint8_t cns;
+
+	cns = le32toh(cmd->cdw10) & 0xFF;
+	switch (cns) {
+	case 1:
+		break;
+	default:
+		warnx("Unsupported CNS %#x for IDENTIFY", cns);
+		goto error;
+	}
+
+	nvmf_send_controller_data(nc, &c->cdata, sizeof(c->cdata));
+	return;
+error:
+	nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
+}
+
+void
+controller_handle_admin_commands(struct controller *c, handle_command *cb,
+    void *cb_arg)
+{
+	struct nvmf_qpair *qp = c->qp;
+	const struct nvme_command *cmd;
+	struct nvmf_capsule *nc;
+	int error;
+
+	for (;;) {
+		error = nvmf_controller_receive_capsule(qp, &nc);
+		if (error != 0) {
+			if (error != ECONNRESET)
+				warnc(error, "Failed to read command capsule");
+			break;
+		}
+
+		cmd = nvmf_capsule_sqe(nc);
+
+		/*
+		 * Only permit Fabrics commands while a controller is
+		 * disabled.
+		 */
+		if (NVMEV(NVME_CC_REG_EN, c->cc) == 0 &&
+		    cmd->opc != NVME_OPC_FABRICS_COMMANDS) {
+			warnx("Unsupported admin opcode %#x whiled disabled\n",
+			    cmd->opc);
+			nvmf_send_generic_error(nc,
+			    NVME_SC_COMMAND_SEQUENCE_ERROR);
+			nvmf_free_capsule(nc);
+			continue;
+		}
+
+		if (cb(nc, cmd, cb_arg)) {
+			nvmf_free_capsule(nc);
+			continue;
+		}
+
+		switch (cmd->opc) {
+		case NVME_OPC_FABRICS_COMMANDS:
+			handle_fabrics_command(c, nc,
+			    (const struct nvmf_fabric_cmd *)cmd);
+			break;
+		case NVME_OPC_IDENTIFY:
+			handle_identify_command(c, nc, cmd);
+			break;
+		default:
+			warnx("Unsupported admin opcode %#x", cmd->opc);
+			nvmf_send_generic_error(nc, NVME_SC_INVALID_OPCODE);
+			break;
+		}
+		nvmf_free_capsule(nc);
+	}
+}
+
+struct controller *
+init_controller(struct nvmf_qpair *qp,
+    const struct nvme_controller_data *cdata)
+{
+	struct controller *c;
+
+	c = calloc(1, sizeof(*c));
+	c->qp = qp;
+	c->cap = nvmf_controller_cap(c->qp);
+	c->vs = cdata->ver;
+	c->cdata = *cdata;
+
+	return (c);
+}
+
+void
+free_controller(struct controller *c)
+{
+	free(c);
+}
diff --git a/usr.sbin/nvmfd/ctl.c b/usr.sbin/nvmfd/ctl.c
new file mode 100644
index 000000000000..5f01ec8e5bc8
--- /dev/null
+++ b/usr.sbin/nvmfd/ctl.c
@@ -0,0 +1,139 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/linker.h>
+#include <sys/nv.h>
+#include <sys/time.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libnvmf.h>
+#include <string.h>
+
+#include <cam/ctl/ctl.h>
+#include <cam/ctl/ctl_io.h>
+#include <cam/ctl/ctl_ioctl.h>
+
+#include "internal.h"
+
+static int ctl_fd = -1;
+static int ctl_port;
+
+static void
+open_ctl(void)
+{
+	if (ctl_fd > 0)
+		return;
+
+	ctl_fd = open(CTL_DEFAULT_DEV, O_RDWR);
+	if (ctl_fd == -1 && errno == ENOENT) {
+		if (kldload("ctl") == -1)
+			err(1, "Failed to load ctl.ko");
+		ctl_fd = open(CTL_DEFAULT_DEV, O_RDWR);
+	}
+	if (ctl_fd == -1)
+		err(1, "Failed to open %s", CTL_DEFAULT_DEV);
+}
+
+void
+init_ctl_port(const char *subnqn, const struct nvmf_association_params *params)
+{
+	char result_buf[256];
+	struct ctl_port_entry entry;
+	struct ctl_req req;
+	nvlist_t *nvl;
+
+	open_ctl();
+
+	nvl = nvlist_create(0);
+
+	nvlist_add_string(nvl, "subnqn", subnqn);
+
+	/* XXX: Hardcoded in discovery.c */
+	nvlist_add_stringf(nvl, "portid", "%u", 1);
+
+	nvlist_add_stringf(nvl, "max_io_qsize", "%u", params->max_io_qsize);
+
+	memset(&req, 0, sizeof(req));
+	strlcpy(req.driver, "nvmf", sizeof(req.driver));
+	req.reqtype = CTL_REQ_CREATE;
+	req.args = nvlist_pack(nvl, &req.args_len);
+	if (req.args == NULL)
+		errx(1, "Failed to pack nvlist for CTL_PORT/CTL_REQ_CREATE");
+	req.result = result_buf;
+	req.result_len = sizeof(result_buf);
+	if (ioctl(ctl_fd, CTL_PORT_REQ, &req) != 0)
+		err(1, "ioctl(CTL_PORT/CTL_REQ_CREATE)");
+	if (req.status == CTL_LUN_ERROR)
+		errx(1, "Failed to create CTL port: %s", req.error_str);
+	if (req.status != CTL_LUN_OK)
+		errx(1, "Failed to create CTL port: %d", req.status);
+
+	nvlist_destroy(nvl);
+	nvl = nvlist_unpack(result_buf, req.result_len, 0);
+	if (nvl == NULL)
+		errx(1, "Failed to unpack nvlist from CTL_PORT/CTL_REQ_CREATE");
+
+	ctl_port = nvlist_get_number(nvl, "port_id");
+	nvlist_destroy(nvl);
+
+	memset(&entry, 0, sizeof(entry));
+	entry.targ_port = ctl_port;
+	if (ioctl(ctl_fd, CTL_ENABLE_PORT, &entry) != 0)
+		errx(1, "ioctl(CTL_ENABLE_PORT)");
+}
+
+void
+shutdown_ctl_port(const char *subnqn)
+{
+	struct ctl_req req;
+	nvlist_t *nvl;
+
+	open_ctl();
+
+	nvl = nvlist_create(0);
+
+	nvlist_add_string(nvl, "subnqn", subnqn);
+
+	memset(&req, 0, sizeof(req));
+	strlcpy(req.driver, "nvmf", sizeof(req.driver));
+	req.reqtype = CTL_REQ_REMOVE;
+	req.args = nvlist_pack(nvl, &req.args_len);
+	if (req.args == NULL)
+		errx(1, "Failed to pack nvlist for CTL_PORT/CTL_REQ_REMOVE");
+	if (ioctl(ctl_fd, CTL_PORT_REQ, &req) != 0)
+		err(1, "ioctl(CTL_PORT/CTL_REQ_REMOVE)");
+	if (req.status == CTL_LUN_ERROR)
+		errx(1, "Failed to remove CTL port: %s", req.error_str);
+	if (req.status != CTL_LUN_OK)
+		errx(1, "Failed to remove CTL port: %d", req.status);
+
+	nvlist_destroy(nvl);
+}
+
+void
+ctl_handoff_qpair(struct nvmf_qpair *qp,
+    const struct nvmf_fabric_connect_cmd *cmd,
+    const struct nvmf_fabric_connect_data *data)
+{
+	struct ctl_nvmf req;
+	int error;
+
+	memset(&req, 0, sizeof(req));
+	req.type = CTL_NVMF_HANDOFF;
+	error = nvmf_handoff_controller_qpair(qp, &req.data.handoff);
+	if (error != 0) {
+		warnc(error, "Failed to prepare qpair for handoff");
+		return;
+	}
+
+	req.data.handoff.cmd = cmd;
+	req.data.handoff.data = data;
+	if (ioctl(ctl_fd, CTL_NVMF, &req) != 0)
+		warn("ioctl(CTL_NVMF/CTL_NVMF_HANDOFF)");
+}
diff --git a/usr.sbin/nvmfd/devices.c b/usr.sbin/nvmfd/devices.c
new file mode 100644
index 000000000000..fafc1077f207
--- /dev/null
+++ b/usr.sbin/nvmfd/devices.c
@@ -0,0 +1,386 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/disk.h>
+#include <sys/gsb_crc32.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <net/ieee_oui.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libnvmf.h>
+#include <libutil.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "internal.h"
+
+#define	RAMDISK_PREFIX	"ramdisk:"
+
+struct backing_device {
+	enum { RAMDISK, FILE, CDEV } type;
+	union {
+		int	fd;	/* FILE, CDEV */
+		void	*mem;	/* RAMDISK */
+	};
+	u_int	sector_size;
+	uint64_t nlbas;
+	uint64_t eui64;
+};
+
+static struct backing_device *devices;
+static u_int ndevices;
+
+static uint64_t
+generate_eui64(uint32_t low)
+{
+	return (OUI_FREEBSD_NVME_LOW << 16 | low);
+}
+
+static uint32_t
+crc32(const void *buf, size_t len)
+{
+	return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff);
+}
+
+static void
+init_ramdisk(const char *config, struct backing_device *dev)
+{
+	static uint32_t ramdisk_idx = 1;
+	uint64_t num;
+
+	dev->type = RAMDISK;
+	dev->sector_size = 512;
+	if (expand_number(config, &num))
+		errx(1, "Invalid ramdisk specification: %s", config);
+	if ((num % dev->sector_size) != 0)
+		errx(1, "Invalid ramdisk size %ju", (uintmax_t)num);
+	dev->mem = calloc(num, 1);
+	dev->nlbas = num / dev->sector_size;
+	dev->eui64 = generate_eui64('M' << 24 | ramdisk_idx++);
+}
+
+static void
+init_filedevice(const char *config, int fd, struct stat *sb,
+    struct backing_device *dev)
+{
+	dev->type = FILE;
+	dev->fd = fd;
+	dev->sector_size = 512;
+	if ((sb->st_size % dev->sector_size) != 0)
+		errx(1, "File size is not a multiple of 512: %s", config);
+	dev->nlbas = sb->st_size / dev->sector_size;
+	dev->eui64 = generate_eui64('F' << 24 |
+	    (crc32(config, strlen(config)) & 0xffffff));
+}
+
+static void
+init_chardevice(const char *config, int fd, struct backing_device *dev)
+{
+	off_t len;
+
+	dev->type = CDEV;
+	dev->fd = fd;
+	if (ioctl(fd, DIOCGSECTORSIZE, &dev->sector_size) != 0)
+		err(1, "Failed to fetch sector size for %s", config);
+	if (ioctl(fd, DIOCGMEDIASIZE, &len) != 0)
+		err(1, "Failed to fetch sector size for %s", config);
+	dev->nlbas = len / dev->sector_size;
+	dev->eui64 = generate_eui64('C' << 24 |
+	    (crc32(config, strlen(config)) & 0xffffff));
+}
+
+static void
+init_device(const char *config, struct backing_device *dev)
+{
+	struct stat sb;
+	int fd;
+
+	/* Check for a RAM disk. */
+	if (strncmp(RAMDISK_PREFIX, config, strlen(RAMDISK_PREFIX)) == 0) {
+		init_ramdisk(config + strlen(RAMDISK_PREFIX), dev);
+		return;
+	}
+
+	fd = open(config, O_RDWR);
+	if (fd == -1)
+		err(1, "Failed to open %s", config);
+	if (fstat(fd, &sb) == -1)
+		err(1, "fstat");
+	switch (sb.st_mode & S_IFMT) {
+	case S_IFCHR:
+		init_chardevice(config, fd, dev);
+		break;
+	case S_IFREG:
+		init_filedevice(config, fd, &sb, dev);
+		break;
+	default:
+		errx(1, "Invalid file type for %s", config);
+	}
+}
+
+void
+register_devices(int ac, char **av)
+{
+	ndevices = ac;
+	devices = calloc(ndevices, sizeof(*devices));
+
+	for (int i = 0; i < ac; i++)
+		init_device(av[i], &devices[i]);
+}
+
+u_int
+device_count(void)
+{
+	return (ndevices);
+}
+
+static struct backing_device *
+lookup_device(uint32_t nsid)
+{
+	if (nsid == 0 || nsid > ndevices)
+		return (NULL);
+	return (&devices[nsid - 1]);
+}
+
+void
+device_active_nslist(uint32_t nsid, struct nvme_ns_list *nslist)
+{
+	u_int count;
+
+	memset(nslist, 0, sizeof(*nslist));
+	count = 0;
+	nsid++;
+	while (nsid <= ndevices) {
+		nslist->ns[count] = htole32(nsid);
+		count++;
+		if (count == nitems(nslist->ns))
+			break;
+		nsid++;
+	}
+}
+
+bool
+device_identification_descriptor(uint32_t nsid, void *buf)
+{
+	struct backing_device *dev;
+	char *p;
+
+	dev = lookup_device(nsid);
+	if (dev == NULL)
+		return (false);
+
+	memset(buf, 0, 4096);
+
+	p = buf;
+
+	/* EUI64 */
+	*p++ = 1;
+	*p++ = 8;
+	p += 2;
+	be64enc(p, dev->eui64);
+	return (true);
+}
+
+bool
+device_namespace_data(uint32_t nsid, struct nvme_namespace_data *nsdata)
+{
+	struct backing_device *dev;
+
+	dev = lookup_device(nsid);
+	if (dev == NULL)
+		return (false);
+
+	memset(nsdata, 0, sizeof(*nsdata));
+	nsdata->nsze = htole64(dev->nlbas);
+	nsdata->ncap = nsdata->nsze;
+	nsdata->nuse = nsdata->ncap;
+	nsdata->nlbaf = 1 - 1;
+	nsdata->flbas = NVMEF(NVME_NS_DATA_FLBAS_FORMAT, 0);
+	nsdata->lbaf[0] = NVMEF(NVME_NS_DATA_LBAF_LBADS,
+	    ffs(dev->sector_size) - 1);
+
+	be64enc(nsdata->eui64, dev->eui64);
+	return (true);
+}
+
+static bool
+read_buffer(int fd, void *buf, size_t len, off_t offset)
+{
+	ssize_t nread;
+	char *dst;
+
+	dst = buf;
+	while (len > 0) {
+		nread = pread(fd, dst, len, offset);
+		if (nread == -1 && errno == EINTR)
+			continue;
+		if (nread <= 0)
+			return (false);
+		dst += nread;
+		len -= nread;
+		offset += nread;
+	}
+	return (true);
+}
+
+void
+device_read(uint32_t nsid, uint64_t lba, u_int nlb,
+    const struct nvmf_capsule *nc)
+{
+	struct backing_device *dev;
+	char *p, *src;
+	off_t off;
+	size_t len;
+
+	dev = lookup_device(nsid);
+	if (dev == NULL) {
+		nvmf_send_generic_error(nc,
+		    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
+		return;
+	}
+
+	if (lba + nlb < lba || lba + nlb > dev->nlbas) {
+		nvmf_send_generic_error(nc, NVME_SC_LBA_OUT_OF_RANGE);
+		return;
+	}
+
+	off = lba * dev->sector_size;
+	len = nlb * dev->sector_size;
+	if (nvmf_capsule_data_len(nc) != len) {
+		nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
+		return;
+	}
+
+	if (dev->type == RAMDISK) {
+		p = NULL;
+		src = (char *)dev->mem + off;
+	} else {
+		p = malloc(len);
+		if (!read_buffer(dev->fd, p, len, off)) {
+			free(p);
+			nvmf_send_generic_error(nc,
+			    NVME_SC_INTERNAL_DEVICE_ERROR);
+			return;
+		}
+		src = p;
+	}
+
+	nvmf_send_controller_data(nc, src, len);
+	free(p);
+}
+
+static bool
+write_buffer(int fd, const void *buf, size_t len, off_t offset)
+{
+	ssize_t nwritten;
+	const char *src;
+
+	src = buf;
+	while (len > 0) {
+		nwritten = pwrite(fd, src, len, offset);
+		if (nwritten == -1 && errno == EINTR)
+			continue;
+		if (nwritten <= 0)
+			return (false);
+		src += nwritten;
+		len -= nwritten;
+		offset += nwritten;
+	}
+	return (true);
+}
+
+void
+device_write(uint32_t nsid, uint64_t lba, u_int nlb,
+    const struct nvmf_capsule *nc)
+{
+	struct backing_device *dev;
+	char *p, *dst;
+	off_t off;
+	size_t len;
+	int error;
+
+	dev = lookup_device(nsid);
+	if (dev == NULL) {
+		nvmf_send_generic_error(nc,
+		    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
+		return;
+	}
+
+	if (lba + nlb < lba || lba + nlb > dev->nlbas) {
+		nvmf_send_generic_error(nc, NVME_SC_LBA_OUT_OF_RANGE);
+		return;
+	}
+
+	off = lba * dev->sector_size;
+	len = nlb * dev->sector_size;
+	if (nvmf_capsule_data_len(nc) != len) {
+		nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
+		return;
+	}
+
+	if (dev->type == RAMDISK) {
+		p = NULL;
+		dst = (char *)dev->mem + off;
+	} else {
+		p = malloc(len);
+		dst = p;
+	}
+
+	error = nvmf_receive_controller_data(nc, 0, dst, len);
+	if (error != 0) {
+		nvmf_send_generic_error(nc, NVME_SC_TRANSIENT_TRANSPORT_ERROR);
+		free(p);
+		return;
+	}
+
+	if (dev->type != RAMDISK) {
+		if (!write_buffer(dev->fd, p, len, off)) {
+			free(p);
+			nvmf_send_generic_error(nc,
+			    NVME_SC_INTERNAL_DEVICE_ERROR);
+			return;
+		}
+	}
+	free(p);
+	nvmf_send_success(nc);
+}
+
+void
+device_flush(uint32_t nsid, const struct nvmf_capsule *nc)
+{
+	struct backing_device *dev;
+
+	dev = lookup_device(nsid);
+	if (dev == NULL) {
+		nvmf_send_generic_error(nc,
+		    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
+		return;
+	}
+
+	switch (dev->type) {
+	case RAMDISK:
+		break;
+	case FILE:
+		if (fdatasync(dev->fd) == -1) {
+			nvmf_send_error(nc, NVME_SCT_MEDIA_ERROR,
+			    NVME_SC_WRITE_FAULTS);
+			return;
+		}
+		break;
+	case CDEV:
+		if (ioctl(dev->fd, DIOCGFLUSH) == -1) {
+			nvmf_send_error(nc, NVME_SCT_MEDIA_ERROR,
+			    NVME_SC_WRITE_FAULTS);
+			return;
+		}
+	}
+
+	nvmf_send_success(nc);
+}
diff --git a/usr.sbin/nvmfd/discovery.c b/usr.sbin/nvmfd/discovery.c
new file mode 100644
index 000000000000..985c77620a62
--- /dev/null
+++ b/usr.sbin/nvmfd/discovery.c
@@ -0,0 +1,343 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2023-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <assert.h>
+#include <err.h>
+#include <libnvmf.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "internal.h"
+
+struct io_controller_data {
+	struct nvme_discovery_log_entry entry;
+	bool wildcard;
+};
+
+struct discovery_controller {
+	struct nvme_discovery_log *discovery_log;
+	size_t discovery_log_len;
+	int s;
+};
+
+struct discovery_thread_arg {
+	struct controller *c;
+	struct nvmf_qpair *qp;
+	int s;
+};
+
+static struct io_controller_data *io_controllers;
+static struct nvmf_association *discovery_na;
+static u_int num_io_controllers;
+
+static bool
+init_discovery_log_entry(struct nvme_discovery_log_entry *entry, int s,
+    const char *subnqn)
+{
+	struct sockaddr_storage ss;
+	socklen_t len;
+	bool wildcard;
+
+	len = sizeof(ss);
+	if (getsockname(s, (struct sockaddr *)&ss, &len) == -1)
+		err(1, "getsockname");
+
+	memset(entry, 0, sizeof(*entry));
+	entry->trtype = NVMF_TRTYPE_TCP;
+	switch (ss.ss_family) {
+	case AF_INET:
+	{
+		struct sockaddr_in *sin;
+
+		sin = (struct sockaddr_in *)&ss;
+		entry->adrfam = NVMF_ADRFAM_IPV4;
+		snprintf(entry->trsvcid, sizeof(entry->trsvcid), "%u",
+		    htons(sin->sin_port));
+		if (inet_ntop(AF_INET, &sin->sin_addr, entry->traddr,
+		    sizeof(entry->traddr)) == NULL)
+			err(1, "inet_ntop");
+		wildcard = (sin->sin_addr.s_addr == htonl(INADDR_ANY));
+		break;
+	}
+	case AF_INET6:
+	{
+		struct sockaddr_in6 *sin6;
+
+		sin6 = (struct sockaddr_in6 *)&ss;
+		entry->adrfam = NVMF_ADRFAM_IPV6;
+		snprintf(entry->trsvcid, sizeof(entry->trsvcid), "%u",
+		    htons(sin6->sin6_port));
+		if (inet_ntop(AF_INET6, &sin6->sin6_addr, entry->traddr,
+		    sizeof(entry->traddr)) == NULL)
+			err(1, "inet_ntop");
+		wildcard = (memcmp(&sin6->sin6_addr, &in6addr_any,
+		    sizeof(in6addr_any)) == 0);
+		break;
+	}
+	default:
+		errx(1, "Unsupported address family %u", ss.ss_family);
+	}
+	entry->subtype = NVMF_SUBTYPE_NVME;
+	if (flow_control_disable)
+		entry->treq |= (1 << 2);
+	entry->portid = htole16(1);
+	entry->cntlid = htole16(NVMF_CNTLID_DYNAMIC);
+	entry->aqsz = NVME_MAX_ADMIN_ENTRIES;
+	strlcpy(entry->subnqn, subnqn, sizeof(entry->subnqn));
+	return (wildcard);
+}
+
+void
+init_discovery(void)
+{
+	struct nvmf_association_params aparams;
+
+	memset(&aparams, 0, sizeof(aparams));
+	aparams.sq_flow_control = false;
+	aparams.dynamic_controller_model = true;
+	aparams.max_admin_qsize = NVME_MAX_ADMIN_ENTRIES;
+	aparams.tcp.pda = 0;
+	aparams.tcp.header_digests = header_digests;
+	aparams.tcp.data_digests = data_digests;
+	aparams.tcp.maxr2t = 1;
+	aparams.tcp.maxh2cdata = 256 * 1024;
+	discovery_na = nvmf_allocate_association(NVMF_TRTYPE_TCP, true,
*** 1381 LINES SKIPPED ***