git: aa1207ea4f03 - main - nvmf: Add infrastructure kernel module for NVMe over Fabrics

From: John Baldwin <jhb_at_FreeBSD.org>
Date: Fri, 03 May 2024 00:15:46 UTC
The branch main has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=aa1207ea4f030c50a91bca6a3df950ca25113d5a

commit aa1207ea4f030c50a91bca6a3df950ca25113d5a
Author:     John Baldwin <jhb@FreeBSD.org>
AuthorDate: 2024-05-02 23:28:32 +0000
Commit:     John Baldwin <jhb@FreeBSD.org>
CommitDate: 2024-05-02 23:28:32 +0000

    nvmf: Add infrastructure kernel module for NVMe over Fabrics
    
    nvmf_transport.ko provides routines for managing NVMeoF queue pairs
    and capsules.  It provides a glue layer between transports (such as
    TCP or RDMA) and an NVMeoF host (initiator) and controller (target).
    
    Unlike the synchronous API exposed to the host and controller by
    libnvmf, the kernel's transport layer uses an asynchronous API built
    on callbacks.  Upper layers provide callbacks on queue pairs that are
    invoked for transport errors (error_cb) or anytime a capsule is
    received (receive_cb).
    
    Data transfers for a command are usually associated with a callback
    that is invoked once a transfer has finished either due to an error
    or successful completion.
    
    For an upper layer that is a host, command capsules are allocated and
    populated with an NVMe SQE by calling nvmf_allocate_command.  A data
    buffer (described by a struct memdesc) can be associated with a
    command capsule before it is transmitted via nvmf_capsule_append_data.
    This function accepts a direction (send vs receive) as well as the
    data transfer callback.  The host then transmits the command via
    nvmf_transmit_capsule.  The host must ensure that the data buffer
    described by the 'struct memdesc' remains valid until the data
    transfer callback is called.  The queue pair's receive_cb callback
    should match received response capsules up with previously transmitted
    commands.
    
    For the controller, incoming commands are received via the queue
    pair's receive_cb callback.  nvmf_receive_controller_data is used to
    retrieve any data from a command (e.g. the data for a WRITE command).
    It can be called multiple times to split the data transfer into
    smaller sizes.  This function accepts an I/O completion callback that
    is invoked once the data transfer has completed.
    nvmf_send_controller_data is used to send data to a remote host in
    response to a command.  In this case a callback function is not used
    but the status is returned synchronously.  Finally, the controller can
    allocate a response capsule via nvmf_allocate_response populated with
    a supplied CQE and send the response via nvmf_transmit_capsule.
    
    Reviewed by:    imp
    Sponsored by:   Chelsio Communications
    Differential Revision:  https://reviews.freebsd.org/D44711
---
 sys/dev/nvmf/nvmf_transport.c            | 344 +++++++++++++++++++++++++++++++
 sys/dev/nvmf/nvmf_transport.h            | 140 +++++++++++++
 sys/dev/nvmf/nvmf_transport_internal.h   | 128 ++++++++++++
 sys/modules/Makefile                     |   1 +
 sys/modules/nvmf/Makefile                |   3 +
 sys/modules/nvmf/nvmf_transport/Makefile |   9 +
 6 files changed, 625 insertions(+)

diff --git a/sys/dev/nvmf/nvmf_transport.c b/sys/dev/nvmf/nvmf_transport.c
new file mode 100644
index 000000000000..14d526192270
--- /dev/null
+++ b/sys/dev/nvmf/nvmf_transport.c
@@ -0,0 +1,344 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/refcount.h>
+#include <sys/sysctl.h>
+#include <sys/sx.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_transport.h>
+#include <dev/nvmf/nvmf_transport_internal.h>
+
+/* Transport-independent support for fabrics queue pairs and commands. */
+
+struct nvmf_transport {
+	struct nvmf_transport_ops *nt_ops;
+
+	volatile u_int nt_active_qpairs;
+	SLIST_ENTRY(nvmf_transport) nt_link;
+};
+
+/* nvmf_transports[nvmf_trtype] is sorted by priority */
+static SLIST_HEAD(, nvmf_transport) nvmf_transports[NVMF_TRTYPE_TCP + 1];
+static struct sx nvmf_transports_lock;
+
+static MALLOC_DEFINE(M_NVMF_TRANSPORT, "nvmf_xport",
+    "NVMe over Fabrics transport");
+
+SYSCTL_NODE(_kern, OID_AUTO, nvmf, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+    "NVMe over Fabrics");
+
+static bool
+nvmf_supported_trtype(enum nvmf_trtype trtype)
+{
+	return (trtype < nitems(nvmf_transports));
+}
+
+struct nvmf_qpair *
+nvmf_allocate_qpair(enum nvmf_trtype trtype, bool controller,
+    const struct nvmf_handoff_qpair_params *params,
+    nvmf_qpair_error_t *error_cb, void *error_cb_arg,
+    nvmf_capsule_receive_t *receive_cb, void *receive_cb_arg)
+{
+	struct nvmf_transport *nt;
+	struct nvmf_qpair *qp;
+
+	if (!nvmf_supported_trtype(trtype))
+		return (NULL);
+
+	sx_slock(&nvmf_transports_lock);
+	SLIST_FOREACH(nt, &nvmf_transports[trtype], nt_link) {
+		qp = nt->nt_ops->allocate_qpair(controller, params);
+		if (qp != NULL) {
+			refcount_acquire(&nt->nt_active_qpairs);
+			break;
+		}
+	}
+	sx_sunlock(&nvmf_transports_lock);
+	if (qp == NULL)
+		return (NULL);
+
+	qp->nq_transport = nt;
+	qp->nq_ops = nt->nt_ops;
+	qp->nq_controller = controller;
+	qp->nq_error = error_cb;
+	qp->nq_error_arg = error_cb_arg;
+	qp->nq_receive = receive_cb;
+	qp->nq_receive_arg = receive_cb_arg;
+	qp->nq_admin = params->admin;
+	return (qp);
+}
+
+void
+nvmf_free_qpair(struct nvmf_qpair *qp)
+{
+	struct nvmf_transport *nt;
+
+	nt = qp->nq_transport;
+	qp->nq_ops->free_qpair(qp);
+	if (refcount_release(&nt->nt_active_qpairs))
+		wakeup(nt);
+}
+
+struct nvmf_capsule *
+nvmf_allocate_command(struct nvmf_qpair *qp, const void *sqe, int how)
+{
+	struct nvmf_capsule *nc;
+
+	KASSERT(how == M_WAITOK || how == M_NOWAIT,
+	    ("%s: invalid how", __func__));
+	nc = qp->nq_ops->allocate_capsule(qp, how);
+	if (nc == NULL)
+		return (NULL);
+
+	nc->nc_qpair = qp;
+	nc->nc_qe_len = sizeof(struct nvme_command);
+	memcpy(&nc->nc_sqe, sqe, nc->nc_qe_len);
+
+	/* 4.2 of NVMe base spec: Fabrics always uses SGL. */
+	nc->nc_sqe.fuse &= ~NVMEM(NVME_CMD_PSDT);
+	nc->nc_sqe.fuse |= NVMEF(NVME_CMD_PSDT, NVME_PSDT_SGL);
+	return (nc);
+}
+
+struct nvmf_capsule *
+nvmf_allocate_response(struct nvmf_qpair *qp, const void *cqe, int how)
+{
+	struct nvmf_capsule *nc;
+
+	KASSERT(how == M_WAITOK || how == M_NOWAIT,
+	    ("%s: invalid how", __func__));
+	nc = qp->nq_ops->allocate_capsule(qp, how);
+	if (nc == NULL)
+		return (NULL);
+
+	nc->nc_qpair = qp;
+	nc->nc_qe_len = sizeof(struct nvme_completion);
+	memcpy(&nc->nc_cqe, cqe, nc->nc_qe_len);
+	return (nc);
+}
+
+int
+nvmf_capsule_append_data(struct nvmf_capsule *nc, struct memdesc *mem,
+    size_t len, bool send, nvmf_io_complete_t *complete_cb,
+    void *cb_arg)
+{
+	if (nc->nc_data.io_len != 0)
+		return (EBUSY);
+
+	nc->nc_send_data = send;
+	nc->nc_data.io_mem = *mem;
+	nc->nc_data.io_len = len;
+	nc->nc_data.io_complete = complete_cb;
+	nc->nc_data.io_complete_arg = cb_arg;
+	return (0);
+}
+
+void
+nvmf_free_capsule(struct nvmf_capsule *nc)
+{
+	nc->nc_qpair->nq_ops->free_capsule(nc);
+}
+
+int
+nvmf_transmit_capsule(struct nvmf_capsule *nc)
+{
+	return (nc->nc_qpair->nq_ops->transmit_capsule(nc));
+}
+
+void
+nvmf_abort_capsule_data(struct nvmf_capsule *nc, int error)
+{
+	if (nc->nc_data.io_len != 0)
+		nvmf_complete_io_request(&nc->nc_data, 0, error);
+}
+
+void *
+nvmf_capsule_sqe(struct nvmf_capsule *nc)
+{
+	KASSERT(nc->nc_qe_len == sizeof(struct nvme_command),
+	    ("%s: capsule %p is not a command capsule", __func__, nc));
+	return (&nc->nc_sqe);
+}
+
+void *
+nvmf_capsule_cqe(struct nvmf_capsule *nc)
+{
+	KASSERT(nc->nc_qe_len == sizeof(struct nvme_completion),
+	    ("%s: capsule %p is not a response capsule", __func__, nc));
+	return (&nc->nc_cqe);
+}
+
+uint8_t
+nvmf_validate_command_capsule(struct nvmf_capsule *nc)
+{
+	KASSERT(nc->nc_qe_len == sizeof(struct nvme_command),
+	    ("%s: capsule %p is not a command capsule", __func__, nc));
+
+	if (NVMEV(NVME_CMD_PSDT, nc->nc_sqe.fuse) != NVME_PSDT_SGL)
+		return (NVME_SC_INVALID_FIELD);
+
+	return (nc->nc_qpair->nq_ops->validate_command_capsule(nc));
+}
+
+size_t
+nvmf_capsule_data_len(const struct nvmf_capsule *nc)
+{
+	return (nc->nc_qpair->nq_ops->capsule_data_len(nc));
+}
+
+int
+nvmf_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
+    struct memdesc *mem, size_t len, nvmf_io_complete_t *complete_cb,
+    void *cb_arg)
+{
+	struct nvmf_io_request io;
+
+	io.io_mem = *mem;
+	io.io_len = len;
+	io.io_complete = complete_cb;
+	io.io_complete_arg = cb_arg;
+	return (nc->nc_qpair->nq_ops->receive_controller_data(nc, data_offset,
+	    &io));
+}
+
+u_int
+nvmf_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
+    struct mbuf *m, size_t len)
+{
+	MPASS(m_length(m, NULL) == len);
+	return (nc->nc_qpair->nq_ops->send_controller_data(nc, data_offset, m,
+	    len));
+}
+
+int
+nvmf_transport_module_handler(struct module *mod, int what, void *arg)
+{
+	struct nvmf_transport_ops *ops = arg;
+	struct nvmf_transport *nt, *nt2, *prev;
+	int error;
+
+	switch (what) {
+	case MOD_LOAD:
+		if (!nvmf_supported_trtype(ops->trtype)) {
+			printf("NVMF: Unsupported transport %u", ops->trtype);
+			return (EINVAL);
+		}
+
+		nt = malloc(sizeof(*nt), M_NVMF_TRANSPORT, M_WAITOK | M_ZERO);
+		nt->nt_ops = arg;
+
+		sx_xlock(&nvmf_transports_lock);
+		if (SLIST_EMPTY(&nvmf_transports[ops->trtype])) {
+			SLIST_INSERT_HEAD(&nvmf_transports[ops->trtype], nt,
+			    nt_link);
+		} else {
+			prev = NULL;
+			SLIST_FOREACH(nt2, &nvmf_transports[ops->trtype],
+			    nt_link) {
+				if (ops->priority > nt2->nt_ops->priority)
+					break;
+				prev = nt2;
+			}
+			if (prev == NULL)
+				SLIST_INSERT_HEAD(&nvmf_transports[ops->trtype],
+				    nt, nt_link);
+			else
+				SLIST_INSERT_AFTER(prev, nt, nt_link);
+		}
+		sx_xunlock(&nvmf_transports_lock);
+		return (0);
+
+	case MOD_QUIESCE:
+		if (!nvmf_supported_trtype(ops->trtype))
+			return (0);
+
+		sx_slock(&nvmf_transports_lock);
+		SLIST_FOREACH(nt, &nvmf_transports[ops->trtype], nt_link) {
+			if (nt->nt_ops == ops)
+				break;
+		}
+		if (nt == NULL) {
+			sx_sunlock(&nvmf_transports_lock);
+			return (0);
+		}
+		if (nt->nt_active_qpairs != 0) {
+			sx_sunlock(&nvmf_transports_lock);
+			return (EBUSY);
+		}
+		sx_sunlock(&nvmf_transports_lock);
+		return (0);
+
+	case MOD_UNLOAD:
+		if (!nvmf_supported_trtype(ops->trtype))
+			return (0);
+
+		sx_xlock(&nvmf_transports_lock);
+		prev = NULL;
+		SLIST_FOREACH(nt, &nvmf_transports[ops->trtype], nt_link) {
+			if (nt->nt_ops == ops)
+				break;
+			prev = nt;
+		}
+		if (nt == NULL) {
+			KASSERT(nt->nt_active_qpairs == 0,
+			    ("unregistered transport has connections"));
+			sx_xunlock(&nvmf_transports_lock);
+			return (0);
+		}
+
+		if (prev == NULL)
+			SLIST_REMOVE_HEAD(&nvmf_transports[ops->trtype],
+			    nt_link);
+		else
+			SLIST_REMOVE_AFTER(prev, nt_link);
+
+		error = 0;
+		while (nt->nt_active_qpairs != 0 && error == 0)
+			error = sx_sleep(nt, &nvmf_transports_lock, PCATCH,
+			    "nftunld", 0);
+		sx_xunlock(&nvmf_transports_lock);
+		if (error != 0)
+			return (error);
+		free(nt, M_NVMF_TRANSPORT);
+		return (0);
+
+	default:
+		return (EOPNOTSUPP);
+	}
+}
+
+static int
+nvmf_transport_modevent(module_t mod __unused, int what, void *arg __unused)
+{
+	switch (what) {
+	case MOD_LOAD:
+		for (u_int i = 0; i < nitems(nvmf_transports); i++)
+			SLIST_INIT(&nvmf_transports[i]);
+		sx_init(&nvmf_transports_lock, "nvmf transports");
+		return (0);
+	default:
+		return (EOPNOTSUPP);
+	}
+}
+
+static moduledata_t nvmf_transport_mod = {
+	"nvmf_transport",
+	nvmf_transport_modevent,
+	0
+};
+
+DECLARE_MODULE(nvmf_transport, nvmf_transport_mod, SI_SUB_DRIVERS,
+    SI_ORDER_FIRST);
+MODULE_VERSION(nvmf_transport, 1);
diff --git a/sys/dev/nvmf/nvmf_transport.h b/sys/dev/nvmf/nvmf_transport.h
new file mode 100644
index 000000000000..549170b25940
--- /dev/null
+++ b/sys/dev/nvmf/nvmf_transport.h
@@ -0,0 +1,140 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#ifndef __NVMF_TRANSPORT_H__
+#define	__NVMF_TRANSPORT_H__
+
+/*
+ * Interface used by the Fabrics host (initiator) and controller
+ * (target) to send and receive capsules and associated data.
+ */
+
+#include <sys/sysctl.h>
+#include <dev/nvmf/nvmf_proto.h>
+
+struct mbuf;
+struct memdesc;
+struct nvmf_capsule;
+struct nvmf_connection;
+struct nvmf_qpair;
+struct nvmf_handoff_qpair_params;
+
+SYSCTL_DECL(_kern_nvmf);
+
+/*
+ * Callback to invoke when an error occurs on a qpair.  The last
+ * parameter is an error value.  If the error value is zero, the qpair
+ * has been closed at the transport level rather than a transport
+ * error occuring.
+ */
+typedef void nvmf_qpair_error_t(void *, int);
+
+/* Callback to invoke when a capsule is received. */
+typedef void nvmf_capsule_receive_t(void *, struct nvmf_capsule *);
+
+/*
+ * Callback to invoke when an I/O request has completed.  The second
+ * parameter is the amount of data transferred.  The last parameter is
+ * an error value which is non-zero if the request did not complete
+ * successfully.  A request with an error may complete partially.
+ */
+typedef void nvmf_io_complete_t(void *, size_t, int);
+
+/*
+ * A queue pair represents either an Admin or I/O
+ * submission/completion queue pair.  The params contains negotiated
+ * values passed in from userland.
+ *
+ * Unlike libnvmf in userland, the kernel transport interface does not
+ * have any notion of an association.  Instead, qpairs are
+ * independent.
+ */
+struct nvmf_qpair *nvmf_allocate_qpair(enum nvmf_trtype trtype,
+    bool controller, const struct nvmf_handoff_qpair_params *params,
+    nvmf_qpair_error_t *error_cb, void *error_cb_arg,
+    nvmf_capsule_receive_t *receive_cb, void *receive_cb_arg);
+void	nvmf_free_qpair(struct nvmf_qpair *qp);
+
+/*
+ * Capsules are either commands (host -> controller) or responses
+ * (controller -> host).  A data buffer may be associated with a
+ * command capsule.  Transmitted data is not copied by this API but
+ * instead must be preserved until the completion callback is invoked
+ * to indicate capsule transmission has completed.
+ */
+struct nvmf_capsule *nvmf_allocate_command(struct nvmf_qpair *qp,
+    const void *sqe, int how);
+struct nvmf_capsule *nvmf_allocate_response(struct nvmf_qpair *qp,
+    const void *cqe, int how);
+void	nvmf_free_capsule(struct nvmf_capsule *nc);
+int	nvmf_capsule_append_data(struct nvmf_capsule *nc,
+    struct memdesc *mem, size_t len, bool send,
+    nvmf_io_complete_t *complete_cb, void *cb_arg);
+int	nvmf_transmit_capsule(struct nvmf_capsule *nc);
+void	nvmf_abort_capsule_data(struct nvmf_capsule *nc, int error);
+void *nvmf_capsule_sqe(struct nvmf_capsule *nc);
+void *nvmf_capsule_cqe(struct nvmf_capsule *nc);
+
+/* Controller-specific APIs. */
+
+/*
+ * A controller calls this function to check for any
+ * transport-specific errors (invalid fields) in a received command
+ * capsule.  The callback returns a generic command status value:
+ * NVME_SC_SUCCESS if no error is found.
+ */
+uint8_t	nvmf_validate_command_capsule(struct nvmf_capsule *nc);
+
+/*
+ * A controller calls this function to query the amount of data
+ * associated with a command capsule.
+ */
+size_t	nvmf_capsule_data_len(const struct nvmf_capsule *cc);
+
+/*
+ * A controller calls this function to receive data associated with a
+ * command capsule (e.g. the data for a WRITE command).  This can
+ * either return in-capsule data or fetch data from the host
+ * (e.g. using a R2T PDU over TCP).  The received command capsule
+ * should be passed in 'nc'.  The received data is stored in 'mem'.
+ * If this function returns success, then the callback will be invoked
+ * once the operation has completed.  Note that the callback might be
+ * invoked before this function returns.
+ */
+int	nvmf_receive_controller_data(struct nvmf_capsule *nc,
+    uint32_t data_offset, struct memdesc *mem, size_t len,
+    nvmf_io_complete_t *complete_cb, void *cb_arg);
+
+/*
+ * A controller calls this function to send data in response to a
+ * command prior to sending a response capsule.  If an error occurs,
+ * the function returns a generic status completion code to be sent in
+ * the following CQE.  Note that the transfer might send a subset of
+ * the data requested by nc.  If the transfer succeeds, this function
+ * can return one of the following values:
+ *
+ * - NVME_SC_SUCCESS: The transfer has completed successfully and the
+ *   caller should send a success CQE in a response capsule.
+ *
+ * - NVMF_SUCCESS_SENT: The transfer has completed successfully and
+ *   the transport layer has sent an implicit success CQE to the
+ *   remote host (e.g. the SUCCESS flag for TCP).  The caller should
+ *   not send a response capsule.
+ *
+ * - NVMF_MORE: The transfer has completed successfully, but the
+ *   transfer did not complete the data buffer.
+ *
+ * The mbuf chain in 'm' is consumed by this function even if an error
+ * is returned.
+ */
+u_int	nvmf_send_controller_data(struct nvmf_capsule *nc,
+    uint32_t data_offset, struct mbuf *m, size_t len);
+
+#define	NVMF_SUCCESS_SENT	0x100
+#define	NVMF_MORE		0x101
+
+#endif /* !__NVMF_TRANSPORT_H__ */
diff --git a/sys/dev/nvmf/nvmf_transport_internal.h b/sys/dev/nvmf/nvmf_transport_internal.h
new file mode 100644
index 000000000000..0be427ee0690
--- /dev/null
+++ b/sys/dev/nvmf/nvmf_transport_internal.h
@@ -0,0 +1,128 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#ifndef __NVMF_TRANSPORT_INTERNAL_H__
+#define	__NVMF_TRANSPORT_INTERNAL_H__
+
+#include <sys/memdesc.h>
+
+/*
+ * Interface between the transport-independent APIs in
+ * nvmf_transport.c and individual transports.
+ */
+
+struct module;
+struct nvmf_io_request;
+
+struct nvmf_transport_ops {
+	/* Queue pair management. */
+	struct nvmf_qpair *(*allocate_qpair)(bool controller,
+	    const struct nvmf_handoff_qpair_params *params);
+	void (*free_qpair)(struct nvmf_qpair *qp);
+
+	/* Capsule operations. */
+	struct nvmf_capsule *(*allocate_capsule)(struct nvmf_qpair *qp,
+	    int how);
+	void (*free_capsule)(struct nvmf_capsule *nc);
+	int (*transmit_capsule)(struct nvmf_capsule *nc);
+	uint8_t (*validate_command_capsule)(struct nvmf_capsule *nc);
+
+	/* Transferring controller data. */
+	size_t (*capsule_data_len)(const struct nvmf_capsule *nc);
+	int (*receive_controller_data)(struct nvmf_capsule *nc,
+	    uint32_t data_offset, struct nvmf_io_request *io);
+	u_int (*send_controller_data)(struct nvmf_capsule *nc,
+	    uint32_t data_offset, struct mbuf *m, size_t len);
+
+	enum nvmf_trtype trtype;
+	int priority;
+};
+
+/* Either an Admin or I/O Submission/Completion Queue pair. */
+struct nvmf_qpair {
+	struct nvmf_transport *nq_transport;
+	struct nvmf_transport_ops *nq_ops;
+	bool nq_controller;
+
+	/* Callback to invoke for a received capsule. */
+	nvmf_capsule_receive_t *nq_receive;
+	void *nq_receive_arg;
+
+	/* Callback to invoke for an error. */
+	nvmf_qpair_error_t *nq_error;
+	void *nq_error_arg;
+
+	bool nq_admin;
+};
+
+struct nvmf_io_request {
+	/*
+	 * Data buffer contains io_len bytes in the backing store
+	 * described by mem.
+	 */
+	struct memdesc io_mem;
+	size_t	io_len;
+	nvmf_io_complete_t *io_complete;
+	void	*io_complete_arg;
+};
+
+/*
+ * Fabrics Command and Response Capsules.  The Fabrics host
+ * (initiator) and controller (target) drivers work with capsules that
+ * are transmitted and received by a specific transport.
+ */
+struct nvmf_capsule {
+	struct nvmf_qpair *nc_qpair;
+
+	/* Either a SQE or CQE. */
+	union {
+		struct nvme_command nc_sqe;
+		struct nvme_completion nc_cqe;
+	};
+	int	nc_qe_len;
+
+	/*
+	 * Is SQHD in received capsule valid?  False for locally-
+	 * synthesized responses.
+	 */
+	bool	nc_sqhd_valid;
+
+	bool	nc_send_data;
+	struct nvmf_io_request nc_data;
+};
+
+static void __inline
+nvmf_qpair_error(struct nvmf_qpair *nq, int error)
+{
+	nq->nq_error(nq->nq_error_arg, error);
+}
+
+static void __inline
+nvmf_capsule_received(struct nvmf_qpair *nq, struct nvmf_capsule *nc)
+{
+	nq->nq_receive(nq->nq_receive_arg, nc);
+}
+
+static void __inline
+nvmf_complete_io_request(struct nvmf_io_request *io, size_t xfered, int error)
+{
+	io->io_complete(io->io_complete_arg, xfered, error);
+}
+
+int	nvmf_transport_module_handler(struct module *, int, void *);
+
+#define	NVMF_TRANSPORT(name, ops)					\
+static moduledata_t nvmf_transport_##name##_mod = {			\
+	"nvmf/" #name,							\
+	nvmf_transport_module_handler,					\
+	&(ops)								\
+};									\
+DECLARE_MODULE(nvmf_transport_##name, nvmf_transport_##name##_mod,	\
+    SI_SUB_DRIVERS, SI_ORDER_ANY);					\
+MODULE_DEPEND(nvmf_transport_##name, nvmf_transport, 1, 1, 1)
+
+#endif /* !__NVMF_TRANSPORT_INTERNAL_H__ */
diff --git a/sys/modules/Makefile b/sys/modules/Makefile
index d65eb3706ef1..37db75349cf1 100644
--- a/sys/modules/Makefile
+++ b/sys/modules/Makefile
@@ -296,6 +296,7 @@ SUBDIR=	\
 	nvd \
 	${_nvdimm} \
 	nvme \
+	nvmf \
 	${_nvram} \
 	oce \
 	${_ocs_fc} \
diff --git a/sys/modules/nvmf/Makefile b/sys/modules/nvmf/Makefile
new file mode 100644
index 000000000000..b1be042f4385
--- /dev/null
+++ b/sys/modules/nvmf/Makefile
@@ -0,0 +1,3 @@
+SUBDIR=	nvmf_transport
+
+.include <bsd.subdir.mk>
diff --git a/sys/modules/nvmf/nvmf_transport/Makefile b/sys/modules/nvmf/nvmf_transport/Makefile
new file mode 100644
index 000000000000..f0edfac5ac35
--- /dev/null
+++ b/sys/modules/nvmf/nvmf_transport/Makefile
@@ -0,0 +1,9 @@
+.PATH: ${SRCTOP}/sys/dev/nvmf
+
+KMOD=	nvmf_transport
+
+SRCS=	nvmf_transport.c
+
+EXPORT_SYMS=	YES
+
+.include <bsd.kmod.mk>