git: 2da066ef6d85 - main - libnvmf: Add internal library to support NVMe over Fabrics

From: John Baldwin <jhb_at_FreeBSD.org>
Date: Fri, 03 May 2024 00:15:45 UTC
The branch main has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=2da066ef6d85d3f7cd8aaec14369d66254836536

commit 2da066ef6d85d3f7cd8aaec14369d66254836536
Author:     John Baldwin <jhb@FreeBSD.org>
AuthorDate: 2024-05-02 23:28:16 +0000
Commit:     John Baldwin <jhb@FreeBSD.org>
CommitDate: 2024-05-02 23:28:16 +0000

    libnvmf: Add internal library to support NVMe over Fabrics
    
    libnvmf provides APIs for transmitting and receiving Command and
    Response capsules along with data associated with NVMe commands.
    Capsules are represented by 'struct nvmf_capsule' objects.
    
    Capsules are transmitted and received on queue pairs represented by
    'struct nvmf_qpair' objects.
    
    Queue pairs belong to an association represented by a 'struct
    nvmf_association' object.
    
    libnvmf provides additional helper APIs to assist with constructing
    command capsules for a host, response capsules for a controller,
    connecting queue pairs to a remote controller and optionally
    offloading connected queues to an in-kernel host, accepting queue pair
    connections from remote hosts and optionally offloading connected
    queues to an in-kernel controller, constructing controller data
    structures for local controllers, etc.
    
    libnvmf also includes an internal transport abstraction as well as an
    implementation of a userspace TCP transport.
    
    libnvmf is primarily intended for ease of use and low-traffic use cases
    such as establishing connections that are handed off to the kernel.
    As such, it uses a simple API built on blocking I/O.
    
    For a host, a consumer first populates an 'struct
    nvmf_association_params' with a set of parameters shared by all queue
    pairs for a single association such as whether or not to use SQ flow
    control and header and data digests and creates a 'struct
    nvmf_association' object.  The consumer is responsible for
    establishing a TCP socket for each queue pair.  This socket is
    included in the 'struct nvmf_qpair_params' passed to 'nvmf_connect' to
    complete transport-specific negotiation, send a Fabrics Connect
    command, and wait for the Connect reply. Upon success, a new 'struct
    nvmf_qpair' object is returned.  This queue pair can then be used to
    send and receive capsules.  A command capsule is allocated, populated
    with an SQE and optional data buffer, and transmitted via
    nvmf_host_transmit_command.  The consumer can then wait for a reply
    via nvmf_host_wait_for_response.  The library also provides some
    wrapper functions such as nvmf_read_property and nvmf_write_property
    which send a command and wait for a response synchronously.
    
    For a controller, a consumer uses a single association for a set of
    incoming connections.  A consumer can choose to use multiple
    associations (e.g. a separate association for connections to a
    discovery controller listening on a different port than I/O
    controllers).  The consumer is responsible for accepting TCP sockets
    directly, but once a socket has been accepted it is passed to
    nvmf_accept to perform transport-specific negotiation and wait for the
    Connect command.  Similar to nvmf_connect, nvmf_accept returns a newly
    construct nvmf_qpair.  However, in contrast to nvmf_connect,
    nvmf_accept does not complete the Fabrics negotiation.  The consumer
    must explicitly send a response capsule before waiting for additional
    command capsules to arrive.  In particular, in the kernel offload
    case, the Connect command and data are provided to the kernel
    controller and the Connect response capsule is sent by the kernel once
    it is ready to handle the new queue pair.
    
    For userspace controller command handling, the consumer uses
    nvmf_controller_receive_capsule to wait for a command capsule.
    nvmf_receive_controller_data is used to retrieve any data from a
    command (e.g. the data for a WRITE command).  It can be called
    multiple times to split the data transfer into smaller sizes.
    nvmf_send_controller_data is used to send data to a remote host in
    response to a command.  It also sends a response capsule indicating
    success, or an error if an internal error occurs.  nvmf_send_response
    is used to send a response without associated data.  There are also
    several convenience wrappers such as nvmf_send_success and
    nvmf_send_generic_error.
    
    Reviewed by:    imp
    Sponsored by:   Chelsio Communications
    Differential Revision:  https://reviews.freebsd.org/D44710
---
 lib/Makefile                  |    1 +
 lib/libnvmf/Makefile          |   22 +
 lib/libnvmf/internal.h        |  116 ++++
 lib/libnvmf/libnvmf.h         |  363 ++++++++++
 lib/libnvmf/nvmf_controller.c |  463 +++++++++++++
 lib/libnvmf/nvmf_host.c       |  911 +++++++++++++++++++++++++
 lib/libnvmf/nvmf_tcp.c        | 1474 +++++++++++++++++++++++++++++++++++++++++
 lib/libnvmf/nvmf_transport.c  |  269 ++++++++
 share/mk/src.libnames.mk      |    4 +
 9 files changed, 3623 insertions(+)

diff --git a/lib/Makefile b/lib/Makefile
index 6135cff10c15..5696fa4aa593 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -78,6 +78,7 @@ SUBDIR=	${SUBDIR_BOOTSTRAP} \
 	libnetbsd \
 	libnetmap \
 	libnv \
+	libnvmf \
 	libopenbsd \
 	libpam \
 	libpathconv \
diff --git a/lib/libnvmf/Makefile b/lib/libnvmf/Makefile
new file mode 100644
index 000000000000..dbba6b476510
--- /dev/null
+++ b/lib/libnvmf/Makefile
@@ -0,0 +1,22 @@
+.PATH:  ${SRCTOP}/sys/dev/nvmf/controller
+.PATH:  ${SRCTOP}/sys/libkern
+
+LIB=		nvmf
+INTERNALLIB=
+PACKAGE=	nvmf
+
+INCS=		libnvmf.h
+
+SRCS=		gsb_crc32.c \
+		nvmf_controller.c \
+		nvmf_host.c \
+		nvmf_tcp.c \
+		nvmf_transport.c \
+		nvmft_subr.c
+
+CFLAGS+=	-I${SRCTOP}/sys/dev/nvmf/controller
+CFLAGS+=	-I${SRCTOP}/sys/dev/nvmf
+
+.include <bsd.lib.mk>
+
+CWARNFLAGS.gsb_crc32.c=	-Wno-cast-align
diff --git a/lib/libnvmf/internal.h b/lib/libnvmf/internal.h
new file mode 100644
index 000000000000..cf45c15ba2f0
--- /dev/null
+++ b/lib/libnvmf/internal.h
@@ -0,0 +1,116 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#ifndef __LIBNVMF_INTERNAL_H__
+#define __LIBNVMF_INTERNAL_H__
+
+#include <sys/queue.h>
+
+struct nvmf_transport_ops {
+	/* Association management. */
+	struct nvmf_association *(*allocate_association)(bool controller,
+	    const struct nvmf_association_params *params);
+	void (*update_association)(struct nvmf_association *na,
+	    const struct nvme_controller_data *cdata);
+	void (*free_association)(struct nvmf_association *na);
+
+	/* Queue pair management. */
+	struct nvmf_qpair *(*allocate_qpair)(struct nvmf_association *na,
+	    const struct nvmf_qpair_params *params);
+	void (*free_qpair)(struct nvmf_qpair *qp);
+
+	/* Create params for kernel handoff. */
+	int (*kernel_handoff_params)(struct nvmf_qpair *qp,
+	    struct nvmf_handoff_qpair_params *qparams);
+
+	/* Capsule operations. */
+	struct nvmf_capsule *(*allocate_capsule)(struct nvmf_qpair *qp);
+	void (*free_capsule)(struct nvmf_capsule *nc);
+	int (*transmit_capsule)(struct nvmf_capsule *nc);
+	int (*receive_capsule)(struct nvmf_qpair *qp,
+	    struct nvmf_capsule **ncp);
+	uint8_t (*validate_command_capsule)(const struct nvmf_capsule *nc);
+
+	/* Transferring controller data. */
+	size_t (*capsule_data_len)(const struct nvmf_capsule *nc);
+	int (*receive_controller_data)(const struct nvmf_capsule *nc,
+	    uint32_t data_offset, void *buf, size_t len);
+	int (*send_controller_data)(const struct nvmf_capsule *nc,
+	    const void *buf, size_t len);
+};
+
+struct nvmf_association {
+	struct nvmf_transport_ops *na_ops;
+	enum nvmf_trtype na_trtype;
+	bool na_controller;
+
+	struct nvmf_association_params na_params;
+
+	/* Each qpair holds a reference on an association. */
+	u_int na_refs;
+
+	char *na_last_error;
+};
+
+struct nvmf_qpair {
+	struct nvmf_association *nq_association;
+	bool nq_admin;
+
+	uint16_t nq_cid;	/* host only */
+
+	/*
+	 * Queue sizes.  This assumes the same size for both the
+	 * completion and submission queues within a pair.
+	 */
+	u_int	nq_qsize;
+
+	/* Flow control management for submission queues. */
+	bool nq_flow_control;
+	uint16_t nq_sqhd;
+	uint16_t nq_sqtail;	/* host only */
+
+	/* Value in response to/from CONNECT. */
+	uint16_t nq_cntlid;
+
+	uint32_t nq_kato;	/* valid on admin queue only */
+
+	TAILQ_HEAD(, nvmf_capsule) nq_rx_capsules;
+};
+
+struct nvmf_capsule {
+	struct nvmf_qpair *nc_qpair;
+
+	/* Either a SQE or CQE. */
+	union {
+		struct nvme_command nc_sqe;
+		struct nvme_completion nc_cqe;
+	};
+	int	nc_qe_len;
+
+	/*
+	 * Is SQHD in received capsule valid?  False for locally-
+	 * synthesized responses.
+	 */
+	bool	nc_sqhd_valid;
+
+	/* Data buffer. */
+	bool	nc_send_data;
+	void	*nc_data;
+	size_t	nc_data_len;
+
+	TAILQ_ENTRY(nvmf_capsule) nc_link;
+};
+
+extern struct nvmf_transport_ops tcp_ops;
+
+void	na_clear_error(struct nvmf_association *na);
+void	na_error(struct nvmf_association *na, const char *fmt, ...);
+
+int	nvmf_kernel_handoff_params(struct nvmf_qpair *qp,
+    struct nvmf_handoff_qpair_params *qparams);
+
+#endif /* !__LIBNVMF_INTERNAL_H__ */
diff --git a/lib/libnvmf/libnvmf.h b/lib/libnvmf/libnvmf.h
new file mode 100644
index 000000000000..f15277a02621
--- /dev/null
+++ b/lib/libnvmf/libnvmf.h
@@ -0,0 +1,363 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#ifndef __LIBNVMF_H__
+#define	__LIBNVMF_H__
+
+#include <sys/uio.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_proto.h>
+
+struct nvmf_capsule;
+struct nvmf_association;
+struct nvmf_qpair;
+
+/*
+ * Parameters shared by all queue-pairs of an association.  Note that
+ * this contains the requested values used to initiate transport
+ * negotiation.
+ */
+struct nvmf_association_params {
+	bool sq_flow_control;		/* SQ flow control required. */
+	bool dynamic_controller_model;	/* Controller only */
+	uint16_t max_admin_qsize;	/* Controller only */
+	uint32_t max_io_qsize;		/* Controller only, 0 for discovery */
+	union {
+		struct {
+			uint8_t pda;	/* Tx-side PDA. */
+			bool header_digests;
+			bool data_digests;
+			uint32_t maxr2t;	/* Host only */
+			uint32_t maxh2cdata;	/* Controller only */
+		} tcp;
+	};
+};
+
+/* Parameters specific to a single queue pair of an association. */
+struct nvmf_qpair_params {
+	bool admin;			/* Host only */
+	union {
+		struct {
+			int fd;
+		} tcp;
+	};
+};
+
+/* Transport-independent APIs. */
+
+/*
+ * A host should allocate a new association for each association with
+ * a controller.  After the admin queue has been allocated and the
+ * controller's data has been fetched, it should be passed to
+ * nvmf_update_association to update internal transport-specific
+ * parameters before allocating I/O queues.
+ *
+ * A controller uses a single association to manage all incoming
+ * queues since it is not known until after parsing the CONNECT
+ * command which transport queues are admin vs I/O and which
+ * controller they are created against.
+ */
+struct nvmf_association *nvmf_allocate_association(enum nvmf_trtype trtype,
+    bool controller, const struct nvmf_association_params *params);
+void	nvmf_update_assocation(struct nvmf_association *na,
+    const struct nvme_controller_data *cdata);
+void	nvmf_free_association(struct nvmf_association *na);
+
+/* The most recent association-wide error message. */
+const char *nvmf_association_error(const struct nvmf_association *na);
+
+/*
+ * A queue pair represents either an Admin or I/O
+ * submission/completion queue pair.
+ *
+ * Each open qpair holds a reference on its association.  Once queue
+ * pairs are allocated, callers can safely free the association to
+ * ease bookkeeping.
+ *
+ * If nvmf_allocate_qpair fails, a detailed error message can be obtained
+ * from nvmf_association_error.
+ */
+struct nvmf_qpair *nvmf_allocate_qpair(struct nvmf_association *na,
+    const struct nvmf_qpair_params *params);
+void	nvmf_free_qpair(struct nvmf_qpair *qp);
+
+/*
+ * Capsules are either commands (host -> controller) or responses
+ * (controller -> host).  A single data buffer segment may be
+ * associated with a command capsule.  Transmitted data is not copied
+ * by this API but instead must be preserved until the capsule is
+ * transmitted and freed.
+ */
+struct nvmf_capsule *nvmf_allocate_command(struct nvmf_qpair *qp,
+    const void *sqe);
+struct nvmf_capsule *nvmf_allocate_response(struct nvmf_qpair *qp,
+    const void *cqe);
+void	nvmf_free_capsule(struct nvmf_capsule *nc);
+int	nvmf_capsule_append_data(struct nvmf_capsule *nc,
+    void *buf, size_t len, bool send);
+int	nvmf_transmit_capsule(struct nvmf_capsule *nc);
+int	nvmf_receive_capsule(struct nvmf_qpair *qp, struct nvmf_capsule **ncp);
+const void *nvmf_capsule_sqe(const struct nvmf_capsule *nc);
+const void *nvmf_capsule_cqe(const struct nvmf_capsule *nc);
+
+/* Return a string name for a transport type. */
+const char *nvmf_transport_type(uint8_t trtype);
+
+/* Validate a NVMe Qualified Name. */
+bool	nvmf_nqn_valid(const char *nqn);
+
+/* Controller-specific APIs. */
+
+/*
+ * A controller calls this function to check for any
+ * transport-specific errors (invalid fields) in a received command
+ * capsule.  The callback returns a generic command status value:
+ * NVME_SC_SUCCESS if no error is found.
+ */
+uint8_t	nvmf_validate_command_capsule(const struct nvmf_capsule *nc);
+
+/*
+ * A controller calls this function to query the amount of data
+ * associated with a command capsule.
+ */
+size_t	nvmf_capsule_data_len(const struct nvmf_capsule *cc);
+
+/*
+ * A controller calls this function to receive data associated with a
+ * command capsule (e.g. the data for a WRITE command).  This can
+ * either return in-capsule data or fetch data from the host
+ * (e.g. using a R2T PDU over TCP).  The received command capsule
+ * should be passed in 'nc'.  The received data is stored in '*buf'.
+ */
+int	nvmf_receive_controller_data(const struct nvmf_capsule *nc,
+    uint32_t data_offset, void *buf, size_t len);
+
+/*
+ * A controller calls this function to send data in response to a
+ * command along with a response capsule.  If the data transfer
+ * succeeds, a success response is sent.  If the data transfer fails,
+ * an appropriate error status capsule is sent.  Regardless, a
+ * response capsule is always sent.
+ */
+int	nvmf_send_controller_data(const struct nvmf_capsule *nc,
+    const void *buf, size_t len);
+
+/*
+ * Construct a CQE for a reply to a command capsule in 'nc' with the
+ * completion status 'status'.  This is useful when additional CQE
+ * info is required beyond the completion status.
+ */
+void	nvmf_init_cqe(void *cqe, const struct nvmf_capsule *nc,
+    uint16_t status);
+
+/*
+ * Construct and send a response capsule to a command capsule with
+ * the supplied CQE.
+ */
+int	nvmf_send_response(const struct nvmf_capsule *nc, const void *cqe);
+
+/*
+ * Wait for a single command capsule and return it in *ncp.  This can
+ * fail if an invalid capsule is received or an I/O error occurs.
+ */
+int	nvmf_controller_receive_capsule(struct nvmf_qpair *qp,
+    struct nvmf_capsule **ncp);
+
+/* Send a response capsule from a controller. */
+int	nvmf_controller_transmit_response(struct nvmf_capsule *nc);
+
+/* Construct and send an error response capsule. */
+int	nvmf_send_error(const struct nvmf_capsule *cc, uint8_t sc_type,
+    uint8_t sc_status);
+
+/*
+ * Construct and send an error response capsule using a generic status
+ * code.
+ */
+int	nvmf_send_generic_error(const struct nvmf_capsule *nc,
+    uint8_t sc_status);
+
+/* Construct and send a simple success response capsule. */
+int	nvmf_send_success(const struct nvmf_capsule *nc);
+
+/*
+ * Allocate a new queue pair and wait for the CONNECT command capsule.
+ * If this fails, a detailed error message can be obtained from
+ * nvmf_association_error.  On success, the command capsule is saved
+ * in '*ccp' and the connect data is saved in 'data'.  The caller
+ * must send an explicit response and free the the command capsule.
+ */
+struct nvmf_qpair *nvmf_accept(struct nvmf_association *na,
+    const struct nvmf_qpair_params *params, struct nvmf_capsule **ccp,
+    struct nvmf_fabric_connect_data *data);
+
+/*
+ * Construct and send a response capsule with the Fabrics CONNECT
+ * invalid parameters error status.  If data is true the offset is
+ * relative to the CONNECT data structure, otherwise the offset is
+ * relative to the SQE.
+ */
+void	nvmf_connect_invalid_parameters(const struct nvmf_capsule *cc,
+    bool data, uint16_t offset);
+
+/* Construct and send a response capsule for a successful CONNECT. */
+int	nvmf_finish_accept(const struct nvmf_capsule *cc, uint16_t cntlid);
+
+/* Compute the initial state of CAP for a controller. */
+uint64_t nvmf_controller_cap(struct nvmf_qpair *qp);
+
+/* Generate a serial number string from a host ID. */
+void	nvmf_controller_serial(char *buf, size_t len, u_long hostid);
+
+/*
+ * Populate an Identify Controller data structure for a Discovery
+ * controller.
+ */
+void	nvmf_init_discovery_controller_data(struct nvmf_qpair *qp,
+    struct nvme_controller_data *cdata);
+
+/*
+ * Populate an Identify Controller data structure for an I/O
+ * controller.
+ */
+void	nvmf_init_io_controller_data(struct nvmf_qpair *qp, const char *serial,
+    const char *subnqn, int nn, uint32_t ioccsz,
+    struct nvme_controller_data *cdata);
+
+/*
+ * Validate if a new value for CC is legal given the existing values of
+ * CAP and CC.
+ */
+bool	nvmf_validate_cc(struct nvmf_qpair *qp, uint64_t cap, uint32_t old_cc,
+    uint32_t new_cc);
+
+/* Return the log page id (LID) of a GET_LOG_PAGE command. */
+uint8_t	nvmf_get_log_page_id(const struct nvme_command *cmd);
+
+/* Return the requested data length of a GET_LOG_PAGE command. */
+uint64_t nvmf_get_log_page_length(const struct nvme_command *cmd);
+
+/* Return the requested data offset of a GET_LOG_PAGE command. */
+uint64_t nvmf_get_log_page_offset(const struct nvme_command *cmd);
+
+/* Prepare to handoff a controller qpair. */
+int	nvmf_handoff_controller_qpair(struct nvmf_qpair *qp,
+    struct nvmf_handoff_controller_qpair *h);
+
+/* Host-specific APIs. */
+
+/*
+ * Connect to an admin or I/O queue.  If this fails, a detailed error
+ * message can be obtained from nvmf_association_error.
+ */
+struct nvmf_qpair *nvmf_connect(struct nvmf_association *na,
+    const struct nvmf_qpair_params *params, uint16_t qid, u_int queue_size,
+    const uint8_t hostid[16], uint16_t cntlid, const char *subnqn,
+    const char *hostnqn, uint32_t kato);
+
+/* Return the CNTLID for a queue returned from CONNECT. */
+uint16_t nvmf_cntlid(struct nvmf_qpair *qp);
+
+/*
+ * Send a command to the controller.  This can fail with EBUSY if the
+ * submission queue is full.
+ */
+int	nvmf_host_transmit_command(struct nvmf_capsule *nc);
+
+/*
+ * Wait for a response to a command.  If there are no outstanding
+ * commands in the SQ, fails with EWOULDBLOCK.
+ */
+int	nvmf_host_receive_response(struct nvmf_qpair *qp,
+    struct nvmf_capsule **rcp);
+
+/*
+ * Wait for a response to a specific command.  The command must have been
+ * succesfully sent previously.
+ */
+int	nvmf_host_wait_for_response(struct nvmf_capsule *cc,
+    struct nvmf_capsule **rcp);
+
+/* Build a KeepAlive command. */
+struct nvmf_capsule *nvmf_keepalive(struct nvmf_qpair *qp);
+
+/* Read a controller property. */
+int	nvmf_read_property(struct nvmf_qpair *qp, uint32_t offset, uint8_t size,
+    uint64_t *value);
+
+/* Write a controller property. */
+int	nvmf_write_property(struct nvmf_qpair *qp, uint32_t offset,
+    uint8_t size, uint64_t value);
+
+/* Construct a 16-byte HostId from kern.hostuuid. */
+int	nvmf_hostid_from_hostuuid(uint8_t hostid[16]);
+
+/* Construct a NQN from kern.hostuuid. */
+int	nvmf_nqn_from_hostuuid(char nqn[NVMF_NQN_MAX_LEN]);
+
+/* Fetch controller data via IDENTIFY. */
+int	nvmf_host_identify_controller(struct nvmf_qpair *qp,
+    struct nvme_controller_data *data);
+
+/* Fetch namespace data via IDENTIFY. */
+int	nvmf_host_identify_namespace(struct nvmf_qpair *qp, uint32_t nsid,
+    struct nvme_namespace_data *nsdata);
+
+/*
+ * Fetch discovery log page.  The memory for the log page is allocated
+ * by malloc() and returned in *logp.  The caller must free the
+ * memory.
+ */
+int	nvmf_host_fetch_discovery_log_page(struct nvmf_qpair *qp,
+    struct nvme_discovery_log **logp);
+
+/*
+ * Request a desired number of I/O queues via SET_FEATURES.  The
+ * number of actual I/O queues available is returned in *actual on
+ * success.
+ */
+int	nvmf_host_request_queues(struct nvmf_qpair *qp, u_int requested,
+    u_int *actual);
+
+/*
+ * Handoff active host association to the kernel.  This frees the
+ * qpairs (even on error).
+ */
+int	nvmf_handoff_host(struct nvmf_qpair *admin_qp, u_int num_queues,
+    struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata);
+
+/*
+ * Disconnect an active host association previously handed off to the
+ * kernel.  *name is either the name of the device (nvmeX) for this
+ * association or the remote subsystem NQN.
+ */
+int	nvmf_disconnect_host(const char *host);
+
+/*
+ * Disconnect all active host associations previously handed off to
+ * the kernel.
+ */
+int	nvmf_disconnect_all(void);
+
+/*
+ * Fetch reconnect parameters from an existing kernel host to use for
+ * establishing a new association.
+ */
+int	nvmf_reconnect_params(int fd, struct nvmf_reconnect_params *rparams);
+
+/*
+ * Handoff active host association to an existing host in the kernel.
+ * This frees the qpairs (even on error).
+ */
+int	nvmf_reconnect_host(int fd, struct nvmf_qpair *admin_qp,
+    u_int num_queues, struct nvmf_qpair **io_queues,
+    const struct nvme_controller_data *cdata);
+
+#endif /* !__LIBNVMF_H__ */
diff --git a/lib/libnvmf/nvmf_controller.c b/lib/libnvmf/nvmf_controller.c
new file mode 100644
index 000000000000..554e5e769ded
--- /dev/null
+++ b/lib/libnvmf/nvmf_controller.c
@@ -0,0 +1,463 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/utsname.h>
+#include <assert.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "libnvmf.h"
+#include "internal.h"
+#include "nvmft_subr.h"
+
+void
+nvmf_init_cqe(void *cqe, const struct nvmf_capsule *nc, uint16_t status)
+{
+	struct nvme_completion *cpl = cqe;
+	const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
+
+	memset(cpl, 0, sizeof(*cpl));
+	cpl->cid = cmd->cid;
+	cpl->status = htole16(status);
+}
+
+static struct nvmf_capsule *
+nvmf_simple_response(const struct nvmf_capsule *nc, uint8_t sc_type,
+    uint8_t sc_status)
+{
+	struct nvme_completion cpl;
+	uint16_t status;
+
+	status = NVMEF(NVME_STATUS_SCT, sc_type) |
+	    NVMEF(NVME_STATUS_SC, sc_status);
+	nvmf_init_cqe(&cpl, nc, status);
+	return (nvmf_allocate_response(nc->nc_qpair, &cpl));
+}
+
+int
+nvmf_controller_receive_capsule(struct nvmf_qpair *qp,
+    struct nvmf_capsule **ncp)
+{
+	struct nvmf_capsule *nc;
+	int error;
+	uint8_t sc_status;
+
+	*ncp = NULL;
+	error = nvmf_receive_capsule(qp, &nc);
+	if (error != 0)
+		return (error);
+
+	sc_status = nvmf_validate_command_capsule(nc);
+	if (sc_status != NVME_SC_SUCCESS) {
+		nvmf_send_generic_error(nc, sc_status);
+		nvmf_free_capsule(nc);
+		return (EPROTO);
+	}
+
+	*ncp = nc;
+	return (0);
+}
+
+int
+nvmf_controller_transmit_response(struct nvmf_capsule *nc)
+{
+	struct nvmf_qpair *qp = nc->nc_qpair;
+
+	/* Set SQHD. */
+	if (qp->nq_flow_control) {
+		qp->nq_sqhd = (qp->nq_sqhd + 1) % qp->nq_qsize;
+		nc->nc_cqe.sqhd = htole16(qp->nq_sqhd);
+	} else
+		nc->nc_cqe.sqhd = 0;
+
+	return (nvmf_transmit_capsule(nc));
+}
+
+int
+nvmf_send_response(const struct nvmf_capsule *cc, const void *cqe)
+{
+	struct nvmf_capsule *rc;
+	int error;
+
+	rc = nvmf_allocate_response(cc->nc_qpair, cqe);
+	if (rc == NULL)
+		return (ENOMEM);
+	error = nvmf_controller_transmit_response(rc);
+	nvmf_free_capsule(rc);
+	return (error);
+}
+
+int
+nvmf_send_error(const struct nvmf_capsule *cc, uint8_t sc_type,
+    uint8_t sc_status)
+{
+	struct nvmf_capsule *rc;
+	int error;
+
+	rc = nvmf_simple_response(cc, sc_type, sc_status);
+	error = nvmf_controller_transmit_response(rc);
+	nvmf_free_capsule(rc);
+	return (error);
+}
+
+int
+nvmf_send_generic_error(const struct nvmf_capsule *nc, uint8_t sc_status)
+{
+	return (nvmf_send_error(nc, NVME_SCT_GENERIC, sc_status));
+}
+
+int
+nvmf_send_success(const struct nvmf_capsule *nc)
+{
+	return (nvmf_send_generic_error(nc, NVME_SC_SUCCESS));
+}
+
+void
+nvmf_connect_invalid_parameters(const struct nvmf_capsule *cc, bool data,
+    uint16_t offset)
+{
+	struct nvmf_fabric_connect_rsp rsp;
+	struct nvmf_capsule *rc;
+
+	nvmf_init_cqe(&rsp, cc,
+	    NVMEF(NVME_STATUS_SCT, NVME_SCT_COMMAND_SPECIFIC) |
+	    NVMEF(NVME_STATUS_SC, NVMF_FABRIC_SC_INVALID_PARAM));
+	rsp.status_code_specific.invalid.ipo = htole16(offset);
+	rsp.status_code_specific.invalid.iattr = data ? 1 : 0;
+	rc = nvmf_allocate_response(cc->nc_qpair, &rsp);
+	nvmf_transmit_capsule(rc);
+	nvmf_free_capsule(rc);
+}
+
+struct nvmf_qpair *
+nvmf_accept(struct nvmf_association *na, const struct nvmf_qpair_params *params,
+    struct nvmf_capsule **ccp, struct nvmf_fabric_connect_data *data)
+{
+	static const char hostid_zero[sizeof(data->hostid)];
+	const struct nvmf_fabric_connect_cmd *cmd;
+	struct nvmf_qpair *qp;
+	struct nvmf_capsule *cc, *rc;
+	u_int qsize;
+	int error;
+	uint16_t cntlid;
+	uint8_t sc_status;
+
+	qp = NULL;
+	cc = NULL;
+	rc = NULL;
+	*ccp = NULL;
+	na_clear_error(na);
+	if (!na->na_controller) {
+		na_error(na, "Cannot accept on a host");
+		goto error;
+	}
+
+	qp = nvmf_allocate_qpair(na, params);
+	if (qp == NULL)
+		goto error;
+
+	/* Read the CONNECT capsule. */
+	error = nvmf_receive_capsule(qp, &cc);
+	if (error != 0) {
+		na_error(na, "Failed to receive CONNECT: %s", strerror(error));
+		goto error;
+	}
+
+	sc_status = nvmf_validate_command_capsule(cc);
+	if (sc_status != 0) {
+		na_error(na, "CONNECT command failed to validate: %u",
+		    sc_status);
+		rc = nvmf_simple_response(cc, NVME_SCT_GENERIC, sc_status);
+		goto error;
+	}
+
+	cmd = nvmf_capsule_sqe(cc);
+	if (cmd->opcode != NVME_OPC_FABRICS_COMMANDS ||
+	    cmd->fctype != NVMF_FABRIC_COMMAND_CONNECT) {
+		na_error(na, "Invalid opcode in CONNECT (%u,%u)", cmd->opcode,
+		    cmd->fctype);
+		rc = nvmf_simple_response(cc, NVME_SCT_GENERIC,
+		    NVME_SC_INVALID_OPCODE);
+		goto error;
+	}
+
+	if (cmd->recfmt != htole16(0)) {
+		na_error(na, "Unsupported CONNECT record format %u",
+		    le16toh(cmd->recfmt));
+		rc = nvmf_simple_response(cc, NVME_SCT_COMMAND_SPECIFIC,
+		    NVMF_FABRIC_SC_INCOMPATIBLE_FORMAT);
+		goto error;
+	}
+
+	qsize = le16toh(cmd->sqsize) + 1;
+	if (cmd->qid == 0) {
+		/* Admin queue limits. */
+		if (qsize < NVME_MIN_ADMIN_ENTRIES ||
+		    qsize > NVME_MAX_ADMIN_ENTRIES ||
+		    qsize > na->na_params.max_admin_qsize) {
+			na_error(na, "Invalid queue size %u", qsize);
+			nvmf_connect_invalid_parameters(cc, false,
+			    offsetof(struct nvmf_fabric_connect_cmd, sqsize));
+			goto error;
+		}
+		qp->nq_admin = true;
+	} else {
+		/* I/O queues not allowed for discovery. */
+		if (na->na_params.max_io_qsize == 0) {
+			na_error(na, "I/O queue on discovery controller");
+			nvmf_connect_invalid_parameters(cc, false,
+			    offsetof(struct nvmf_fabric_connect_cmd, qid));
+			goto error;
+		}
+
+		/* I/O queue limits. */
+		if (qsize < NVME_MIN_IO_ENTRIES ||
+		    qsize > NVME_MAX_IO_ENTRIES ||
+		    qsize > na->na_params.max_io_qsize) {
+			na_error(na, "Invalid queue size %u", qsize);
+			nvmf_connect_invalid_parameters(cc, false,
+			    offsetof(struct nvmf_fabric_connect_cmd, sqsize));
+			goto error;
+		}
+
+		/* KATO is reserved for I/O queues. */
+		if (cmd->kato != 0) {
+			na_error(na,
+			    "KeepAlive timeout specified for I/O queue");
+			nvmf_connect_invalid_parameters(cc, false,
+			    offsetof(struct nvmf_fabric_connect_cmd, kato));
+			goto error;
+		}
+		qp->nq_admin = false;
+	}
+	qp->nq_qsize = qsize;
+
+	/* Fetch CONNECT data. */
+	if (nvmf_capsule_data_len(cc) != sizeof(*data)) {
+		na_error(na, "Invalid data payload length for CONNECT: %zu",
+		    nvmf_capsule_data_len(cc));
+		nvmf_connect_invalid_parameters(cc, false,
+		    offsetof(struct nvmf_fabric_connect_cmd, sgl1));
+		goto error;
+	}
+
+	error = nvmf_receive_controller_data(cc, 0, data, sizeof(*data));
+	if (error != 0) {
+		na_error(na, "Failed to read data for CONNECT: %s",
+		    strerror(error));
+		rc = nvmf_simple_response(cc, NVME_SCT_GENERIC,
+		    NVME_SC_DATA_TRANSFER_ERROR);
+		goto error;
+	}
+
+	/* The hostid must be non-zero. */
+	if (memcmp(data->hostid, hostid_zero, sizeof(hostid_zero)) == 0) {
+		na_error(na, "HostID in CONNECT data is zero");
+		nvmf_connect_invalid_parameters(cc, true,
+		    offsetof(struct nvmf_fabric_connect_data, hostid));
+		goto error;
+	}
+
+	cntlid = le16toh(data->cntlid);
+	if (cmd->qid == 0) {
+		if (na->na_params.dynamic_controller_model) {
+			if (cntlid != NVMF_CNTLID_DYNAMIC) {
+				na_error(na, "Invalid controller ID %#x",
+				    cntlid);
+				nvmf_connect_invalid_parameters(cc, true,
+				    offsetof(struct nvmf_fabric_connect_data,
+					cntlid));
+				goto error;
+			}
+		} else {
+			if (cntlid > NVMF_CNTLID_STATIC_MAX &&
+			    cntlid != NVMF_CNTLID_STATIC_ANY) {
+				na_error(na, "Invalid controller ID %#x",
+				    cntlid);
+				nvmf_connect_invalid_parameters(cc, true,
+				    offsetof(struct nvmf_fabric_connect_data,
+					cntlid));
+				goto error;
+			}
+		}
+	} else {
+		/* Wildcard Controller IDs are only valid on an Admin queue. */
+		if (cntlid > NVMF_CNTLID_STATIC_MAX) {
+			na_error(na, "Invalid controller ID %#x", cntlid);
+			nvmf_connect_invalid_parameters(cc, true,
+			    offsetof(struct nvmf_fabric_connect_data, cntlid));
+			goto error;
+		}
+	}
+
+	/* Simple validation of each NQN. */
+	if (!nvmf_nqn_valid(data->subnqn)) {
+		na_error(na, "Invalid SubNQN %.*s", (int)sizeof(data->subnqn),
+		    data->subnqn);
+		nvmf_connect_invalid_parameters(cc, true,
+		    offsetof(struct nvmf_fabric_connect_data, subnqn));
+		goto error;
+	}
+	if (!nvmf_nqn_valid(data->hostnqn)) {
+		na_error(na, "Invalid HostNQN %.*s", (int)sizeof(data->hostnqn),
+		    data->hostnqn);
+		nvmf_connect_invalid_parameters(cc, true,
+		    offsetof(struct nvmf_fabric_connect_data, hostnqn));
+		goto error;
+	}
+
+	if (na->na_params.sq_flow_control ||
+	    (cmd->cattr & NVMF_CONNECT_ATTR_DISABLE_SQ_FC) == 0)
+		qp->nq_flow_control = true;
+	else
+		qp->nq_flow_control = false;
+	qp->nq_sqhd = 0;
+	qp->nq_kato = le32toh(cmd->kato);
+	*ccp = cc;
+	return (qp);
+error:
+	if (rc != NULL) {
+		nvmf_transmit_capsule(rc);
+		nvmf_free_capsule(rc);
+	}
+	if (cc != NULL)
+		nvmf_free_capsule(cc);
+	if (qp != NULL)
+		nvmf_free_qpair(qp);
+	return (NULL);
+}
+
+int
+nvmf_finish_accept(const struct nvmf_capsule *cc, uint16_t cntlid)
+{
+	struct nvmf_fabric_connect_rsp rsp;
+	struct nvmf_qpair *qp = cc->nc_qpair;
+	struct nvmf_capsule *rc;
+	int error;
+
+	nvmf_init_cqe(&rsp, cc, 0);
+	if (qp->nq_flow_control)
+		rsp.sqhd = htole16(qp->nq_sqhd);
+	else
+		rsp.sqhd = htole16(0xffff);
+	rsp.status_code_specific.success.cntlid = htole16(cntlid);
+	rc = nvmf_allocate_response(qp, &rsp);
+	if (rc == NULL)
+		return (ENOMEM);
+	error = nvmf_transmit_capsule(rc);
*** 2805 LINES SKIPPED ***