git: 59144db3fca1 - main - nvmf_tcp: Add a TCP transport for NVMe over Fabrics

From: John Baldwin <jhb_at_FreeBSD.org>
Date: Fri, 03 May 2024 00:15:47 UTC
The branch main has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=59144db3fca192c4637637dfe6b5a5d98632cd47

commit 59144db3fca192c4637637dfe6b5a5d98632cd47
Author:     John Baldwin <jhb@FreeBSD.org>
AuthorDate: 2024-05-02 23:28:47 +0000
Commit:     John Baldwin <jhb@FreeBSD.org>
CommitDate: 2024-05-02 23:28:47 +0000

    nvmf_tcp: Add a TCP transport for NVMe over Fabrics
    
    Structurally this is very similar to the TCP transport for iSCSI
    (icl_soft.c).  One key difference is that NVMeoF transports use a more
    abstract interface working with NVMe commands rather than transport
    PDUs.  Thus, the data transfer for a given command is managed entirely
    in the transport backend.
    
    Similar to icl_soft.c, separate kthreads are used to handle transmit
    and receive for each queue pair.  On the transmit side, when a capsule
    is transmitted by an upper layer, it is placed on a queue for
    processing by the transmit thread.  The transmit thread converts
    command response capsules into suitable TCP PDUs where each PDU is
    described by an mbuf chain that is then queued to the backing socket's
    send buffer.  Command capsules can embed data along with the NVMe
    command.
    
    On the receive side, a socket upcall notifies the receive kthread when
    more data arrives.  Once enough data has arrived for a PDU, the PDU is
    handled synchronously in the kthread.  PDUs such as R2T or data
    related PDUs are handled internally, with callbacks invoked if a data
    transfer encounters an error, or once the data transfer has completed.
    Received capsule PDUs invoke the upper layer's capsule_received
    callback.
    
    struct nvmf_tcp_command_buffer manages a TCP command buffer for data
    transfers that do not use in-capsule-data as described in the NVMeoF
    spec.  Data related PDUs such as R2T, C2H, and H2C are associated with
    a command buffer except in the case of the send_controller_data
    transport method which simply constructs one or more C2H PDUs from the
    caller's mbuf chain.
    
    Sponsored by:   Chelsio Communications
    Differential Revision:  https://reviews.freebsd.org/D44712
---
 share/man/man4/Makefile            |    1 +
 share/man/man4/nvmf_tcp.4          |   57 ++
 sys/conf/NOTES                     |    2 +
 sys/conf/files                     |    1 +
 sys/dev/nvmf/nvmf_tcp.c            | 1867 ++++++++++++++++++++++++++++++++++++
 sys/modules/nvmf/Makefile          |    3 +-
 sys/modules/nvmf/nvmf_tcp/Makefile |    7 +
 7 files changed, 1937 insertions(+), 1 deletion(-)

diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile
index 020b009893d5..aab55d9b90b5 100644
--- a/share/man/man4/Makefile
+++ b/share/man/man4/Makefile
@@ -408,6 +408,7 @@ MAN=	aac.4 \
 	nvd.4 \
 	${_nvdimm.4} \
 	nvme.4 \
+	nvmf_tcp.4 \
 	${_nvram.4} \
 	oce.4 \
 	ocs_fc.4\
diff --git a/share/man/man4/nvmf_tcp.4 b/share/man/man4/nvmf_tcp.4
new file mode 100644
index 000000000000..4d77997c19a2
--- /dev/null
+++ b/share/man/man4/nvmf_tcp.4
@@ -0,0 +1,57 @@
+.\"
+.\" SPDX-License-Identifier: BSD-2-Clause
+.\"
+.\" Copyright (c) 2024 Chelsio Communications, Inc.
+.\"
+.Dd May 2, 2024
+.Dt NVMF_TCP 4
+.Os
+.Sh NAME
+.Nm nvmf_tcp
+.Nd "TCP transport for NVM Express over Fabrics"
+.Sh SYNOPSIS
+To compile the module into the kernel,
+place the following line in the
+kernel configuration file:
+.Bd -ragged -offset indent
+.Cd "device nvmf_tcp"
+.Ed
+.Pp
+Alternatively, to load the
+module at boot time, place the following line in
+.Xr loader.conf 5 :
+.Bd -literal -offset indent
+nvmf_tcp_load="YES"
+.Ed
+.Sh DESCRIPTION
+The
+.Nm
+module implements the software TCP/IP transport for NVM Express over Fabrics.
+It can be used by either the in-kernel NVMeoF host driver or controller.
+.Sh SYSCTL VARIABLES
+The following variables are available as both
+.Xr sysctl 8
+variables and
+.Xr loader 8
+tunables:
+.Bl -tag -width indent
+.It Va kern.nvmf.tcp.max_c2hdata
+The maximum data payload size of a
+.Va C2H_DATA
+PDU sent by the controller to a remote host.
+The default size is 256 kilobytes.
+.El
+.Sh SEE ALSO
+.Xr nvmf 4 ,
+.Xr nvmft 4
+.Sh HISTORY
+The
+.Nm
+module first appeared in
+.Fx 15.0 .
+.Sh AUTHORS
+The
+.Nm
+module was developed by
+.An John Baldwin Aq Mt jhb@FreeBSD.org
+under sponsorship from Chelsio Communications, Inc.
diff --git a/sys/conf/NOTES b/sys/conf/NOTES
index 216a96c2073c..1f52af4c99d8 100644
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -1676,11 +1676,13 @@ device		mrsas		# LSI/Avago MegaRAID SAS/SATA, 6Gb/s and 12Gb/s
 # NVM Express
 #
 # nvme:	PCI-express NVM Express host controllers
+# nvmf_tcp: TCP transport for NVM Express over Fabrics
 # nda:	CAM NVMe disk driver
 # nvd:	non-CAM NVMe disk driver
 
 device		nvme		# base NVMe driver
 options 	NVME_USE_NVD=1	# Use nvd(4) instead of the CAM nda(4) driver
+device		nvmf_tcp	# NVMeoF TCP transport
 device		nda		# NVMe direct access devices (aka disks)
 device		nvd		# expose NVMe namespaces as disks, depends on nvme
 
diff --git a/sys/conf/files b/sys/conf/files
index 326260ffb1dc..143814301c20 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -2533,6 +2533,7 @@ dev/nvme/nvme_test.c		optional nvme
 dev/nvme/nvme_util.c		optional nvme
 dev/nvmem/nvmem.c		optional nvmem fdt
 dev/nvmem/nvmem_if.m		optional nvmem
+dev/nvmf/nvmf_tcp.c		optional nvmf_tcp
 dev/oce/oce_hw.c		optional oce pci
 dev/oce/oce_if.c		optional oce pci
 dev/oce/oce_mbox.c		optional oce pci
diff --git a/sys/dev/nvmf/nvmf_tcp.c b/sys/dev/nvmf/nvmf_tcp.c
new file mode 100644
index 000000000000..57c81eceee02
--- /dev/null
+++ b/sys/dev/nvmf/nvmf_tcp.c
@@ -0,0 +1,1867 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2022-2024 Chelsio Communications, Inc.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ */
+
+#include <sys/param.h>
+#include <sys/capsicum.h>
+#include <sys/condvar.h>
+#include <sys/file.h>
+#include <sys/gsb_crc32.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/protosw.h>
+#include <sys/refcount.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/uio.h>
+#include <netinet/in.h>
+#include <dev/nvme/nvme.h>
+#include <dev/nvmf/nvmf.h>
+#include <dev/nvmf/nvmf_proto.h>
+#include <dev/nvmf/nvmf_tcp.h>
+#include <dev/nvmf/nvmf_transport.h>
+#include <dev/nvmf/nvmf_transport_internal.h>
+
+struct nvmf_tcp_capsule;
+struct nvmf_tcp_qpair;
+
+struct nvmf_tcp_command_buffer {
+	struct nvmf_tcp_qpair *qp;
+
+	struct nvmf_io_request io;
+	size_t	data_len;
+	size_t	data_xfered;
+	uint32_t data_offset;
+
+	u_int	refs;
+	int	error;
+
+	uint16_t cid;
+	uint16_t ttag;
+
+	TAILQ_ENTRY(nvmf_tcp_command_buffer) link;
+
+	/* Controller only */
+	struct nvmf_tcp_capsule *tc;
+};
+
+struct nvmf_tcp_command_buffer_list {
+	TAILQ_HEAD(, nvmf_tcp_command_buffer) head;
+	struct mtx lock;
+};
+
+struct nvmf_tcp_qpair {
+	struct nvmf_qpair qp;
+
+	struct socket *so;
+
+	volatile u_int refs;	/* Every allocated capsule holds a reference */
+	uint8_t	txpda;
+	uint8_t rxpda;
+	bool header_digests;
+	bool data_digests;
+	uint32_t maxr2t;
+	uint32_t maxh2cdata;	/* Controller only */
+	uint32_t max_tx_data;
+	uint32_t max_icd;	/* Host only */
+	uint16_t next_ttag;	/* Controller only */
+	u_int num_ttags;	/* Controller only */
+	u_int active_ttags;	/* Controller only */
+	bool send_success;	/* Controller only */
+
+	/* Receive state. */
+	struct thread *rx_thread;
+	struct cv rx_cv;
+	bool	rx_shutdown;
+
+	/* Transmit state. */
+	struct thread *tx_thread;
+	struct cv tx_cv;
+	bool	tx_shutdown;
+	struct mbufq tx_pdus;
+	STAILQ_HEAD(, nvmf_tcp_capsule) tx_capsules;
+
+	struct nvmf_tcp_command_buffer_list tx_buffers;
+	struct nvmf_tcp_command_buffer_list rx_buffers;
+
+	/*
+	 * For the controller, an RX command buffer can be in one of
+	 * two locations, all protected by the rx_buffers.lock.  If a
+	 * receive request is waiting for either an R2T slot for its
+	 * command (due to exceeding MAXR2T), or a transfer tag it is
+	 * placed on the rx_buffers list.  When a request is allocated
+	 * an active transfer tag, it moves to the open_ttags[] array
+	 * (indexed by the tag) until it completes.
+	 */
+	struct nvmf_tcp_command_buffer **open_ttags;	/* Controller only */
+};
+
+struct nvmf_tcp_rxpdu {
+	struct mbuf *m;
+	const struct nvme_tcp_common_pdu_hdr *hdr;
+	uint32_t data_len;
+	bool data_digest_mismatch;
+};
+
+struct nvmf_tcp_capsule {
+	struct nvmf_capsule nc;
+
+	volatile u_int refs;
+
+	struct nvmf_tcp_rxpdu rx_pdu;
+
+	uint32_t active_r2ts;		/* Controller only */
+#ifdef INVARIANTS
+	uint32_t tx_data_offset;	/* Controller only */
+	u_int pending_r2ts;		/* Controller only */
+#endif
+
+	STAILQ_ENTRY(nvmf_tcp_capsule) link;
+};
+
+#define	TCAP(nc)	((struct nvmf_tcp_capsule *)(nc))
+#define	TQP(qp)		((struct nvmf_tcp_qpair *)(qp))
+
+static void	tcp_release_capsule(struct nvmf_tcp_capsule *tc);
+static void	tcp_free_qpair(struct nvmf_qpair *nq);
+
+SYSCTL_NODE(_kern_nvmf, OID_AUTO, tcp, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+    "TCP transport");
+static u_int tcp_max_transmit_data = 256 * 1024;
+SYSCTL_UINT(_kern_nvmf_tcp, OID_AUTO, max_c2hdata, CTLFLAG_RWTUN,
+    &tcp_max_transmit_data, 0,
+    "Maximum size of data payload in a transmitted PDU");
+
+static MALLOC_DEFINE(M_NVMF_TCP, "nvmf_tcp", "NVMe over TCP");
+
+static int
+mbuf_crc32c_helper(void *arg, void *data, u_int len)
+{
+	uint32_t *digestp = arg;
+
+	*digestp = calculate_crc32c(*digestp, data, len);
+	return (0);
+}
+
+static uint32_t
+mbuf_crc32c(struct mbuf *m, u_int offset, u_int len)
+{
+	uint32_t digest = 0xffffffff;
+
+	m_apply(m, offset, len, mbuf_crc32c_helper, &digest);
+	digest = digest ^ 0xffffffff;
+
+	return (digest);
+}
+
+static uint32_t
+compute_digest(const void *buf, size_t len)
+{
+	return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff);
+}
+
+static struct nvmf_tcp_command_buffer *
+tcp_alloc_command_buffer(struct nvmf_tcp_qpair *qp,
+    const struct nvmf_io_request *io, uint32_t data_offset, size_t data_len,
+    uint16_t cid)
+{
+	struct nvmf_tcp_command_buffer *cb;
+
+	cb = malloc(sizeof(*cb), M_NVMF_TCP, M_WAITOK);
+	cb->qp = qp;
+	cb->io = *io;
+	cb->data_offset = data_offset;
+	cb->data_len = data_len;
+	cb->data_xfered = 0;
+	refcount_init(&cb->refs, 1);
+	cb->error = 0;
+	cb->cid = cid;
+	cb->ttag = 0;
+	cb->tc = NULL;
+
+	return (cb);
+}
+
+static void
+tcp_hold_command_buffer(struct nvmf_tcp_command_buffer *cb)
+{
+	refcount_acquire(&cb->refs);
+}
+
+static void
+tcp_free_command_buffer(struct nvmf_tcp_command_buffer *cb)
+{
+	nvmf_complete_io_request(&cb->io, cb->data_xfered, cb->error);
+	if (cb->tc != NULL)
+		tcp_release_capsule(cb->tc);
+	free(cb, M_NVMF_TCP);
+}
+
+static void
+tcp_release_command_buffer(struct nvmf_tcp_command_buffer *cb)
+{
+	if (refcount_release(&cb->refs))
+		tcp_free_command_buffer(cb);
+}
+
+static void
+tcp_add_command_buffer(struct nvmf_tcp_command_buffer_list *list,
+    struct nvmf_tcp_command_buffer *cb)
+{
+	mtx_assert(&list->lock, MA_OWNED);
+	TAILQ_INSERT_HEAD(&list->head, cb, link);
+}
+
+static struct nvmf_tcp_command_buffer *
+tcp_find_command_buffer(struct nvmf_tcp_command_buffer_list *list,
+    uint16_t cid, uint16_t ttag)
+{
+	struct nvmf_tcp_command_buffer *cb;
+
+	mtx_assert(&list->lock, MA_OWNED);
+	TAILQ_FOREACH(cb, &list->head, link) {
+		if (cb->cid == cid && cb->ttag == ttag)
+			return (cb);
+	}
+	return (NULL);
+}
+
+static void
+tcp_remove_command_buffer(struct nvmf_tcp_command_buffer_list *list,
+    struct nvmf_tcp_command_buffer *cb)
+{
+	mtx_assert(&list->lock, MA_OWNED);
+	TAILQ_REMOVE(&list->head, cb, link);
+}
+
+static void
+tcp_purge_command_buffer(struct nvmf_tcp_command_buffer_list *list,
+    uint16_t cid, uint16_t ttag)
+{
+	struct nvmf_tcp_command_buffer *cb;
+
+	mtx_lock(&list->lock);
+	cb = tcp_find_command_buffer(list, cid, ttag);
+	if (cb != NULL) {
+		tcp_remove_command_buffer(list, cb);
+		mtx_unlock(&list->lock);
+		tcp_release_command_buffer(cb);
+	} else
+		mtx_unlock(&list->lock);
+}
+
+static void
+nvmf_tcp_write_pdu(struct nvmf_tcp_qpair *qp, struct mbuf *m)
+{
+	struct socket *so = qp->so;
+
+	SOCKBUF_LOCK(&so->so_snd);
+	mbufq_enqueue(&qp->tx_pdus, m);
+	/* XXX: Do we need to handle sb_hiwat being wrong? */
+	if (sowriteable(so))
+		cv_signal(&qp->tx_cv);
+	SOCKBUF_UNLOCK(&so->so_snd);
+}
+
+static void
+nvmf_tcp_report_error(struct nvmf_tcp_qpair *qp, uint16_t fes, uint32_t fei,
+    struct mbuf *rx_pdu, u_int hlen)
+{
+	struct nvme_tcp_term_req_hdr *hdr;
+	struct mbuf *m;
+
+	if (hlen != 0) {
+		hlen = min(hlen, NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE);
+		hlen = min(hlen, m_length(rx_pdu, NULL));
+	}
+
+	m = m_get2(sizeof(*hdr) + hlen, M_WAITOK, MT_DATA, 0);
+	m->m_len = sizeof(*hdr) + hlen;
+	hdr = mtod(m, void *);
+	memset(hdr, 0, sizeof(*hdr));
+	hdr->common.pdu_type = qp->qp.nq_controller ?
+	    NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
+	hdr->common.hlen = sizeof(*hdr);
+	hdr->common.plen = sizeof(*hdr) + hlen;
+	hdr->fes = htole16(fes);
+	le32enc(hdr->fei, fei);
+	if (hlen != 0)
+		m_copydata(rx_pdu, 0, hlen, (caddr_t)(hdr + 1));
+
+	nvmf_tcp_write_pdu(qp, m);
+}
+
+static int
+nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
+{
+	const struct nvme_tcp_common_pdu_hdr *ch;
+	struct mbuf *m = pdu->m;
+	uint32_t data_len, fei, plen;
+	uint32_t digest, rx_digest;
+	u_int hlen;
+	int error;
+	uint16_t fes;
+
+	/* Determine how large of a PDU header to return for errors. */
+	ch = pdu->hdr;
+	hlen = ch->hlen;
+	plen = le32toh(ch->plen);
+	if (hlen < sizeof(*ch) || hlen > plen)
+		hlen = sizeof(*ch);
+
+	error = nvmf_tcp_validate_pdu_header(ch, qp->qp.nq_controller,
+	    qp->header_digests, qp->data_digests, qp->rxpda, &data_len, &fes,
+	    &fei);
+	if (error != 0) {
+		if (error != ECONNRESET)
+			nvmf_tcp_report_error(qp, fes, fei, m, hlen);
+		return (error);
+	}
+
+	/* Check header digest if present. */
+	if ((ch->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0) {
+		digest = mbuf_crc32c(m, 0, ch->hlen);
+		m_copydata(m, ch->hlen, sizeof(rx_digest), (caddr_t)&rx_digest);
+		if (digest != rx_digest) {
+			printf("NVMe/TCP: Header digest mismatch\n");
+			nvmf_tcp_report_error(qp,
+			    NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, m,
+			    hlen);
+			return (EBADMSG);
+		}
+	}
+
+	/* Check data digest if present. */
+	pdu->data_digest_mismatch = false;
+	if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) {
+		digest = mbuf_crc32c(m, ch->pdo, data_len);
+		m_copydata(m, plen - sizeof(rx_digest), sizeof(rx_digest),
+		    (caddr_t)&rx_digest);
+		if (digest != rx_digest) {
+			printf("NVMe/TCP: Data digest mismatch\n");
+			pdu->data_digest_mismatch = true;
+		}
+	}
+
+	pdu->data_len = data_len;
+	return (0);
+}
+
+static void
+nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu *pdu)
+{
+	m_freem(pdu->m);
+	pdu->m = NULL;
+	pdu->hdr = NULL;
+}
+
+static int
+nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu *pdu)
+{
+	const struct nvme_tcp_term_req_hdr *hdr;
+
+	hdr = (const void *)pdu->hdr;
+
+	printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n",
+	    le16toh(hdr->fes), le32dec(hdr->fei));
+	nvmf_tcp_free_pdu(pdu);
+	return (ECONNRESET);
+}
+
+static int
+nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair *qp,
+    struct nvmf_tcp_rxpdu *pdu)
+{
+	const struct nvme_tcp_cmd *cmd;
+	struct nvmf_capsule *nc;
+	struct nvmf_tcp_capsule *tc;
+
+	cmd = (const void *)pdu->hdr;
+
+	nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe, M_WAITOK);
+
+	tc = TCAP(nc);
+	tc->rx_pdu = *pdu;
+
+	nvmf_capsule_received(&qp->qp, nc);
+	return (0);
+}
+
+static int
+nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair *qp,
+    struct nvmf_tcp_rxpdu *pdu)
+{
+	const struct nvme_tcp_rsp *rsp;
+	struct nvmf_capsule *nc;
+	struct nvmf_tcp_capsule *tc;
+
+	rsp = (const void *)pdu->hdr;
+
+	nc = nvmf_allocate_response(&qp->qp, &rsp->rccqe, M_WAITOK);
+
+	nc->nc_sqhd_valid = true;
+	tc = TCAP(nc);
+	tc->rx_pdu = *pdu;
+
+	/*
+	 * Once the CQE has been received, no further transfers to the
+	 * command buffer for the associated CID can occur.
+	 */
+	tcp_purge_command_buffer(&qp->rx_buffers, rsp->rccqe.cid, 0);
+	tcp_purge_command_buffer(&qp->tx_buffers, rsp->rccqe.cid, 0);
+
+	nvmf_capsule_received(&qp->qp, nc);
+	return (0);
+}
+
+/*
+ * Construct a PDU that contains an optional data payload.  This
+ * includes dealing with digests and the length fields in the common
+ * header.
+ */
+static struct mbuf *
+nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair *qp, void *hdr, size_t hlen,
+    struct mbuf *data, uint32_t data_len)
+{
+	struct nvme_tcp_common_pdu_hdr *ch;
+	struct mbuf *top;
+	uint32_t digest, pad, pdo, plen, mlen;
+
+	plen = hlen;
+	if (qp->header_digests)
+		plen += sizeof(digest);
+	if (data_len != 0) {
+		KASSERT(m_length(data, NULL) == data_len, ("length mismatch"));
+		pdo = roundup2(plen, qp->txpda);
+		pad = pdo - plen;
+		plen = pdo + data_len;
+		if (qp->data_digests)
+			plen += sizeof(digest);
+		mlen = pdo;
+	} else {
+		KASSERT(data == NULL, ("payload mbuf with zero length"));
+		pdo = 0;
+		pad = 0;
+		mlen = plen;
+	}
+
+	top = m_get2(mlen, M_WAITOK, MT_DATA, 0);
+	top->m_len = mlen;
+	ch = mtod(top, void *);
+	memcpy(ch, hdr, hlen);
+	ch->hlen = hlen;
+	if (qp->header_digests)
+		ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF;
+	if (qp->data_digests && data_len != 0)
+		ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF;
+	ch->pdo = pdo;
+	ch->plen = htole32(plen);
+
+	/* HDGST */
+	if (qp->header_digests) {
+		digest = compute_digest(ch, hlen);
+		memcpy((char *)ch + hlen, &digest, sizeof(digest));
+	}
+
+	if (pad != 0) {
+		/* PAD */
+		memset((char *)ch + pdo - pad, 0, pad);
+	}
+
+	if (data_len != 0) {
+		/* DATA */
+		top->m_next = data;
+
+		/* DDGST */
+		if (qp->data_digests) {
+			digest = mbuf_crc32c(data, 0, data_len);
+
+			/* XXX: Can't use m_append as it uses M_NOWAIT. */
+			while (data->m_next != NULL)
+				data = data->m_next;
+
+			data->m_next = m_get(M_WAITOK, MT_DATA);
+			data->m_next->m_len = sizeof(digest);
+			memcpy(mtod(data->m_next, void *), &digest,
+			    sizeof(digest));
+		}
+	}
+
+	return (top);
+}
+
+/* Find the next command buffer eligible to schedule for R2T. */
+static struct nvmf_tcp_command_buffer *
+nvmf_tcp_next_r2t(struct nvmf_tcp_qpair *qp)
+{
+	struct nvmf_tcp_command_buffer *cb;
+
+	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
+	MPASS(qp->active_ttags < qp->num_ttags);
+
+	TAILQ_FOREACH(cb, &qp->rx_buffers.head, link) {
+		/* NB: maxr2t is 0's based. */
+		if (cb->tc->active_r2ts > qp->maxr2t)
+			continue;
+#ifdef INVARIANTS
+		cb->tc->pending_r2ts--;
+#endif
+		TAILQ_REMOVE(&qp->rx_buffers.head, cb, link);
+		return (cb);
+	}
+	return (NULL);
+}
+
+/* Allocate the next free transfer tag and assign it to cb. */
+static void
+nvmf_tcp_allocate_ttag(struct nvmf_tcp_qpair *qp,
+    struct nvmf_tcp_command_buffer *cb)
+{
+	uint16_t ttag;
+
+	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
+
+	ttag = qp->next_ttag;
+	for (;;) {
+		if (qp->open_ttags[ttag] == NULL)
+			break;
+		if (ttag == qp->num_ttags - 1)
+			ttag = 0;
+		else
+			ttag++;
+		MPASS(ttag != qp->next_ttag);
+	}
+	if (ttag == qp->num_ttags - 1)
+		qp->next_ttag = 0;
+	else
+		qp->next_ttag = ttag + 1;
+
+	cb->tc->active_r2ts++;
+	qp->active_ttags++;
+	qp->open_ttags[ttag] = cb;
+
+	/*
+	 * Don't bother byte-swapping ttag as it is just a cookie
+	 * value returned by the other end as-is.
+	 */
+	cb->ttag = ttag;
+}
+
+/* NB: cid and ttag are both little-endian already. */
+static void
+tcp_send_r2t(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag,
+    uint32_t data_offset, uint32_t data_len)
+{
+	struct nvme_tcp_r2t_hdr r2t;
+	struct mbuf *m;
+
+	memset(&r2t, 0, sizeof(r2t));
+	r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T;
+	r2t.cccid = cid;
+	r2t.ttag = ttag;
+	r2t.r2to = htole32(data_offset);
+	r2t.r2tl = htole32(data_len);
+
+	m = nvmf_tcp_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0);
+	nvmf_tcp_write_pdu(qp, m);
+}
+
+/*
+ * Release a transfer tag and schedule another R2T.
+ *
+ * NB: This drops the rx_buffers.lock mutex.
+ */
+static void
+nvmf_tcp_send_next_r2t(struct nvmf_tcp_qpair *qp,
+    struct nvmf_tcp_command_buffer *cb)
+{
+	struct nvmf_tcp_command_buffer *ncb;
+
+	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
+	MPASS(qp->open_ttags[cb->ttag] == cb);
+
+	/* Release this transfer tag. */
+	qp->open_ttags[cb->ttag] = NULL;
+	qp->active_ttags--;
+	cb->tc->active_r2ts--;
+
+	/* Schedule another R2T. */
+	ncb = nvmf_tcp_next_r2t(qp);
+	if (ncb != NULL) {
+		nvmf_tcp_allocate_ttag(qp, ncb);
+		mtx_unlock(&qp->rx_buffers.lock);
+		tcp_send_r2t(qp, ncb->cid, ncb->ttag, ncb->data_offset,
+		    ncb->data_len);
+	} else
+		mtx_unlock(&qp->rx_buffers.lock);
+}
+
+/*
+ * Copy len bytes starting at offset skip from an mbuf chain into an
+ * I/O buffer at destination offset io_offset.
+ */
+static void
+mbuf_copyto_io(struct mbuf *m, u_int skip, u_int len,
+    struct nvmf_io_request *io, u_int io_offset)
+{
+	u_int todo;
+
+	while (m->m_len <= skip) {
+		skip -= m->m_len;
+		m = m->m_next;
+	}
+	while (len != 0) {
+		MPASS((m->m_flags & M_EXTPG) == 0);
+
+		todo = m->m_len - skip;
+		if (todo > len)
+			todo = len;
+
+		memdesc_copyback(&io->io_mem, io_offset, todo, mtodo(m, skip));
+		skip = 0;
+		io_offset += todo;
+		len -= todo;
+		m = m->m_next;
+	}
+}
+
+static int
+nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
+{
+	const struct nvme_tcp_h2c_data_hdr *h2c;
+	struct nvmf_tcp_command_buffer *cb;
+	uint32_t data_len, data_offset;
+	uint16_t ttag;
+
+	h2c = (const void *)pdu->hdr;
+	if (le32toh(h2c->datal) > qp->maxh2cdata) {
+		nvmf_tcp_report_error(qp,
+		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0,
+		    pdu->m, pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	/*
+	 * NB: Don't bother byte-swapping ttag as we don't byte-swap
+	 * it when sending.
+	 */
+	ttag = h2c->ttag;
+	if (ttag >= qp->num_ttags) {
+		nvmf_tcp_report_error(qp,
+		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
+		    offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m,
+		    pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	mtx_lock(&qp->rx_buffers.lock);
+	cb = qp->open_ttags[ttag];
+	if (cb == NULL) {
+		mtx_unlock(&qp->rx_buffers.lock);
+		nvmf_tcp_report_error(qp,
+		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
+		    offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m,
+		    pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+	MPASS(cb->ttag == ttag);
+
+	/* For a data digest mismatch, fail the I/O request. */
+	if (pdu->data_digest_mismatch) {
+		nvmf_tcp_send_next_r2t(qp, cb);
+		cb->error = EINTEGRITY;
+		tcp_release_command_buffer(cb);
+		nvmf_tcp_free_pdu(pdu);
+		return (0);
+	}
+
+	data_len = le32toh(h2c->datal);
+	if (data_len != pdu->data_len) {
+		mtx_unlock(&qp->rx_buffers.lock);
+		nvmf_tcp_report_error(qp,
+		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
+		    offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->m,
+		    pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	data_offset = le32toh(h2c->datao);
+	if (data_offset < cb->data_offset ||
+	    data_offset + data_len > cb->data_offset + cb->data_len) {
+		mtx_unlock(&qp->rx_buffers.lock);
+		nvmf_tcp_report_error(qp,
+		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->m,
+		    pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	if (data_offset != cb->data_offset + cb->data_xfered) {
+		mtx_unlock(&qp->rx_buffers.lock);
+		nvmf_tcp_report_error(qp,
+		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
+		    pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	if ((cb->data_xfered + data_len == cb->data_len) !=
+	    ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) {
+		mtx_unlock(&qp->rx_buffers.lock);
+		nvmf_tcp_report_error(qp,
+		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
+		    pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	cb->data_xfered += data_len;
+	data_offset -= cb->data_offset;
+	if (cb->data_xfered == cb->data_len) {
+		nvmf_tcp_send_next_r2t(qp, cb);
+	} else {
+		tcp_hold_command_buffer(cb);
+		mtx_unlock(&qp->rx_buffers.lock);
+	}
+
+	mbuf_copyto_io(pdu->m, pdu->hdr->pdo, data_len, &cb->io, data_offset);
+
+	tcp_release_command_buffer(cb);
+	nvmf_tcp_free_pdu(pdu);
+	return (0);
+}
+
+static int
+nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu)
+{
+	const struct nvme_tcp_c2h_data_hdr *c2h;
+	struct nvmf_tcp_command_buffer *cb;
+	uint32_t data_len, data_offset;
+
+	c2h = (const void *)pdu->hdr;
+
+	mtx_lock(&qp->rx_buffers.lock);
+	cb = tcp_find_command_buffer(&qp->rx_buffers, c2h->cccid, 0);
+	if (cb == NULL) {
+		mtx_unlock(&qp->rx_buffers.lock);
+		/*
+		 * XXX: Could be PDU sequence error if cccid is for a
+		 * command that doesn't use a command buffer.
+		 */
+		nvmf_tcp_report_error(qp,
+		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
+		    offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->m,
+		    pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	/* For a data digest mismatch, fail the I/O request. */
+	if (pdu->data_digest_mismatch) {
+		cb->error = EINTEGRITY;
+		tcp_remove_command_buffer(&qp->rx_buffers, cb);
+		mtx_unlock(&qp->rx_buffers.lock);
+		tcp_release_command_buffer(cb);
+		nvmf_tcp_free_pdu(pdu);
+		return (0);
+	}
+
+	data_len = le32toh(c2h->datal);
+	if (data_len != pdu->data_len) {
+		mtx_unlock(&qp->rx_buffers.lock);
+		nvmf_tcp_report_error(qp,
+		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
+		    offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->m,
+		    pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	data_offset = le32toh(c2h->datao);
+	if (data_offset < cb->data_offset ||
+	    data_offset + data_len > cb->data_offset + cb->data_len) {
+		mtx_unlock(&qp->rx_buffers.lock);
+		nvmf_tcp_report_error(qp,
+		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
+		    pdu->m, pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	if (data_offset != cb->data_offset + cb->data_xfered) {
+		mtx_unlock(&qp->rx_buffers.lock);
+		nvmf_tcp_report_error(qp,
+		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
+		    pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
+	}
+
+	if ((cb->data_xfered + data_len == cb->data_len) !=
+	    ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) {
+		mtx_unlock(&qp->rx_buffers.lock);
+		nvmf_tcp_report_error(qp,
+		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
+		    pdu->hdr->hlen);
+		nvmf_tcp_free_pdu(pdu);
+		return (EBADMSG);
*** 1069 LINES SKIPPED ***