git: 67360f7bb0bb - main - cxgbei: Support iSCSI offload on T6.

John Baldwin jhb at FreeBSD.org
Sat May 29 00:02:24 UTC 2021


The branch main has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=67360f7bb0bb575d823c21420abaf165ecf62066

commit 67360f7bb0bb575d823c21420abaf165ecf62066
Author:     John Baldwin <jhb at FreeBSD.org>
AuthorDate: 2021-05-28 23:45:29 +0000
Commit:     John Baldwin <jhb at FreeBSD.org>
CommitDate: 2021-05-28 23:45:29 +0000

    cxgbei: Support iSCSI offload on T6.
    
    T6 makes several changes relative to T5 for receive of iSCSI PDUs.
    
    First, earlier adapters issue either 2 or 3 messages to the host for
    each PDU received: CPL_ISCSI_HDR contains the BHS of the PDU,
    CPL_ISCSI_DATA (when DDP is not used for zero-copy receive) contains
    the PDU data as buffers on the freelist, and CPL_RX_ISCSI_DDP with
    status of the PDU such as result of CRC checks.  In T6, a new
    CPL_RX_ISCSI_CMP combines CPL_ISCSI_HDR and CPL_RX_ISCSI_DDP.  Data
    PDUs which are directly placed via DDP only report a single
    CPL_RX_ISCSI_CMP message.  Data PDUs received on the free lists are
    reported as CPL_ISCSI_DATA followed by CPL_RX_ISCSI_CMP.  Control PDUs
    such as R2T are still reported via CPL_ISCSI_HDR and CPL_RX_ISCSI_DDP.
    
    Supporting this requires changing the CPL_ISCSI_DATA handler to
    allocate a PDU structure if it is not preceded by a CPL_ISCSI_HDR as
    well as support for the new CPL_RX_ISCSI_CMP.
    
    Second, when using DDP for zero-copy receive, T6 will only issue a
    CPL_RX_ISCSI_CMP after a burst of PDUs have been received (indicated
    by the F flag in the BHS).  In this case, the CPL_RX_ISCSI_CMP can
    reflect the completion of multiple PDUs and the BHS and TCP sequence
    number included in the message are from the last PDU received in the
    burst.  Notably, the message does not include any information about
    earlier PDUs received as part of the burst.  Instead, the driver must
    track the amount of data already received for a given transfer and use
    this to compute the amount of data received in a burst.  In addition,
    the iSCSI layer currently has no way to permit receiving a logical PDU
    which spans multiple PDUs.  Instead, the driver presents each burst as
    a single, "large" PDU to the iSCSI target and initiators.  This is
    done by rewriting the buffer offset and data length fields in the BHS
    of the final PDU as well as rewriting the DataSN so that the received
    PDUs appear to be in order.
    
    To track all this, cxgbei maintains a hash table of 'cxgbei_cmp'
    structures indexed by transfer tags for each offloaded iSCSI
    connection.  When a SCSI_DATA_IN message is received, the ITT from the
    received BHS is used to find the necessary state in the hash table,
    whereas SCSI_DATA_OUT replies use the TTT as the key.  The structure
    tracks the expected starting offset and DataSN of the next burst as
    well as the rewritten DataSN value used for the previously received
    PDU.
    
    Discussed with: np
    Sponsored by:   Chelsio Communications
    Differential Revision:  https://reviews.freebsd.org/D30458
---
 sys/dev/cxgbe/cxgbei/cxgbei.c     | 280 ++++++++++++++++++++++++++++++++++++--
 sys/dev/cxgbe/cxgbei/cxgbei.h     |  16 +++
 sys/dev/cxgbe/cxgbei/icl_cxgbei.c | 154 +++++++++++++++------
 3 files changed, 395 insertions(+), 55 deletions(-)

diff --git a/sys/dev/cxgbe/cxgbei/cxgbei.c b/sys/dev/cxgbe/cxgbei/cxgbei.c
index f95c9f60163f..c70bda7e0436 100644
--- a/sys/dev/cxgbe/cxgbei/cxgbei.c
+++ b/sys/dev/cxgbe/cxgbei/cxgbei.c
@@ -222,27 +222,47 @@ do_rx_iscsi_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m
 	u_int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
+	struct icl_pdu *ip;
 
 	M_ASSERTPKTHDR(m);
 	MPASS(m->m_pkthdr.len == be16toh(cpl->len) + sizeof(*cpl));
 
-	/* Must already have received the header (but not the data). */
-	MPASS(icp != NULL);
-	MPASS(icp->icp_flags == ICPF_RX_HDR);
-	MPASS(icp->ip.ip_data_mbuf == NULL);
-
+	if (icp == NULL) {
+		/*
+		 * T6 completion enabled, start of a new pdu. Header
+		 * will come in completion CPL.
+		 */
+	        ip = icl_cxgbei_new_pdu(M_NOWAIT);
+	        if (ip == NULL)
+			CXGBE_UNIMPLEMENTED("PDU allocation failure");
+		icp = ip_to_icp(ip);
+	} else {
+		/* T5 mode, header is already received. */
+		MPASS(icp->icp_flags == ICPF_RX_HDR);
+		MPASS(icp->ip.ip_data_mbuf == NULL);
+		MPASS(icp->ip.ip_data_len == m->m_pkthdr.len - sizeof(*cpl));
+	}
 
+	/* Trim the cpl header from mbuf. */
 	m_adj(m, sizeof(*cpl));
-	MPASS(icp->ip.ip_data_len == m->m_pkthdr.len);
 
 	icp->icp_flags |= ICPF_RX_FLBUF;
 	icp->ip.ip_data_mbuf = m;
 	toep->ofld_rxq->rx_iscsi_fl_pdus++;
 	toep->ofld_rxq->rx_iscsi_fl_octets += m->m_pkthdr.len;
 
+	/*
+	 * For T6, save the icp for further processing in the
+	 * completion handler.
+	 */
+	if (icp->icp_flags == ICPF_RX_FLBUF) {
+		MPASS(toep->ulpcb2 == NULL);
+		toep->ulpcb2 = icp;
+	}
+
 #if 0
-	CTR3(KTR_CXGBE, "%s: tid %u, cpl->len %u", __func__, tid,
-	    be16toh(cpl->len));
+	CTR4(KTR_CXGBE, "%s: tid %u, cpl->len %u, icp %p", __func__, tid,
+	    be16toh(cpl->len), icp);
 #endif
 
 	return (0);
@@ -304,15 +324,17 @@ do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 		    __func__, tid, pdu_len, inp->inp_flags);
 		INP_WUNLOCK(inp);
 		icl_cxgbei_conn_pdu_free(NULL, ip);
-#ifdef INVARIANTS
 		toep->ulpcb2 = NULL;
-#endif
 		return (0);
 	}
 
+	/*
+	 * T6+ does not report data PDUs received via DDP without F
+	 * set.  This can result in gaps in the TCP sequence space.
+	 */
 	tp = intotcpcb(inp);
-	MPASS(icp->icp_seq == tp->rcv_nxt);
-	tp->rcv_nxt += pdu_len;
+	MPASS(chip_id(sc) >= CHELSIO_T6 || icp->icp_seq == tp->rcv_nxt);
+	tp->rcv_nxt = icp->icp_seq + pdu_len;
 	tp->t_rcvtime = ticks;
 
 	/*
@@ -342,9 +364,7 @@ do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 		CURVNET_RESTORE();
 
 		icl_cxgbei_conn_pdu_free(NULL, ip);
-#ifdef INVARIANTS
 		toep->ulpcb2 = NULL;
-#endif
 		return (0);
 	}
 	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
@@ -399,10 +419,238 @@ do_rx_iscsi_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 	SOCKBUF_UNLOCK(sb);
 	INP_WUNLOCK(inp);
 
-#ifdef INVARIANTS
 	toep->ulpcb2 = NULL;
+
+	return (0);
+}
+
+static int
+do_rx_iscsi_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
+{
+	struct epoch_tracker et;
+	struct adapter *sc = iq->adapter;
+	struct cpl_rx_iscsi_cmp *cpl = mtod(m, struct cpl_rx_iscsi_cmp *);
+	u_int tid = GET_TID(cpl);
+	struct toepcb *toep = lookup_tid(sc, tid);
+	struct icl_cxgbei_pdu *icp = toep->ulpcb2;
+	struct icl_pdu *ip;
+	struct cxgbei_cmp *cmp;
+	struct inpcb *inp = toep->inp;
+#ifdef INVARIANTS
+	uint16_t len = be16toh(cpl->len);
+#endif
+	struct socket *so;
+	struct sockbuf *sb;
+	struct tcpcb *tp;
+	struct icl_cxgbei_conn *icc;
+	struct icl_conn *ic;
+	struct iscsi_bhs_data_out *bhsdo;
+	u_int val = be32toh(cpl->ddpvld);
+	u_int npdus, pdu_len, data_digest_len, hdr_digest_len;
+	uint32_t prev_seg_len;
+
+	M_ASSERTPKTHDR(m);
+	MPASS(m->m_pkthdr.len == len + sizeof(*cpl));
+
+	if ((val & F_DDP_PDU) == 0) {
+		MPASS(icp != NULL);
+		MPASS((icp->icp_flags & ICPF_RX_STATUS) == 0);
+		ip = &icp->ip;
+	}
+
+	if (icp == NULL) {
+		/* T6 completion enabled, start of a new PDU. */
+		ip = icl_cxgbei_new_pdu(M_NOWAIT);
+		if (ip == NULL)
+			CXGBE_UNIMPLEMENTED("PDU allocation failure");
+		icp = ip_to_icp(ip);
+	}
+	pdu_len = G_ISCSI_PDU_LEN(be16toh(cpl->pdu_len_ddp));
+
+#if 0
+	CTR5(KTR_CXGBE,
+	    "%s: tid %u, cpl->len %u, ddpvld 0x%08x, icp %p",
+	    __func__, tid, pdu_len, val, icp);
 #endif
 
+	/* Copy header */
+	m_copydata(m, sizeof(*cpl), ISCSI_BHS_SIZE, (caddr_t)ip->ip_bhs);
+	bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs;
+	ip->ip_data_len = bhsdo->bhsdo_data_segment_len[0] << 16 |
+	    bhsdo->bhsdo_data_segment_len[1] << 8 |
+	    bhsdo->bhsdo_data_segment_len[2];
+	icp->icp_seq = ntohl(cpl->seq);
+	icp->icp_flags |= ICPF_RX_HDR;
+	icp->icp_flags |= ICPF_RX_STATUS;
+
+	if (val & F_DDP_PADDING_ERR)
+		icp->icp_flags |= ICPF_PAD_ERR;
+	if (val & F_DDP_HDRCRC_ERR)
+		icp->icp_flags |= ICPF_HCRC_ERR;
+	if (val & F_DDP_DATACRC_ERR)
+		icp->icp_flags |= ICPF_DCRC_ERR;
+
+	INP_WLOCK(inp);
+	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
+		CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
+		    __func__, tid, pdu_len, inp->inp_flags);
+		INP_WUNLOCK(inp);
+		icl_cxgbei_conn_pdu_free(NULL, ip);
+		toep->ulpcb2 = NULL;
+		m_freem(m);
+		return (0);
+	}
+
+	tp = intotcpcb(inp);
+
+	/*
+	 * If icc is NULL, the connection is being closed in
+	 * icl_cxgbei_conn_close(), just drop this data.
+	 */
+	icc = toep->ulpcb;
+	if (__predict_false(icc == NULL)) {
+		CTR4(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes), icc %p",
+		    __func__, tid, pdu_len, icc);
+
+		/*
+		 * Update rcv_nxt so the sequence number of the FIN
+		 * doesn't appear wrong.
+		 */
+		tp->rcv_nxt = icp->icp_seq + pdu_len;
+		tp->t_rcvtime = ticks;
+		INP_WUNLOCK(inp);
+
+		icl_cxgbei_conn_pdu_free(NULL, ip);
+		toep->ulpcb2 = NULL;
+		m_freem(m);
+		return (0);
+	}
+
+	data_digest_len = (icc->ulp_submode & ULP_CRC_DATA) ?
+	    ISCSI_DATA_DIGEST_SIZE : 0;
+	hdr_digest_len = (icc->ulp_submode & ULP_CRC_HEADER) ?
+	    ISCSI_HEADER_DIGEST_SIZE : 0;
+	MPASS(roundup2(ip->ip_data_len, 4) == pdu_len - len - data_digest_len);
+
+	if (val & F_DDP_PDU && ip->ip_data_mbuf == NULL) {
+		MPASS((icp->icp_flags & ICPF_RX_FLBUF) == 0);
+		MPASS(ip->ip_data_len > 0);
+		icp->icp_flags |= ICPF_RX_DDP;
+		bhsdo = (struct iscsi_bhs_data_out *)ip->ip_bhs;
+
+		switch (ip->ip_bhs->bhs_opcode & ~ISCSI_BHS_OPCODE_IMMEDIATE) {
+		case ISCSI_BHS_OPCODE_SCSI_DATA_IN:
+			cmp = cxgbei_find_cmp(icc,
+			    be32toh(bhsdo->bhsdo_initiator_task_tag));
+			break;
+		case ISCSI_BHS_OPCODE_SCSI_DATA_OUT:
+			cmp = cxgbei_find_cmp(icc,
+			    be32toh(bhsdo->bhsdo_target_transfer_tag));
+			break;
+		default:
+			__assert_unreachable();
+		}
+		MPASS(cmp != NULL);
+
+		/* Must be the final PDU. */
+		MPASS(bhsdo->bhsdo_flags & BHSDO_FLAGS_F);
+
+		/*
+		 * The difference between the end of the last burst
+		 * and the offset of the last PDU in this burst is
+		 * the additional data received via DDP.
+		 */
+		prev_seg_len = be32toh(bhsdo->bhsdo_buffer_offset) -
+		    cmp->next_buffer_offset;
+
+		if (prev_seg_len != 0) {
+			/*
+			 * Since cfiscsi doesn't know about previous
+			 * headers, pretend that the entire r2t data
+			 * length was received in this single segment.
+			 */
+			ip->ip_data_len += prev_seg_len;
+			bhsdo->bhsdo_data_segment_len[2] = ip->ip_data_len;
+			bhsdo->bhsdo_data_segment_len[1] = ip->ip_data_len >> 8;
+			bhsdo->bhsdo_data_segment_len[0] = ip->ip_data_len >> 16;
+			bhsdo->bhsdo_buffer_offset =
+			    htobe32(cmp->next_buffer_offset);
+
+			npdus = htobe32(bhsdo->bhsdo_datasn) - cmp->last_datasn;
+		} else {
+			MPASS(htobe32(bhsdo->bhsdo_datasn) ==
+			    cmp->last_datasn + 1);
+			npdus = 1;
+		}
+
+		cmp->next_buffer_offset += ip->ip_data_len;
+		cmp->last_datasn = htobe32(bhsdo->bhsdo_datasn);
+		bhsdo->bhsdo_datasn = htobe32(cmp->next_datasn);
+		cmp->next_datasn++;
+		toep->ofld_rxq->rx_iscsi_ddp_pdus += npdus;
+		toep->ofld_rxq->rx_iscsi_ddp_octets += ip->ip_data_len;
+	} else {
+		MPASS(icp->icp_flags & (ICPF_RX_FLBUF));
+		MPASS(ip->ip_data_len == ip->ip_data_mbuf->m_pkthdr.len);
+		MPASS(icp->icp_seq == tp->rcv_nxt);
+	}
+
+	tp->rcv_nxt = icp->icp_seq + pdu_len;
+	tp->t_rcvtime = ticks;
+
+	/*
+	 * Don't update the window size or return credits since RX
+	 * flow control is disabled.
+	 */
+
+	so = inp->inp_socket;
+	sb = &so->so_rcv;
+	SOCKBUF_LOCK(sb);
+	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
+		CTR5(KTR_CXGBE,
+		    "%s: tid %u, excess rx (%d bytes), icc %p, sb_state 0x%x",
+		    __func__, tid, pdu_len, icc, sb->sb_state);
+		SOCKBUF_UNLOCK(sb);
+		INP_WUNLOCK(inp);
+
+		CURVNET_SET(so->so_vnet);
+		NET_EPOCH_ENTER(et);
+		INP_WLOCK(inp);
+		tp = tcp_drop(tp, ECONNRESET);
+		if (tp != NULL)
+			INP_WUNLOCK(inp);
+		NET_EPOCH_EXIT(et);
+		CURVNET_RESTORE();
+
+		icl_cxgbei_conn_pdu_free(NULL, ip);
+		toep->ulpcb2 = NULL;
+		m_freem(m);
+		return (0);
+	}
+	MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE);
+	ic = &icc->ic;
+	icl_cxgbei_new_pdu_set_conn(ip, ic);
+
+	/* Enqueue the PDU to the received pdus queue. */
+	STAILQ_INSERT_TAIL(&icc->rcvd_pdus, ip, ip_next);
+	if ((icc->rx_flags & RXF_ACTIVE) == 0) {
+		struct cxgbei_worker_thread_softc *cwt = &cwt_softc[icc->cwt];
+
+		mtx_lock(&cwt->cwt_lock);
+		icc->rx_flags |= RXF_ACTIVE;
+		TAILQ_INSERT_TAIL(&cwt->rx_head, icc, rx_link);
+		if (cwt->cwt_state == CWT_SLEEPING) {
+			cwt->cwt_state = CWT_RUNNING;
+			cv_signal(&cwt->cwt_cv);
+		}
+		mtx_unlock(&cwt->cwt_lock);
+	}
+	SOCKBUF_UNLOCK(sb);
+	INP_WUNLOCK(inp);
+
+	toep->ulpcb2 = NULL;
+	m_freem(m);
+
 	return (0);
 }
 
@@ -669,6 +917,7 @@ cxgbei_mod_load(void)
 	t4_register_cpl_handler(CPL_ISCSI_HDR, do_rx_iscsi_hdr);
 	t4_register_cpl_handler(CPL_ISCSI_DATA, do_rx_iscsi_data);
 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, do_rx_iscsi_ddp);
+	t4_register_cpl_handler(CPL_RX_ISCSI_CMP, do_rx_iscsi_cmp);
 
 	rc = start_worker_threads();
 	if (rc != 0)
@@ -699,6 +948,7 @@ cxgbei_mod_unload(void)
 	t4_register_cpl_handler(CPL_ISCSI_HDR, NULL);
 	t4_register_cpl_handler(CPL_ISCSI_DATA, NULL);
 	t4_register_cpl_handler(CPL_RX_ISCSI_DDP, NULL);
+	t4_register_cpl_handler(CPL_RX_ISCSI_CMP, NULL);
 
 	return (0);
 }
diff --git a/sys/dev/cxgbe/cxgbei/cxgbei.h b/sys/dev/cxgbe/cxgbei/cxgbei.h
index 9941e817b9cb..45d3398d545c 100644
--- a/sys/dev/cxgbe/cxgbei/cxgbei.h
+++ b/sys/dev/cxgbe/cxgbei/cxgbei.h
@@ -53,6 +53,17 @@ enum {
 	RXF_ACTIVE	= 1 << 0,	/* In the worker thread's queue */
 };
 
+struct cxgbei_cmp {
+	LIST_ENTRY(cxgbei_cmp) link;
+
+	uint32_t tt;		/* Transfer tag. */
+
+	uint32_t next_datasn;
+	uint32_t next_buffer_offset;
+	uint32_t last_datasn;
+};
+LIST_HEAD(cxgbei_cmp_head, cxgbei_cmp);
+
 struct icl_cxgbei_conn {
 	struct icl_conn ic;
 
@@ -67,6 +78,10 @@ struct icl_cxgbei_conn {
 	u_int cwt;
 	STAILQ_HEAD(, icl_pdu) rcvd_pdus;	/* protected by so_rcv lock */
 	TAILQ_ENTRY(icl_cxgbei_conn) rx_link;	/* protected by cwt lock */
+
+	struct cxgbei_cmp_head *cmp_table;	/* protected by cmp_lock */
+	struct mtx cmp_lock;
+	unsigned long cmp_hash_mask;
 };
 
 static inline struct icl_cxgbei_conn *
@@ -128,5 +143,6 @@ int icl_cxgbei_mod_unload(void);
 struct icl_pdu *icl_cxgbei_new_pdu(int);
 void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *);
 void icl_cxgbei_conn_pdu_free(struct icl_conn *, struct icl_pdu *);
+struct cxgbei_cmp *cxgbei_find_cmp(struct icl_cxgbei_conn *, uint32_t);
 
 #endif
diff --git a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
index 17d5685f1c1a..b9f7c6355b6f 100644
--- a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
+++ b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
@@ -60,7 +60,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/sx.h>
 #include <sys/uio.h>
 #include <machine/bus.h>
-#include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <netinet/in.h>
@@ -100,6 +99,16 @@ __FBSDID("$FreeBSD$");
 #include "tom/t4_tom.h"
 #include "cxgbei.h"
 
+/*
+ * Use the page pod tag for the TT hash.
+ */
+#define	TT_HASH(icc, tt)	(G_PPOD_TAG(tt) & (icc)->cmp_hash_mask)
+
+struct cxgbei_ddp_state {
+	struct ppod_reservation prsv;
+	struct cxgbei_cmp cmp;
+};
+
 static MALLOC_DEFINE(M_CXGBEI, "cxgbei", "cxgbei(4)");
 
 SYSCTL_NODE(_kern_icl, OID_AUTO, cxgbei, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
@@ -117,7 +126,6 @@ static int recvspace = 1048576;
 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, recvspace, CTLFLAG_RWTUN,
     &recvspace, 0, "Default receive socket buffer size");
 
-static uma_zone_t prsv_zone;
 static volatile u_int icl_cxgbei_ncons;
 
 #define ICL_CONN_LOCK(X)		mtx_lock(X->ic_lock)
@@ -555,6 +563,9 @@ icl_cxgbei_new_conn(const char *name, struct mtx *lock)
 	icc->icc_signature = CXGBEI_CONN_SIGNATURE;
 	STAILQ_INIT(&icc->rcvd_pdus);
 
+	icc->cmp_table = hashinit(64, M_CXGBEI, &icc->cmp_hash_mask);
+	mtx_init(&icc->cmp_lock, "cxgbei_cmp", NULL, MTX_DEF);
+
 	ic = &icc->ic;
 	ic->ic_lock = lock;
 
@@ -586,6 +597,8 @@ icl_cxgbei_conn_free(struct icl_conn *ic)
 	cv_destroy(&ic->ic_send_cv);
 	cv_destroy(&ic->ic_receive_cv);
 
+	mtx_destroy(&icc->cmp_lock);
+	hashdestroy(icc->cmp_table, M_CXGBEI, icc->cmp_hash_mask);
 	kobj_delete((struct kobj *)icc, M_CXGBE);
 	refcount_release(&icl_cxgbei_ncons);
 }
@@ -904,6 +917,61 @@ icl_cxgbei_conn_close(struct icl_conn *ic)
 	soclose(so);
 }
 
+static void
+cxgbei_insert_cmp(struct icl_cxgbei_conn *icc, struct cxgbei_cmp *cmp,
+    uint32_t tt)
+{
+#ifdef INVARIANTS
+	struct cxgbei_cmp *cmp2;
+#endif
+
+	cmp->tt = tt;
+
+	mtx_lock(&icc->cmp_lock);
+#ifdef INVARIANTS
+	LIST_FOREACH(cmp2, &icc->cmp_table[TT_HASH(icc, tt)], link) {
+		KASSERT(cmp2->tt != tt, ("%s: duplicate cmp", __func__));
+	}
+#endif
+	LIST_INSERT_HEAD(&icc->cmp_table[TT_HASH(icc, tt)], cmp, link);
+	mtx_unlock(&icc->cmp_lock);
+}
+
+struct cxgbei_cmp *
+cxgbei_find_cmp(struct icl_cxgbei_conn *icc, uint32_t tt)
+{
+	struct cxgbei_cmp *cmp;
+
+	mtx_lock(&icc->cmp_lock);
+	LIST_FOREACH(cmp, &icc->cmp_table[TT_HASH(icc, tt)], link) {
+		if (cmp->tt == tt)
+			break;
+	}
+	mtx_unlock(&icc->cmp_lock);
+	return (cmp);
+}
+
+static void
+cxgbei_rm_cmp(struct icl_cxgbei_conn *icc, struct cxgbei_cmp *cmp)
+{
+#ifdef INVARIANTS
+	struct cxgbei_cmp *cmp2;
+#endif
+
+	mtx_lock(&icc->cmp_lock);
+
+#ifdef INVARIANTS
+	LIST_FOREACH(cmp2, &icc->cmp_table[TT_HASH(icc, cmp->tt)], link) {
+		if (cmp2 == cmp)
+			goto found;
+	}
+	panic("%s: could not find cmp", __func__);
+found:
+#endif
+	LIST_REMOVE(cmp, link);
+	mtx_unlock(&icc->cmp_lock);
+}
+
 int
 icl_cxgbei_conn_task_setup(struct icl_conn *ic, struct icl_pdu *ip,
     struct ccb_scsiio *csio, uint32_t *ittp, void **arg)
@@ -913,6 +981,7 @@ icl_cxgbei_conn_task_setup(struct icl_conn *ic, struct icl_pdu *ip,
 	struct adapter *sc = icc->sc;
 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
 	struct ppod_region *pr = &ci->pr;
+	struct cxgbei_ddp_state *ddp;
 	struct ppod_reservation *prsv;
 	uint32_t itt;
 	int rc = 0;
@@ -943,30 +1012,32 @@ no_ddp:
 	 * Reserve resources for DDP, update the itt that should be used in the
 	 * PDU, and save DDP specific state for this I/O in *arg.
 	 */
-
-	prsv = uma_zalloc(prsv_zone, M_NOWAIT);
-	if (prsv == NULL) {
+	ddp = malloc(sizeof(*ddp), M_CXGBEI, M_NOWAIT | M_ZERO);
+	if (ddp == NULL) {
 		rc = ENOMEM;
 		goto no_ddp;
 	}
+	prsv = &ddp->prsv;
 
 	/* XXX add support for all CAM_DATA_ types */
 	MPASS((csio->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_VADDR);
 	rc = t4_alloc_page_pods_for_buf(pr, (vm_offset_t)csio->data_ptr,
 	    csio->dxfer_len, prsv);
 	if (rc != 0) {
-		uma_zfree(prsv_zone, prsv);
+		free(ddp, M_CXGBEI);
 		goto no_ddp;
 	}
 
 	rc = t4_write_page_pods_for_buf(sc, toep, prsv,
 	    (vm_offset_t)csio->data_ptr, csio->dxfer_len);
-	if (rc != 0) {
+	if (__predict_false(rc != 0)) {
 		t4_free_page_pods(prsv);
-		uma_zfree(prsv_zone, prsv);
+		free(ddp, M_CXGBEI);
 		goto no_ddp;
 	}
 
+	ddp->cmp.last_datasn = -1;
+	cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag);
 	*ittp = htobe32(prsv->prsv_tag);
 	*arg = prsv;
 	counter_u64_add(toep->ofld_rxq->rx_iscsi_ddp_setup_ok, 1);
@@ -978,10 +1049,11 @@ icl_cxgbei_conn_task_done(struct icl_conn *ic, void *arg)
 {
 
 	if (arg != NULL) {
-		struct ppod_reservation *prsv = arg;
+		struct cxgbei_ddp_state *ddp = arg;
 
-		t4_free_page_pods(prsv);
-		uma_zfree(prsv_zone, prsv);
+		cxgbei_rm_cmp(ic_to_icc(ic), &ddp->cmp);
+		t4_free_page_pods(&ddp->prsv);
+		free(ddp, M_CXGBEI);
 	}
 }
 
@@ -1009,7 +1081,7 @@ ddp_sgl_check(struct ctl_sg_entry *sg, int entries, int xferlen)
 
 /* XXXNP: PDU should be passed in as parameter, like on the initiator. */
 #define io_to_request_pdu(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr)
-#define io_to_ppod_reservation(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND2].ptr)
+#define io_to_ddp_state(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND2].ptr)
 
 int
 icl_cxgbei_conn_transfer_setup(struct icl_conn *ic, union ctl_io *io,
@@ -1021,6 +1093,7 @@ icl_cxgbei_conn_transfer_setup(struct icl_conn *ic, union ctl_io *io,
 	struct adapter *sc = icc->sc;
 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
 	struct ppod_region *pr = &ci->pr;
+	struct cxgbei_ddp_state *ddp;
 	struct ppod_reservation *prsv;
 	struct ctl_sg_entry *sgl, sg_entry;
 	int sg_entries = ctsio->kern_sg_entries;
@@ -1064,7 +1137,7 @@ no_ddp:
 			ttt = *tttp & M_PPOD_TAG;
 			ttt = V_PPOD_TAG(ttt) | pr->pr_invalid_bit;
 			*tttp = htobe32(ttt);
-			MPASS(io_to_ppod_reservation(io) == NULL);
+			MPASS(io_to_ddp_state(io) == NULL);
 			if (rc != 0)
 				counter_u64_add(
 				    toep->ofld_rxq->rx_iscsi_ddp_setup_error, 1);
@@ -1086,17 +1159,17 @@ no_ddp:
 		 * Reserve resources for DDP, update the ttt that should be used
 		 * in the PDU, and save DDP specific state for this I/O.
 		 */
-
-		MPASS(io_to_ppod_reservation(io) == NULL);
-		prsv = uma_zalloc(prsv_zone, M_NOWAIT);
-		if (prsv == NULL) {
+		MPASS(io_to_ddp_state(io) == NULL);
+		ddp = malloc(sizeof(*ddp), M_CXGBEI, M_NOWAIT | M_ZERO);
+		if (ddp == NULL) {
 			rc = ENOMEM;
 			goto no_ddp;
 		}
+		prsv = &ddp->prsv;
 
 		rc = t4_alloc_page_pods_for_sgl(pr, sgl, sg_entries, prsv);
 		if (rc != 0) {
-			uma_zfree(prsv_zone, prsv);
+			free(ddp, M_CXGBEI);
 			goto no_ddp;
 		}
 
@@ -1104,12 +1177,16 @@ no_ddp:
 		    xferlen);
 		if (__predict_false(rc != 0)) {
 			t4_free_page_pods(prsv);
-			uma_zfree(prsv_zone, prsv);
+			free(ddp, M_CXGBEI);
 			goto no_ddp;
 		}
 
+		ddp->cmp.next_buffer_offset = ctsio->kern_rel_offset +
+		    first_burst;
+		ddp->cmp.last_datasn = -1;
+		cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag);
 		*tttp = htobe32(prsv->prsv_tag);
-		io_to_ppod_reservation(io) = prsv;
+		io_to_ddp_state(io) = ddp;
 		*arg = ctsio;
 		counter_u64_add(toep->ofld_rxq->rx_iscsi_ddp_setup_ok, 1);
 		return (0);
@@ -1119,16 +1196,19 @@ no_ddp:
 	 * In the middle of an I/O.  A non-NULL page pod reservation indicates
 	 * that a DDP buffer is being used for the I/O.
 	 */
-
-	prsv = io_to_ppod_reservation(ctsio);
-	if (prsv == NULL)
+	ddp = io_to_ddp_state(ctsio);
+	if (ddp == NULL)
 		goto no_ddp;
+	prsv = &ddp->prsv;
 
 	alias = (prsv->prsv_tag & pr->pr_alias_mask) >> pr->pr_alias_shift;
 	alias++;
 	prsv->prsv_tag &= ~pr->pr_alias_mask;
 	prsv->prsv_tag |= alias << pr->pr_alias_shift & pr->pr_alias_mask;
 
+	ddp->cmp.next_datasn = 0;
+	ddp->cmp.last_datasn = -1;
+	cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag);
 	*tttp = htobe32(prsv->prsv_tag);
 	*arg = ctsio;
 
@@ -1140,16 +1220,19 @@ icl_cxgbei_conn_transfer_done(struct icl_conn *ic, void *arg)
 {
 	struct ctl_scsiio *ctsio = arg;
 
-	if (ctsio != NULL && (ctsio->kern_data_len == ctsio->ext_data_filled ||
-	    ic->ic_disconnecting)) {
-		struct ppod_reservation *prsv;
+	if (ctsio != NULL) {
+		struct cxgbei_ddp_state *ddp;
 
-		prsv = io_to_ppod_reservation(ctsio);
-		MPASS(prsv != NULL);
+		ddp = io_to_ddp_state(ctsio);
+		MPASS(ddp != NULL);
 
-		t4_free_page_pods(prsv);
-		uma_zfree(prsv_zone, prsv);
-		io_to_ppod_reservation(ctsio) = NULL;
+		cxgbei_rm_cmp(ic_to_icc(ic), &ddp->cmp);
+		if (ctsio->kern_data_len == ctsio->ext_data_filled ||
+		    ic->ic_disconnecting) {
+			t4_free_page_pods(&ddp->prsv);
+			free(ddp, M_CXGBEI);
+			io_to_ddp_state(ctsio) = NULL;
+		}
 	}
 }
 
@@ -1208,13 +1291,6 @@ icl_cxgbei_mod_load(void)
 {
 	int rc;
 
-	/*
-	 * Space to track pagepod reservations.
-	 */
-	prsv_zone = uma_zcreate("Pagepod reservations",
-	    sizeof(struct ppod_reservation), NULL, NULL, NULL, NULL,
-	    UMA_ALIGN_CACHE, 0);
-
 	refcount_init(&icl_cxgbei_ncons, 0);
 
 	rc = icl_register("cxgbei", false, -100, icl_cxgbei_limits,
@@ -1232,8 +1308,6 @@ icl_cxgbei_mod_unload(void)
 
 	icl_unregister("cxgbei", false);
 
-	uma_zdestroy(prsv_zone);
-
 	return (0);
 }
 #endif


More information about the dev-commits-src-main mailing list