git: 68e46643065b - stable/13 - cxgbei: Support DDP for target I/O S/G lists with more than one entry.

From: John Baldwin <jhb_at_FreeBSD.org>
Date: Fri, 29 Oct 2021 23:58:06 UTC
The branch stable/13 has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=68e46643065b08e01d5640723c7aefc02aebcf3f

commit 68e46643065b08e01d5640723c7aefc02aebcf3f
Author:     John Baldwin <jhb@FreeBSD.org>
AuthorDate: 2021-05-14 19:17:06 +0000
Commit:     John Baldwin <jhb@FreeBSD.org>
CommitDate: 2021-10-29 22:53:37 +0000

    cxgbei: Support DDP for target I/O S/G lists with more than one entry.
    
    A CAM target layer I/O CCB can use a S/G list of virtual address ranges
    to describe its data buffer.  This change adds zero-copy receive support
    for such requests.
    
    Sponsored by:   Chelsio Communications
    Differential Revision:  https://reviews.freebsd.org/D29908
    
    (cherry picked from commit 46bee8043ee2bd352d420cd573e0364ca45f813e)
    (cherry picked from commit 8d2b4b2e7c1e0b10c4d49963753db31c4794dbc4)
---
 sys/dev/cxgbe/cxgbei/icl_cxgbei.c |  50 +++++++---
 sys/dev/cxgbe/tom/t4_ddp.c        | 191 ++++++++++++++++++++++++++++++++++++++
 sys/dev/cxgbe/tom/t4_tom.h        |   5 +
 3 files changed, 232 insertions(+), 14 deletions(-)

diff --git a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
index 655cc1de1478..5770599eeeef 100644
--- a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
+++ b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
@@ -873,6 +873,28 @@ icl_cxgbei_conn_task_done(struct icl_conn *ic, void *arg)
 	}
 }
 
+static inline bool
+ddp_sgl_check(struct ctl_sg_entry *sg, int entries, int xferlen)
+{
+	int total_len = 0;
+
+	MPASS(entries > 0);
+	if (((vm_offset_t)sg[--entries].addr & 3U) != 0)
+		return (false);
+
+	total_len += sg[entries].len;
+
+	while (--entries >= 0) {
+		if (((vm_offset_t)sg[entries].addr & PAGE_MASK) != 0 ||
+		    (sg[entries].len % PAGE_SIZE) != 0)
+			return (false);
+		total_len += sg[entries].len;
+	}
+
+	MPASS(total_len == xferlen);
+	return (true);
+}
+
 /* XXXNP: PDU should be passed in as parameter, like on the initiator. */
 #define io_to_request_pdu(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr)
 #define io_to_ppod_reservation(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND2].ptr)
@@ -888,6 +910,8 @@ icl_cxgbei_conn_transfer_setup(struct icl_conn *ic, union ctl_io *io,
 	struct cxgbei_data *ci = sc->iscsi_ulp_softc;
 	struct ppod_region *pr = &ci->pr;
 	struct ppod_reservation *prsv;
+	struct ctl_sg_entry *sgl, sg_entry;
+	int sg_entries = ctsio->kern_sg_entries;
 	uint32_t ttt;
 	int xferlen, rc = 0, alias;
 
@@ -898,7 +922,6 @@ icl_cxgbei_conn_transfer_setup(struct icl_conn *ic, union ctl_io *io,
 	if (ctsio->ext_data_filled == 0) {
 		int first_burst;
 		struct icl_pdu *ip = io_to_request_pdu(io);
-		vm_offset_t buf;
 #ifdef INVARIANTS
 		struct icl_cxgbei_pdu *icp = ip_to_icp(ip);
 
@@ -931,18 +954,16 @@ no_ddp:
 			return (0);
 		}
 
-		if (ctsio->kern_sg_entries == 0)
-			buf = (vm_offset_t)ctsio->kern_data_ptr;
-		else if (ctsio->kern_sg_entries == 1) {
-			struct ctl_sg_entry *sgl = (void *)ctsio->kern_data_ptr;
+		if (sg_entries == 0) {
+			sgl = &sg_entry;
+			sgl->len = xferlen;
+			sgl->addr = (void *)ctsio->kern_data_ptr;
+			sg_entries = 1;
+		} else
+			sgl = (void *)ctsio->kern_data_ptr;
 
-			MPASS(sgl->len == xferlen);
-			buf = (vm_offset_t)sgl->addr;
-		} else {
-			rc = EAGAIN;	/* XXX implement */
+		if (!ddp_sgl_check(sgl, sg_entries, xferlen))
 			goto no_ddp;
-		}
-
 
 		/*
 		 * Reserve resources for DDP, update the ttt that should be used
@@ -956,14 +977,15 @@ no_ddp:
 			goto no_ddp;
 		}
 
-		rc = t4_alloc_page_pods_for_buf(pr, buf, xferlen, prsv);
+		rc = t4_alloc_page_pods_for_sgl(pr, sgl, sg_entries, prsv);
 		if (rc != 0) {
 			uma_zfree(prsv_zone, prsv);
 			goto no_ddp;
 		}
 
-		rc = t4_write_page_pods_for_buf(sc, toep, prsv, buf, xferlen);
-		if (rc != 0) {
+		rc = t4_write_page_pods_for_sgl(sc, toep, prsv, sgl, sg_entries,
+		    xferlen);
+		if (__predict_false(rc != 0)) {
 			t4_free_page_pods(prsv);
 			uma_zfree(prsv_zone, prsv);
 			goto no_ddp;
diff --git a/sys/dev/cxgbe/tom/t4_ddp.c b/sys/dev/cxgbe/tom/t4_ddp.c
index e87d013a0453..34c01674659a 100644
--- a/sys/dev/cxgbe/tom/t4_ddp.c
+++ b/sys/dev/cxgbe/tom/t4_ddp.c
@@ -62,6 +62,9 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 
+#include <cam/scsi/scsi_all.h>
+#include <cam/ctl/ctl_io.h>
+
 #ifdef TCP_OFFLOAD
 #include "common/common.h"
 #include "common/t4_msg.h"
@@ -981,6 +984,76 @@ have_pgsz:
 	return (0);
 }
 
+int
+t4_alloc_page_pods_for_sgl(struct ppod_region *pr, struct ctl_sg_entry *sgl,
+    int entries, struct ppod_reservation *prsv)
+{
+	int hcf, seglen, idx = 0, npages, nppods, i, len;
+	uintptr_t start_pva, end_pva, pva, p1 ;
+	vm_offset_t buf;
+	struct ctl_sg_entry *sge;
+
+	MPASS(entries > 0);
+	MPASS(sgl);
+
+	/*
+	 * The DDP page size is unrelated to the VM page size.	We combine
+	 * contiguous physical pages into larger segments to get the best DDP
+	 * page size possible.	This is the largest of the four sizes in
+	 * A_ULP_RX_ISCSI_PSZ that evenly divides the HCF of the segment sizes
+	 * in the page list.
+	 */
+	hcf = 0;
+	for (i = entries - 1; i >= 0; i--) {
+		sge = sgl + i;
+		buf = (vm_offset_t)sge->addr;
+		len = sge->len;
+		start_pva = trunc_page(buf);
+		end_pva = trunc_page(buf + len - 1);
+		pva = start_pva;
+		while (pva <= end_pva) {
+			seglen = PAGE_SIZE;
+			p1 = pmap_kextract(pva);
+			pva += PAGE_SIZE;
+			while (pva <= end_pva && p1 + seglen ==
+			    pmap_kextract(pva)) {
+				seglen += PAGE_SIZE;
+				pva += PAGE_SIZE;
+			}
+
+			hcf = calculate_hcf(hcf, seglen);
+			if (hcf < (1 << pr->pr_page_shift[1])) {
+				idx = 0;
+				goto have_pgsz; /* give up, short circuit */
+			}
+		}
+	}
+#define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
+	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
+	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
+		if ((hcf & PR_PAGE_MASK(idx)) == 0)
+			break;
+	}
+#undef PR_PAGE_MASK
+
+have_pgsz:
+	MPASS(idx <= M_PPOD_PGSZ);
+
+	npages = 0;
+	while (entries--) {
+		npages++;
+		start_pva = trunc_page((vm_offset_t)sgl->addr);
+		end_pva = trunc_page((vm_offset_t)sgl->addr + sgl->len - 1);
+		npages += (end_pva - start_pva) >> pr->pr_page_shift[idx];
+		sgl = sgl + 1;
+	}
+	nppods = howmany(npages, PPOD_PAGES);
+	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
+		return (ENOMEM);
+	MPASS(prsv->prsv_nppods > 0);
+	return (0);
+}
+
 void
 t4_free_page_pods(struct ppod_reservation *prsv)
 {
@@ -1197,6 +1270,124 @@ t4_write_page_pods_for_buf(struct adapter *sc, struct toepcb *toep,
 	return (0);
 }
 
+int
+t4_write_page_pods_for_sgl(struct adapter *sc, struct toepcb *toep,
+    struct ppod_reservation *prsv, struct ctl_sg_entry *sgl, int entries,
+    int xferlen)
+{
+	struct inpcb *inp = toep->inp;
+	struct ulp_mem_io *ulpmc;
+	struct ulptx_idata *ulpsc;
+	struct pagepod *ppod;
+	int i, j, k, n, chunk, len, ddp_pgsz;
+	u_int ppod_addr, offset, sg_offset = 0;
+	uint32_t cmd;
+	struct ppod_region *pr = prsv->prsv_pr;
+	uintptr_t pva, pa;
+	struct mbuf *m;
+	struct mbufq wrq;
+
+	MPASS(sgl != NULL);
+	MPASS(entries > 0);
+	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
+	if (is_t4(sc))
+		cmd |= htobe32(F_ULP_MEMIO_ORDER);
+	else
+		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
+	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
+	offset = (vm_offset_t)sgl->addr & PAGE_MASK;
+	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
+	pva = trunc_page((vm_offset_t)sgl->addr);
+	mbufq_init(&wrq, INT_MAX);
+	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
+
+		/* How many page pods are we writing in this cycle */
+		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
+		MPASS(n > 0);
+		chunk = PPOD_SZ(n);
+		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
+
+		m = alloc_raw_wr_mbuf(len);
+		if (m == NULL) {
+			mbufq_drain(&wrq);
+			return (ENOMEM);
+		}
+		ulpmc = mtod(m, struct ulp_mem_io *);
+
+		INIT_ULPTX_WR(ulpmc, len, 0, toep->tid);
+		ulpmc->cmd = cmd;
+		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
+		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
+		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
+
+		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
+		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
+		ulpsc->len = htobe32(chunk);
+
+		ppod = (struct pagepod *)(ulpsc + 1);
+		for (j = 0; j < n; i++, j++, ppod++) {
+			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
+			    V_PPOD_TID(toep->tid) |
+			    (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ)));
+			ppod->len_offset = htobe64(V_PPOD_LEN(xferlen) |
+			    V_PPOD_OFST(offset));
+			ppod->rsvd = 0;
+
+			for (k = 0; k < nitems(ppod->addr); k++) {
+				if (entries != 0) {
+					pa = pmap_kextract(pva + sg_offset);
+					ppod->addr[k] = htobe64(pa);
+				} else
+					ppod->addr[k] = 0;
+
+#if 0
+				CTR5(KTR_CXGBE,
+				    "%s: tid %d ppod[%d]->addr[%d] = %p",
+				    __func__, toep->tid, i, k,
+				    htobe64(ppod->addr[k]));
+#endif
+
+				/*
+				 * If this is the last entry in a pod,
+				 * reuse the same entry for first address
+				 * in the next pod.
+				 */
+				if (k + 1 == nitems(ppod->addr))
+					break;
+
+				/*
+				 * Don't move to the next DDP page if the
+				 * sgl is already finished.
+				 */
+				if (entries == 0)
+					continue;
+
+				sg_offset += ddp_pgsz;
+				if (sg_offset == sgl->len) {
+					/*
+					 * This sgl entry is done.  Go
+					 * to the next.
+					 */
+					entries--;
+					sgl++;
+					sg_offset = 0;
+					if (entries != 0)
+						pva = trunc_page(
+						    (vm_offset_t)sgl->addr);
+				}
+			}
+		}
+
+		mbufq_enqueue(&wrq, m);
+	}
+
+	INP_WLOCK(inp);
+	mbufq_concat(&toep->ulp_pduq, &wrq);
+	INP_WUNLOCK(inp);
+
+	return (0);
+}
+
 /*
  * Prepare a pageset for DDP.  This sets up page pods.
  */
diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h
index f1129b47cbcf..c7984f838735 100644
--- a/sys/dev/cxgbe/tom/t4_tom.h
+++ b/sys/dev/cxgbe/tom/t4_tom.h
@@ -88,6 +88,7 @@ enum {
 	DDP_DEAD	= (1 << 6),	/* toepcb is shutting down */
 };
 
+struct ctl_sg_entry;
 struct sockopt;
 struct offload_settings;
 
@@ -437,10 +438,14 @@ void t4_free_ppod_region(struct ppod_region *);
 int t4_alloc_page_pods_for_ps(struct ppod_region *, struct pageset *);
 int t4_alloc_page_pods_for_buf(struct ppod_region *, vm_offset_t, int,
     struct ppod_reservation *);
+int t4_alloc_page_pods_for_sgl(struct ppod_region *, struct ctl_sg_entry *, int,
+    struct ppod_reservation *);
 int t4_write_page_pods_for_ps(struct adapter *, struct sge_wrq *, int,
     struct pageset *);
 int t4_write_page_pods_for_buf(struct adapter *, struct toepcb *,
     struct ppod_reservation *, vm_offset_t, int);
+int t4_write_page_pods_for_sgl(struct adapter *, struct toepcb *,
+    struct ppod_reservation *, struct ctl_sg_entry *, int, int);
 void t4_free_page_pods(struct ppod_reservation *);
 int t4_soreceive_ddp(struct socket *, struct sockaddr **, struct uio *,
     struct mbuf **, struct mbuf **, int *);