git: 0695b57a9875 - stable/13 - cxgbei: Fix a race between transfer setup and a peer reset.

From: John Baldwin <jhb_at_FreeBSD.org>
Date: Fri, 29 Oct 2021 23:58:20 UTC
The branch stable/13 has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=0695b57a987591b7b5e5ac0e78365c69faf11216

commit 0695b57a987591b7b5e5ac0e78365c69faf11216
Author:     John Baldwin <jhb@FreeBSD.org>
AuthorDate: 2021-05-20 23:03:19 +0000
Commit:     John Baldwin <jhb@FreeBSD.org>
CommitDate: 2021-10-29 23:08:09 +0000

    cxgbei: Fix a race between transfer setup and a peer reset.
    
    In 4427ac3675f9, the TOM driver stopped sending work requests to
    program iSCSI page pods directly and instead queued them to be written
    asynchronously with iSCSI PDUs.  The queue of mbufs to send is
    protected by the inp lock.  However, the inp cannot be safely obtained
    from the toep since a RST from the remote peer might have cleared
    toep->inp asynchronously in an ithread.  To fix, obtain the inp from
    the socket as is already done in icl_cxgbei_conn_pdu_queue_cb() and
    fail the new transfer setup with ECONNRESET if the connection has been
    reset.
    
    To avoid passing sockets or inps into the page pod routines, pull the
    mbufq out of the two relevant page pod routines such that the routines
    queue new work request mbufs to a caller-supplied mbufq.
    
    Reported by:    Jithesh Arakkan @ Chelsio
    Fixes:          4427ac3675f91df039d54a23518132e0e0fede86
    
    (cherry picked from commit f949967c8eb3ab5e5a965e3cf07a726dfdc81263)
---
 sys/dev/cxgbe/cxgbei/icl_cxgbei.c | 44 +++++++++++++++++++++++++++++++++++++--
 sys/dev/cxgbe/tom/t4_ddp.c        | 31 +++++++--------------------
 sys/dev/cxgbe/tom/t4_tom.h        |  4 ++--
 3 files changed, 51 insertions(+), 28 deletions(-)

diff --git a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
index b9f7c6355b6f..01759d929c0e 100644
--- a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
+++ b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c
@@ -983,6 +983,8 @@ icl_cxgbei_conn_task_setup(struct icl_conn *ic, struct icl_pdu *ip,
 	struct ppod_region *pr = &ci->pr;
 	struct cxgbei_ddp_state *ddp;
 	struct ppod_reservation *prsv;
+	struct inpcb *inp;
+	struct mbufq mq;
 	uint32_t itt;
 	int rc = 0;
 
@@ -1028,14 +1030,32 @@ no_ddp:
 		goto no_ddp;
 	}
 
+	mbufq_init(&mq, INT_MAX);
 	rc = t4_write_page_pods_for_buf(sc, toep, prsv,
-	    (vm_offset_t)csio->data_ptr, csio->dxfer_len);
+	    (vm_offset_t)csio->data_ptr, csio->dxfer_len, &mq);
 	if (__predict_false(rc != 0)) {
+		mbufq_drain(&mq);
 		t4_free_page_pods(prsv);
 		free(ddp, M_CXGBEI);
 		goto no_ddp;
 	}
 
+	/*
+	 * Do not get inp from toep->inp as the toepcb might have
+	 * detached already.
+	 */
+	inp = sotoinpcb(ic->ic_socket);
+	INP_WLOCK(inp);
+	if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) != 0) {
+		INP_WUNLOCK(inp);
+		mbufq_drain(&mq);
+		t4_free_page_pods(prsv);
+		free(ddp, M_CXGBEI);
+		return (ECONNRESET);
+	}
+	mbufq_concat(&toep->ulp_pduq, &mq);
+	INP_WUNLOCK(inp);
+
 	ddp->cmp.last_datasn = -1;
 	cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag);
 	*ittp = htobe32(prsv->prsv_tag);
@@ -1096,6 +1116,8 @@ icl_cxgbei_conn_transfer_setup(struct icl_conn *ic, union ctl_io *io,
 	struct cxgbei_ddp_state *ddp;
 	struct ppod_reservation *prsv;
 	struct ctl_sg_entry *sgl, sg_entry;
+	struct inpcb *inp;
+	struct mbufq mq;
 	int sg_entries = ctsio->kern_sg_entries;
 	uint32_t ttt;
 	int xferlen, rc = 0, alias;
@@ -1173,14 +1195,32 @@ no_ddp:
 			goto no_ddp;
 		}
 
+		mbufq_init(&mq, INT_MAX);
 		rc = t4_write_page_pods_for_sgl(sc, toep, prsv, sgl, sg_entries,
-		    xferlen);
+		    xferlen, &mq);
 		if (__predict_false(rc != 0)) {
+			mbufq_drain(&mq);
 			t4_free_page_pods(prsv);
 			free(ddp, M_CXGBEI);
 			goto no_ddp;
 		}
 
+		/*
+		 * Do not get inp from toep->inp as the toepcb might
+		 * have detached already.
+		 */
+		inp = sotoinpcb(ic->ic_socket);
+		INP_WLOCK(inp);
+		if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) != 0) {
+			INP_WUNLOCK(inp);
+			mbufq_drain(&mq);
+			t4_free_page_pods(prsv);
+			free(ddp, M_CXGBEI);
+			return (ECONNRESET);
+		}
+		mbufq_concat(&toep->ulp_pduq, &mq);
+		INP_WUNLOCK(inp);
+
 		ddp->cmp.next_buffer_offset = ctsio->kern_rel_offset +
 		    first_burst;
 		ddp->cmp.last_datasn = -1;
diff --git a/sys/dev/cxgbe/tom/t4_ddp.c b/sys/dev/cxgbe/tom/t4_ddp.c
index 34c01674659a..2b58cb60d4fd 100644
--- a/sys/dev/cxgbe/tom/t4_ddp.c
+++ b/sys/dev/cxgbe/tom/t4_ddp.c
@@ -1175,9 +1175,9 @@ alloc_raw_wr_mbuf(int len)
 
 int
 t4_write_page_pods_for_buf(struct adapter *sc, struct toepcb *toep,
-    struct ppod_reservation *prsv, vm_offset_t buf, int buflen)
+    struct ppod_reservation *prsv, vm_offset_t buf, int buflen,
+    struct mbufq *wrq)
 {
-	struct inpcb *inp = toep->inp;
 	struct ulp_mem_io *ulpmc;
 	struct ulptx_idata *ulpsc;
 	struct pagepod *ppod;
@@ -1187,7 +1187,6 @@ t4_write_page_pods_for_buf(struct adapter *sc, struct toepcb *toep,
 	struct ppod_region *pr = prsv->prsv_pr;
 	uintptr_t end_pva, pva, pa;
 	struct mbuf *m;
-	struct mbufq wrq;
 
 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
 	if (is_t4(sc))
@@ -1199,7 +1198,6 @@ t4_write_page_pods_for_buf(struct adapter *sc, struct toepcb *toep,
 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
 	pva = trunc_page(buf);
 	end_pva = trunc_page(buf + buflen - 1);
-	mbufq_init(&wrq, INT_MAX);
 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
 
 		/* How many page pods are we writing in this cycle */
@@ -1209,10 +1207,8 @@ t4_write_page_pods_for_buf(struct adapter *sc, struct toepcb *toep,
 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
 
 		m = alloc_raw_wr_mbuf(len);
-		if (m == NULL) {
-			mbufq_drain(&wrq);
+		if (m == NULL)
 			return (ENOMEM);
-		}
 		ulpmc = mtod(m, struct ulp_mem_io *);
 
 		INIT_ULPTX_WR(ulpmc, len, 0, toep->tid);
@@ -1258,13 +1254,9 @@ t4_write_page_pods_for_buf(struct adapter *sc, struct toepcb *toep,
 			pva -= ddp_pgsz;
 		}
 
-		mbufq_enqueue(&wrq, m);
+		mbufq_enqueue(wrq, m);
 	}
 
-	INP_WLOCK(inp);
-	mbufq_concat(&toep->ulp_pduq, &wrq);
-	INP_WUNLOCK(inp);
-
 	MPASS(pva <= end_pva);
 
 	return (0);
@@ -1273,9 +1265,8 @@ t4_write_page_pods_for_buf(struct adapter *sc, struct toepcb *toep,
 int
 t4_write_page_pods_for_sgl(struct adapter *sc, struct toepcb *toep,
     struct ppod_reservation *prsv, struct ctl_sg_entry *sgl, int entries,
-    int xferlen)
+    int xferlen, struct mbufq *wrq)
 {
-	struct inpcb *inp = toep->inp;
 	struct ulp_mem_io *ulpmc;
 	struct ulptx_idata *ulpsc;
 	struct pagepod *ppod;
@@ -1285,7 +1276,6 @@ t4_write_page_pods_for_sgl(struct adapter *sc, struct toepcb *toep,
 	struct ppod_region *pr = prsv->prsv_pr;
 	uintptr_t pva, pa;
 	struct mbuf *m;
-	struct mbufq wrq;
 
 	MPASS(sgl != NULL);
 	MPASS(entries > 0);
@@ -1298,7 +1288,6 @@ t4_write_page_pods_for_sgl(struct adapter *sc, struct toepcb *toep,
 	offset = (vm_offset_t)sgl->addr & PAGE_MASK;
 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
 	pva = trunc_page((vm_offset_t)sgl->addr);
-	mbufq_init(&wrq, INT_MAX);
 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
 
 		/* How many page pods are we writing in this cycle */
@@ -1308,10 +1297,8 @@ t4_write_page_pods_for_sgl(struct adapter *sc, struct toepcb *toep,
 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
 
 		m = alloc_raw_wr_mbuf(len);
-		if (m == NULL) {
-			mbufq_drain(&wrq);
+		if (m == NULL)
 			return (ENOMEM);
-		}
 		ulpmc = mtod(m, struct ulp_mem_io *);
 
 		INIT_ULPTX_WR(ulpmc, len, 0, toep->tid);
@@ -1378,13 +1365,9 @@ t4_write_page_pods_for_sgl(struct adapter *sc, struct toepcb *toep,
 			}
 		}
 
-		mbufq_enqueue(&wrq, m);
+		mbufq_enqueue(wrq, m);
 	}
 
-	INP_WLOCK(inp);
-	mbufq_concat(&toep->ulp_pduq, &wrq);
-	INP_WUNLOCK(inp);
-
 	return (0);
 }
 
diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h
index c7984f838735..21cfb1df6e16 100644
--- a/sys/dev/cxgbe/tom/t4_tom.h
+++ b/sys/dev/cxgbe/tom/t4_tom.h
@@ -443,9 +443,9 @@ int t4_alloc_page_pods_for_sgl(struct ppod_region *, struct ctl_sg_entry *, int,
 int t4_write_page_pods_for_ps(struct adapter *, struct sge_wrq *, int,
     struct pageset *);
 int t4_write_page_pods_for_buf(struct adapter *, struct toepcb *,
-    struct ppod_reservation *, vm_offset_t, int);
+    struct ppod_reservation *, vm_offset_t, int, struct mbufq *);
 int t4_write_page_pods_for_sgl(struct adapter *, struct toepcb *,
-    struct ppod_reservation *, struct ctl_sg_entry *, int, int);
+    struct ppod_reservation *, struct ctl_sg_entry *, int, int, struct mbufq *);
 void t4_free_page_pods(struct ppod_reservation *);
 int t4_soreceive_ddp(struct socket *, struct sockaddr **, struct uio *,
     struct mbuf **, struct mbuf **, int *);