git: 2ff447ee3b6c - main - cxgbe: Enable TOE TLS RX when an RX key is provided via setsockopt().

From: John Baldwin <jhb_at_FreeBSD.org>
Date: Tue, 15 Nov 2022 20:11:50 UTC
The branch main has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=2ff447ee3b6c94a664d7604fb7b5334a702fb79e

commit 2ff447ee3b6c94a664d7604fb7b5334a702fb79e
Author:     John Baldwin <jhb@FreeBSD.org>
AuthorDate: 2022-11-15 20:08:51 +0000
Commit:     John Baldwin <jhb@FreeBSD.org>
CommitDate: 2022-11-15 20:08:51 +0000

    cxgbe: Enable TOE TLS RX when an RX key is provided via setsockopt().
    
    Rather than requiring a socket to be created as a TLS socket from the
    get go, switch a TOE socket from "plain" TOE to TLS mode when a
    receive key is added to the socket.
    
    The firmware is only able to switch a "plain" TOE connection to TLS
    mode if the head of the pending socket data is the start of a TLS
    record, so the connection is migrated to TLS mode as a multi-step
    process.
    
    When TOE TLS RX is enabled, the associated connection's receive side
    is frozen via a flag in the TCB.  The state of the socket buffer is
    then examined to determine if the pending data in the socket buffer
    ends on a TLS record boundary.  If so, the connection is migrated to
    TLS mode and unfrozen.  Otherwise, the connection is unfrozen
    temporarily until more data arrives.  Once more data arrives, the
    receive queue is frozen again and rechecked.  This continues until the
    connection is paused at a record boundary.  Any records received
    before TLS mode is enabled are decrypted as software records.
    
    Note that this removes the 'rx_tls_ports' sysctl.  TOE TLS offload for
    receive is now enabled automatically on existing TOE connections when
    using a KTLS-aware SSL library just as it was previously enabled
    automatically for TLS transmit.  This also enables TLS offload for TOE
    connections which enable TLS after passing initial data in the clear
    (e.g. STARTTLS with SMTP).
    
    Sponsored by:   Chelsio Communications
    Differential Revision:  https://reviews.freebsd.org/D37351
---
 sys/dev/cxgbe/offload.h       |   3 -
 sys/dev/cxgbe/t4_main.c       | 120 +--------
 sys/dev/cxgbe/tom/t4_cpl_io.c |  33 +--
 sys/dev/cxgbe/tom/t4_tls.c    | 550 +++++++++++++++++++++++++-----------------
 sys/dev/cxgbe/tom/t4_tls.h    |   7 +-
 sys/dev/cxgbe/tom/t4_tom.c    |  32 +--
 sys/dev/cxgbe/tom/t4_tom.h    |  10 +-
 7 files changed, 342 insertions(+), 413 deletions(-)

diff --git a/sys/dev/cxgbe/offload.h b/sys/dev/cxgbe/offload.h
index 6607d0c173e7..ff16f04e2dc2 100644
--- a/sys/dev/cxgbe/offload.h
+++ b/sys/dev/cxgbe/offload.h
@@ -225,9 +225,6 @@ struct tom_tunables {
 	int ddp;
 	int rx_coalesce;
 	int tls;
-	int tls_rx_timeout;
-	int *tls_rx_ports;
-	int num_tls_rx_ports;
 	int tx_align;
 	int tx_zcopy;
 	int cop_managed_offloading;
diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c
index 261fe16d1bc9..b854da46b146 100644
--- a/sys/dev/cxgbe/t4_main.c
+++ b/sys/dev/cxgbe/t4_main.c
@@ -414,11 +414,6 @@ SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 14, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[14], 0, "");
 SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 15, CTLFLAG_RDTUN,
     &t4_toe_rexmt_backoff[15], 0, "");
-
-static int t4_toe_tls_rx_timeout = 5;
-SYSCTL_INT(_hw_cxgbe_toe, OID_AUTO, tls_rx_timeout, CTLFLAG_RDTUN,
-    &t4_toe_tls_rx_timeout, 0,
-    "Timeout in seconds to downgrade TLS sockets to plain TOE");
 #endif
 
 #ifdef DEV_NETMAP
@@ -833,8 +828,6 @@ static int sysctl_cpus(SYSCTL_HANDLER_ARGS);
 static int sysctl_reset(SYSCTL_HANDLER_ARGS);
 #ifdef TCP_OFFLOAD
 static int sysctl_tls(SYSCTL_HANDLER_ARGS);
-static int sysctl_tls_rx_ports(SYSCTL_HANDLER_ARGS);
-static int sysctl_tls_rx_timeout(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_tick(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_dack_timer(SYSCTL_HANDLER_ARGS);
 static int sysctl_tp_timer(SYSCTL_HANDLER_ARGS);
@@ -1867,7 +1860,6 @@ t4_detach_common(device_t dev)
 	free(sc->tids.hpftid_tab, M_CXGBE);
 	free_hftid_hash(&sc->tids);
 	free(sc->tids.tid_tab, M_CXGBE);
-	free(sc->tt.tls_rx_ports, M_CXGBE);
 	t4_destroy_dma_tag(sc);
 
 	callout_drain(&sc->ktls_tick);
@@ -5743,10 +5735,9 @@ set_params__post_init(struct adapter *sc)
 	if (sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS &&
 	    sc->toecaps & FW_CAPS_CONFIG_TOE) {
 		/*
-		 * Limit TOE connections to 2 reassembly "islands".  This is
-		 * required for TOE TLS connections to downgrade to plain TOE
-		 * connections if an unsupported TLS version or ciphersuite is
-		 * used.
+		 * Limit TOE connections to 2 reassembly "islands".
+		 * This is required to permit migrating TOE
+		 * connections to UPL_MODE_TLS.
 		 */
 		t4_tp_wr_bits_indirect(sc, A_TP_FRAG_CONFIG,
 		    V_PASSMODE(M_PASSMODE), V_PASSMODE(2));
@@ -7683,17 +7674,6 @@ t4_sysctls(struct adapter *sc)
 		    CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, sysctl_tls, "I",
 		    "Inline TLS allowed");
 
-		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tls_rx_ports",
-		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
-		    sysctl_tls_rx_ports, "I",
-		    "TCP ports that use inline TLS+TOE RX");
-
-		sc->tt.tls_rx_timeout = t4_toe_tls_rx_timeout;
-		SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tls_rx_timeout",
-		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
-		    sysctl_tls_rx_timeout, "I",
-		    "Timeout in seconds to downgrade TLS sockets to plain TOE");
-
 		sc->tt.tx_align = -1;
 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_align",
 		    CTLFLAG_RW, &sc->tt.tx_align, 0, "chop and align payload");
@@ -11287,97 +11267,6 @@ sysctl_tls(SYSCTL_HANDLER_ARGS)
 
 }
 
-static int
-sysctl_tls_rx_ports(SYSCTL_HANDLER_ARGS)
-{
-	struct adapter *sc = arg1;
-	int *old_ports, *new_ports;
-	int i, new_count, rc;
-
-	if (req->newptr == NULL && req->oldptr == NULL)
-		return (SYSCTL_OUT(req, NULL, imax(sc->tt.num_tls_rx_ports, 1) *
-		    sizeof(sc->tt.tls_rx_ports[0])));
-
-	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tlsrx");
-	if (rc)
-		return (rc);
-
-	if (hw_off_limits(sc)) {
-		rc = ENXIO;
-		goto done;
-	}
-
-	if (sc->tt.num_tls_rx_ports == 0) {
-		i = -1;
-		rc = SYSCTL_OUT(req, &i, sizeof(i));
-	} else
-		rc = SYSCTL_OUT(req, sc->tt.tls_rx_ports,
-		    sc->tt.num_tls_rx_ports * sizeof(sc->tt.tls_rx_ports[0]));
-	if (rc == 0 && req->newptr != NULL) {
-		new_count = req->newlen / sizeof(new_ports[0]);
-		new_ports = malloc(new_count * sizeof(new_ports[0]), M_CXGBE,
-		    M_WAITOK);
-		rc = SYSCTL_IN(req, new_ports, new_count *
-		    sizeof(new_ports[0]));
-		if (rc)
-			goto err;
-
-		/* Allow setting to a single '-1' to clear the list. */
-		if (new_count == 1 && new_ports[0] == -1) {
-			ADAPTER_LOCK(sc);
-			old_ports = sc->tt.tls_rx_ports;
-			sc->tt.tls_rx_ports = NULL;
-			sc->tt.num_tls_rx_ports = 0;
-			ADAPTER_UNLOCK(sc);
-			free(old_ports, M_CXGBE);
-		} else {
-			for (i = 0; i < new_count; i++) {
-				if (new_ports[i] < 1 ||
-				    new_ports[i] > IPPORT_MAX) {
-					rc = EINVAL;
-					goto err;
-				}
-			}
-
-			ADAPTER_LOCK(sc);
-			old_ports = sc->tt.tls_rx_ports;
-			sc->tt.tls_rx_ports = new_ports;
-			sc->tt.num_tls_rx_ports = new_count;
-			ADAPTER_UNLOCK(sc);
-			free(old_ports, M_CXGBE);
-			new_ports = NULL;
-		}
-	err:
-		free(new_ports, M_CXGBE);
-	}
-done:
-	end_synchronized_op(sc, 0);
-	return (rc);
-}
-
-static int
-sysctl_tls_rx_timeout(SYSCTL_HANDLER_ARGS)
-{
-	struct adapter *sc = arg1;
-	int v, rc;
-
-	v = sc->tt.tls_rx_timeout;
-	rc = sysctl_handle_int(oidp, &v, 0, req);
-	if (rc != 0 || req->newptr == NULL)
-		return (rc);
-
-	if (v < 0)
-		return (EINVAL);
-
-	if (v != 0 && !(sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS))
-		return (ENOTSUP);
-
-	sc->tt.tls_rx_timeout = v;
-
-	return (0);
-
-}
-
 static void
 unit_conv(char *buf, size_t len, u_int val, u_int factor)
 {
@@ -12869,9 +12758,6 @@ tweak_tunables(void)
 
 	if (t4_pktc_idx_ofld < -1 || t4_pktc_idx_ofld >= SGE_NCOUNTERS)
 		t4_pktc_idx_ofld = PKTC_IDX_OFLD;
-
-	if (t4_toe_tls_rx_timeout < 0)
-		t4_toe_tls_rx_timeout = 0;
 #else
 	if (t4_rdmacaps_allowed == -1)
 		t4_rdmacaps_allowed = 0;
diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c
index 528af1414381..e1f709080f18 100644
--- a/sys/dev/cxgbe/tom/t4_cpl_io.c
+++ b/sys/dev/cxgbe/tom/t4_cpl_io.c
@@ -98,10 +98,6 @@ send_flowc_wr(struct toepcb *toep, struct tcpcb *tp)
 		nparams = 8;
 	else
 		nparams = 6;
-	if (ulp_mode(toep) == ULP_MODE_TLS)
-		nparams++;
-	if (toep->tls.fcplenmax != 0)
-		nparams++;
 	if (toep->params.tc_idx != -1) {
 		MPASS(toep->params.tc_idx >= 0 &&
 		    toep->params.tc_idx < sc->params.nsched_cls);
@@ -148,10 +144,6 @@ send_flowc_wr(struct toepcb *toep, struct tcpcb *tp)
 	    __func__, toep->tid, toep->params.emss, toep->params.sndbuf,
 	    tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0);
 
-	if (ulp_mode(toep) == ULP_MODE_TLS)
-		FLOWC_PARAM(ULP_MODE, ulp_mode(toep));
-	if (toep->tls.fcplenmax != 0)
-		FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax);
 	if (toep->params.tc_idx != -1)
 		FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx);
 #undef FLOWC_PARAM
@@ -395,9 +387,6 @@ make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt)
 	send_flowc_wr(toep, tp);
 
 	soisconnected(so);
-
-	if (ulp_mode(toep) == ULP_MODE_TLS)
-		tls_establish(toep);
 }
 
 int
@@ -421,23 +410,6 @@ send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
 	return (credits);
 }
 
-void
-send_rx_modulate(struct adapter *sc, struct toepcb *toep)
-{
-	struct wrqe *wr;
-	struct cpl_rx_data_ack *req;
-
-	wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
-	if (wr == NULL)
-		return;
-	req = wrtod(wr);
-
-	INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
-	req->credit_dack = htobe32(F_RX_MODULATE_RX);
-
-	t4_wrq_tx(sc, wr);
-}
-
 void
 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp)
 {
@@ -459,8 +431,7 @@ t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp)
 		rx_credits = send_rx_credits(sc, toep, rx_credits);
 		tp->rcv_wnd += rx_credits;
 		tp->rcv_adv += rx_credits;
-	} else if (toep->flags & TPF_FORCE_CREDITS)
-		send_rx_modulate(sc, toep);
+	}
 }
 
 void
@@ -1823,6 +1794,8 @@ do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 		    tid);
 		ddp_queue_toep(toep);
 	}
+	if (toep->flags & TPF_TLS_STARTING)
+		tls_received_starting_data(sc, toep, sb, len);
 	sorwakeup_locked(so);
 	SOCKBUF_UNLOCK_ASSERT(sb);
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP)
diff --git a/sys/dev/cxgbe/tom/t4_tls.c b/sys/dev/cxgbe/tom/t4_tls.c
index 017b13700db6..ae2f9ebaf91c 100644
--- a/sys/dev/cxgbe/tom/t4_tls.c
+++ b/sys/dev/cxgbe/tom/t4_tls.c
@@ -86,14 +86,14 @@ tls_tx_key(struct toepcb *toep)
 	return (tls_ofld->tx_key_addr >= 0);
 }
 
-/* Set TLS Key-Id in TCB */
+/* Set TF_RX_QUIESCE to pause receive. */
 static void
-t4_set_tls_keyid(struct toepcb *toep, unsigned int key_id)
+t4_set_rx_quiesce(struct toepcb *toep)
 {
+	struct adapter *sc = td_adapter(toep->td);
 
-	t4_set_tls_tcb_field(toep, W_TCB_RX_TLS_KEY_TAG,
-			 V_TCB_RX_TLS_KEY_TAG(M_TCB_RX_TLS_BUF_TAG),
-			 V_TCB_RX_TLS_KEY_TAG(key_id));
+	t4_set_tcb_field(sc, &toep->ofld_txq->wrq, toep, W_TCB_T_FLAGS,
+	    V_TF_RX_QUIESCE(1), V_TF_RX_QUIESCE(1), 1, CPL_COOKIE_TOM);
 }
 
 /* Clear TF_RX_QUIESCE to re-enable receive. */
@@ -104,27 +104,6 @@ t4_clear_rx_quiesce(struct toepcb *toep)
 	t4_set_tls_tcb_field(toep, W_TCB_T_FLAGS, V_TF_RX_QUIESCE(1), 0);
 }
 
-static void
-tls_clr_ofld_mode(struct toepcb *toep)
-{
-
-	tls_stop_handshake_timer(toep);
-
-	KASSERT(toep->tls.rx_key_addr == -1,
-	    ("%s: tid %d has RX key", __func__, toep->tid));
-
-	/* Switch to plain TOE mode. */
-	t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW,
-	    V_TCB_ULP_RAW(V_TF_TLS_ENABLE(1)),
-	    V_TCB_ULP_RAW(V_TF_TLS_ENABLE(0)));
-	t4_set_tls_tcb_field(toep, W_TCB_ULP_TYPE,
-	    V_TCB_ULP_TYPE(M_TCB_ULP_TYPE), V_TCB_ULP_TYPE(ULP_MODE_NONE));
-	t4_clear_rx_quiesce(toep);
-
-	toep->flags &= ~(TPF_FORCE_CREDITS | TPF_TLS_ESTABLISHED);
-	toep->params.ulp_mode = ULP_MODE_NONE;
-}
-
 /* TLS/DTLS content type  for CPL SFO */
 static inline unsigned char
 tls_content_type(unsigned char content_type)
@@ -226,88 +205,29 @@ tls_program_key_id(struct toepcb *toep, struct ktls_session *tls,
 	return (0);
 }
 
-/*
- * In some cases a client connection can hang without sending the
- * ServerHelloDone message from the NIC to the host.  Send a dummy
- * RX_DATA_ACK with RX_MODULATE to unstick the connection.
- */
-static void
-tls_send_handshake_ack(void *arg)
-{
-	struct toepcb *toep = arg;
-	struct tls_ofld_info *tls_ofld = &toep->tls;
-	struct adapter *sc = td_adapter(toep->td);
-
-	/* Bail without rescheduling if the connection has closed. */
-	if ((toep->flags & (TPF_FIN_SENT | TPF_ABORT_SHUTDOWN)) != 0)
-		return;
-
-	/*
-	 * If this connection has timed out without receiving more
-	 * data, downgrade to plain TOE mode and don't re-arm the
-	 * timer.
-	 */
-	if (sc->tt.tls_rx_timeout != 0) {
-		struct inpcb *inp;
-		struct tcpcb *tp;
-
-		inp = toep->inp;
-		tp = intotcpcb(inp);
-		if ((ticks - tp->t_rcvtime) >= sc->tt.tls_rx_timeout) {
-			CTR2(KTR_CXGBE, "%s: tid %d clr_ofld_mode", __func__,
-			    toep->tid);
-			tls_clr_ofld_mode(toep);
-			return;
-		}
-	}
-
-	/*
-	 * XXX: Does not have the t4_get_tcb() checks to refine the
-	 * workaround.
-	 */
-	callout_schedule(&tls_ofld->handshake_timer, TLS_SRV_HELLO_RD_TM * hz);
-
-	CTR2(KTR_CXGBE, "%s: tid %d sending RX_DATA_ACK", __func__, toep->tid);
-	send_rx_modulate(sc, toep);
-}
-
-static void
-tls_start_handshake_timer(struct toepcb *toep)
-{
-	struct tls_ofld_info *tls_ofld = &toep->tls;
-
-	INP_WLOCK_ASSERT(toep->inp);
-	callout_reset(&tls_ofld->handshake_timer, TLS_SRV_HELLO_BKOFF_TM * hz,
-	    tls_send_handshake_ack, toep);
-}
-
-void
-tls_stop_handshake_timer(struct toepcb *toep)
-{
-	struct tls_ofld_info *tls_ofld = &toep->tls;
-
-	INP_WLOCK_ASSERT(toep->inp);
-	callout_stop(&tls_ofld->handshake_timer);
-}
-
 int
 tls_alloc_ktls(struct toepcb *toep, struct ktls_session *tls, int direction)
 {
 	struct adapter *sc = td_adapter(toep->td);
-	int error, explicit_iv_size, key_offset, mac_first;
+	int error, explicit_iv_size, mac_first;
 
-	if (!can_tls_offload(td_adapter(toep->td)))
+	if (!can_tls_offload(sc))
 		return (EINVAL);
-	switch (ulp_mode(toep)) {
-	case ULP_MODE_TLS:
-		break;
-	case ULP_MODE_NONE:
-	case ULP_MODE_TCPDDP:
-		if (direction != KTLS_TX)
+
+	if (direction == KTLS_RX) {
+		if (ulp_mode(toep) != ULP_MODE_NONE)
 			return (EINVAL);
-		break;
-	default:
-		return (EINVAL);
+		if ((toep->flags & TPF_TLS_STARTING) != 0)
+			return (EINVAL);
+	} else {
+		switch (ulp_mode(toep)) {
+		case ULP_MODE_NONE:
+		case ULP_MODE_TLS:
+		case ULP_MODE_TCPDDP:
+			break;
+		default:
+			return (EINVAL);
+		}
 	}
 
 	switch (tls->params.cipher_algorithm) {
@@ -319,8 +239,7 @@ tls_alloc_ktls(struct toepcb *toep, struct ktls_session *tls, int direction)
 		case 256 / 8:
 			break;
 		default:
-			error = EINVAL;
-			goto clr_ofld;
+			return (EINVAL);
 		}
 		switch (tls->params.auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
@@ -328,16 +247,14 @@ tls_alloc_ktls(struct toepcb *toep, struct ktls_session *tls, int direction)
 		case CRYPTO_SHA2_384_HMAC:
 			break;
 		default:
-			error = EPROTONOSUPPORT;
-			goto clr_ofld;
+			return (EPROTONOSUPPORT);
 		}
 		explicit_iv_size = AES_BLOCK_LEN;
 		mac_first = 1;
 		break;
 	case CRYPTO_AES_NIST_GCM_16:
 		if (tls->params.iv_len != SALT_SIZE) {
-			error = EINVAL;
-			goto clr_ofld;
+			return (EINVAL);
 		}
 		switch (tls->params.cipher_key_len) {
 		case 128 / 8:
@@ -345,23 +262,20 @@ tls_alloc_ktls(struct toepcb *toep, struct ktls_session *tls, int direction)
 		case 256 / 8:
 			break;
 		default:
-			error = EINVAL;
-			goto clr_ofld;
+			return (EINVAL);
 		}
 		explicit_iv_size = 8;
 		mac_first = 0;
 		break;
 	default:
-		error = EPROTONOSUPPORT;
-		goto clr_ofld;
+		return (EPROTONOSUPPORT);
 	}
 
 	/* Only TLS 1.1 and TLS 1.2 are currently supported. */
 	if (tls->params.tls_vmajor != TLS_MAJOR_VER_ONE ||
 	    tls->params.tls_vminor < TLS_MINOR_VER_ONE ||
 	    tls->params.tls_vminor > TLS_MINOR_VER_TWO) {
-		error = EPROTONOSUPPORT;
-		goto clr_ofld;
+		return (EPROTONOSUPPORT);
 	}
 
 	/* Bail if we already have a key. */
@@ -374,11 +288,8 @@ tls_alloc_ktls(struct toepcb *toep, struct ktls_session *tls, int direction)
 	}
 
 	error = tls_program_key_id(toep, tls, direction);
-	if (error) {
-		if (direction == KTLS_RX)
-			goto clr_ofld;
+	if (error)
 		return (error);
-	}
 
 	if (direction == KTLS_TX) {
 		toep->tls.scmd0.seqno_numivs =
@@ -406,42 +317,16 @@ tls_alloc_ktls(struct toepcb *toep, struct ktls_session *tls, int direction)
 		    tls->params.max_frame_len;
 		toep->tls.tx_key_info_size = t4_tls_key_info_size(tls);
 	} else {
-		/* Stop timer on handshake completion */
-		tls_stop_handshake_timer(toep);
-
-		toep->flags &= ~TPF_FORCE_CREDITS;
-		toep->flags |= TPF_TLS_RECEIVE;
+		toep->flags |= TPF_TLS_STARTING | TPF_TLS_RX_QUIESCED;
 		toep->tls.rx_version = tls->params.tls_vmajor << 8 |
 		    tls->params.tls_vminor;
 
-		/*
-		 * RX key tags are an index into the key portion of MA
-		 * memory stored as an offset from the base address in
-		 * units of 64 bytes.
-		 */
-		key_offset = toep->tls.rx_key_addr - sc->vres.key.start;
-		t4_set_tls_keyid(toep, key_offset / 64);
-		t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW,
-				 V_TCB_ULP_RAW(M_TCB_ULP_RAW),
-				 V_TCB_ULP_RAW((V_TF_TLS_KEY_SIZE(3) |
-						V_TF_TLS_CONTROL(1) |
-						V_TF_TLS_ACTIVE(1) |
-						V_TF_TLS_ENABLE(1))));
-		t4_set_tls_tcb_field(toep, W_TCB_TLS_SEQ,
-				 V_TCB_TLS_SEQ(M_TCB_TLS_SEQ),
-				 V_TCB_TLS_SEQ(0));
-		t4_clear_rx_quiesce(toep);
+		CTR2(KTR_CXGBE, "%s: tid %d setting RX_QUIESCE", __func__,
+		    toep->tid);
+		t4_set_rx_quiesce(toep);
 	}
 
 	return (0);
-
-clr_ofld:
-	if (ulp_mode(toep) == ULP_MODE_TLS) {
-		CTR2(KTR_CXGBE, "%s: tid %d clr_ofld_mode", __func__,
-		    toep->tid);
-		tls_clr_ofld_mode(toep);
-	}
-	return (error);
 }
 
 void
@@ -453,42 +338,10 @@ tls_init_toep(struct toepcb *toep)
 	tls_ofld->tx_key_addr = -1;
 }
 
-void
-tls_establish(struct toepcb *toep)
-{
-
-	/*
-	 * Enable PDU extraction.
-	 *
-	 * XXX: Supposedly this should be done by the firmware when
-	 * the ULP_MODE FLOWC parameter is set in send_flowc_wr(), but
-	 * in practice this seems to be required.
-	 */
-	CTR2(KTR_CXGBE, "%s: tid %d setting TLS_ENABLE", __func__, toep->tid);
-	t4_set_tls_tcb_field(toep, W_TCB_ULP_RAW, V_TCB_ULP_RAW(M_TCB_ULP_RAW),
-	    V_TCB_ULP_RAW(V_TF_TLS_ENABLE(1)));
-
-	toep->flags |= TPF_FORCE_CREDITS | TPF_TLS_ESTABLISHED;
-
-	callout_init_rw(&toep->tls.handshake_timer, &toep->inp->inp_lock, 0);
-	tls_start_handshake_timer(toep);
-}
-
-void
-tls_detach(struct toepcb *toep)
-{
-
-	if (toep->flags & TPF_TLS_ESTABLISHED) {
-		tls_stop_handshake_timer(toep);
-		toep->flags &= ~TPF_TLS_ESTABLISHED;
-	}
-}
-
 void
 tls_uninit_toep(struct toepcb *toep)
 {
 
-	MPASS((toep->flags & TPF_TLS_ESTABLISHED) == 0);
 	clear_tls_keyid(toep);
 }
 
@@ -943,7 +796,7 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 	struct mbuf *tls_data;
 	struct tls_get_record *tgr;
 	struct mbuf *control;
-	int pdu_length, rx_credits;
+	int pdu_length, rx_credits, trailer_len;
 #if defined(KTR) || defined(INVARIANTS)
 	int len;
 #endif
@@ -1005,6 +858,9 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 
 	/* Report decryption errors as EBADMSG. */
 	if ((tls_hdr_pkt->res_to_mac_error & M_TLSRX_HDR_PKT_ERROR) != 0) {
+		CTR4(KTR_CXGBE, "%s: tid %u TLS error %#x ddp_vld %#x",
+		    __func__, toep->tid, tls_hdr_pkt->res_to_mac_error,
+		    be32toh(cpl->ddp_valid));
 		m_freem(m);
 		m_freem(tls_data);
 
@@ -1018,49 +874,16 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 		return (0);
 	}
 
-	/* Allocate the control message mbuf. */
-	control = sbcreatecontrol(NULL, sizeof(*tgr), TLS_GET_RECORD,
-	    IPPROTO_TCP, M_NOWAIT);
-	if (control == NULL) {
-		m_freem(m);
-		m_freem(tls_data);
-
-		CURVNET_SET(toep->vnet);
-		so->so_error = ENOBUFS;
-		sorwakeup(so);
-
-		INP_WUNLOCK(inp);
-		CURVNET_RESTORE();
-
-		return (0);
-	}
-
-	tgr = (struct tls_get_record *)
-	    CMSG_DATA(mtod(control, struct cmsghdr *));
-	memset(tgr, 0, sizeof(*tgr));
-	tgr->tls_type = tls_hdr_pkt->type;
-	tgr->tls_vmajor = be16toh(tls_hdr_pkt->version) >> 8;
-	tgr->tls_vminor = be16toh(tls_hdr_pkt->version) & 0xff;
-
-	m_freem(m);
-
-	if (tls_data != NULL) {
-		m_last(tls_data)->m_flags |= M_EOR;
-		tgr->tls_length = htobe16(tls_data->m_pkthdr.len);
-	} else
-		tgr->tls_length = 0;
-	m = tls_data;
-
+	/* Handle data received after the socket is closed. */
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
-
 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
 		struct epoch_tracker et;
 
 		CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
 		    __func__, tid, pdu_length);
 		m_freem(m);
-		m_freem(control);
+		m_freem(tls_data);
 		SOCKBUF_UNLOCK(sb);
 		INP_WUNLOCK(inp);
 
@@ -1068,7 +891,7 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 		NET_EPOCH_ENTER(et);
 		INP_WLOCK(inp);
 		tp = tcp_drop(tp, ECONNRESET);
-		if (tp)
+		if (tp != NULL)
 			INP_WUNLOCK(inp);
 		NET_EPOCH_EXIT(et);
 		CURVNET_RESTORE();
@@ -1077,10 +900,63 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 	}
 
 	/*
-	 * Not all of the bytes on the wire are included in the socket buffer
-	 * (e.g. the MAC of the TLS record).  However, those bytes are included
-	 * in the TCP sequence space.
+	 * If there is any data in the 'sb_mtls' chain of the socket
+	 * or we aren't able to allocate the control mbuf, append the
+	 * record as a CSUM_TLS_DECRYPTED packet to 'sb_mtls' rather
+	 * than as a decrypted record to 'sb_m'.
 	 */
+	if (sb->sb_mtls != NULL)
+		control = NULL;
+	else
+		control = sbcreatecontrol(NULL, sizeof(*tgr), TLS_GET_RECORD,
+		    IPPROTO_TCP, M_NOWAIT);
+
+	if (control != NULL) {
+		tgr = (struct tls_get_record *)
+		    CMSG_DATA(mtod(control, struct cmsghdr *));
+		memset(tgr, 0, sizeof(*tgr));
+		tgr->tls_type = tls_hdr_pkt->type;
+		tgr->tls_vmajor = be16toh(tls_hdr_pkt->version) >> 8;
+		tgr->tls_vminor = be16toh(tls_hdr_pkt->version) & 0xff;
+		if (tls_data != NULL) {
+			m_last(tls_data)->m_flags |= M_EOR;
+			tgr->tls_length = htobe16(tls_data->m_pkthdr.len);
+		} else
+			tgr->tls_length = 0;
+
+		m_freem(m);
+		m = tls_data;
+	} else {
+		M_ASSERTPKTHDR(m);
+
+		/* It's ok that any explicit IV is missing. */
+		m->m_len = sb->sb_tls_info->params.tls_hlen;
+		m->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED;
+		m->m_pkthdr.len = m->m_len;
+		if (tls_data != NULL) {
+			m->m_pkthdr.len += tls_data->m_pkthdr.len;
+			m_demote_pkthdr(tls_data);
+			m->m_next = tls_data;
+		}
+
+		/*
+		 * Grow the chain by the trailer, but without
+		 * contents.  The trailer will be thrown away by
+		 * ktls_decrypt.  Note that ktls_decrypt assumes the
+		 * trailer is tls_tlen bytes long, so append that many
+		 * bytes not the actual trailer size computed from
+		 * pdu_length.
+		 */
+		trailer_len = sb->sb_tls_info->params.tls_tlen;
+		if (tls_data != NULL) {
+			m_last(tls_data)->m_len += trailer_len;
+			tls_data = NULL;
+		} else
+			m->m_len += trailer_len;
+		m->m_pkthdr.len += trailer_len;
+		tls_hdr_pkt->length = htobe16(m->m_pkthdr.len -
+		    sizeof(struct tls_record_layer));
+	}
 
 	/* receive buffer autosize */
 	MPASS(toep->vnet == so->so_vnet);
@@ -1097,7 +973,10 @@ do_rx_tls_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 			sb->sb_flags &= ~SB_AUTOSIZE;
 	}
 
-	sbappendcontrol_locked(sb, m, control, 0);
+	if (control != NULL)
+		sbappendcontrol_locked(sb, m, control, 0);
+	else
+		sbappendstream_locked(sb, m, 0);
 	rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
 #ifdef VERBOSE_TRACES
 	CTR4(KTR_CXGBE, "%s: tid %u rx_credits %u rcv_wnd %u",
@@ -1223,12 +1102,242 @@ out:
 	m_freem(m);
 }
 
+/* SET_TCB_FIELD sent as a ULP command looks like this */
+#define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
+    sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
+
+static inline void *
+mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep,
+    uint64_t word, uint64_t mask, uint64_t val)
+{
+	struct ulptx_idata *ulpsc;
+	struct cpl_set_tcb_field_core *req;
+
+	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
+	ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16));
+
+	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
+	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
+	ulpsc->len = htobe32(sizeof(*req));
+
+	req = (struct cpl_set_tcb_field_core *)(ulpsc + 1);
+	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid));
+	req->reply_ctrl = htobe16(V_NO_REPLY(1) |
+	    V_QUEUENO(toep->ofld_rxq->iq.abs_id));
+	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0));
+	req->mask = htobe64(mask);
+	req->val = htobe64(val);
+
+	ulpsc = (struct ulptx_idata *)(req + 1);
+	if (LEN__SET_TCB_FIELD_ULP % 16) {
+		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
+		ulpsc->len = htobe32(0);
+		return (ulpsc + 1);
+	}
+	return (ulpsc);
+}
+
+/*
+ * Send a work request setting multiple TCB fields to enable
+ * ULP_MODE_TLS.
+ */
+static void
+tls_update_tcb(struct adapter *sc, struct toepcb *toep, uint64_t seqno)
+{
+	struct wrqe *wr;
+	struct work_request_hdr *wrh;
+	struct ulp_txpkt *ulpmc;
+	int fields, key_offset, len;
+
+	KASSERT(ulp_mode(toep) == ULP_MODE_NONE,
+	    ("%s: tid %d already ULP_MODE_TLS", __func__, toep->tid));
+
+	fields = 0;
+
+	/* 2 writes for the overlay region */
+	fields += 2;
+
+	/* W_TCB_TLS_SEQ */
+	fields++;
+
+	/* W_TCB_ULP_RAW */
+	fields++;
+
+	/* W_TCB_ULP_TYPE */
+	fields ++;
+
+	/* W_TCB_T_FLAGS */
+	fields++;
+
+	len = sizeof(*wrh) + fields * roundup2(LEN__SET_TCB_FIELD_ULP, 16);
+	KASSERT(len <= SGE_MAX_WR_LEN,
+	    ("%s: WR with %d TCB field updates too large", __func__, fields));
+
+	wr = alloc_wrqe(len, toep->ctrlq);
+	if (wr == NULL) {
+		/* XXX */
+		panic("%s: out of memory", __func__);
+	}
+
+	wrh = wrtod(wr);
+	INIT_ULPTX_WRH(wrh, len, 1, 0);	/* atomic */
+	ulpmc = (struct ulp_txpkt *)(wrh + 1);
+
+	/*
+	 * Clear the TLS overlay region: 1023:832.
+	 *
+	 * Words 26/27 are always set to zero.  Words 28/29
+	 * contain seqno and are set when enabling TLS
+	 * decryption.  Word 30 is zero and Word 31 contains
+	 * the keyid.
+	 */
+	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 26,
+	    0xffffffffffffffff, 0);
+
+	/*
+	 * RX key tags are an index into the key portion of MA
+	 * memory stored as an offset from the base address in
+	 * units of 64 bytes.
+	 */
+	key_offset = toep->tls.rx_key_addr - sc->vres.key.start;
+	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 30,
+	    0xffffffffffffffff,
+	    (uint64_t)V_TCB_RX_TLS_KEY_TAG(key_offset / 64) << 32);
+
+	CTR3(KTR_CXGBE, "%s: tid %d enable TLS seqno %lu", __func__,
+	    toep->tid, seqno);
+	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_TLS_SEQ,
+	    V_TCB_TLS_SEQ(M_TCB_TLS_SEQ), V_TCB_TLS_SEQ(seqno));
+	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_ULP_RAW,
+	    V_TCB_ULP_RAW(M_TCB_ULP_RAW),
+	    V_TCB_ULP_RAW((V_TF_TLS_KEY_SIZE(3) | V_TF_TLS_CONTROL(1) |
+	    V_TF_TLS_ACTIVE(1) | V_TF_TLS_ENABLE(1))));
+
+	toep->flags &= ~TPF_TLS_STARTING;
+	toep->flags |= TPF_TLS_RECEIVE;
+
+	/* Set the ULP mode to ULP_MODE_TLS. */
+	toep->params.ulp_mode = ULP_MODE_TLS;
+	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_ULP_TYPE,
+	    V_TCB_ULP_TYPE(M_TCB_ULP_TYPE),
+	    V_TCB_ULP_TYPE(ULP_MODE_TLS));
+
+	/* Clear TF_RX_QUIESCE. */
+	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_T_FLAGS,
+	    V_TF_RX_QUIESCE(1), 0);
+
+	t4_wrq_tx(sc, wr);
+}
+
+/*
+ * Examine the pending data in the socket buffer and either enable TLS
+ * RX or request more encrypted data.
+ */
+static void
+tls_check_rx_sockbuf(struct adapter *sc, struct toepcb *toep,
+    struct sockbuf *sb)
+{
+	uint64_t seqno;
+	size_t resid;
+	bool have_header;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+	MPASS(toep->tls.rx_resid == 0);
+
+	have_header = ktls_pending_rx_info(sb, &seqno, &resid);
+	CTR5(KTR_CXGBE, "%s: tid %d have_header %d seqno %lu resid %zu",
+	    __func__, toep->tid, have_header, seqno, resid);
+
+	/*
+	 * If we have a partial header or we need fewer bytes than the
+	 * size of a TLS record, re-enable receive and pause again once
+	 * we get more data to try again.
+	 */
+	if (!have_header || resid != 0) {
+		CTR(KTR_CXGBE, "%s: tid %d waiting for more data", __func__,
+		    toep->tid);
+		toep->flags &= ~TPF_TLS_RX_QUIESCED;
+		t4_clear_rx_quiesce(toep);
+		return;
+	}
+
+	tls_update_tcb(sc, toep, seqno);
+}
+
+void
+tls_received_starting_data(struct adapter *sc, struct toepcb *toep,
+    struct sockbuf *sb, int len)
+{
+	MPASS(toep->flags & TPF_TLS_STARTING);
+
+	/*
+	 * A previous call to tls_check_rx_sockbuf needed more data.
+	 * Now that more data has arrived, quiesce receive again and
+	 * check the state once the quiesce has completed.
+	 */
+	if ((toep->flags & TPF_TLS_RX_QUIESCED) == 0) {
+		CTR(KTR_CXGBE, "%s: tid %d quiescing", __func__, toep->tid);
+		toep->flags |= TPF_TLS_RX_QUIESCED;
+		t4_set_rx_quiesce(toep);
+		return;
+	}
+
+	KASSERT(len <= toep->tls.rx_resid,
+	    ("%s: received excess bytes %d (waiting for %zu)", __func__, len,
+	    toep->tls.rx_resid));
+	toep->tls.rx_resid -= len;
+	if (toep->tls.rx_resid != 0)
+		return;
+
+	tls_check_rx_sockbuf(sc, toep, sb);
+}
+
+static int
+do_tls_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
*** 192 LINES SKIPPED ***