git: c0e4090e3d43 - main - ktls: Accurately track if ifnet ktls is enabled

From: Andrew Gallatin <gallatin_at_FreeBSD.org>
Date: Thu, 09 Feb 2023 17:53:08 UTC
The branch main has been updated by gallatin:

URL: https://cgit.FreeBSD.org/src/commit/?id=c0e4090e3d43eeb86270dd35835862660b045c26

commit c0e4090e3d43eeb86270dd35835862660b045c26
Author:     Andrew Gallatin <gallatin@FreeBSD.org>
AuthorDate: 2023-02-08 20:37:08 +0000
Commit:     Andrew Gallatin <gallatin@FreeBSD.org>
CommitDate: 2023-02-09 17:44:44 +0000

    ktls: Accurately track if ifnet ktls is enabled
    
    This allows us to avoid spurious calls to ktls_disable_ifnet()
    
    When we implemented ifnet kTLSe, we set a flag in the tx socket
    buffer (SB_TLS_IFNET) to indicate ifnet kTLS.  This flag meant that
    now, or in the past, ifnet ktls was active on a socket.  Later,
    I added code to switch ifnet ktls sessions to software in the case
    of lossy TCP connections that have a high retransmit rate.
    Because TCP was using SB_TLS_IFNET to know if it needed to do math
    to calculate the retransmit ratio and potentially call into
    ktls_disable_ifnet(), it was doing unneeded work long after
    a session was moved to software.
    
    This patch carefully tracks whether or not ifnet ktls is still enabled
    on a TCP connection.  Because the inp is now embedded in the tcpcb, and
    because TCP is the most frequent accessor of this state, it made sense to
    move this from the socket buffer flags to the tcpcb. Because we now need
    reliable access to the tcbcb, we take a ref on the inp when creating a tx
    ktls session.
    
    While here, I noticed that rack/bbr were incorrectly implementing
    tfb_hwtls_change(), and applying the change to all pending sends,
    when it should apply only to future sends.
    
    This change reduces spurious calls to  ktls_disable_ifnet() by 95% or so
    in a Netflix CDN environment.
    
    Reviewed by: markj, rrs
    Sponsored by: Netflix
    Differential Revision: https://reviews.freebsd.org/D38380
---
 sys/kern/uipc_ktls.c          | 145 +++++++++++++++++++++++++++++++++---------
 sys/netinet/tcp_output.c      |   2 +-
 sys/netinet/tcp_ratelimit.c   |   4 +-
 sys/netinet/tcp_stacks/bbr.c  |   2 +-
 sys/netinet/tcp_stacks/rack.c |  14 +---
 sys/netinet/tcp_var.h         |   3 +
 sys/sys/ktls.h                |   3 +-
 sys/sys/sockbuf.h             |   2 +-
 8 files changed, 126 insertions(+), 49 deletions(-)

diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c
index ac55268728e9..b3895aee9249 100644
--- a/sys/kern/uipc_ktls.c
+++ b/sys/kern/uipc_ktls.c
@@ -222,6 +222,11 @@ static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_ok);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_ok, CTLFLAG_RD,
     &ktls_ifnet_disable_ok, "TLS sessions able to switch to SW from ifnet");
 
+static COUNTER_U64_DEFINE_EARLY(ktls_destroy_task);
+SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, destroy_task, CTLFLAG_RD,
+    &ktls_destroy_task,
+    "Number of times ktls session was destroyed via taskqueue");
+
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "Software TLS session stats");
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
@@ -619,10 +624,14 @@ ktls_create_session(struct socket *so, struct tls_enable *en,
 	counter_u64_add(ktls_offload_active, 1);
 
 	refcount_init(&tls->refcount, 1);
-	if (direction == KTLS_RX)
+	if (direction == KTLS_RX) {
 		TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_receive_tag, tls);
-	else
+	} else {
 		TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_send_tag, tls);
+		tls->inp = so->so_pcb;
+		in_pcbref(tls->inp);
+		tls->tx = true;
+	}
 
 	tls->wq_index = ktls_get_cpu(so);
 
@@ -757,12 +766,16 @@ ktls_clone_session(struct ktls_session *tls, int direction)
 	counter_u64_add(ktls_offload_active, 1);
 
 	refcount_init(&tls_new->refcount, 1);
-	if (direction == KTLS_RX)
+	if (direction == KTLS_RX) {
 		TASK_INIT(&tls_new->reset_tag_task, 0, ktls_reset_receive_tag,
 		    tls_new);
-	else
+	} else {
 		TASK_INIT(&tls_new->reset_tag_task, 0, ktls_reset_send_tag,
 		    tls_new);
+		tls_new->inp = tls->inp;
+		tls_new->tx = true;
+		in_pcbref(tls_new->inp);
+	}
 
 	/* Copy fields from existing session. */
 	tls_new->params = tls->params;
@@ -1272,6 +1285,7 @@ ktls_enable_tx(struct socket *so, struct tls_enable *en)
 {
 	struct ktls_session *tls;
 	struct inpcb *inp;
+	struct tcpcb *tp;
 	int error;
 
 	if (!ktls_offload_enable)
@@ -1336,8 +1350,13 @@ ktls_enable_tx(struct socket *so, struct tls_enable *en)
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_tls_seqno = be64dec(en->rec_seq);
 	so->so_snd.sb_tls_info = tls;
-	if (tls->mode != TCP_TLS_MODE_SW)
-		so->so_snd.sb_flags |= SB_TLS_IFNET;
+	if (tls->mode != TCP_TLS_MODE_SW) {
+		tp = intotcpcb(inp);
+		MPASS(tp->t_nic_ktls_xmit == 0);
+		tp->t_nic_ktls_xmit = 1;
+		if (tp->t_fb->tfb_hwtls_change != NULL)
+			(*tp->t_fb->tfb_hwtls_change)(tp, 1);
+	}
 	SOCKBUF_UNLOCK(&so->so_snd);
 	INP_WUNLOCK(inp);
 	SOCK_IO_SEND_UNLOCK(so);
@@ -1438,6 +1457,7 @@ ktls_set_tx_mode(struct socket *so, int mode)
 {
 	struct ktls_session *tls, *tls_new;
 	struct inpcb *inp;
+	struct tcpcb *tp;
 	int error;
 
 	if (SOLISTENING(so))
@@ -1452,6 +1472,20 @@ ktls_set_tx_mode(struct socket *so, int mode)
 
 	inp = so->so_pcb;
 	INP_WLOCK_ASSERT(inp);
+	tp = intotcpcb(inp);
+
+	if (mode == TCP_TLS_MODE_IFNET) {
+		/* Don't allow enabling ifnet ktls multiple times */
+		if (tp->t_nic_ktls_xmit)
+			return (EALREADY);
+		/*
+		 * Don't enable ifnet ktls if we disabled it due to an
+		 * excessive retransmission rate
+		 */
+		if (tp->t_nic_ktls_xmit_dis)
+			return (ENXIO);
+	}
+
 	SOCKBUF_LOCK(&so->so_snd);
 	tls = so->so_snd.sb_tls_info;
 	if (tls == NULL) {
@@ -1507,8 +1541,12 @@ ktls_set_tx_mode(struct socket *so, int mode)
 	INP_WLOCK(inp);
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_tls_info = tls_new;
-	if (tls_new->mode != TCP_TLS_MODE_SW)
-		so->so_snd.sb_flags |= SB_TLS_IFNET;
+	if (tls_new->mode != TCP_TLS_MODE_SW) {
+		MPASS(tp->t_nic_ktls_xmit == 0);
+		tp->t_nic_ktls_xmit = 1;
+		if (tp->t_fb->tfb_hwtls_change != NULL)
+			(*tp->t_fb->tfb_hwtls_change)(tp, 1);
+	}
 	SOCKBUF_UNLOCK(&so->so_snd);
 	SOCK_IO_SEND_UNLOCK(so);
 
@@ -1662,8 +1700,7 @@ ktls_reset_send_tag(void *context, int pending)
 		mtx_pool_lock(mtxpool_sleep, tls);
 		tls->reset_pending = false;
 		mtx_pool_unlock(mtxpool_sleep, tls);
-		if (!in_pcbrele_wlocked(inp))
-			INP_WUNLOCK(inp);
+		INP_WUNLOCK(inp);
 
 		counter_u64_add(ktls_ifnet_reset, 1);
 
@@ -1674,18 +1711,15 @@ ktls_reset_send_tag(void *context, int pending)
 	} else {
 		NET_EPOCH_ENTER(et);
 		INP_WLOCK(inp);
-		if (!in_pcbrele_wlocked(inp)) {
-			if (!(inp->inp_flags & INP_DROPPED)) {
-				tp = intotcpcb(inp);
-				CURVNET_SET(inp->inp_vnet);
-				tp = tcp_drop(tp, ECONNABORTED);
-				CURVNET_RESTORE();
-				if (tp != NULL)
-					INP_WUNLOCK(inp);
+		if (!(inp->inp_flags & INP_DROPPED)) {
+			tp = intotcpcb(inp);
+			CURVNET_SET(inp->inp_vnet);
+			tp = tcp_drop(tp, ECONNABORTED);
+			CURVNET_RESTORE();
+			if (tp != NULL)
 				counter_u64_add(ktls_ifnet_reset_dropped, 1);
-			} else
-				INP_WUNLOCK(inp);
 		}
+		INP_WUNLOCK(inp);
 		NET_EPOCH_EXIT(et);
 
 		counter_u64_add(ktls_ifnet_reset_failed, 1);
@@ -1746,8 +1780,6 @@ ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls)
 	mtx_pool_lock(mtxpool_sleep, tls);
 	if (!tls->reset_pending) {
 		(void) ktls_hold(tls);
-		in_pcbref(inp);
-		tls->inp = inp;
 		tls->reset_pending = true;
 		taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task);
 	}
@@ -1790,11 +1822,55 @@ ktls_modify_txrtlmt(struct ktls_session *tls, uint64_t max_pacing_rate)
 #endif
 #endif
 
+static void
+ktls_destroy_help(void *context, int pending __unused)
+{
+	ktls_destroy(context);
+}
+
 void
 ktls_destroy(struct ktls_session *tls)
 {
+	struct inpcb *inp;
+	struct tcpcb *tp;
+	bool wlocked;
+
 	MPASS(tls->refcount == 0);
 
+	inp = tls->inp;
+	if (tls->tx) {
+		wlocked = INP_WLOCKED(inp);
+		if (!wlocked && !INP_TRY_WLOCK(inp)) {
+			/*
+			 * rwlocks read locks are anonymous, and there
+			 * is no way to know if our current thread
+			 * holds an rlock on the inp.  As a rough
+			 * estimate, check to see if the thread holds
+			 * *any* rlocks at all.  If it does not, then we
+			 * know that we don't hold the inp rlock, and
+			 * can safely take the wlock
+			 */
+
+			if (curthread->td_rw_rlocks == 0) {
+				INP_WLOCK(inp);
+			} else {
+				/*
+				 * We might hold the rlock, so let's
+				 * do the destroy in a taskqueue
+				 * context to avoid a potential
+				 * deadlock.  This should be very
+				 * rare.
+				 */
+				counter_u64_add(ktls_destroy_task, 1);
+				TASK_INIT(&tls->destroy_task, 0,
+				    ktls_destroy_help, tls);
+				(void)taskqueue_enqueue(taskqueue_thread,
+				    &tls->destroy_task);
+				return;
+			}
+		}
+	}
+
 	if (tls->sequential_records) {
 		struct mbuf *m, *n;
 		int page_count;
@@ -1841,6 +1917,12 @@ ktls_destroy(struct ktls_session *tls)
 			m_snd_tag_rele(tls->snd_tag);
 		if (tls->rx_ifp != NULL)
 			if_rele(tls->rx_ifp);
+		if (tls->tx) {
+			INP_WLOCK_ASSERT(inp);
+			tp = intotcpcb(inp);
+			MPASS(tp->t_nic_ktls_xmit == 1);
+			tp->t_nic_ktls_xmit = 0;
+		}
 		break;
 #ifdef TCP_OFFLOAD
 	case TCP_TLS_MODE_TOE:
@@ -1870,6 +1952,11 @@ ktls_destroy(struct ktls_session *tls)
 		tls->params.cipher_key = NULL;
 		tls->params.cipher_key_len = 0;
 	}
+	if (tls->tx) {
+		INP_WLOCK_ASSERT(inp);
+		if (!in_pcbrele_wlocked(inp) && !wlocked)
+			INP_WUNLOCK(inp);
+	}
 	explicit_bzero(tls->params.iv, sizeof(tls->params.iv));
 
 	uma_zfree(ktls_session_zone, tls);
@@ -3213,8 +3300,7 @@ out:
 	CURVNET_SET(so->so_vnet);
 	sorele(so);
 	CURVNET_RESTORE();
-	if (!in_pcbrele_wlocked(inp))
-		INP_WUNLOCK(inp);
+	INP_WUNLOCK(inp);
 	ktls_free(tls);
 }
 
@@ -3245,22 +3331,19 @@ ktls_disable_ifnet(void *arg)
 	so = inp->inp_socket;
 	SOCK_LOCK(so);
 	tls = so->so_snd.sb_tls_info;
-	if (tls->disable_ifnet_pending) {
+	if (tp->t_nic_ktls_xmit_dis == 1) {
 		SOCK_UNLOCK(so);
 		return;
 	}
-
 	/*
-	 * note that disable_ifnet_pending is never cleared; disabling
-	 * ifnet can only be done once per session, so we never want
+	 * note that t_nic_ktls_xmit_dis is never cleared; disabling
+	 * ifnet can only be done once per connection, so we never want
 	 * to do it again
 	 */
 
 	(void)ktls_hold(tls);
-	in_pcbref(inp);
 	soref(so);
-	tls->disable_ifnet_pending = true;
-	tls->inp = inp;
+	tp->t_nic_ktls_xmit_dis = 1;
 	SOCK_UNLOCK(so);
 	TASK_INIT(&tls->disable_ifnet_task, 0, ktls_disable_ifnet_help, tls);
 	(void)taskqueue_enqueue(taskqueue_thread, &tls->disable_ifnet_task);
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index e0e8dfeb46ef..db9d96a1a58e 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -227,7 +227,7 @@ tcp_default_output(struct tcpcb *tp)
 	isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif
 #ifdef KERN_TLS
-	const bool hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0;
+	const bool hw_tls = tp->t_nic_ktls_xmit != 0;
 #else
 	const bool hw_tls = false;
 #endif
diff --git a/sys/netinet/tcp_ratelimit.c b/sys/netinet/tcp_ratelimit.c
index 82aea5bdf0de..1d3e185855db 100644
--- a/sys/netinet/tcp_ratelimit.c
+++ b/sys/netinet/tcp_ratelimit.c
@@ -1350,7 +1350,7 @@ tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
 		}
 #ifdef KERN_TLS
 		tls = NULL;
-		if (tptosocket(tp)->so_snd.sb_flags & SB_TLS_IFNET) {
+		if (tp->t_nic_ktls_xmit != 0) {
 			tls = tptosocket(tp)->so_snd.sb_tls_info;
 
 			if ((ifp->if_capenable & IFCAP_TXTLS_RTLMT) == 0 ||
@@ -1413,7 +1413,7 @@ tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
 	}
 
 #ifdef KERN_TLS
-	if (tptosocket(tp)->so_snd.sb_flags & SB_TLS_IFNET) {
+	if (tp->t_nic_ktls_xmit) {
 		tls = tptosocket(tp)->so_snd.sb_tls_info;
 		if (tls->mode != TCP_TLS_MODE_IFNET)
 			tls = NULL;
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index d54d213c82dd..0c266849ebd3 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -11861,7 +11861,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
 	inp = bbr->rc_inp;
 	so = inp->inp_socket;
 	sb = &so->so_snd;
- 	if (sb->sb_flags & SB_TLS_IFNET)
+	if (tp->t_nic_ktls_xmit)
  		hw_tls = 1;
  	else
  		hw_tls = 0;
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 17c671705cab..953b74470738 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -18202,7 +18202,7 @@ send:
 	 * and initialize the header from the template for sends on this
 	 * connection.
 	 */
-	hw_tls = (sb->sb_flags & SB_TLS_IFNET) != 0;
+	hw_tls = tp->t_nic_ktls_xmit != 0;
 	if (len) {
 		uint32_t max_val;
 		uint32_t moff;
@@ -20183,20 +20183,10 @@ rack_apply_deferred_options(struct tcp_rack *rack)
 static void
 rack_hw_tls_change(struct tcpcb *tp, int chg)
 {
-	/*
-	 * HW tls state has changed.. fix all
-	 * rsm's in flight.
-	 */
+	/* Update HW tls state */
 	struct tcp_rack *rack;
-	struct rack_sendmap *rsm;
 
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
-	RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
-		if (chg)
-			rsm->r_hw_tls = 1;
-		else
-			rsm->r_hw_tls = 0;
-	}
 	if (chg)
 		rack->r_ctl.fsb.hw_tls = 1;
 	else
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 68c2c3f9d9e6..c2d5c2c99587 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -209,6 +209,9 @@ struct tcpcb {
 	tcp_seq	snd_recover;		/* for use in NewReno Fast Recovery */
 	char	t_oobflags;		/* have some */
 	char	t_iobc;			/* input character */
+	uint8_t t_nic_ktls_xmit:1,	/* active nic ktls xmit sessions */
+		t_nic_ktls_xmit_dis:1,	/* disabled nic xmit ktls? */
+		t_nic_ktls_spare:6;	/* spare nic ktls */
 	int	t_rxtcur;		/* current retransmit value (ticks) */
 
 	int	t_rxtshift;		/* log(2) of rexmt exp. backoff */
diff --git a/sys/sys/ktls.h b/sys/sys/ktls.h
index 66b510ddfe40..909d5347bc47 100644
--- a/sys/sys/ktls.h
+++ b/sys/sys/ktls.h
@@ -194,13 +194,14 @@ struct ktls_session {
 	struct ifnet *rx_ifp;
 	u_short rx_vlan_id;
 	bool reset_pending;
-	bool disable_ifnet_pending;
+	bool tx;
 	bool sync_dispatch;
 	bool sequential_records;
 
 	/* Only used for TLS 1.0. */
 	uint64_t next_seqno;
 	STAILQ_HEAD(, mbuf) pending_records;
+	struct task destroy_task;
 } __aligned(CACHE_LINE_SIZE);
 
 extern unsigned int ktls_ifnet_max_rexmit_pct;
diff --git a/sys/sys/sockbuf.h b/sys/sys/sockbuf.h
index 33c0abb381a3..80ac5cacc796 100644
--- a/sys/sys/sockbuf.h
+++ b/sys/sys/sockbuf.h
@@ -52,7 +52,7 @@
 #define	SB_AUTOSIZE	0x800		/* automatically size socket buffer */
 #define	SB_STOP		0x1000		/* backpressure indicator */
 #define	SB_AIO_RUNNING	0x2000		/* AIO operation running */
-#define	SB_TLS_IFNET	0x4000		/* has used / is using ifnet KTLS */
+#define	SB_UNUSED	0x4000		/* previously used for SB_TLS_IFNET */
 #define	SB_TLS_RX_RESYNC 0x8000		/* KTLS RX lost HW sync */
 
 #define	SBS_CANTSENDMORE	0x0010	/* can't send more data to peer */