git: c0e4090e3d43 - main - ktls: Accurately track if ifnet ktls is enabled
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Thu, 09 Feb 2023 17:53:08 UTC
The branch main has been updated by gallatin: URL: https://cgit.FreeBSD.org/src/commit/?id=c0e4090e3d43eeb86270dd35835862660b045c26 commit c0e4090e3d43eeb86270dd35835862660b045c26 Author: Andrew Gallatin <gallatin@FreeBSD.org> AuthorDate: 2023-02-08 20:37:08 +0000 Commit: Andrew Gallatin <gallatin@FreeBSD.org> CommitDate: 2023-02-09 17:44:44 +0000 ktls: Accurately track if ifnet ktls is enabled This allows us to avoid spurious calls to ktls_disable_ifnet() When we implemented ifnet kTLSe, we set a flag in the tx socket buffer (SB_TLS_IFNET) to indicate ifnet kTLS. This flag meant that now, or in the past, ifnet ktls was active on a socket. Later, I added code to switch ifnet ktls sessions to software in the case of lossy TCP connections that have a high retransmit rate. Because TCP was using SB_TLS_IFNET to know if it needed to do math to calculate the retransmit ratio and potentially call into ktls_disable_ifnet(), it was doing unneeded work long after a session was moved to software. This patch carefully tracks whether or not ifnet ktls is still enabled on a TCP connection. Because the inp is now embedded in the tcpcb, and because TCP is the most frequent accessor of this state, it made sense to move this from the socket buffer flags to the tcpcb. Because we now need reliable access to the tcbcb, we take a ref on the inp when creating a tx ktls session. While here, I noticed that rack/bbr were incorrectly implementing tfb_hwtls_change(), and applying the change to all pending sends, when it should apply only to future sends. This change reduces spurious calls to ktls_disable_ifnet() by 95% or so in a Netflix CDN environment. Reviewed by: markj, rrs Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D38380 --- sys/kern/uipc_ktls.c | 145 +++++++++++++++++++++++++++++++++--------- sys/netinet/tcp_output.c | 2 +- sys/netinet/tcp_ratelimit.c | 4 +- sys/netinet/tcp_stacks/bbr.c | 2 +- sys/netinet/tcp_stacks/rack.c | 14 +--- sys/netinet/tcp_var.h | 3 + sys/sys/ktls.h | 3 +- sys/sys/sockbuf.h | 2 +- 8 files changed, 126 insertions(+), 49 deletions(-) diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c index ac55268728e9..b3895aee9249 100644 --- a/sys/kern/uipc_ktls.c +++ b/sys/kern/uipc_ktls.c @@ -222,6 +222,11 @@ static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_ok); SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_ok, CTLFLAG_RD, &ktls_ifnet_disable_ok, "TLS sessions able to switch to SW from ifnet"); +static COUNTER_U64_DEFINE_EARLY(ktls_destroy_task); +SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, destroy_task, CTLFLAG_RD, + &ktls_destroy_task, + "Number of times ktls session was destroyed via taskqueue"); + SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Software TLS session stats"); SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, @@ -619,10 +624,14 @@ ktls_create_session(struct socket *so, struct tls_enable *en, counter_u64_add(ktls_offload_active, 1); refcount_init(&tls->refcount, 1); - if (direction == KTLS_RX) + if (direction == KTLS_RX) { TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_receive_tag, tls); - else + } else { TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_send_tag, tls); + tls->inp = so->so_pcb; + in_pcbref(tls->inp); + tls->tx = true; + } tls->wq_index = ktls_get_cpu(so); @@ -757,12 +766,16 @@ ktls_clone_session(struct ktls_session *tls, int direction) counter_u64_add(ktls_offload_active, 1); refcount_init(&tls_new->refcount, 1); - if (direction == KTLS_RX) + if (direction == KTLS_RX) { TASK_INIT(&tls_new->reset_tag_task, 0, ktls_reset_receive_tag, tls_new); - else + } else { TASK_INIT(&tls_new->reset_tag_task, 0, ktls_reset_send_tag, tls_new); + tls_new->inp = tls->inp; + tls_new->tx = true; + in_pcbref(tls_new->inp); + } /* Copy fields from existing session. */ tls_new->params = tls->params; @@ -1272,6 +1285,7 @@ ktls_enable_tx(struct socket *so, struct tls_enable *en) { struct ktls_session *tls; struct inpcb *inp; + struct tcpcb *tp; int error; if (!ktls_offload_enable) @@ -1336,8 +1350,13 @@ ktls_enable_tx(struct socket *so, struct tls_enable *en) SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_tls_seqno = be64dec(en->rec_seq); so->so_snd.sb_tls_info = tls; - if (tls->mode != TCP_TLS_MODE_SW) - so->so_snd.sb_flags |= SB_TLS_IFNET; + if (tls->mode != TCP_TLS_MODE_SW) { + tp = intotcpcb(inp); + MPASS(tp->t_nic_ktls_xmit == 0); + tp->t_nic_ktls_xmit = 1; + if (tp->t_fb->tfb_hwtls_change != NULL) + (*tp->t_fb->tfb_hwtls_change)(tp, 1); + } SOCKBUF_UNLOCK(&so->so_snd); INP_WUNLOCK(inp); SOCK_IO_SEND_UNLOCK(so); @@ -1438,6 +1457,7 @@ ktls_set_tx_mode(struct socket *so, int mode) { struct ktls_session *tls, *tls_new; struct inpcb *inp; + struct tcpcb *tp; int error; if (SOLISTENING(so)) @@ -1452,6 +1472,20 @@ ktls_set_tx_mode(struct socket *so, int mode) inp = so->so_pcb; INP_WLOCK_ASSERT(inp); + tp = intotcpcb(inp); + + if (mode == TCP_TLS_MODE_IFNET) { + /* Don't allow enabling ifnet ktls multiple times */ + if (tp->t_nic_ktls_xmit) + return (EALREADY); + /* + * Don't enable ifnet ktls if we disabled it due to an + * excessive retransmission rate + */ + if (tp->t_nic_ktls_xmit_dis) + return (ENXIO); + } + SOCKBUF_LOCK(&so->so_snd); tls = so->so_snd.sb_tls_info; if (tls == NULL) { @@ -1507,8 +1541,12 @@ ktls_set_tx_mode(struct socket *so, int mode) INP_WLOCK(inp); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_tls_info = tls_new; - if (tls_new->mode != TCP_TLS_MODE_SW) - so->so_snd.sb_flags |= SB_TLS_IFNET; + if (tls_new->mode != TCP_TLS_MODE_SW) { + MPASS(tp->t_nic_ktls_xmit == 0); + tp->t_nic_ktls_xmit = 1; + if (tp->t_fb->tfb_hwtls_change != NULL) + (*tp->t_fb->tfb_hwtls_change)(tp, 1); + } SOCKBUF_UNLOCK(&so->so_snd); SOCK_IO_SEND_UNLOCK(so); @@ -1662,8 +1700,7 @@ ktls_reset_send_tag(void *context, int pending) mtx_pool_lock(mtxpool_sleep, tls); tls->reset_pending = false; mtx_pool_unlock(mtxpool_sleep, tls); - if (!in_pcbrele_wlocked(inp)) - INP_WUNLOCK(inp); + INP_WUNLOCK(inp); counter_u64_add(ktls_ifnet_reset, 1); @@ -1674,18 +1711,15 @@ ktls_reset_send_tag(void *context, int pending) } else { NET_EPOCH_ENTER(et); INP_WLOCK(inp); - if (!in_pcbrele_wlocked(inp)) { - if (!(inp->inp_flags & INP_DROPPED)) { - tp = intotcpcb(inp); - CURVNET_SET(inp->inp_vnet); - tp = tcp_drop(tp, ECONNABORTED); - CURVNET_RESTORE(); - if (tp != NULL) - INP_WUNLOCK(inp); + if (!(inp->inp_flags & INP_DROPPED)) { + tp = intotcpcb(inp); + CURVNET_SET(inp->inp_vnet); + tp = tcp_drop(tp, ECONNABORTED); + CURVNET_RESTORE(); + if (tp != NULL) counter_u64_add(ktls_ifnet_reset_dropped, 1); - } else - INP_WUNLOCK(inp); } + INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); counter_u64_add(ktls_ifnet_reset_failed, 1); @@ -1746,8 +1780,6 @@ ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls) mtx_pool_lock(mtxpool_sleep, tls); if (!tls->reset_pending) { (void) ktls_hold(tls); - in_pcbref(inp); - tls->inp = inp; tls->reset_pending = true; taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task); } @@ -1790,11 +1822,55 @@ ktls_modify_txrtlmt(struct ktls_session *tls, uint64_t max_pacing_rate) #endif #endif +static void +ktls_destroy_help(void *context, int pending __unused) +{ + ktls_destroy(context); +} + void ktls_destroy(struct ktls_session *tls) { + struct inpcb *inp; + struct tcpcb *tp; + bool wlocked; + MPASS(tls->refcount == 0); + inp = tls->inp; + if (tls->tx) { + wlocked = INP_WLOCKED(inp); + if (!wlocked && !INP_TRY_WLOCK(inp)) { + /* + * rwlocks read locks are anonymous, and there + * is no way to know if our current thread + * holds an rlock on the inp. As a rough + * estimate, check to see if the thread holds + * *any* rlocks at all. If it does not, then we + * know that we don't hold the inp rlock, and + * can safely take the wlock + */ + + if (curthread->td_rw_rlocks == 0) { + INP_WLOCK(inp); + } else { + /* + * We might hold the rlock, so let's + * do the destroy in a taskqueue + * context to avoid a potential + * deadlock. This should be very + * rare. + */ + counter_u64_add(ktls_destroy_task, 1); + TASK_INIT(&tls->destroy_task, 0, + ktls_destroy_help, tls); + (void)taskqueue_enqueue(taskqueue_thread, + &tls->destroy_task); + return; + } + } + } + if (tls->sequential_records) { struct mbuf *m, *n; int page_count; @@ -1841,6 +1917,12 @@ ktls_destroy(struct ktls_session *tls) m_snd_tag_rele(tls->snd_tag); if (tls->rx_ifp != NULL) if_rele(tls->rx_ifp); + if (tls->tx) { + INP_WLOCK_ASSERT(inp); + tp = intotcpcb(inp); + MPASS(tp->t_nic_ktls_xmit == 1); + tp->t_nic_ktls_xmit = 0; + } break; #ifdef TCP_OFFLOAD case TCP_TLS_MODE_TOE: @@ -1870,6 +1952,11 @@ ktls_destroy(struct ktls_session *tls) tls->params.cipher_key = NULL; tls->params.cipher_key_len = 0; } + if (tls->tx) { + INP_WLOCK_ASSERT(inp); + if (!in_pcbrele_wlocked(inp) && !wlocked) + INP_WUNLOCK(inp); + } explicit_bzero(tls->params.iv, sizeof(tls->params.iv)); uma_zfree(ktls_session_zone, tls); @@ -3213,8 +3300,7 @@ out: CURVNET_SET(so->so_vnet); sorele(so); CURVNET_RESTORE(); - if (!in_pcbrele_wlocked(inp)) - INP_WUNLOCK(inp); + INP_WUNLOCK(inp); ktls_free(tls); } @@ -3245,22 +3331,19 @@ ktls_disable_ifnet(void *arg) so = inp->inp_socket; SOCK_LOCK(so); tls = so->so_snd.sb_tls_info; - if (tls->disable_ifnet_pending) { + if (tp->t_nic_ktls_xmit_dis == 1) { SOCK_UNLOCK(so); return; } - /* - * note that disable_ifnet_pending is never cleared; disabling - * ifnet can only be done once per session, so we never want + * note that t_nic_ktls_xmit_dis is never cleared; disabling + * ifnet can only be done once per connection, so we never want * to do it again */ (void)ktls_hold(tls); - in_pcbref(inp); soref(so); - tls->disable_ifnet_pending = true; - tls->inp = inp; + tp->t_nic_ktls_xmit_dis = 1; SOCK_UNLOCK(so); TASK_INIT(&tls->disable_ifnet_task, 0, ktls_disable_ifnet_help, tls); (void)taskqueue_enqueue(taskqueue_thread, &tls->disable_ifnet_task); diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index e0e8dfeb46ef..db9d96a1a58e 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -227,7 +227,7 @@ tcp_default_output(struct tcpcb *tp) isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif #ifdef KERN_TLS - const bool hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0; + const bool hw_tls = tp->t_nic_ktls_xmit != 0; #else const bool hw_tls = false; #endif diff --git a/sys/netinet/tcp_ratelimit.c b/sys/netinet/tcp_ratelimit.c index 82aea5bdf0de..1d3e185855db 100644 --- a/sys/netinet/tcp_ratelimit.c +++ b/sys/netinet/tcp_ratelimit.c @@ -1350,7 +1350,7 @@ tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp, } #ifdef KERN_TLS tls = NULL; - if (tptosocket(tp)->so_snd.sb_flags & SB_TLS_IFNET) { + if (tp->t_nic_ktls_xmit != 0) { tls = tptosocket(tp)->so_snd.sb_tls_info; if ((ifp->if_capenable & IFCAP_TXTLS_RTLMT) == 0 || @@ -1413,7 +1413,7 @@ tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte, } #ifdef KERN_TLS - if (tptosocket(tp)->so_snd.sb_flags & SB_TLS_IFNET) { + if (tp->t_nic_ktls_xmit) { tls = tptosocket(tp)->so_snd.sb_tls_info; if (tls->mode != TCP_TLS_MODE_IFNET) tls = NULL; diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c index d54d213c82dd..0c266849ebd3 100644 --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -11861,7 +11861,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) inp = bbr->rc_inp; so = inp->inp_socket; sb = &so->so_snd; - if (sb->sb_flags & SB_TLS_IFNET) + if (tp->t_nic_ktls_xmit) hw_tls = 1; else hw_tls = 0; diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index 17c671705cab..953b74470738 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -18202,7 +18202,7 @@ send: * and initialize the header from the template for sends on this * connection. */ - hw_tls = (sb->sb_flags & SB_TLS_IFNET) != 0; + hw_tls = tp->t_nic_ktls_xmit != 0; if (len) { uint32_t max_val; uint32_t moff; @@ -20183,20 +20183,10 @@ rack_apply_deferred_options(struct tcp_rack *rack) static void rack_hw_tls_change(struct tcpcb *tp, int chg) { - /* - * HW tls state has changed.. fix all - * rsm's in flight. - */ + /* Update HW tls state */ struct tcp_rack *rack; - struct rack_sendmap *rsm; rack = (struct tcp_rack *)tp->t_fb_ptr; - RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { - if (chg) - rsm->r_hw_tls = 1; - else - rsm->r_hw_tls = 0; - } if (chg) rack->r_ctl.fsb.hw_tls = 1; else diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 68c2c3f9d9e6..c2d5c2c99587 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -209,6 +209,9 @@ struct tcpcb { tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ char t_oobflags; /* have some */ char t_iobc; /* input character */ + uint8_t t_nic_ktls_xmit:1, /* active nic ktls xmit sessions */ + t_nic_ktls_xmit_dis:1, /* disabled nic xmit ktls? */ + t_nic_ktls_spare:6; /* spare nic ktls */ int t_rxtcur; /* current retransmit value (ticks) */ int t_rxtshift; /* log(2) of rexmt exp. backoff */ diff --git a/sys/sys/ktls.h b/sys/sys/ktls.h index 66b510ddfe40..909d5347bc47 100644 --- a/sys/sys/ktls.h +++ b/sys/sys/ktls.h @@ -194,13 +194,14 @@ struct ktls_session { struct ifnet *rx_ifp; u_short rx_vlan_id; bool reset_pending; - bool disable_ifnet_pending; + bool tx; bool sync_dispatch; bool sequential_records; /* Only used for TLS 1.0. */ uint64_t next_seqno; STAILQ_HEAD(, mbuf) pending_records; + struct task destroy_task; } __aligned(CACHE_LINE_SIZE); extern unsigned int ktls_ifnet_max_rexmit_pct; diff --git a/sys/sys/sockbuf.h b/sys/sys/sockbuf.h index 33c0abb381a3..80ac5cacc796 100644 --- a/sys/sys/sockbuf.h +++ b/sys/sys/sockbuf.h @@ -52,7 +52,7 @@ #define SB_AUTOSIZE 0x800 /* automatically size socket buffer */ #define SB_STOP 0x1000 /* backpressure indicator */ #define SB_AIO_RUNNING 0x2000 /* AIO operation running */ -#define SB_TLS_IFNET 0x4000 /* has used / is using ifnet KTLS */ +#define SB_UNUSED 0x4000 /* previously used for SB_TLS_IFNET */ #define SB_TLS_RX_RESYNC 0x8000 /* KTLS RX lost HW sync */ #define SBS_CANTSENDMORE 0x0010 /* can't send more data to peer */