git: 69c7c8119058 - main - Move access to tcp's t_logstate into inline functions and provide new tracepoint and bbpoint capabilities.
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Thu, 16 Mar 2023 15:44:56 UTC
The branch main has been updated by rrs: URL: https://cgit.FreeBSD.org/src/commit/?id=69c7c81190580ade16e608c6edaaa9f63dcd9052 commit 69c7c81190580ade16e608c6edaaa9f63dcd9052 Author: Randall Stewart <rrs@FreeBSD.org> AuthorDate: 2023-03-16 15:43:16 +0000 Commit: Randall Stewart <rrs@FreeBSD.org> CommitDate: 2023-03-16 15:43:16 +0000 Move access to tcp's t_logstate into inline functions and provide new tracepoint and bbpoint capabilities. The TCP stacks have long accessed t_logstate directly, but in order to do tracepoints and the new bbpoints we need to move to using the new inline functions. This adds them and moves rack to now use the tcp_tracepoints. Reviewed by: tuexen, gallatin Sponsored by: Netflix Inc Differential Revision: https://reviews.freebsd.org/D38831 --- sys/conf/options | 1 + sys/netinet/cc/cc_cubic.c | 2 +- sys/netinet/cc/cc_newreno.c | 2 +- sys/netinet/tcp.h | 12 ++ sys/netinet/tcp_accounting.h | 8 - sys/netinet/tcp_hpts.c | 2 +- sys/netinet/tcp_input.c | 2 +- sys/netinet/tcp_log_buf.c | 328 ++++++++++++++++++++++++++----- sys/netinet/tcp_log_buf.h | 228 +++++++++++++++++++-- sys/netinet/tcp_lro.c | 2 +- sys/netinet/tcp_output.c | 7 +- sys/netinet/tcp_ratelimit.c | 2 +- sys/netinet/tcp_stacks/bbr.c | 82 ++++---- sys/netinet/tcp_stacks/rack.c | 216 +++++--------------- sys/netinet/tcp_stacks/rack_bbr_common.c | 2 +- sys/netinet/tcp_subr.c | 68 ++++++- sys/netinet/tcp_timer.c | 6 +- sys/netinet/tcp_usrreq.c | 6 +- sys/netinet/tcp_var.h | 14 +- 19 files changed, 694 insertions(+), 296 deletions(-) diff --git a/sys/conf/options b/sys/conf/options index 37218f8a83d8..6dd19582f346 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -227,6 +227,7 @@ SYSVSEM opt_sysvipc.h SYSVSHM opt_sysvipc.h SW_WATCHDOG opt_watchdog.h TCPHPTS opt_inet.h +TCP_ACCOUNTING opt_inet.h TURNSTILE_PROFILING UMTX_PROFILING UMTX_CHAINS opt_global.h diff --git a/sys/netinet/cc/cc_cubic.c b/sys/netinet/cc/cc_cubic.c index cb4c79378132..7257aea56028 100644 --- a/sys/netinet/cc/cc_cubic.c +++ b/sys/netinet/cc/cc_cubic.c @@ -129,7 +129,7 @@ cubic_log_hystart_event(struct cc_var *ccv, struct cubic *cubicd, uint8_t mod, u if (hystart_bblogs == 0) return; tp = ccv->ccvc.tcp; - if (tp->t_logstate != TCP_LOG_STATE_OFF) { + if (tcp_bblogging_on(tp)) { union tcp_log_stackspecific log; struct timeval tv; diff --git a/sys/netinet/cc/cc_newreno.c b/sys/netinet/cc/cc_newreno.c index fb5ffed43a15..79148e93bfe0 100644 --- a/sys/netinet/cc/cc_newreno.c +++ b/sys/netinet/cc/cc_newreno.c @@ -139,7 +139,7 @@ newreno_log_hystart_event(struct cc_var *ccv, struct newreno *nreno, uint8_t mod if (hystart_bblogs == 0) return; tp = ccv->ccvc.tcp; - if (tp->t_logstate != TCP_LOG_STATE_OFF) { + if (tcp_bblogging_on(tp)) { union tcp_log_stackspecific log; struct timeval tv; diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h index 371bb88b9b17..1c34442f2617 100644 --- a/sys/netinet/tcp.h +++ b/sys/netinet/tcp.h @@ -330,6 +330,18 @@ struct tcphdr { /* Maximum length of log ID. */ #define TCP_LOG_ID_LEN 64 +/* TCP accounting counters */ +#define TCP_NUM_PROC_COUNTERS 11 +#define TCP_NUM_CNT_COUNTERS 13 + +/* Must match counter array sizes in tcpcb */ +struct tcp_perf_info { + uint64_t tcp_cnt_counters[TCP_NUM_CNT_COUNTERS]; + uint64_t tcp_proc_time[TCP_NUM_CNT_COUNTERS]; + uint64_t timebase; /* timebase for tcp_proc_time */ + uint8_t tb_is_stable; /* timebase is stable/invariant */ +}; + /* * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits * the caller to query certain information about the state of a TCP diff --git a/sys/netinet/tcp_accounting.h b/sys/netinet/tcp_accounting.h index a2e2d4edd46d..51ced29c217c 100644 --- a/sys/netinet/tcp_accounting.h +++ b/sys/netinet/tcp_accounting.h @@ -25,15 +25,7 @@ #define CYC_HANDLE_MAP 11 #define CYC_HANDLE_ACK 12 -/* Should the tp->xxx array's be alloc'ed? */ /* #define TCP_NUM_PROC_COUNTERS 11 defined in tcp_var.h */ /* #define TCP_NUM_CNT_COUNTERS 13 defined in tcp_var.h */ -#ifdef _KERNEL -#ifdef TCP_ACCOUNTING -extern counter_u64_t tcp_cnt_counters[TCP_NUM_CNT_COUNTERS]; -extern counter_u64_t tcp_proc_time[TCP_NUM_PROC_COUNTERS]; -#endif -#endif - #endif diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c index 25876ebd4105..5ea7bc3dc127 100644 --- a/sys/netinet/tcp_hpts.c +++ b/sys/netinet/tcp_hpts.c @@ -1344,7 +1344,7 @@ again: } CURVNET_SET(inp->inp_vnet); /* Lets do any logging that we might want to */ - if (hpts_does_tp_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { + if (hpts_does_tp_logging && tcp_bblogging_on(tp)) { tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, from_callout); } diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index 7b9c5668e888..992f8ea16e3d 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -104,10 +104,10 @@ __FBSDID("$FreeBSD$"); #include <netinet6/nd6.h> #include <netinet/tcp.h> #include <netinet/tcp_fsm.h> -#include <netinet/tcp_log_buf.h> #include <netinet/tcp_seq.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> +#include <netinet/tcp_log_buf.h> #include <netinet6/tcp6_var.h> #include <netinet/tcpip.h> #include <netinet/cc/cc.h> diff --git a/sys/netinet/tcp_log_buf.c b/sys/netinet/tcp_log_buf.c index a147b201b2fd..491e1c23588c 100644 --- a/sys/netinet/tcp_log_buf.c +++ b/sys/netinet/tcp_log_buf.c @@ -29,8 +29,10 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include "opt_inet.h" #include <sys/param.h> #include <sys/arb.h> +#include <sys/hash.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/malloc.h> @@ -45,7 +47,6 @@ __FBSDID("$FreeBSD$"); #include <sys/tree.h> #include <sys/stats.h> /* Must come after qmath.h and tree.h */ #include <sys/counter.h> - #include <dev/tcp_log/tcp_log_dev.h> #include <net/if.h> @@ -57,6 +58,7 @@ __FBSDID("$FreeBSD$"); #include <netinet/in_var.h> #include <netinet/tcp_var.h> #include <netinet/tcp_log_buf.h> +#include <netinet/tcp_hpts.h> /* Default expiry time */ #define TCP_LOG_EXPIRE_TIME ((sbintime_t)60 * SBT_1S) @@ -85,6 +87,9 @@ RB_PROTOTYPE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp) SYSCTL_NODE(_net_inet_tcp, OID_AUTO, bb, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP Black Box controls"); +SYSCTL_NODE(_net_inet_tcp_bb, OID_AUTO, tp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "TCP Black Box Trace Point controls"); + SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_verbose, CTLFLAG_RW, &tcp_log_verbose, 0, "Force verbose logging for TCP traces"); @@ -92,6 +97,23 @@ SYSCTL_INT(_net_inet_tcp_bb, OID_AUTO, log_session_limit, CTLFLAG_RW, &tcp_log_session_limit, 0, "Maximum number of events maintained for each TCP session"); +uint32_t tcp_trace_point_config = 0; +SYSCTL_U32(_net_inet_tcp_bb_tp, OID_AUTO, number, CTLFLAG_RW, + &tcp_trace_point_config, TCP_LOG_STATE_HEAD_AUTO, + "What is the trace point number to activate (0=none, 0xffffffff = all)?"); + +uint32_t tcp_trace_point_bb_mode = TCP_LOG_STATE_CONTINUAL; +SYSCTL_U32(_net_inet_tcp_bb_tp, OID_AUTO, bbmode, CTLFLAG_RW, + &tcp_trace_point_bb_mode, TCP_LOG_STATE_HEAD_AUTO, + "What is BB logging mode that is activated?"); + +int32_t tcp_trace_point_count = 0; +SYSCTL_U32(_net_inet_tcp_bb_tp, OID_AUTO, count, CTLFLAG_RW, + &tcp_trace_point_count, TCP_LOG_STATE_HEAD_AUTO, + "How many connections will have BB logging turned on that hit the tracepoint?"); + + + SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_global_limit, CTLFLAG_RW, &tcp_log_zone, "Maximum number of events maintained for all TCP sessions"); @@ -264,7 +286,7 @@ struct tcp_log_id_bucket volatile u_int tlb_refcnt; volatile u_int tlb_reqcnt; uint32_t tlb_loglimit; - uint8_t tlb_logstate; + int8_t tlb_logstate; }; struct tcp_log_id_node @@ -514,6 +536,61 @@ tcp_log_increment_reqcnt(struct tcp_log_id_bucket *tlb) atomic_fetchadd_int(&tlb->tlb_reqcnt, 1); } +int +tcp_log_apply_ratio(struct tcpcb *tp, int ratio) +{ + struct tcp_log_id_bucket *tlb; + struct inpcb *inp = tptoinpcb(tp); + uint32_t hash, ratio_hash_thresh; + int rv, tree_locked; + + rv = 0; + tree_locked = TREE_UNLOCKED; + tlb = tp->t_lib; + + INP_WLOCK_ASSERT(inp); + if (tlb == NULL) { + INP_WUNLOCK(inp); + return (EOPNOTSUPP); + } + ratio_hash_thresh = max(1, UINT32_MAX / ratio); + TCPID_BUCKET_REF(tlb); + INP_WUNLOCK(inp); + TCPID_BUCKET_LOCK(tlb); + + hash = hash32_buf(tlb->tlb_id, strlen(tlb->tlb_id), 0); + if (hash > ratio_hash_thresh && tp->_t_logstate == TCP_LOG_STATE_OFF && + tlb->tlb_logstate == TCP_LOG_STATE_OFF) { + /* + * Ratio decision not to log this log ID (and this connection by + * way of association). We only apply a log ratio log disable + * decision if it would not interfere with a log enable decision + * made elsewhere e.g. tcp_log_selectauto() or setsockopt(). + */ + tlb->tlb_logstate = TCP_LOG_STATE_RATIO_OFF; + INP_WLOCK(inp); + RECHECK_INP(); + (void)tcp_log_state_change(tp, TCP_LOG_STATE_OFF); +done: + INP_WUNLOCK(inp); + } + + INP_UNLOCK_ASSERT(inp); + if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL)) + TCPID_BUCKET_UNLOCK(tlb); + + if (tree_locked == TREE_WLOCKED) { + TCPID_TREE_WLOCK_ASSERT(); + TCPID_TREE_WUNLOCK(); + } else if (tree_locked == TREE_RLOCKED) { + TCPID_TREE_RLOCK_ASSERT(); + TCPID_TREE_RUNLOCK(); + } else + TCPID_TREE_UNLOCK_ASSERT(); + + return (rv); +} + /* * Associate the specified tag with a particular TCP log ID. * Called with INPCB locked. Returns with it unlocked. @@ -565,7 +642,7 @@ tcp_log_set_id(struct tcpcb *tp, char *id) struct tcp_log_id_node *tln; struct inpcb *inp = tptoinpcb(tp); int tree_locked, rv; - bool bucket_locked; + bool bucket_locked, same; tlb = NULL; tln = NULL; @@ -574,17 +651,41 @@ tcp_log_set_id(struct tcpcb *tp, char *id) restart: INP_WLOCK_ASSERT(inp); - /* See if the ID is unchanged. */ - if ((tp->t_lib != NULL && !strcmp(tp->t_lib->tlb_id, id)) || - (tp->t_lib == NULL && *id == 0)) { + same = ((tp->t_lib != NULL && !strcmp(tp->t_lib->tlb_id, id)) || + (tp->t_lib == NULL && *id == 0)); + if (tp->_t_logstate && STAILQ_FIRST(&tp->t_logs) && !same) { + /* + * There are residual logs left we may + * be changing id's so dump what we can. + */ + switch(tp->_t_logstate) { + case TCP_LOG_STATE_HEAD_AUTO: + (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from head at id switch", + M_NOWAIT, false); + break; + case TCP_LOG_STATE_TAIL_AUTO: + (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from tail at id switch", + M_NOWAIT, false); + break; + case TCP_LOG_STATE_CONTINUAL: + (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual at id switch", + M_NOWAIT, false); + break; + case TCP_LOG_VIA_BBPOINTS: + (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from bbpoints at id switch", + M_NOWAIT, false); + break; + } + } + if (same) { if (tp->t_lib != NULL) { tcp_log_increment_reqcnt(tp->t_lib); - if ((tp->t_lib->tlb_logstate) && + if ((tp->t_lib->tlb_logstate > TCP_LOG_STATE_OFF) && (tp->t_log_state_set == 0)) { /* Clone in any logging */ - tp->t_logstate = tp->t_lib->tlb_logstate; + tp->_t_logstate = tp->t_lib->tlb_logstate; } if ((tp->t_lib->tlb_loglimit) && (tp->t_log_state_set == 0)) { @@ -873,10 +974,10 @@ refind: SLIST_INSERT_HEAD(&tlb->tlb_head, tln, tln_list); tp->t_lib = tlb; tp->t_lin = tln; - if (tp->t_lib->tlb_logstate) { + if (tp->t_lib->tlb_logstate > TCP_LOG_STATE_OFF) { /* Clone in any logging */ - tp->t_logstate = tp->t_lib->tlb_logstate; + tp->_t_logstate = tp->t_lib->tlb_logstate; } if (tp->t_lib->tlb_loglimit) { /* The loglimit too */ @@ -1189,7 +1290,7 @@ tcp_log_tcpcbinit(struct tcpcb *tp) if ((tcp_log_auto_all == true) && tcp_log_auto_mode && tcp_log_selectauto()) { - tp->t_logstate = tcp_log_auto_mode; + tp->_t_logstate = tcp_log_auto_mode; tp->t_flags2 |= TF2_LOG_AUTO; } } @@ -1325,17 +1426,57 @@ tcp_log_tcpcbfini(struct tcpcb *tp) struct tcp_log_mem *log_entry; sbintime_t callouttime; - INP_WLOCK_ASSERT(tptoinpcb(tp)); - - TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_CONNEND, 0, 0, NULL, false); + INP_WLOCK_ASSERT(tptoinpcb(tp)); +#ifdef TCP_ACCOUNTING + if (tp->_t_logstate) { + struct tcp_log_buffer *lgb; + union tcp_log_stackspecific log; + struct timeval tv; + int i; + + memset(&log, 0, sizeof(log)); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + for (i = 0; i<TCP_NUM_CNT_COUNTERS; i++) { + log.u_raw.u64_flex[i] = tp->tcp_cnt_counters[i]; + } + lgb = tcp_log_event(tp, NULL, + NULL, + NULL, + TCP_LOG_ACCOUNTING, 0, + 0, &log, false, NULL, NULL, 0, &tv); + lgb->tlb_flex1 = TCP_NUM_CNT_COUNTERS; + lgb->tlb_flex2 = 1; + for (i = 0; i<TCP_NUM_CNT_COUNTERS; i++) { + log.u_raw.u64_flex[i] = tp->tcp_proc_time[i]; + } + lgb = tcp_log_event(tp, NULL, + NULL, + NULL, + TCP_LOG_ACCOUNTING, 0, + 0, &log, false, NULL, NULL, 0, &tv); + if (tptoinpcb(tp)->inp_flags2 & INP_MBUF_ACKCMP) + lgb->tlb_flex1 = TCP_NUM_CNT_COUNTERS; + else + lgb->tlb_flex1 = TCP_NUM_PROC_COUNTERS; + lgb->tlb_flex2 = 2; + } + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.cur_del_rate = tp->t_end_info; + TCP_LOG_EVENTP(tp, NULL, + NULL, + NULL, + TCP_LOG_CONNEND, 0, + 0, &log, false, &tv); + } +#endif /* * If we were gathering packets to be automatically dumped, try to do * it now. If this succeeds, the log information in the TCPCB will be * cleared. Otherwise, we'll handle the log information as we do * for other states. */ - switch(tp->t_logstate) { + switch(tp->_t_logstate) { case TCP_LOG_STATE_HEAD_AUTO: (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from head", M_NOWAIT, false); @@ -1344,6 +1485,10 @@ tcp_log_tcpcbfini(struct tcpcb *tp) (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from tail", M_NOWAIT, false); break; + case TCP_LOG_VIA_BBPOINTS: + (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from bbpoints", + M_NOWAIT, false); + break; case TCP_LOG_STATE_CONTINUAL: (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual", M_NOWAIT, false); @@ -1493,7 +1638,7 @@ tcp_log_tcpcbfini(struct tcpcb *tp) * Change the log state to off (just in case anything tries to sneak * in a last-minute log). */ - tp->t_logstate = TCP_LOG_STATE_OFF; + tp->_t_logstate = TCP_LOG_STATE_OFF; } static void @@ -1510,7 +1655,7 @@ tcp_log_purge_tp_logbuf(struct tcpcb *tp) KASSERT(tp->t_lognum == 0, ("%s: After freeing entries, tp->t_lognum=%d (expected 0)", __func__, tp->t_lognum)); - tp->t_logstate = TCP_LOG_STATE_OFF; + tp->_t_logstate = TCP_LOG_STATE_OFF; } /* @@ -1520,7 +1665,7 @@ tcp_log_purge_tp_logbuf(struct tcpcb *tp) */ struct tcp_log_buffer * -tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, +tcp_log_event(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len, union tcp_log_stackspecific *stackinfo, int th_hostorder, const char *output_caller, const char *func, int line, const struct timeval *itv) @@ -1547,13 +1692,14 @@ tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, tcp_log_purge_tp_logbuf(tp); return (NULL); } - KASSERT(tp->t_logstate == TCP_LOG_STATE_HEAD || - tp->t_logstate == TCP_LOG_STATE_TAIL || - tp->t_logstate == TCP_LOG_STATE_CONTINUAL || - tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO || - tp->t_logstate == TCP_LOG_STATE_TAIL_AUTO, - ("%s called with unexpected tp->t_logstate (%d)", __func__, - tp->t_logstate)); + KASSERT(tp->_t_logstate == TCP_LOG_STATE_HEAD || + tp->_t_logstate == TCP_LOG_STATE_TAIL || + tp->_t_logstate == TCP_LOG_STATE_CONTINUAL || + tp->_t_logstate == TCP_LOG_STATE_HEAD_AUTO || + tp->_t_logstate == TCP_LOG_VIA_BBPOINTS || + tp->_t_logstate == TCP_LOG_STATE_TAIL_AUTO, + ("%s called with unexpected tp->_t_logstate (%d)", __func__, + tp->_t_logstate)); /* * Get the serial number. We do this early so it will @@ -1589,7 +1735,7 @@ retry: "to set tp %p to TCP_LOG_STATE_CLEAR", __func__, __LINE__, tp); #endif - tp->t_logstate = TCP_LOG_STATE_OFF; + tp->_t_logstate = TCP_LOG_STATE_OFF; } return (NULL); } @@ -1598,12 +1744,12 @@ retry: * the buffers. If successful, deactivate tracing. Otherwise, * leave it active so we will retry. */ - if (tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO && + if (tp->_t_logstate == TCP_LOG_STATE_HEAD_AUTO && !tcp_log_dump_tp_logbuf(tp, "auto-dumped from head", M_NOWAIT, false)) { - tp->t_logstate = TCP_LOG_STATE_OFF; + tp->_t_logstate = TCP_LOG_STATE_OFF; return(NULL); - } else if ((tp->t_logstate == TCP_LOG_STATE_CONTINUAL) && + } else if ((tp->_t_logstate == TCP_LOG_STATE_CONTINUAL) && !tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual", M_NOWAIT, false)) { if (attempt_count == 0) { @@ -1614,15 +1760,26 @@ retry: counter_u64_add(tcp_log_que_fail4, 1); #endif return(NULL); - } else if (tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO) + + } else if ((tp->_t_logstate == TCP_LOG_VIA_BBPOINTS) && + !tcp_log_dump_tp_logbuf(tp, "auto-dumped from bbpoints", + M_NOWAIT, false)) { + if (attempt_count == 0) { + attempt_count++; + goto retry; + } +#ifdef TCPLOG_DEBUG_COUNTERS + counter_u64_add(tcp_log_que_fail4, 1); +#endif + return(NULL); + } else if (tp->_t_logstate == TCP_LOG_STATE_HEAD_AUTO) return(NULL); /* If in HEAD state, just deactivate the tracing and return. */ - if (tp->t_logstate == TCP_LOG_STATE_HEAD) { - tp->t_logstate = TCP_LOG_STATE_OFF; + if (tp->_t_logstate == TCP_LOG_STATE_HEAD) { + tp->_t_logstate = TCP_LOG_STATE_OFF; return(NULL); } - /* * Get a buffer to reuse. If that fails, just give up. * (We can't log anything without a buffer in which to @@ -1647,7 +1804,7 @@ retry: /* Basic entries. */ if (itv == NULL) - getmicrouptime(&log_buf->tlb_tv); + microuptime(&log_buf->tlb_tv); else memcpy(&log_buf->tlb_tv, itv, sizeof(struct timeval)); log_buf->tlb_ticks = ticks; @@ -1663,12 +1820,18 @@ retry: log_buf->tlb_rxbuf.tls_sb_acc = rxbuf->sb_acc; log_buf->tlb_rxbuf.tls_sb_ccc = rxbuf->sb_ccc; log_buf->tlb_rxbuf.tls_sb_spare = 0; + } else { + log_buf->tlb_rxbuf.tls_sb_acc = 0; + log_buf->tlb_rxbuf.tls_sb_ccc = 0; } if (txbuf != NULL) { log_buf->tlb_eventflags |= TLB_FLAG_TXBUF; log_buf->tlb_txbuf.tls_sb_acc = txbuf->sb_acc; log_buf->tlb_txbuf.tls_sb_ccc = txbuf->sb_ccc; log_buf->tlb_txbuf.tls_sb_spare = 0; + } else { + log_buf->tlb_txbuf.tls_sb_acc = 0; + log_buf->tlb_txbuf.tls_sb_ccc = 0; } /* Copy values from tp to the log entry. */ #define COPY_STAT(f) log_buf->tlb_ ## f = tp->f @@ -1700,8 +1863,6 @@ retry: COPY_STAT_T(fbyte_out); #undef COPY_STAT #undef COPY_STAT_T - log_buf->tlb_flex1 = 0; - log_buf->tlb_flex2 = 0; /* Copy stack-specific info. */ if (stackinfo != NULL) { memcpy(&log_buf->tlb_stackinfo, stackinfo, @@ -1721,6 +1882,8 @@ retry: optlen = (th->th_off << 2) - sizeof (struct tcphdr); if (optlen > 0) memcpy(log_buf->tlb_opts, th + 1, optlen); + } else { + memset(&log_buf->tlb_th, 0, sizeof(*th)); } /* Verbose information */ @@ -1749,8 +1912,10 @@ int tcp_log_state_change(struct tcpcb *tp, int state) { struct tcp_log_mem *log_entry; + int rv; INP_WLOCK_ASSERT(tptoinpcb(tp)); + rv = 0; switch(state) { case TCP_LOG_STATE_CLEAR: while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL) @@ -1758,15 +1923,27 @@ tcp_log_state_change(struct tcpcb *tp, int state) /* Fall through */ case TCP_LOG_STATE_OFF: - tp->t_logstate = TCP_LOG_STATE_OFF; + tp->_t_logstate = TCP_LOG_STATE_OFF; break; case TCP_LOG_STATE_TAIL: case TCP_LOG_STATE_HEAD: case TCP_LOG_STATE_CONTINUAL: + case TCP_LOG_VIA_BBPOINTS: case TCP_LOG_STATE_HEAD_AUTO: case TCP_LOG_STATE_TAIL_AUTO: - tp->t_logstate = state; + /* + * When the RATIO_OFF state is set for the bucket, the log ID + * this tp is associated with has been probabilistically opted + * out of logging per tcp_log_apply_ratio(). + */ + if (tp->t_lib == NULL || + tp->t_lib->tlb_logstate != TCP_LOG_STATE_RATIO_OFF) { + tp->_t_logstate = state; + } else { + rv = ECANCELED; + tp->_t_logstate = TCP_LOG_STATE_OFF; + } break; default: @@ -1774,11 +1951,12 @@ tcp_log_state_change(struct tcpcb *tp, int state) } if (tcp_disable_all_bb_logs) { /* We are prohibited from doing any logs */ - tp->t_logstate = TCP_LOG_STATE_OFF; + tp->_t_logstate = TCP_LOG_STATE_OFF; + rv = EBUSY; } tp->t_flags2 &= ~(TF2_LOG_AUTO); - return (0); + return (rv); } /* If tcp_drain() is called, flush half the log entries. */ @@ -1792,13 +1970,26 @@ tcp_log_drain(struct tcpcb *tp) if ((target = tp->t_lognum / 2) == 0) return; + /* + * XXXRRS: At this I don't think this is wise that + * we do this. All that a drain call means is that + * we are hitting one of the system mbuf limits. BB + * logging, or freeing of them, will not create any + * more mbufs and really has nothing to do with + * the system running out of mbufs. For now I + * am changing this to free any "AUTO" by dumping + * them out. But this should either be changed + * so that it gets called when we hit the BB limit + * or it should just not get called (one of the two) + * since I don't think the mbuf <-> BB log cleanup + * is the right thing to do here. + */ /* * If we are logging the "head" packets, we want to discard * from the tail of the queue. Otherwise, we want to discard * from the head. */ - if (tp->t_logstate == TCP_LOG_STATE_HEAD || - tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO) { + if (tp->_t_logstate == TCP_LOG_STATE_HEAD) { skip = tp->t_lognum - target; STAILQ_FOREACH(log_entry, &tp->t_logs, tlm_queue) if (!--skip) @@ -1818,7 +2009,16 @@ tcp_log_drain(struct tcpcb *tp) KASSERT(target == 0, ("%s: After removing from tail, target was %d", __func__, target)); - } else if (tp->t_logstate == TCP_LOG_STATE_CONTINUAL) { + } else if (tp->_t_logstate == TCP_LOG_STATE_HEAD_AUTO) { + (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from head at drain", + M_NOWAIT, false); + } else if (tp->_t_logstate == TCP_LOG_STATE_TAIL_AUTO) { + (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from tail at drain", + M_NOWAIT, false); + } else if (tp->_t_logstate == TCP_LOG_VIA_BBPOINTS) { + (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from bbpoints", + M_NOWAIT, false); + } else if (tp->_t_logstate == TCP_LOG_STATE_CONTINUAL) { (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual", M_NOWAIT, false); } else { @@ -2181,7 +2381,7 @@ tcp_log_dump_tp_logbuf(struct tcpcb *tp, char *reason, int how, bool force) KASSERT(tp->t_lognum == 0, ("%s: After freeing entries, tp->t_lognum=%d (expected 0)", __func__, tp->t_lognum)); - tp->t_logstate = TCP_LOG_STATE_OFF; + tp->_t_logstate = TCP_LOG_STATE_OFF; return (0); } @@ -2632,9 +2832,47 @@ tcp_log_dump_tp_bucket_logbufs(struct tcpcb *tp, char *reason) void tcp_log_flowend(struct tcpcb *tp) { - if (tp->t_logstate != TCP_LOG_STATE_OFF) { + if (tp->_t_logstate != TCP_LOG_STATE_OFF) { struct socket *so = tptosocket(tp); TCP_LOG_EVENT(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FLOWEND, 0, 0, NULL, false); } } + +void +tcp_log_sendfile(struct socket *so, off_t offset, size_t nbytes, int flags) +{ + struct inpcb *inp; + struct tcpcb *tp; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_log_sendfile: inp == NULL")); + + /* quick check to see if logging is enabled for this connection */ + tp = intotcpcb(inp); + if ((inp->inp_flags & INP_DROPPED) || + (tp->_t_logstate == TCP_LOG_STATE_OFF)) { + return; + } + + INP_WLOCK(inp); + /* double check log state now that we have the lock */ + if (inp->inp_flags & INP_DROPPED) + goto done; + if (tp->_t_logstate != TCP_LOG_STATE_OFF) { + struct timeval tv; + tcp_log_eventspecific_t log; + + microuptime(&tv); + log.u_sf.offset = offset; + log.u_sf.length = nbytes; + log.u_sf.flags = flags; + + TCP_LOG_EVENTP(tp, NULL, + &tptosocket(tp)->so_rcv, + &tptosocket(tp)->so_snd, + TCP_LOG_SENDFILE, 0, 0, &log, false, &tv); + } +done: + INP_WUNLOCK(inp); +} diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h index f82d2bf3a9b3..2b708c6545ce 100644 --- a/sys/netinet/tcp_log_buf.h +++ b/sys/netinet/tcp_log_buf.h @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2016-2018 Netflix, Inc. + * Copyright (c) 2016-2020 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -101,13 +101,42 @@ struct tcp_log_bbr { uint32_t pkt_epoch; }; -/* Per-stack stack-specific info. */ +/* shadows tcp_log_bbr struct element sizes */ +struct tcp_log_raw { + uint64_t u64_flex[4]; + uint32_t u32_flex[14]; + uint16_t u16_flex[3]; + uint8_t u8_flex[6]; + uint32_t u32_flex2[1]; +}; + +struct tcp_log_uint64 { + uint64_t u64_flex[13]; +}; + +struct tcp_log_sendfile { + uint64_t offset; + uint64_t length; + uint32_t flags; +}; + +/* + * tcp_log_stackspecific is currently being used as "event specific" log + * info by all stacks (i.e. struct tcp_log_bbr is used for generic event + * logging). Until this is cleaned up more generically and throughout, + * allow events to use the same space in the union. + */ union tcp_log_stackspecific { struct tcp_log_rack u_rack; struct tcp_log_bbr u_bbr; + struct tcp_log_sendfile u_sf; + struct tcp_log_raw u_raw; /* "raw" log access */ + struct tcp_log_uint64 u64_raw; /* just u64's - used by process info */ }; +typedef union tcp_log_stackspecific tcp_log_eventspecific_t; + struct tcp_log_buffer { /* Event basics */ @@ -156,7 +185,6 @@ struct tcp_log_buffer uint8_t tlb_snd_scale:4, /* TCPCB snd_scale */ tlb_rcv_scale:4; /* TCPCB rcv_scale */ uint8_t _pad[3]; /* Padding */ - /* Per-stack info */ union tcp_log_stackspecific tlb_stackinfo; #define tlb_rack tlb_stackinfo.u_rack @@ -245,8 +273,14 @@ enum tcp_log_events { }; enum tcp_log_states { - TCP_LOG_STATE_CLEAR = -1, /* Deactivate and clear tracing */ + TCP_LOG_STATE_RATIO_OFF = -2, /* Log ratio evaluation yielded an OFF + result. Only used for tlb_logstate */ + TCP_LOG_STATE_CLEAR = -1, /* Deactivate and clear tracing. Passed + to tcp_log_state_change() but never + stored in any logstate variable */ TCP_LOG_STATE_OFF = 0, /* Pause */ + + /* Positively numbered states represent active logging modes */ TCP_LOG_STATE_TAIL=1, /* Keep the trailing events */ TCP_LOG_STATE_HEAD=2, /* Keep the leading events */ TCP_LOG_STATE_HEAD_AUTO=3, /* Keep the leading events, and @@ -256,6 +290,7 @@ enum tcp_log_states { TCP_LOG_STATE_TAIL_AUTO=5, /* Keep the trailing events, and automatically dump them when the session ends */ + TCP_LOG_VIA_BBPOINTS=6 /* Log only if the BB point has been configured */ }; /* Use this if we don't know whether the operation succeeded. */ @@ -298,8 +333,136 @@ struct tcp_log_dev_log_queue { #endif /* _KERNEL */ #endif /* __tcp_log_dev_h__ */ +/* + * Defined BBPOINTS that can be used + * with TCP_LOG_VIA_BBPOINTS. + */ +#define TCP_BBPOINT_NONE 0 +#define TCP_BBPOINT_REQ_LEVEL_LOGGING 1 + +/*********************/ +/* TCP Trace points */ +/*********************/ +/* + * TCP trace points are interesting points within + * the TCP code that the author/debugger may want + * to have BB logging enabled if we hit that point. + * In order to enable a trace point you set the + * sysctl var net.inet.tcp.bb.tp.number to + * one of the numbers listed below. You also + * must make sure net.inet.tcp.bb.tp.bbmode is + * non-zero, the default is 4 for continuous tracing. + * You also set in the number of connections you want + * have get BB logs in net.inet.tcp.bb.tp.count. + * + * Count will decrement every time BB logging is assigned + * to a connection that hit your tracepoint. + * + * You can enable all trace points by setting the number + * to 0xffffffff. You can disable all trace points by + * setting number to zero (or count to 0). + * + * Below are the enumerated list of tracepoints that + * have currently been defined in the code. Add more + * as you add a call to rack_trace_point(rack, <name>); + * where <name> is defined below. + */ +#define TCP_TP_HWENOBUF 0x00000001 /* When we are doing hardware pacing and hit enobufs */ +#define TCP_TP_ENOBUF 0x00000002 /* When we hit enobufs with software pacing */ +#define TCP_TP_COLLAPSED_WND 0x00000003 /* When a peer to collapses its rwnd on us */ +#define TCP_TP_COLLAPSED_RXT 0x00000004 /* When we actually retransmit a collapsed window rsm */ +#define TCP_TP_HTTP_LOG_FAIL 0x00000005 /* We tried to allocate a HTTP log but had no space */ +#define TCP_TP_RESET_RCV 0x00000006 /* Triggers when we receive a RST */ +#define TCP_TP_EXCESS_RXT 0x00000007 /* When we get excess RXT's clamping the cwnd */ +#define TCP_TP_SAD_TRIGGERED 0x00000008 /* Sack Attack Detection triggers */ + +#define TCP_TP_SAD_SUSPECT 0x0000000a /* A sack has supicious information in it */ + #ifdef _KERNEL +extern uint32_t tcp_trace_point_config; +extern uint32_t tcp_trace_point_bb_mode; +extern int32_t tcp_trace_point_count; + +/* + * Returns true if any sort of BB logging is enabled, + * commonly used throughout the codebase. + */ +static inline int +tcp_bblogging_on(struct tcpcb *tp) +{ + if (tp->_t_logstate <= TCP_LOG_STATE_OFF) + return (0); + if (tp->_t_logstate == TCP_LOG_VIA_BBPOINTS) + return (0); + return (1); +} + +/* + * Returns true if we match a specific bbpoint when + * in TCP_LOG_VIA_BBPOINTS, but also returns true + * for all the other logging states. + */ +static inline int +tcp_bblogging_point_on(struct tcpcb *tp, uint8_t bbpoint) +{ + if (tp->_t_logstate <= TCP_LOG_STATE_OFF) + return (0); + if ((tp->_t_logstate == TCP_LOG_VIA_BBPOINTS) && + (tp->_t_logpoint == bbpoint)) + return (1); + else if (tp->_t_logstate == TCP_LOG_VIA_BBPOINTS) + return (0); + return (1); +} + +static inline void +tcp_set_bblog_state(struct tcpcb *tp, uint8_t ls, uint8_t bbpoint) +{ + if ((ls == TCP_LOG_VIA_BBPOINTS) && + (tp->_t_logstate <= TCP_LOG_STATE_OFF)){ + /* + * We don't allow a BBPOINTS set to override + * other types of BB logging set by other means such + * as the bb_ratio/bb_state URL parameters. In other + * words BBlogging must be *off* in order to turn on + * a BBpoint. + */ + tp->_t_logpoint = bbpoint; + tp->_t_logstate = ls; + } else if (ls != TCP_LOG_VIA_BBPOINTS) { + tp->_t_logpoint = 0; + if ((ls >= TCP_LOG_STATE_OFF) && + (ls < TCP_LOG_VIA_BBPOINTS)) + tp->_t_logstate = ls; + } +} + +static inline uint32_t +tcp_get_bblog_state(struct tcpcb *tp) +{ + return (tp->_t_logstate); +} + +static inline void +tcp_trace_point(struct tcpcb *tp, int num) +{ + if (((tcp_trace_point_config == num) || + (tcp_trace_point_config == 0xffffffff)) && + (tcp_trace_point_bb_mode != 0) && + (tcp_trace_point_count > 0) && + (tcp_bblogging_on(tp) == 0)) { + int res; + res = atomic_fetchadd_int(&tcp_trace_point_count, -1); + if (res > 0) { + tcp_set_bblog_state(tp, tcp_trace_point_bb_mode, TCP_BBPOINT_NONE); + } else { + /* Loss a race assure its zero now */ + tcp_trace_point_count = 0; + } + } +} + #define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 5000 #define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 5000000 @@ -309,15 +472,51 @@ struct tcp_log_dev_log_queue { */ #define TCP_LOG_EVENT_VERBOSE(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \ do { \ - if (tp->t_logstate != TCP_LOG_STATE_OFF) \ - tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \ + if (tcp_bblogging_on(tp)) \ + tcp_log_event(tp, th, rxbuf, txbuf, eventid, \ errornum, len, stackinfo, th_hostorder, \ tp->t_output_caller, __func__, __LINE__, tv);\ } while (0) *** 1563 LINES SKIPPED ***