git: 6edfc10ca5fb - main - tcp: adding a functionality to define "trace points" so that BB logging can be enabled at specific events.

From: Randall Stewart <rrs_at_FreeBSD.org>
Date: Thu, 14 Apr 2022 20:08:59 UTC
The branch main has been updated by rrs:

URL: https://cgit.FreeBSD.org/src/commit/?id=6edfc10ca5fbefa5ca6a3d72821ba15006c2d148

commit 6edfc10ca5fbefa5ca6a3d72821ba15006c2d148
Author:     Randall Stewart <rrs@FreeBSD.org>
AuthorDate: 2022-04-14 20:07:34 +0000
Commit:     Randall Stewart <rrs@FreeBSD.org>
CommitDate: 2022-04-14 20:07:34 +0000

    tcp: adding a functionality to define "trace points" so that BB logging can be enabled at specific events.
    
    This commit will add a new concept to rack, tracepoints. A tracepoint
    is a defined point inserted into the code (3 are included in this initial patch) that
    allows a developer to insert a point that might be of interest. The developer numbers
    the point in the tcp_rack.h file and then can use sysctl to enable that (or all) trace
    points. A limit is also given to how many BB logged connections will turn on
    so that a box is not overrun by BB logging.
    
    Reviewed by: tuexen
    Sponsored by: Netflix Inc.
    Differential Revision: https://reviews.freebsd.org/D34898
---
 sys/netinet/tcp_stacks/rack.c     | 56 +++++++++++++++++++++++++++++++++++++++
 sys/netinet/tcp_stacks/tcp_rack.h | 30 +++++++++++++++++++++
 2 files changed, 86 insertions(+)

diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 2de40c902162..30a23a578dd4 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -240,6 +240,7 @@ static int32_t rack_enobuf_hw_boost_mult = 2;	/* How many times the hw rate we b
 static int32_t rack_enobuf_hw_max = 12000;	/* 12 ms in usecs */
 static int32_t rack_enobuf_hw_min = 10000;	/* 10 ms in usecs */
 static int32_t rack_hw_rwnd_factor = 2;		/* How many max_segs the rwnd must be before we hold off sending */
+
 /*
  * Currently regular tcp has a rto_min of 30ms
  * the backoff goes 12 times so that ends up
@@ -326,6 +327,10 @@ static int32_t rack_timely_no_stopping = 0;
 static int32_t rack_down_raise_thresh = 100;
 static int32_t rack_req_segs = 1;
 static uint64_t rack_bw_rate_cap = 0;
+static uint32_t rack_trace_point_config = 0;
+static uint32_t rack_trace_point_bb_mode = 4;
+static int32_t rack_trace_point_count = 0;
+
 
 /* Weird delayed ack mode */
 static int32_t rack_use_imac_dack = 0;
@@ -547,6 +552,25 @@ rack_apply_deferred_options(struct tcp_rack *rack);
 
 int32_t rack_clear_counter=0;
 
+static inline void
+rack_trace_point(struct tcp_rack *rack, int num)
+{
+	if (((rack_trace_point_config == num)  ||
+	     (rack_trace_point_config = 0xffffffff)) &&
+	    (rack_trace_point_bb_mode != 0) &&
+	    (rack_trace_point_count > 0) &&
+	    (rack->rc_tp->t_logstate == 0)) {
+		int res;
+		res = atomic_fetchadd_int(&rack_trace_point_count, -1);
+		if (res > 0) {
+			rack->rc_tp->t_logstate = rack_trace_point_bb_mode;
+		} else {
+			/* Loss a race assure its zero now */
+			rack_trace_point_count = 0;
+		}
+	}
+}
+
 static void
 rack_set_cc_pacing(struct tcp_rack *rack)
 {
@@ -785,6 +809,7 @@ rack_init_sysctls(void)
 	struct sysctl_oid *rack_measure;
 	struct sysctl_oid *rack_probertt;
 	struct sysctl_oid *rack_hw_pacing;
+	struct sysctl_oid *rack_tracepoint;
 
 	rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
@@ -915,6 +940,28 @@ rack_init_sysctls(void)
 	    OID_AUTO, "hbp_threshold", CTLFLAG_RW,
 	    &rack_hbp_thresh, 3,
 	    "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold");
+
+	rack_tracepoint = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_sysctl_root),
+	    OID_AUTO,
+	    "tp",
+	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+	    "Rack tracepoint facility");
+	SYSCTL_ADD_U32(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_tracepoint),
+	    OID_AUTO, "number", CTLFLAG_RW,
+	    &rack_trace_point_config, 0,
+	    "What is the trace point number to activate (0=none, 0xffffffff = all)?");
+	SYSCTL_ADD_U32(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_tracepoint),
+	    OID_AUTO, "bbmode", CTLFLAG_RW,
+	    &rack_trace_point_bb_mode, 4,
+	    "What is BB logging mode that is activated?");
+	SYSCTL_ADD_S32(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_tracepoint),
+	    OID_AUTO, "count", CTLFLAG_RW,
+	    &rack_trace_point_count, 0,
+	    "How many connections will have BB logging turned on that hit the tracepoint?");
 	/* Pacing related sysctls */
 	rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
@@ -10286,6 +10333,7 @@ rack_collapsed_window(struct tcp_rack *rack)
 #endif
 	tcp_seq max_seq;
 
+	rack_trace_point(rack, RACK_TP_COLLAPSED_WND);
 	max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd;
 	memset(&fe, 0, sizeof(fe));
 	fe.r_start = max_seq;
@@ -15983,6 +16031,10 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
 	}
 	counter_u64_add(rack_fto_rsm_send, 1);
 	if (error && (error == ENOBUFS)) {
+		if (rack->r_ctl.crte != NULL) {
+			rack_trace_point(rack, RACK_TP_HWENOBUF);
+		} else
+			rack_trace_point(rack, RACK_TP_ENOBUF);
 		slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
 		if (rack->rc_enobuf < 0x7f)
 			rack->rc_enobuf++;
@@ -18839,6 +18891,10 @@ nomore:
 			 * Pace us right away to retry in a some
 			 * time
 			 */
+			if (rack->r_ctl.crte != NULL) {
+				rack_trace_point(rack, RACK_TP_HWENOBUF);
+			} else
+				rack_trace_point(rack, RACK_TP_ENOBUF);
 			slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
 			if (rack->rc_enobuf < 0x7f)
 				rack->rc_enobuf++;
diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h
index ad3c4d6883fb..91aefea0a81d 100644
--- a/sys/netinet/tcp_stacks/tcp_rack.h
+++ b/sys/netinet/tcp_stacks/tcp_rack.h
@@ -261,6 +261,36 @@ struct rack_opts_stats {
 #define RACK_QUALITY_PROBERTT	4	/* A measurement where we went into or exited probe RTT */
 #define RACK_QUALITY_ALLACKED	5	/* All data is now acknowledged */
 
+/*********************/
+/* Rack Trace points */
+/*********************/
+/*
+ * Rack trace points are interesting points within
+ * the rack code that the author/debugger may want
+ * to have BB logging enabled if we hit that point.
+ * In order to enable a trace point you set the
+ * sysctl var net.inet.tcp.<stack>.tp.number to
+ * one of the numbers listed below. You also
+ * must make sure net.inet.tcp.<stack>.tp.bbmode is
+ * non-zero, the default is 4 for continous tracing.
+ * You also set in the number of connections you want
+ * have get BB logs in net.inet.tcp.<stack>.tp.count.
+ * 
+ * Count will decrement every time BB logging is assigned
+ * to a connection that hit your tracepoint.
+ *
+ * You can enable all trace points by setting the number
+ * to 0xffffffff. You can disable all trace points by
+ * setting number to zero (or count to 0).
+ *
+ * Below are the enumerated list of tracepoints that
+ * have currently been defined in the code. Add more
+ * as you add a call to rack_trace_point(rack, <name>);
+ * where <name> is defined below.
+ */
+#define RACK_TP_HWENOBUF	0x00000001	/* When we are doing hardware pacing and hit enobufs */
+#define RACK_TP_ENOBUF		0x00000002	/* When we hit enobufs with software pacing */
+#define RACK_TP_COLLAPSED_WND	0x00000003	/* When a peer to collapses its rwnd on us */
 
 #define MIN_GP_WIN 6	/* We need at least 6 MSS in a GP measurement */
 #ifdef _KERNEL