git: 73ee5756dee6 - main - Fixes in the tcp infrastructure with respect to stack changes as well as other infrastructure updates for incoming rack features.

From: Randall Stewart <rrs_at_FreeBSD.org>
Date: Tue, 04 Apr 2023 12:31:46 UTC
The branch main has been updated by rrs:

URL: https://cgit.FreeBSD.org/src/commit/?id=73ee5756dee6b2110eb6fb2b2ef3cde39a1fcb4f

commit 73ee5756dee6b2110eb6fb2b2ef3cde39a1fcb4f
Author:     Randall Stewart <rrs@FreeBSD.org>
AuthorDate: 2023-04-01 05:46:38 +0000
Commit:     Randall Stewart <rrs@FreeBSD.org>
CommitDate: 2023-04-01 05:46:38 +0000

    Fixes in the tcp infrastructure with respect to stack changes as well as other infrastructure updates for incoming rack features.
    
    So stack switching as always been a bit of a issue. We currently use a break before make setup which means that
    if something goes wrong you have to try to get back to a stack. This patch among a lot of other things changes that so
    that it is a make before break. We also expand some of the function blocks in prep for new features in rack that will allow
    more controlled pacing. We also add other abilities such as the pathway for a stack to query a previous stack to acquire from
    it critical state information so things in flight don't get dropped or mis-handled when switching stacks. We also add the
    concept of a timer granularity. This allows an alternate stack to change from the old ticks granularity to microseconds and
    of course this even gives us a pathway to go to nanosecond timekeeping if we need to (something for the data center to consider
    for sure).
    
    Once all this lands I will then update rack to begin using all these new features.
    
    Reviewed by: tuexen
    Sponsored by: Netflix Inc
    Differential Revision: https://reviews.freebsd.org/D39210
---
 sys/conf/options              |   1 +
 sys/kern/kern_sendfile.c      |   9 +
 sys/modules/tcp/rack/Makefile |   2 +-
 sys/netinet/tcp.h             |  66 ++++-
 sys/netinet/tcp_hpts.h        |   9 +
 sys/netinet/tcp_log_buf.c     |  89 +++++++
 sys/netinet/tcp_stacks/bbr.c  | 110 ++++++--
 sys/netinet/tcp_stacks/rack.c |  12 +-
 sys/netinet/tcp_subr.c        | 593 ++++++++++++++++++++++++++++++++++++++++--
 sys/netinet/tcp_syncache.c    |  29 ++-
 sys/netinet/tcp_usrreq.c      |  66 +++--
 sys/netinet/tcp_var.h         | 274 ++++++++++++++++++-
 sys/sys/mbuf.h                |  14 +-
 13 files changed, 1172 insertions(+), 102 deletions(-)

diff --git a/sys/conf/options b/sys/conf/options
index 173c56229084..40bb1e56e8b0 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -227,6 +227,7 @@ SYSVSEM		opt_sysvipc.h
 SYSVSHM		opt_sysvipc.h
 SW_WATCHDOG	opt_watchdog.h
 TCPHPTS         opt_inet.h
+TCP_REQUEST_TRK opt_global.h
 TCP_ACCOUNTING	opt_inet.h
 TURNSTILE_PROFILING
 UMTX_PROFILING
diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c
index 12842e3476e1..9804d14d675d 100644
--- a/sys/kern/kern_sendfile.c
+++ b/sys/kern/kern_sendfile.c
@@ -57,6 +57,9 @@ __FBSDID("$FreeBSD$");
 #include <net/vnet.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_log_buf.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
@@ -1188,6 +1191,12 @@ prepend_header:
 			    NULL, NULL, td);
 			sendfile_iodone(sfio, NULL, 0, error);
 		}
+#ifdef TCP_REQUEST_TRK
+		if (so->so_proto->pr_protocol == IPPROTO_TCP) {
+			/* log the sendfile call to the TCP log, if enabled */
+			tcp_log_sendfile(so, offset, nbytes, flags);
+		}
+#endif
 		CURVNET_RESTORE();
 
 		m = NULL;
diff --git a/sys/modules/tcp/rack/Makefile b/sys/modules/tcp/rack/Makefile
index 68ce40cc074e..cf95faa7fcfd 100644
--- a/sys/modules/tcp/rack/Makefile
+++ b/sys/modules/tcp/rack/Makefile
@@ -6,7 +6,7 @@
 
 STACKNAME=	rack
 KMOD=	tcp_${STACKNAME}
-SRCS=	rack.c sack_filter.c rack_bbr_common.c
+SRCS=	rack.c sack_filter.c rack_bbr_common.c #tailq_hash.c
 
 SRCS+=	opt_inet.h opt_inet6.h opt_ipsec.h
 SRCS+=	opt_kern_tls.h
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
index 1c34442f2617..bec1dc3552d1 100644
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@@ -217,15 +217,15 @@ struct tcphdr {
 /* Options for Rack and BBR */
 #define	TCP_REUSPORT_LB_NUMA   1026	/* set listen socket numa domain */
 #define TCP_RACK_MBUF_QUEUE   1050 /* Do we allow mbuf queuing if supported */
-#define TCP_RACK_PROP	      1051 /* RACK proportional rate reduction (bool) */
+#define TCP_RACK_PROP	      1051 /* Not used */
 #define TCP_RACK_TLP_REDUCE   1052 /* RACK TLP cwnd reduction (bool) */
 #define TCP_RACK_PACE_REDUCE  1053 /* RACK Pacingv reduction factor (divisor) */
 #define TCP_RACK_PACE_MAX_SEG 1054 /* Max TSO size we will send  */
 #define TCP_RACK_PACE_ALWAYS  1055 /* Use the always pace method */
-#define TCP_RACK_PROP_RATE    1056 /* The proportional reduction rate */
+#define TCP_RACK_PROP_RATE    1056 /* Not used */
 #define TCP_RACK_PRR_SENDALOT 1057 /* Allow PRR to send more than one seg */
 #define TCP_RACK_MIN_TO       1058 /* Minimum time between rack t-o's in ms */
-#define TCP_RACK_EARLY_RECOV  1059 /* Should recovery happen early (bool) */
+#define TCP_RACK_EARLY_RECOV  1059 /* Not used */
 #define TCP_RACK_EARLY_SEG    1060 /* If early recovery max segments */
 #define TCP_RACK_REORD_THRESH 1061 /* RACK reorder threshold (shift amount) */
 #define TCP_RACK_REORD_FADE   1062 /* Does reordering fade after ms time */
@@ -309,12 +309,22 @@ struct tcphdr {
 #define TCP_REC_ABC_VAL 1134	/* Do we use the ABC value for recovery or the override one from sysctl  */
 #define TCP_RACK_MEASURE_CNT 1135 /* How many measurements are required in GP pacing */
 #define TCP_DEFER_OPTIONS 1136 /* Defer options until the proper number of measurements occur, does not defer TCP_RACK_MEASURE_CNT */
-#define TCP_FAST_RSM_HACK 1137 /* Do we do the broken thing where we don't twiddle the TLP bits properly in fast_rsm_output? */
+#define TCP_FAST_RSM_HACK 1137	/* Not used in modern stacks */
 #define TCP_RACK_PACING_BETA 1138	/* Changing the beta for pacing */
 #define TCP_RACK_PACING_BETA_ECN 1139	/* Changing the beta for ecn with pacing */
 #define TCP_RACK_TIMER_SLOP 1140	/* Set or get the timer slop used */
 #define TCP_RACK_DSACK_OPT 1141		/* How do we setup rack timer DSACK options bit 1/2 */
 #define TCP_RACK_ENABLE_HYSTART 1142	/* Do we allow hystart in the CC modules */
+#define TCP_RACK_SET_RXT_OPTIONS 1143	/* Set the bits in the retransmit options */
+#define TCP_RACK_HI_BETA 1144 /* Turn on/off high beta */
+#define TCP_RACK_SPLIT_LIMIT 1145	/* Set a split limit for split allocations */
+#define TCP_RACK_PACING_DIVISOR 1146 /* Pacing divisor given to rate-limit code for burst sizing */
+#define TCP_RACK_PACE_MIN_SEG 1147	/* Pacing min seg size rack will use */
+#define TCP_RACK_DGP_IN_REC 1148	/* Do we use full DGP in recovery? */
+#define TCP_RXT_CLAMP 1149 /* Do we apply a threshold to rack so if excess rxt clamp cwnd? */
+#define TCP_HYBRID_PACING   1150	/* Hybrid pacing enablement */
+#define TCP_PACING_DND	    1151	/* When pacing with rr_config=3 can sacks disturb us */
+
 /* Start of reserved space for third-party user-settable options. */
 #define	TCP_VENDOR	SO_VENDOR
 
@@ -447,6 +457,53 @@ struct tcp_function_set {
 #define	TLS_SET_RECORD_TYPE	1
 #define	TLS_GET_RECORD		2
 
+/*
+ * TCP log user opaque
+ */
+struct http_req {
+	uint64_t timestamp;
+	uint64_t start;
+	uint64_t end;
+	uint32_t flags;
+};
+
+union tcp_log_userdata {
+	struct http_req http_req;
+};
+
+struct tcp_log_user {
+	uint32_t type;
+	uint32_t subtype;
+	union tcp_log_userdata data;
+};
+
+/* user types, i.e. apps */
+#define TCP_LOG_USER_HTTPD	1
+
+/* user subtypes */
+#define TCP_LOG_HTTPD_TS	1	/* client timestamp */
+#define TCP_LOG_HTTPD_TS_REQ	2	/* client timestamp and request info */
+
+/* HTTPD REQ flags */
+#define TCP_LOG_HTTPD_RANGE_START	0x0001
+#define TCP_LOG_HTTPD_RANGE_END		0x0002
+
+/* Flags for hybrid pacing */
+#define TCP_HYBRID_PACING_CU		0x0001		/* Enable catch-up mode */
+#define TCP_HYBRID_PACING_DTL		0x0002		/* Enable Detailed logging */
+#define TCP_HYBRID_PACING_CSPR		0x0004		/* A client suggested rate is present  */
+#define TCP_HYBRID_PACING_H_MS		0x0008		/* A client hint for maxseg is present  */
+#define TCP_HYBRID_PACING_ENABLE	0x0010		/* We are enabling hybrid pacing else disable */
+#define TCP_HYBRID_PACING_S_MSS		0x0020		/* Clent wants us to set the mss overriding gp est in CU */
+#define TCP_HYBRID_PACING_SETMSS	0x1000		/* Internal flag that tellsus we set the mss on this entry */
+
+struct tcp_hybrid_req {
+	struct http_req req;
+	uint64_t cspr;
+	uint32_t hint_maxseg;
+	uint32_t hybrid_flags;
+};
+
 /*
  * TCP specific variables of interest for tp->t_stats stats(9) accounting.
  */
@@ -460,6 +517,7 @@ struct tcp_function_set {
 #define	VOI_TCP_CALCFRWINDIFF	7 /* Congestion avoidance LCWIN - FRWIN */
 #define	VOI_TCP_GPUT_ND		8 /* Goodput normalised delta */
 #define	VOI_TCP_ACKLEN		9 /* Average ACKed bytes per ACK */
+#define VOI_TCP_PATHRTT		10 /* The path RTT based on ACK arrival */
 
 #define TCP_REUSPORT_LB_NUMA_NODOM	(-2) /* remove numa binding */
 #define TCP_REUSPORT_LB_NUMA_CURDOM	(-1) /* bind to current domain */
diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h
index ebee6a01b983..51e6d62929d6 100644
--- a/sys/netinet/tcp_hpts.h
+++ b/sys/netinet/tcp_hpts.h
@@ -187,6 +187,15 @@ tcp_tv_to_lusectick(const struct timeval *sv)
 }
 
 #ifdef _KERNEL
+
+extern int32_t tcp_min_hptsi_time;
+
+__inline int32_t
+get_hpts_min_sleep_time()
+{
+	return (tcp_min_hptsi_time + HPTS_TICKS_PER_SLOT);
+}
+
 static __inline uint32_t
 tcp_gethptstick(struct timeval *sv)
 {
diff --git a/sys/netinet/tcp_log_buf.c b/sys/netinet/tcp_log_buf.c
index 491e1c23588c..5a16c7593cfc 100644
--- a/sys/netinet/tcp_log_buf.c
+++ b/sys/netinet/tcp_log_buf.c
@@ -58,6 +58,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/in_var.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_log_buf.h>
+#include <netinet/tcp_seq.h>
 #include <netinet/tcp_hpts.h>
 
 /* Default expiry time */
@@ -2844,6 +2845,10 @@ tcp_log_sendfile(struct socket *so, off_t offset, size_t nbytes, int flags)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp;
+#ifdef TCP_REQUEST_TRK
+	struct http_sendfile_track *ent;
+	int i, fnd;
+#endif
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_log_sendfile: inp == NULL"));
@@ -2873,6 +2878,90 @@ tcp_log_sendfile(struct socket *so, off_t offset, size_t nbytes, int flags)
 		    &tptosocket(tp)->so_snd,
 		    TCP_LOG_SENDFILE, 0, 0, &log, false, &tv);
 	}
+#ifdef TCP_REQUEST_TRK
+	if (tp->t_http_req == 0) {
+		/* No http requests to track */
+		goto done;
+	}
+	fnd = 0;
+	if (tp->t_http_closed == 0) {
+		/* No closed end req to track */
+		goto skip_closed_req;
+	}
+	for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
+		/* Lets see if this one can be found */
+		ent = &tp->t_http_info[i];
+		if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) {
+			/* Not used */
+			continue;
+		}
+		if (ent->flags & TCP_HTTP_TRACK_FLG_OPEN) {
+			/* This pass does not consider open requests */
+			continue;
+		}
+		if (ent->flags & TCP_HTTP_TRACK_FLG_COMP) {
+			/* Don't look at what we have completed */
+			continue;
+		}
+		/* If we reach here its a allocated closed end request */
+		if ((ent->start == offset) || 
+		    ((offset > ent->start) && (offset < ent->end))){
+			/* Its within this request?? */
+			fnd = 1;
+		}
+		if (fnd) {
+			/*
+			 * It is at or past the end, its complete.
+			 */
+			ent->flags |= TCP_HTTP_TRACK_FLG_SEQV;
+			/*
+			 * When an entry completes we can take (snd_una + sb_cc) and know where
+			 * the end of the range really is. Note that this works since two
+			 * requests must be sequential and sendfile now is complete for *this* request.
+			 * we must use sb_ccc since the data may still be in-flight in TLS.
+			 *
+			 * We always cautiously move the end_seq only if our calculations
+			 * show it happened (just in case sf has the call to here at the wrong
+			 * place). When we go COMP we will stop coming here and hopefully be
+			 * left with the correct end_seq.
+			 */
+			if (SEQ_GT((tp->snd_una + so->so_snd.sb_ccc), ent->end_seq))
+				ent->end_seq = tp->snd_una + so->so_snd.sb_ccc;
+			if ((offset + nbytes) >= ent->end) {
+				ent->flags |= TCP_HTTP_TRACK_FLG_COMP;
+				tcp_http_log_req_info(tp, ent, i, TCP_HTTP_REQ_LOG_COMPLETE, offset, nbytes);
+			} else {
+				tcp_http_log_req_info(tp, ent, i, TCP_HTTP_REQ_LOG_MOREYET, offset, nbytes);
+			}
+			/* We assume that sendfile never sends overlapping requests */
+			goto done;
+		}
+	}
+skip_closed_req:
+	if (!fnd) {
+		/* Ok now lets look for open requests */
+		for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
+			ent = &tp->t_http_info[i];
+			if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) {
+				/* Not used */
+				continue;
+			}
+			if ((ent->flags & TCP_HTTP_TRACK_FLG_OPEN) == 0)
+				continue;
+			/* If we reach here its an allocated open request */
+			if (ent->start == offset) {
+				/* It begins this request */
+				ent->start_seq = tp->snd_una +
+				    tptosocket(tp)->so_snd.sb_ccc;
+				ent->flags |= TCP_HTTP_TRACK_FLG_SEQV;
+				break;
+			} else if (offset > ent->start) {
+				ent->flags |= TCP_HTTP_TRACK_FLG_SEQV;
+				break;
+			}
+		}
+	}
+#endif
 done:
 	INP_WUNLOCK(inp);
 }
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index 66f19ccd6c2b..621357494a02 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -500,7 +500,7 @@ static void
 bbr_enter_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts,
 		  int32_t line);
 static void
-bbr_stop_all_timers(struct tcpcb *tp);
+bbr_stop_all_timers(struct tcpcb *tp, struct tcp_bbr *bbr);
 static void
 bbr_exit_probe_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts);
 static void
@@ -1970,7 +1970,7 @@ bbr_log_type_enter_rec(struct tcp_bbr *bbr, uint32_t seq)
 static void
 bbr_log_msgsize_fail(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t len, uint32_t maxseg, uint32_t mtu, int32_t csum_flags, int32_t tso, uint32_t cts)
 {
-	if (tcp_bblogging_on(bbr->rc_tp)) {
+	if (tcp_bblogging_on(tp)) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
@@ -2669,7 +2669,7 @@ bbr_log_type_ltbw(struct tcp_bbr *bbr, uint32_t cts, int32_t reason,
 	uint32_t newbw, uint32_t obw, uint32_t diff,
 	uint32_t tim)
 {
-	if (tcp_bblogging_on(bbr->rc_tp)) {
+	if (/*bbr_verbose_logging && */tcp_bblogging_on(bbr->rc_tp)) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
@@ -2697,7 +2697,7 @@ bbr_log_type_ltbw(struct tcp_bbr *bbr, uint32_t cts, int32_t reason,
 static inline void
 bbr_log_progress_event(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t tick, int event, int line)
 {
-	if (tcp_bblogging_on(bbr->rc_tp)) {
+	if (bbr_verbose_logging && tcp_bblogging_on(bbr->rc_tp)) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
@@ -6281,6 +6281,9 @@ tcp_bbr_xmit_timer_commit(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts)
 		else
 			apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts);
 	}
+#ifdef STATS
+	stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_PATHRTT, imax(0, rtt));
+#endif
 	if (bbr->rc_ack_was_delayed)
 		rtt += bbr->r_ctl.rc_ack_hdwr_delay;
 
@@ -9850,16 +9853,13 @@ bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
 }
 
 static void
-bbr_stop_all_timers(struct tcpcb *tp)
+bbr_stop_all_timers(struct tcpcb *tp, struct tcp_bbr *bbr)
 {
-	struct tcp_bbr *bbr;
-
 	/*
 	 * Assure no timers are running.
 	 */
 	if (tcp_timer_active(tp, TT_PERSIST)) {
 		/* We enter in persists, set the flag appropriately */
-		bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 		bbr->rc_in_persist = 1;
 	}
 }
@@ -9927,14 +9927,14 @@ bbr_google_mode_off(struct tcp_bbr *bbr)
  * which indicates the error (usually no memory).
  */
 static int
-bbr_init(struct tcpcb *tp)
+bbr_init(struct tcpcb *tp, void **ptr)
 {
 	struct inpcb *inp = tptoinpcb(tp);
 	struct tcp_bbr *bbr = NULL;
 	uint32_t cts;
 
-	tp->t_fb_ptr = uma_zalloc(bbr_pcb_zone, (M_NOWAIT | M_ZERO));
-	if (tp->t_fb_ptr == NULL) {
+	*ptr = uma_zalloc(bbr_pcb_zone, (M_NOWAIT | M_ZERO));
+	if (*ptr == NULL) {
 		/*
 		 * We need to allocate memory but cant. The INP and INP_INFO
 		 * locks and they are recursive (happens during setup. So a
@@ -9943,10 +9943,16 @@ bbr_init(struct tcpcb *tp)
 		 */
 		return (ENOMEM);
 	}
-	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+	bbr = (struct tcp_bbr *)*ptr;
 	bbr->rtt_valid = 0;
 	inp->inp_flags2 |= INP_CANNOT_DO_ECN;
 	inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
+	/* Take off any undesired flags */
+	inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
+	inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
+	inp->inp_flags2 &= ~INP_MBUF_ACKCMP;
+	inp->inp_flags2 &= ~INP_MBUF_L_ACKS;
+
 	TAILQ_INIT(&bbr->r_ctl.rc_map);
 	TAILQ_INIT(&bbr->r_ctl.rc_free);
 	TAILQ_INIT(&bbr->r_ctl.rc_tmap);
@@ -10074,8 +10080,8 @@ bbr_init(struct tcpcb *tp)
 
 		rsm = bbr_alloc(bbr);
 		if (rsm == NULL) {
-			uma_zfree(bbr_pcb_zone, tp->t_fb_ptr);
-			tp->t_fb_ptr = NULL;
+			uma_zfree(bbr_pcb_zone, *ptr);
+			*ptr = NULL;
 			return (ENOMEM);
 		}
 		rsm->r_rtt_not_allowed = 1;
@@ -10128,7 +10134,17 @@ bbr_init(struct tcpcb *tp)
 	 * the TCB on the hptsi wheel if a timer is needed with appropriate
 	 * flags.
 	 */
-	bbr_stop_all_timers(tp);
+	bbr_stop_all_timers(tp, bbr);
+	/* 
+	 * Validate the timers are not in usec, if they are convert.
+	 * BBR should in theory move to USEC and get rid of a
+	 * lot of the TICKS_2 calls.. but for now we stay
+	 * with tick timers.
+	 */
+	tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS);
+	TCPT_RANGESET(tp->t_rxtcur,
+	    ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
+	    tp->t_rttmin, TCPTV_REXMTMAX);
 	bbr_start_hpts_timer(bbr, tp, cts, 5, 0, 0);
 	return (0);
 }
@@ -10172,7 +10188,6 @@ static void
 bbr_fini(struct tcpcb *tp, int32_t tcb_is_purged)
 {
 	if (tp->t_fb_ptr) {
-		struct inpcb *inp = tptoinpcb(tp);
 		uint32_t calc;
 		struct tcp_bbr *bbr;
 		struct bbr_sendmap *rsm;
@@ -10182,10 +10197,6 @@ bbr_fini(struct tcpcb *tp, int32_t tcb_is_purged)
 			tcp_rel_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp);
 		bbr_log_flowend(bbr);
 		bbr->rc_tp = NULL;
-		/* Backout any flags2 we applied */
-		inp->inp_flags2 &= ~INP_CANNOT_DO_ECN;
-		inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
-		inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
 		if (bbr->bbr_hdrw_pacing)
 			counter_u64_add(bbr_flows_whdwr_pacing, -1);
 		else
@@ -11853,7 +11864,6 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
 	int32_t isipv6;
 #endif
 	uint8_t app_limited = BBR_JR_SENT_DATA;
-	uint8_t filled_all = 0;
 	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
 	/* We take a cache hit here */
 	memcpy(&bbr->rc_tv, tv, sizeof(struct timeval));
@@ -13162,7 +13172,7 @@ send:
 				if_hw_tsomaxsegsize, msb,
 				((rsm == NULL) ? hw_tls : 0)
 #ifdef NETFLIX_COPY_ARGS
-				, &filled_all
+				, NULL, NULL
 #endif
 				);
 			if (len <= maxseg) {
@@ -13474,7 +13484,7 @@ send:
 #endif
 
 	/* Log to the black box */
-	if (tcp_bblogging_on(bbr->rc_tp)) {
+	if (tcp_bblogging_on(tp)) {
 		union tcp_log_stackspecific log;
 
 		bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
@@ -13483,13 +13493,10 @@ send:
 		log.u_bbr.flex2 = (bbr->r_recovery_bw << 3);
 		log.u_bbr.flex3 = maxseg;
 		log.u_bbr.flex4 = delay_calc;
-		/* Encode filled_all into the upper flex5 bit */
 		log.u_bbr.flex5 = bbr->rc_past_init_win;
 		log.u_bbr.flex5 <<= 1;
 		log.u_bbr.flex5 |= bbr->rc_no_pacing;
 		log.u_bbr.flex5 <<= 29;
-		if (filled_all)
-			log.u_bbr.flex5 |= 0x80000000;
 		log.u_bbr.flex5 |= tp->t_maxseg;
 		log.u_bbr.flex6 = bbr->r_ctl.rc_pace_max_segs;
 		log.u_bbr.flex7 = (bbr->rc_bbr_state << 8) | bbr_state_val(bbr);
@@ -14073,6 +14080,56 @@ bbr_pru_options(struct tcpcb *tp, int flags)
 	return (0);
 }
 
+static void
+bbr_switch_failed(struct tcpcb *tp)
+{
+	/*
+	 * If a switch fails we only need to
+	 * make sure mbuf_queuing is still in place.
+	 * We also need to make sure we are still in
+	 * ticks granularity (though we should probably
+	 * change bbr to go to USECs).
+	 *
+	 * For timers we need to see if we are still in the
+	 * pacer (if our flags are up) if so we are good, if
+	 * not we need to get back into the pacer.
+	 */
+	struct inpcb *inp = tptoinpcb(tp);
+	struct timeval tv;
+	uint32_t cts;
+	uint32_t toval;
+	struct tcp_bbr *bbr;
+	struct hpts_diag diag;
+
+	inp->inp_flags2 |= INP_CANNOT_DO_ECN;
+	inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
+	tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS);
+	if (inp->inp_in_hpts) {
+		return;
+	}
+	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+	cts = tcp_get_usecs(&tv);
+	if (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
+		if (TSTMP_GT(bbr->rc_pacer_started, cts)) {
+			toval = bbr->rc_pacer_started - cts;
+		} else {
+			/* one slot please */
+			toval = HPTS_TICKS_PER_SLOT;
+		}
+	} else if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
+		if (TSTMP_GT(bbr->r_ctl.rc_timer_exp, cts)) {
+			toval = bbr->r_ctl.rc_timer_exp - cts;
+		} else {
+			/* one slot please */
+			toval = HPTS_TICKS_PER_SLOT;
+		}
+	} else
+		toval = HPTS_TICKS_PER_SLOT;
+	(void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(toval),
+				   __LINE__, &diag);
+	bbr_log_hpts_diag(bbr, cts, &diag);
+}
+
 struct tcp_function_block __tcp_bbr = {
 	.tfb_tcp_block_name = __XSTRING(STACKNAME),
 	.tfb_tcp_output = bbr_output,
@@ -14087,6 +14144,7 @@ struct tcp_function_block __tcp_bbr = {
 	.tfb_tcp_handoff_ok = bbr_handoff_ok,
 	.tfb_tcp_mtu_chg = bbr_mtu_chg,
 	.tfb_pru_options = bbr_pru_options,
+	.tfb_switch_failed = bbr_switch_failed,
 	.tfb_flags = TCP_FUNC_OUTPUT_CANDROP,
 };
 
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index d4ba3771ab6e..8b205d12d7f7 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -458,7 +458,7 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
 static uint32_t
 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss);
 static int32_t rack_handoff_ok(struct tcpcb *tp);
-static int32_t rack_init(struct tcpcb *tp);
+static int32_t rack_init(struct tcpcb *tp, void **ptr);
 static void rack_init_sysctls(void);
 static void
 rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
@@ -12344,7 +12344,7 @@ rack_init_fsb(struct tcpcb *tp, struct tcp_rack *rack)
 }
 
 static int
-rack_init(struct tcpcb *tp)
+rack_init(struct tcpcb *tp, void **ptr)
 {
 	struct inpcb *inp = tptoinpcb(tp);
 	struct tcp_rack *rack = NULL;
@@ -12354,8 +12354,8 @@ rack_init(struct tcpcb *tp)
 	uint32_t iwin, snt, us_cts;
 	int err;
 
-	tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
-	if (tp->t_fb_ptr == NULL) {
+	*ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
+	if (*ptr == NULL) {
 		/*
 		 * We need to allocate memory but cant. The INP and INP_INFO
 		 * locks and they are recursive (happens during setup. So a
@@ -12364,9 +12364,9 @@ rack_init(struct tcpcb *tp)
 		 */
 		return (ENOMEM);
 	}
-	memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack));
+	memset(ptr, 0, sizeof(struct tcp_rack));
 
-	rack = (struct tcp_rack *)tp->t_fb_ptr;
+	rack = (struct tcp_rack *)ptr;
 	RB_INIT(&rack->r_ctl.rc_mtree);
 	TAILQ_INIT(&rack->r_ctl.rc_free);
 	TAILQ_INIT(&rack->r_ctl.rc_tmap);
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 4abc0776b14e..1f2256c6b6f9 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -109,6 +109,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/tcp_log_buf.h>
 #include <netinet/tcp_syncache.h>
 #include <netinet/tcp_hpts.h>
+#include <netinet/tcp_lro.h>
 #include <netinet/cc/cc.h>
 #include <netinet/tcpip.h>
 #include <netinet/tcp_fastopen.h>
@@ -152,6 +153,11 @@ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, force_detection,
     CTLFLAG_RW,
     &tcp_force_detection, 0,
     "Do we force detection even if the INP has it off?");
+int32_t tcp_sad_limit = 10000;
+SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, limit,
+    CTLFLAG_RW,
+    &tcp_sad_limit, 10000,
+    "If SaD is enabled, what is the limit to sendmap entries (0 = unlimited)?");
 int32_t tcp_sack_to_ack_thresh = 700;	/* 70 % */
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sack_to_ack_thresh,
     CTLFLAG_RW,
@@ -363,7 +369,7 @@ VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]);
 VNET_DEFINE_STATIC(u_char, ts_offset_secret[TS_OFFSET_SECRET_LENGTH]);
 #define	V_ts_offset_secret	VNET(ts_offset_secret)
 
-static int	tcp_default_fb_init(struct tcpcb *tp);
+static int	tcp_default_fb_init(struct tcpcb *tp, void **ptr);
 static void	tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged);
 static int	tcp_default_handoff_ok(struct tcpcb *tp);
 static struct inpcb *tcp_notify(struct inpcb *, int);
@@ -519,18 +525,11 @@ void
 tcp_switch_back_to_default(struct tcpcb *tp)
 {
 	struct tcp_function_block *tfb;
+	void *ptr = NULL;
 
 	KASSERT(tp->t_fb != &tcp_def_funcblk,
 	    ("%s: called by the built-in default stack", __func__));
 
-	/*
-	 * Release the old stack. This function will either find a new one
-	 * or panic.
-	 */
-	if (tp->t_fb->tfb_tcp_fb_fini != NULL)
-		(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
-	refcount_release(&tp->t_fb->tfb_refcnt);
-
 	/*
 	 * Now, we'll find a new function block to use.
 	 * Start by trying the current user-selected
@@ -551,14 +550,20 @@ tcp_switch_back_to_default(struct tcpcb *tp)
 	/* Try to use that stack. */
 	if (tfb != NULL) {
 		/* Initialize the new stack. If it succeeds, we are done. */
-		tp->t_fb = tfb;
-		if (tp->t_fb->tfb_tcp_fb_init == NULL ||
-		    (*tp->t_fb->tfb_tcp_fb_init)(tp) == 0)
+		if (tfb->tfb_tcp_fb_init == NULL ||
+		    (*tfb->tfb_tcp_fb_init)(tp, &ptr) == 0) {
+			/* Release the old stack */
+			if (tp->t_fb->tfb_tcp_fb_fini != NULL)
+				(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
+			refcount_release(&tp->t_fb->tfb_refcnt);
+			/* Now set in all the pointers */
+			tp->t_fb = tfb;
+			tp->t_fb_ptr = ptr;
 			return;
-
+		}
 		/*
 		 * Initialization failed. Release the reference count on
-		 * the stack.
+		 * the looked up default stack.
 		 */
 		refcount_release(&tfb->tfb_refcnt);
 	}
@@ -578,12 +583,18 @@ tcp_switch_back_to_default(struct tcpcb *tp)
 			panic("Default stack rejects a new session?");
 		}
 	}
-	tp->t_fb = tfb;
-	if (tp->t_fb->tfb_tcp_fb_init != NULL &&
-	    (*tp->t_fb->tfb_tcp_fb_init)(tp)) {
+	if (tfb->tfb_tcp_fb_init != NULL &&
+	    (*tfb->tfb_tcp_fb_init)(tp, &ptr)) {
 		/* The default stack cannot fail */
 		panic("Default stack initialization failed");
 	}
+	/* Now release the old stack */
+	if (tp->t_fb->tfb_tcp_fb_fini != NULL)
+		(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
+	refcount_release(&tp->t_fb->tfb_refcnt);
+	/* And set in the pointers to the new */
+	tp->t_fb = tfb;
+	tp->t_fb_ptr = ptr;
 }
 
 static bool
@@ -1040,16 +1051,37 @@ tcp_default_handoff_ok(struct tcpcb *tp)
  * it is required to always succeed since it is the stack of last resort!
  */
 static int
-tcp_default_fb_init(struct tcpcb *tp)
+tcp_default_fb_init(struct tcpcb *tp, void **ptr)
 {
 	struct socket *so = tptosocket(tp);
+	int rexmt;
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
+	/* We don't use the pointer */
+	*ptr = NULL;
 
 	KASSERT(tp->t_state >= 0 && tp->t_state < TCPS_TIME_WAIT,
 	    ("%s: connection %p in unexpected state %d", __func__, tp,
 	    tp->t_state));
 
+	/* Make sure we get no interesting mbuf queuing behavior */
+	/* All mbuf queue/ack compress flags should be off */
+	tcp_lro_features_off(tptoinpcb(tp));
+
+	/* Cancel the GP measurement in progress */
+	tp->t_flags &= ~TF_GPUTINPROG;
+	/* Validate the timers are not in usec, if they are convert */
+	tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS);
+	if ((tp->t_state == TCPS_SYN_SENT) ||
+	    (tp->t_state == TCPS_SYN_RECEIVED))
+		rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift];
+	else
+		rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
+	if (tp->t_rxtshift == 0)
+		tp->t_rxtcur = rexmt;
+	else
+		TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX);
+
 	/*
 	 * Nothing to do for ESTABLISHED or LISTEN states. And, we don't
 	 * know what to do for unexpected states (which includes TIME_WAIT).
@@ -2240,6 +2272,8 @@ tcp_newtcpcb(struct inpcb *inp)
 	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->t_rcvtime = ticks;
+	/* We always start with ticks granularity */
+	tp->t_tmr_granularity = TCP_TMR_GRANULARITY_TICKS;
 	/*
 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
 	 * because the socket may be bound to an IPv6 wildcard address,
@@ -2265,7 +2299,7 @@ tcp_newtcpcb(struct inpcb *inp)
 #endif
 	tp->t_pacing_rate = -1;
 	if (tp->t_fb->tfb_tcp_fb_init) {
-		if ((*tp->t_fb->tfb_tcp_fb_init)(tp)) {
+		if ((*tp->t_fb->tfb_tcp_fb_init)(tp, &tp->t_fb_ptr)) {
 			refcount_release(&tp->t_fb->tfb_refcnt);
 			return (NULL);
 		}
@@ -4019,3 +4053,524 @@ tcp_do_ack_accounting(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, ui
 	}
 }
 #endif
+
+void
+tcp_change_time_units(struct tcpcb *tp, int granularity)
+{
+	if (tp->t_tmr_granularity == granularity) {
+		/* We are there */
+		return;
+	}
+	if (granularity == TCP_TMR_GRANULARITY_USEC) {
+		KASSERT((tp->t_tmr_granularity == TCP_TMR_GRANULARITY_TICKS),
+			("Granularity is not TICKS its %u in tp:%p",
+			 tp->t_tmr_granularity, tp));
+		tp->t_rttlow = TICKS_2_USEC(tp->t_rttlow);
+		if (tp->t_srtt > 1) {
+			uint32_t val, frac;
+
+			val = tp->t_srtt >> TCP_RTT_SHIFT;
+			frac = tp->t_srtt & 0x1f;
+			tp->t_srtt = TICKS_2_USEC(val);
+			/*
+			 * frac is the fractional part of the srtt (if any)
+			 * but its in ticks and every bit represents
+			 * 1/32nd of a hz.
+			 */
+			if (frac) {
+				if (hz == 1000) {
+					frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE);
+				} else {
+					frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE));
+				}
+				tp->t_srtt += frac;
+			}
+		}
+		if (tp->t_rttvar) {
+			uint32_t val, frac;
+
+			val = tp->t_rttvar >> TCP_RTTVAR_SHIFT;
+			frac = tp->t_rttvar & 0x1f;
+			tp->t_rttvar = TICKS_2_USEC(val);
+			/*
+			 * frac is the fractional part of the srtt (if any)
+			 * but its in ticks and every bit represents
+			 * 1/32nd of a hz.
+			 */
+			if (frac) {
+				if (hz == 1000) {
+					frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE);
+				} else {
+					frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE));
+				}
+				tp->t_rttvar += frac;
+			}
+		}
+		tp->t_tmr_granularity = TCP_TMR_GRANULARITY_USEC;
+	} else if (granularity == TCP_TMR_GRANULARITY_TICKS) {
+		/* Convert back to ticks, with  */
+		KASSERT((tp->t_tmr_granularity == TCP_TMR_GRANULARITY_USEC),
+			("Granularity is not USEC its %u in tp:%p",
+			 tp->t_tmr_granularity, tp));
+		if (tp->t_srtt > 1) {
+			uint32_t val, frac;
+
+			val = USEC_2_TICKS(tp->t_srtt);
+			frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz);
+			tp->t_srtt = val << TCP_RTT_SHIFT;
+			/*
+			 * frac is the fractional part here is left
+			 * over from converting to hz and shifting.
+			 * We need to convert this to the 5 bit
+			 * remainder.
+			 */
+			if (frac) {
+				if (hz == 1000) {
+					frac = (((uint64_t)frac *  (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC);
+				} else {
+					frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC);
+				}
+				tp->t_srtt += frac;
+			}
+		}
+		if (tp->t_rttvar) {
+			uint32_t val, frac;
+
+			val = USEC_2_TICKS(tp->t_rttvar);
+			frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz);
+			tp->t_rttvar = val <<  TCP_RTTVAR_SHIFT;
+			/*
+			 * frac is the fractional part here is left
+			 * over from converting to hz and shifting.
+			 * We need to convert this to the 5 bit
+			 * remainder.
+			 */
+			if (frac) {
+				if (hz == 1000) {
+					frac = (((uint64_t)frac *  (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC);
+				} else {
+					frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC);
+				}
+				tp->t_rttvar += frac;
+			}
+		}
+		tp->t_rttlow = USEC_2_TICKS(tp->t_rttlow);
+		tp->t_tmr_granularity = TCP_TMR_GRANULARITY_TICKS;
+	}
+#ifdef INVARIANTS
+	else {
+		panic("Unknown granularity:%d tp:%p",
+		      granularity, tp);
+	}
+#endif	
+}
+
+void
+tcp_handle_orphaned_packets(struct tcpcb *tp)
+{
+	struct mbuf *save, *m, *prev;
+	/*
+	 * Called when a stack switch is occuring from the fini()
+	 * of the old stack. We assue the init() as already been
+	 * run of the new stack and it has set the inp_flags2 to
+	 * what it supports. This function will then deal with any
+	 * differences i.e. cleanup packets that maybe queued that
+	 * the newstack does not support.
+	 */
+
+	if (tptoinpcb(tp)->inp_flags2 & INP_MBUF_L_ACKS)
+		return;
+	if ((tptoinpcb(tp)->inp_flags2 & INP_SUPPORTS_MBUFQ) == 0) {
+		/*
+		 * It is unsafe to process the packets since a
+		 * reset may be lurking in them (its rare but it
+		 * can occur). If we were to find a RST, then we
+		 * would end up dropping the connection and the
+		 * INP lock, so when we return the caller (tcp_usrreq)
+		 * will blow up when it trys to unlock the inp.
+		 * This new stack does not do any fancy LRO features
+		 * so all we can do is toss the packets.
+		 */
+		m = tp->t_in_pkt;
+		tp->t_in_pkt = NULL;
+		tp->t_tail_pkt = NULL;
+		while (m) {
+			save = m->m_nextpkt;
+			m->m_nextpkt = NULL;
+			m_freem(m);
+			m = save;
+		}
+	} else {
+		/*
+		 * Here we have a stack that does mbuf queuing but
+		 * does not support compressed ack's. We must
+		 * walk all the mbufs and discard any compressed acks.
+		 */
+		m = tp->t_in_pkt;
+		prev = NULL;
+		while (m) {
+			if (m->m_flags & M_ACKCMP) {
+				/* We must toss this packet */
+				if (tp->t_tail_pkt == m)
+					tp->t_tail_pkt = prev;
+				if (prev)
+					prev->m_nextpkt = m->m_nextpkt;
+				else
+					tp->t_in_pkt =  m->m_nextpkt;
+				m->m_nextpkt = NULL;
+				m_freem(m);
+				/* move forward */
+				if (prev)
+					m = prev->m_nextpkt;
+				else
+					m = tp->t_in_pkt;
+			} else {
+				/* this one is ok */
+				prev = m;
+				m = m->m_nextpkt;
+			}
+		}
+	}
+}
+
+#ifdef TCP_REQUEST_TRK
+uint32_t
+tcp_estimate_tls_overhead(struct socket *so, uint64_t tls_usr_bytes)
+{
+#ifdef KERN_TLS
*** 914 LINES SKIPPED ***