git: 030434acaf46 - main - Update rack to the latest code used at NF.
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Tue, 04 Apr 2023 20:05:56 UTC
The branch main has been updated by rrs: URL: https://cgit.FreeBSD.org/src/commit/?id=030434acaf4631c4e205f8bccedcc7f845cbfcbf commit 030434acaf4631c4e205f8bccedcc7f845cbfcbf Author: Randall Stewart <rrs@FreeBSD.org> AuthorDate: 2023-04-04 20:05:46 +0000 Commit: Randall Stewart <rrs@FreeBSD.org> CommitDate: 2023-04-04 20:05:46 +0000 Update rack to the latest code used at NF. There have been many changes to rack over the last couple of years, including: a) Ability when switching stacks to have one stack query another. b) Internal use of micro-second timers instead of ticks. c) Many changes to pacing in forms of 1) Improvements to Dynamic Goodput Pacing (DGP) 2) Improvements to fixed rate paciing 3) A new feature called hybrid pacing where the requestor can get a combination of DGP and fixed rate pacing with deadlines for delivery that can dynamically speed things up. d) All kinds of bugs found during extensive testing and use of the rack stack for streaming video and in fact all data transferred by NF Reviewed by: glebius, gallatin, tuexen Sponsored By: Netflix Inc. Differential Revision: https://reviews.freebsd.org/D39402 --- sys/modules/tcp/rack/Makefile | 2 +- sys/netinet/tcp_stacks/rack.c | 6119 +++++++++++++++++++++++------- sys/netinet/tcp_stacks/rack_bbr_common.c | 34 - sys/netinet/tcp_stacks/rack_bbr_common.h | 3 - sys/netinet/tcp_stacks/tailq_hash.c | 344 ++ sys/netinet/tcp_stacks/tailq_hash.h | 73 + sys/netinet/tcp_stacks/tcp_rack.h | 165 +- sys/netinet/tcp_subr.c | 5 + 8 files changed, 5274 insertions(+), 1471 deletions(-) diff --git a/sys/modules/tcp/rack/Makefile b/sys/modules/tcp/rack/Makefile index cf95faa7fcfd..b80f34ba7ed4 100644 --- a/sys/modules/tcp/rack/Makefile +++ b/sys/modules/tcp/rack/Makefile @@ -6,7 +6,7 @@ STACKNAME= rack KMOD= tcp_${STACKNAME} -SRCS= rack.c sack_filter.c rack_bbr_common.c #tailq_hash.c +SRCS= rack.c sack_filter.c rack_bbr_common.c tailq_hash.c SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h SRCS+= opt_kern_tls.h diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index 8b205d12d7f7..514d10098ff6 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -129,6 +129,7 @@ __FBSDID("$FreeBSD$"); #endif #include "sack_filter.h" #include "tcp_rack.h" +#include "tailq_hash.h" #include "rack_bbr_common.h" uma_zone_t rack_zone; @@ -191,21 +192,38 @@ static int32_t rack_tlp_use_greater = 1; static int32_t rack_reorder_thresh = 2; static int32_t rack_reorder_fade = 60000000; /* 0 - never fade, def 60,000,000 * - 60 seconds */ +static uint32_t rack_clamp_ss_upper = 110; +static uint32_t rack_clamp_ca_upper = 105; +static uint32_t rack_rxt_min_rnds = 10; /* Min rounds if drastic rxt clamp is in place */ +static uint32_t rack_unclamp_round_thresh = 100; /* number of perfect rounds before we unclamp */ +static uint32_t rack_unclamp_rxt_thresh = 5; /* .5% and under */ +static uint64_t rack_rxt_clamp_thresh = 0; /* Do we do the rxt clamp thing */ +static int32_t rack_dnd_default = 0; /* For rr_conf = 3, what is the default for dnd */ +static int32_t rack_rxt_controls = 0; +static int32_t rack_fill_cw_state = 0; static uint8_t rack_req_measurements = 1; /* Attack threshold detections */ static uint32_t rack_highest_sack_thresh_seen = 0; static uint32_t rack_highest_move_thresh_seen = 0; +static uint32_t rack_merge_out_sacks_on_attack = 0; static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */ -static int32_t rack_hw_pace_extra_slots = 2; /* 2 extra MSS time betweens */ -static int32_t rack_hw_rate_caps = 1; /* 1; */ +static int32_t rack_hw_pace_extra_slots = 0; /* 2 extra MSS time betweens */ +static int32_t rack_hw_rate_caps = 0; /* 1; */ +static int32_t rack_hw_rate_cap_per = 0; /* 0 -- off */ static int32_t rack_hw_rate_min = 0; /* 1500000;*/ static int32_t rack_hw_rate_to_low = 0; /* 1200000; */ -static int32_t rack_hw_up_only = 1; +static int32_t rack_hw_up_only = 0; static int32_t rack_stats_gets_ms_rtt = 1; static int32_t rack_prr_addbackmax = 2; static int32_t rack_do_hystart = 0; static int32_t rack_apply_rtt_with_reduced_conf = 0; +static int32_t rack_hibeta_setting = 0; +static int32_t rack_default_pacing_divisor = 250; +static int32_t rack_uses_full_dgp_in_rec = 1; +static uint16_t rack_pacing_min_seg = 0; + +static uint32_t sad_seg_size_per = 800; /* 80.0 % */ static int32_t rack_pkt_delay = 1000; static int32_t rack_send_a_lot_in_prr = 1; static int32_t rack_min_to = 1000; /* Number of microsecond min timeout */ @@ -219,11 +237,13 @@ static int32_t rack_use_rsm_rfo = 1; static int32_t rack_max_abc_post_recovery = 2; static int32_t rack_client_low_buf = 0; static int32_t rack_dsack_std_based = 0x3; /* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */ +static int32_t rack_bw_multipler = 2; /* Limit on fill cw's jump up to be this x gp_est */ #ifdef TCP_ACCOUNTING static int32_t rack_tcp_accounting = 0; #endif static int32_t rack_limits_scwnd = 1; static int32_t rack_enable_mqueue_for_nonpaced = 0; +static int32_t rack_hybrid_allow_set_maxseg = 0; static int32_t rack_disable_prr = 0; static int32_t use_rack_rr = 1; static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */ @@ -233,11 +253,12 @@ static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to us static int32_t rack_default_init_window = 0; /* Use system default */ static int32_t rack_limit_time_with_srtt = 0; static int32_t rack_autosndbuf_inc = 20; /* In percentage form */ -static int32_t rack_enobuf_hw_boost_mult = 2; /* How many times the hw rate we boost slot using time_between */ +static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost slot using time_between */ static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */ static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */ static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */ - +static int32_t rack_hw_check_queue = 0; /* Do we always pre-check queue depth of a hw queue */ +static int32_t rack_full_buffer_discount = 10; /* * Currently regular tcp has a rto_min of 30ms * the backoff goes 12 times so that ends up @@ -326,8 +347,6 @@ static int32_t rack_req_segs = 1; static uint64_t rack_bw_rate_cap = 0; -/* Weird delayed ack mode */ -static int32_t rack_use_imac_dack = 0; /* Rack specific counters */ counter_u64_t rack_saw_enobuf; counter_u64_t rack_saw_enobuf_hw; @@ -336,6 +355,7 @@ counter_u64_t rack_persists_sends; counter_u64_t rack_persists_acks; counter_u64_t rack_persists_loss; counter_u64_t rack_persists_lost_ends; +counter_u64_t rack_total_bytes; #ifdef INVARIANTS counter_u64_t rack_adjust_map_bw; #endif @@ -352,6 +372,8 @@ counter_u64_t rack_to_alloc_emerg; counter_u64_t rack_to_alloc_limited; counter_u64_t rack_alloc_limited_conns; counter_u64_t rack_split_limited; +counter_u64_t rack_rxt_clamps_cwnd; +counter_u64_t rack_rxt_clamps_cwnd_uniq; counter_u64_t rack_multi_single_eq; counter_u64_t rack_proc_non_comp_ack; @@ -367,6 +389,7 @@ counter_u64_t rack_sack_proc_short; counter_u64_t rack_sack_proc_restart; counter_u64_t rack_sack_attacks_detected; counter_u64_t rack_sack_attacks_reversed; +counter_u64_t rack_sack_attacks_suspect; counter_u64_t rack_sack_used_next_merge; counter_u64_t rack_sack_splits; counter_u64_t rack_sack_used_prev_merge; @@ -455,18 +478,25 @@ static int rack_get_sockopt(struct inpcb *inp, struct sockopt *sopt); static void rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack, int line, uint8_t quality); +static void +rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm); + static uint32_t rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss); static int32_t rack_handoff_ok(struct tcpcb *tp); static int32_t rack_init(struct tcpcb *tp, void **ptr); static void rack_init_sysctls(void); + static void rack_log_ack(struct tcpcb *tp, struct tcpopt *to, - struct tcphdr *th, int entered_rec, int dup_ack_struck); + struct tcphdr *th, int entered_rec, int dup_ack_struck, + int *dsack_seen, int *sacks_seen); static void rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t ts, - struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls); + struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls, int segsiz); + +static uint64_t rack_get_gp_est(struct tcp_rack *rack); static void rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, @@ -477,7 +507,7 @@ static int32_t rack_output(struct tcpcb *tp); static uint32_t rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, - uint32_t cts, int *moved_two); + uint32_t cts, int *no_extra, int *moved_two, uint32_t segsiz); static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq); static void rack_remxt_tmr(struct tcpcb *tp); static int rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt); @@ -486,10 +516,10 @@ static int32_t rack_stopall(struct tcpcb *tp); static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); static uint32_t rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, - struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag); + struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag, int segsiz); static void rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, - struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag); + struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag, int segsiz); static int rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack); @@ -530,6 +560,7 @@ static int rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); +static void rack_chk_http_and_hybrid_on_out(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts); struct rack_sendmap * tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused); @@ -544,6 +575,26 @@ rack_apply_deferred_options(struct tcp_rack *rack); int32_t rack_clear_counter=0; +static uint64_t +rack_get_lt_bw(struct tcp_rack *rack) +{ + struct timeval tv; + uint64_t tim, bytes; + + tim = rack->r_ctl.lt_bw_time; + bytes = rack->r_ctl.lt_bw_bytes; + if (rack->lt_bw_up) { + /* Include all the current bytes too */ + microuptime(&tv); + bytes += (rack->rc_tp->snd_una - rack->r_ctl.lt_seq); + tim += (tcp_tv_to_lusectick(&tv) - rack->r_ctl.lt_timemark); + } + if ((bytes != 0) && (tim != 0)) + return ((bytes * (uint64_t)1000000) / tim); + else + return (0); +} + static void rack_swap_beta_values(struct tcp_rack *rack, uint8_t flex8) { @@ -645,7 +696,7 @@ rack_set_cc_pacing(struct tcp_rack *rack) rack->rc_pacing_cc_set = 1; rack_swap_beta_values(rack, 3); } - + static void rack_undo_cc_pacing(struct tcp_rack *rack) { @@ -659,6 +710,42 @@ rack_undo_cc_pacing(struct tcp_rack *rack) rack_swap_beta_values(rack, 4); } +static void +rack_log_gpset(struct tcp_rack *rack, uint32_t seq_end, uint32_t ack_end_t, + uint32_t send_end_t, int line, uint8_t mode, struct rack_sendmap *rsm) +{ + if (tcp_bblogging_on(rack->rc_tp)) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log, 0, sizeof(log)); + log.u_bbr.flex1 = seq_end; + log.u_bbr.flex2 = rack->rc_tp->gput_seq; + log.u_bbr.flex3 = ack_end_t; + log.u_bbr.flex4 = rack->rc_tp->gput_ts; + log.u_bbr.flex5 = send_end_t; + log.u_bbr.flex6 = rack->rc_tp->gput_ack; + log.u_bbr.flex7 = mode; + log.u_bbr.flex8 = 69; + log.u_bbr.rttProp = rack->r_ctl.rc_gp_cumack_ts; + log.u_bbr.delRate = rack->r_ctl.rc_gp_output_ts; + log.u_bbr.pkts_out = line; + log.u_bbr.cwnd_gain = rack->app_limited_needs_set; + log.u_bbr.pkt_epoch = rack->r_ctl.rc_app_limited_cnt; + if (rsm != NULL) { + log.u_bbr.applimited = rsm->r_start; + log.u_bbr.delivered = rsm->r_end; + log.u_bbr.epoch = rsm->r_flags; + } + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + TCP_LOG_EVENTP(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + BBR_LOG_HPTSI_CALC, 0, + 0, &log, false, &tv); + } +} + #ifdef NETFLIX_PEAKRATE static inline void rack_update_peakrate_thr(struct tcpcb *tp) @@ -697,6 +784,7 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS) counter_u64_zero(rack_saw_enobuf_hw); counter_u64_zero(rack_saw_enetunreach); counter_u64_zero(rack_persists_sends); + counter_u64_zero(rack_total_bytes); counter_u64_zero(rack_persists_acks); counter_u64_zero(rack_persists_loss); counter_u64_zero(rack_persists_lost_ends); @@ -719,10 +807,13 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS) counter_u64_zero(rack_to_alloc_limited); counter_u64_zero(rack_alloc_limited_conns); counter_u64_zero(rack_split_limited); + counter_u64_zero(rack_rxt_clamps_cwnd); + counter_u64_zero(rack_rxt_clamps_cwnd_uniq); counter_u64_zero(rack_multi_single_eq); counter_u64_zero(rack_proc_non_comp_ack); counter_u64_zero(rack_sack_attacks_detected); counter_u64_zero(rack_sack_attacks_reversed); + counter_u64_zero(rack_sack_attacks_suspect); counter_u64_zero(rack_sack_used_next_merge); counter_u64_zero(rack_sack_used_prev_merge); counter_u64_zero(rack_sack_splits); @@ -737,6 +828,18 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS) counter_u64_zero(rack_collapsed_win_rxt); counter_u64_zero(rack_collapsed_win_seen); counter_u64_zero(rack_collapsed_win_rxt_bytes); + } else if (stat == 2) { +#ifdef INVARIANTS + printf("Clearing RACK option array\n"); +#endif + COUNTER_ARRAY_ZERO(rack_opts_arry, RACK_OPTS_SIZE); + } else if (stat == 3) { + printf("Rack has no stats counters to clear (use 1 to clear all stats in sysctl node)\n"); + } else if (stat == 4) { +#ifdef INVARIANTS + printf("Clearing RACK out size array\n"); +#endif + COUNTER_ARRAY_ZERO(rack_out_size, TCP_MSS_ACCT_SIZE); } rack_clear_counter = 0; return (0); @@ -893,6 +996,36 @@ rack_init_sysctls(void) "pacing", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Pacing related Controls"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "fulldgpinrec", CTLFLAG_RW, + &rack_uses_full_dgp_in_rec, 1, + "Do we use all DGP features in recovery (fillcw, timely et.al.)?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "fullbufdisc", CTLFLAG_RW, + &rack_full_buffer_discount, 10, + "What percentage b/w reduction over the GP estimate for a full buffer (default=0 off)?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "fillcw", CTLFLAG_RW, + &rack_fill_cw_state, 0, + "Enable fillcw on new connections (default=0 off)?"); + SYSCTL_ADD_U16(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "min_burst", CTLFLAG_RW, + &rack_pacing_min_seg, 0, + "What is the min burst size for pacing (0 disables)?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "divisor", CTLFLAG_RW, + &rack_default_pacing_divisor, 4, + "What is the default divisor given to the rl code?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "fillcw_max_mult", CTLFLAG_RW, + &rack_bw_multipler, 2, + "What is the multiplier of the current gp_est that fillcw can increase the b/w too?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "max_pace_over", CTLFLAG_RW, @@ -900,9 +1033,9 @@ rack_init_sysctls(void) "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), - OID_AUTO, "pace_to_one", CTLFLAG_RW, + OID_AUTO, "allow1mss", CTLFLAG_RW, &rack_pace_one_seg, 0, - "Do we allow low b/w pacing of 1MSS instead of two"); + "Do we allow low b/w pacing of 1MSS instead of two (1.2Meg and less)?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "limit_wsrtt", CTLFLAG_RW, @@ -965,10 +1098,15 @@ rack_init_sysctls(void) OID_AUTO, "rwnd_factor", CTLFLAG_RW, &rack_hw_rwnd_factor, 2, "How many times does snd_wnd need to be bigger than pace_max_seg so we will hold off and get more acks?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_hw_pacing), + OID_AUTO, "precheck", CTLFLAG_RW, + &rack_hw_check_queue, 0, + "Do we always precheck the hdwr pacing queue to avoid ENOBUF's?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_hw_pacing), OID_AUTO, "pace_enobuf_mult", CTLFLAG_RW, - &rack_enobuf_hw_boost_mult, 2, + &rack_enobuf_hw_boost_mult, 0, "By how many time_betweens should we boost the pacing time if we see a ENOBUFS?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_hw_pacing), @@ -988,8 +1126,13 @@ rack_init_sysctls(void) SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_hw_pacing), OID_AUTO, "rate_cap", CTLFLAG_RW, - &rack_hw_rate_caps, 1, + &rack_hw_rate_caps, 0, "Does the highest hardware pacing rate cap the rate we will send at??"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_hw_pacing), + OID_AUTO, "uncap_per", CTLFLAG_RW, + &rack_hw_rate_cap_per, 0, + "If you go over b/w by this amount you will be uncapped (0 = never)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_hw_pacing), OID_AUTO, "rate_min", CTLFLAG_RW, @@ -1003,12 +1146,12 @@ rack_init_sysctls(void) SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_hw_pacing), OID_AUTO, "up_only", CTLFLAG_RW, - &rack_hw_up_only, 1, + &rack_hw_up_only, 0, "Do we allow hw pacing to lower the rate selected?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_hw_pacing), OID_AUTO, "extra_mss_precise", CTLFLAG_RW, - &rack_hw_pace_extra_slots, 2, + &rack_hw_pace_extra_slots, 0, "If the rates between software and hardware match precisely how many extra time_betweens do we get?"); rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), @@ -1287,6 +1430,16 @@ rack_init_sysctls(void) "features", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Feature controls"); + SYSCTL_ADD_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_features), + OID_AUTO, "rxt_clamp_thresh", CTLFLAG_RW, + &rack_rxt_clamp_thresh, 0, + "Bit encoded clamping setup bits CCCC CCCCC UUUU UULF PPPP PPPP PPPP PPPP"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_features), + OID_AUTO, "hybrid_set_maxseg", CTLFLAG_RW, + &rack_hybrid_allow_set_maxseg, 0, + "Should hybrid pacing allow the setmss command"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_features), OID_AUTO, "cmpack", CTLFLAG_RW, @@ -1331,6 +1484,26 @@ rack_init_sysctls(void) &rack_tcp_accounting, 0, "Should we turn on TCP accounting for all rack sessions?"); #endif + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "dnd", CTLFLAG_RW, + &rack_dnd_default, 0, + "Do not disturb default for rack_rrr = 3"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "sad_seg_per", CTLFLAG_RW, + &sad_seg_size_per, 800, + "Percentage of segment size needed in a sack 800 = 80.0?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "rxt_controls", CTLFLAG_RW, + &rack_rxt_controls, 0, + "Retransmit sending size controls (valid values 0, 1, 2 default=1)?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "rack_hibeta", CTLFLAG_RW, + &rack_hibeta_setting, 0, + "Do we ue a high beta (80 instead of 50)?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_misc), OID_AUTO, "apply_rtt_with_low_conf", CTLFLAG_RW, @@ -1371,11 +1544,6 @@ rack_init_sysctls(void) OID_AUTO, "limits_on_scwnd", CTLFLAG_RW, &rack_limits_scwnd, 1, "Should RACK place low end time limits on the shared cwnd feature"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_misc), - OID_AUTO, "iMac_dack", CTLFLAG_RW, - &rack_use_imac_dack, 0, - "Should RACK try to emulate iMac delayed ack"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_misc), OID_AUTO, "no_prr", CTLFLAG_RW, @@ -1406,7 +1574,38 @@ rack_init_sysctls(void) OID_AUTO, "autoscale", CTLFLAG_RW, &rack_autosndbuf_inc, 20, "What percentage should rack scale up its snd buffer by?"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "rnds_for_rxt_clamp", CTLFLAG_RW, + &rack_rxt_min_rnds, 10, + "Number of rounds needed between RTT clamps due to high loss rates"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "rnds_for_unclamp", CTLFLAG_RW, + &rack_unclamp_round_thresh, 100, + "Number of rounds needed with no loss to unclamp"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "rxt_threshs_for_unclamp", CTLFLAG_RW, + &rack_unclamp_rxt_thresh, 5, + "Percentage of retransmits we need to be under to unclamp (5 = .5 percent)\n"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "clamp_ss_upper", CTLFLAG_RW, + &rack_clamp_ss_upper, 110, + "Clamp percentage ceiling in SS?"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "clamp_ca_upper", CTLFLAG_RW, + &rack_clamp_ca_upper, 110, + "Clamp percentage ceiling in CA?"); /* Sack Attacker detection stuff */ + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "merge_out", CTLFLAG_RW, + &rack_merge_out_sacks_on_attack, 0, + "Do we merge the sendmap when we decide we are being attacked?"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_attack), OID_AUTO, "detect_highsackratio", CTLFLAG_RW, @@ -1459,6 +1658,13 @@ rack_init_sysctls(void) OID_AUTO, "reversed", CTLFLAG_RD, &rack_sack_attacks_reversed, "Total number of SACK attackers that were later determined false positive"); + rack_sack_attacks_suspect = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_attack), + OID_AUTO, "suspect", CTLFLAG_RD, + &rack_sack_attacks_suspect, + "Total number of SACKs that triggered early detection"); + rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_attack), @@ -1472,6 +1678,12 @@ rack_init_sysctls(void) &rack_sack_used_prev_merge, "Total number of times we used the prev merge"); /* Counters */ + rack_total_bytes = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "totalbytes", CTLFLAG_RD, + &rack_total_bytes, + "Total number of bytes sent"); rack_fto_send = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), @@ -1599,6 +1811,18 @@ rack_init_sysctls(void) OID_AUTO, "split_limited", CTLFLAG_RD, &rack_split_limited, "Split allocations dropped due to limit"); + rack_rxt_clamps_cwnd = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "rxt_clamps_cwnd", CTLFLAG_RD, + &rack_rxt_clamps_cwnd, + "Number of times that excessive rxt clamped the cwnd down"); + rack_rxt_clamps_cwnd_uniq = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "rxt_clamps_cwnd_uniq", CTLFLAG_RD, + &rack_rxt_clamps_cwnd_uniq, + "Number of connections that have had excessive rxt clamped the cwnd down"); rack_persists_sends = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), @@ -1726,49 +1950,6 @@ rack_init_sysctls(void) &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); } -static __inline int -rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a) -{ - if (SEQ_GEQ(b->r_start, a->r_start) && - SEQ_LT(b->r_start, a->r_end)) { - /* - * The entry b is within the - * block a. i.e.: - * a -- |-------------| - * b -- |----| - * <or> - * b -- |------| - * <or> - * b -- |-----------| - */ - return (0); - } else if (SEQ_GEQ(b->r_start, a->r_end)) { - /* - * b falls as either the next - * sequence block after a so a - * is said to be smaller than b. - * i.e: - * a -- |------| - * b -- |--------| - * or - * b -- |-----| - */ - return (1); - } - /* - * Whats left is where a is - * larger than b. i.e: - * a -- |-------| - * b -- |---| - * or even possibly - * b -- |--------------| - */ - return (-1); -} - -RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); -RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); - static uint32_t rc_init_window(struct tcp_rack *rack) { @@ -1796,14 +1977,282 @@ rack_get_fixed_pacing_bw(struct tcp_rack *rack) return (rack->r_ctl.rc_fixed_pacing_rate_ca); } -static uint64_t -rack_get_bw(struct tcp_rack *rack) +static void +rack_log_hybrid_bw(struct tcp_rack *rack, uint32_t seq, uint64_t cbw, uint64_t tim, + uint64_t data, uint8_t mod, uint16_t aux, + struct http_sendfile_track *cur) { - if (rack->use_fixed_rate) { - /* Return the fixed pacing rate */ - return (rack_get_fixed_pacing_bw(rack)); +#ifdef TCP_REQUEST_TRK + int do_log = 0; + + /* + * The rate cap one is noisy and only should come out when normal BB logging + * is enabled, the other logs (not RATE_CAP and NOT CAP_CALC) only come out + * once per chunk and make up the BBpoint that can be turned on by the client. + */ + if ((mod == HYBRID_LOG_RATE_CAP) || (mod == HYBRID_LOG_CAP_CALC)) { + if (rack_verbose_logging != 0) + do_log = tcp_bblogging_on(rack->rc_tp); + else + do_log = 0; + } else + do_log = tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING); + + if (do_log) { + union tcp_log_stackspecific log; + struct timeval tv; + uint64_t lt_bw; + + /* Convert our ms to a microsecond */ + memset(&log, 0, sizeof(log)); + + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.rttProp = tim; + log.u_bbr.bw_inuse = cbw; + log.u_bbr.delRate = rack_get_gp_est(rack); + lt_bw = rack_get_lt_bw(rack); + log.u_bbr.flex1 = seq; + log.u_bbr.pacing_gain = aux; + /* lt_bw = < flex3 | flex2 > */ + log.u_bbr.flex2 = (uint32_t)(lt_bw & 0x00000000ffffffff); + log.u_bbr.flex3 = (uint32_t)((lt_bw >> 32) & 0x00000000ffffffff); + /* Record the last obtained us rtt in inflight */ + if (cur == NULL) { + /* Make sure we are looking at the right log if an overide comes in */ + cur = rack->r_ctl.rc_last_sft; + } + if (rack->r_ctl.rack_rs.rs_flags != RACK_RTT_EMPTY) + log.u_bbr.inflight = rack->r_ctl.rack_rs.rs_us_rtt; + else { + /* Use the last known rtt i.e. the rack-rtt */ + log.u_bbr.inflight = rack->rc_rack_rtt; + } + if (cur != NULL) { + uint64_t off; + + log.u_bbr.cur_del_rate = cur->deadline; + if ((mod == HYBRID_LOG_RATE_CAP) || (mod == HYBRID_LOG_CAP_CALC)) { + /* start = < lost | pkt_epoch > */ + log.u_bbr.pkt_epoch = (uint32_t)(cur->start & 0x00000000ffffffff); + log.u_bbr.lost = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff); + log.u_bbr.flex6 = cur->start_seq; + log.u_bbr.pkts_out = cur->end_seq; + } else { + /* start = < lost | pkt_epoch > */ + log.u_bbr.pkt_epoch = (uint32_t)(cur->start & 0x00000000ffffffff); + log.u_bbr.lost = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff); + /* end = < pkts_out | flex6 > */ + log.u_bbr.flex6 = (uint32_t)(cur->end & 0x00000000ffffffff); + log.u_bbr.pkts_out = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff); + } + /* first_send = <lt_epoch | epoch> */ + log.u_bbr.epoch = (uint32_t)(cur->first_send & 0x00000000ffffffff); + log.u_bbr.lt_epoch = (uint32_t)((cur->first_send >> 32) & 0x00000000ffffffff); + /* localtime = <delivered | applimited>*/ + log.u_bbr.applimited = (uint32_t)(cur->localtime & 0x00000000ffffffff); + log.u_bbr.delivered = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff); + off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_http_info[0]); + log.u_bbr.bbr_substate = (uint8_t)(off / sizeof(struct http_sendfile_track)); + log.u_bbr.flex4 = (uint32_t)(rack->rc_tp->t_sndbytes - cur->sent_at_fs); + log.u_bbr.flex5 = (uint32_t)(rack->rc_tp->t_snd_rxt_bytes - cur->rxt_at_fs); + log.u_bbr.flex7 = (uint16_t)cur->hybrid_flags; + } else { + log.u_bbr.flex7 = 0xffff; + log.u_bbr.cur_del_rate = 0xffffffffffffffff; + } + /* + * Compose bbr_state to be a bit wise 0000ADHF + * where A is the always_pace flag + * where D is the dgp_on flag + * where H is the hybrid_mode on flag + * where F is the use_fixed_rate flag. + */ + log.u_bbr.bbr_state = rack->rc_always_pace; + log.u_bbr.bbr_state <<= 1; + log.u_bbr.bbr_state |= rack->dgp_on; + log.u_bbr.bbr_state <<= 1; + log.u_bbr.bbr_state |= rack->rc_hybrid_mode; + log.u_bbr.bbr_state <<= 1; + log.u_bbr.bbr_state |= rack->use_fixed_rate; + log.u_bbr.flex8 = mod; + tcp_log_event(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + TCP_HYBRID_PACING_LOG, 0, + 0, &log, false, NULL, __func__, __LINE__, &tv); + + } +#endif +} + +static inline uint64_t +rack_compensate_for_linerate(struct tcp_rack *rack, uint64_t bw) +{ + uint64_t ret_bw, ether; + uint64_t u_segsiz; + + ether = rack->rc_tp->t_maxseg + sizeof(struct tcphdr); + if (rack->r_is_v6){ +#ifdef INET6 + ether += sizeof(struct ip6_hdr); +#endif + ether += 14; /* eheader size 6+6+2 */ + } else { +#ifdef INET + ether += sizeof(struct ip); +#endif + ether += 14; /* eheader size 6+6+2 */ + } + u_segsiz = (uint64_t)min(ctf_fixed_maxseg(rack->rc_tp), rack->r_ctl.rc_pace_min_segs); + ret_bw = bw; + ret_bw *= ether; + ret_bw /= u_segsiz; + return (ret_bw); +} + +static void +rack_rate_cap_bw(struct tcp_rack *rack, uint64_t *bw, int *capped) +{ +#ifdef TCP_REQUEST_TRK + struct timeval tv; + uint64_t timenow, timeleft, lenleft, lengone, calcbw; +#endif + + if (rack->r_ctl.bw_rate_cap == 0) + return; +#ifdef TCP_REQUEST_TRK + if (rack->rc_catch_up && rack->rc_hybrid_mode && + (rack->r_ctl.rc_last_sft != NULL)) { + /* + * We have a dynamic cap. The original target + * is in bw_rate_cap, but we need to look at + * how long it is until we hit the deadline. + */ + struct http_sendfile_track *ent; + + ent = rack->r_ctl.rc_last_sft; + microuptime(&tv); + timenow = tcp_tv_to_lusectick(&tv); + if (timenow >= ent->deadline) { + /* No time left we do DGP only */ + rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, + 0, 0, 0, HYBRID_LOG_OUTOFTIME, 0, ent); + rack->r_ctl.bw_rate_cap = 0; + return; + } + /* We have the time */ + timeleft = rack->r_ctl.rc_last_sft->deadline - timenow; + if (timeleft < HPTS_MSEC_IN_SEC) { + /* If there is less than a ms left just use DGPs rate */ + rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, + 0, timeleft, 0, HYBRID_LOG_OUTOFTIME, 0, ent); + rack->r_ctl.bw_rate_cap = 0; + return; + } + /* + * Now lets find the amount of data left to send. + * + * Now ideally we want to use the end_seq to figure out how much more + * but it might not be possible (only if we have the TRACK_FG_COMP on the entry.. + */ + if (ent->flags & TCP_HTTP_TRACK_FLG_COMP) { + if (SEQ_GT(ent->end_seq, rack->rc_tp->snd_una)) + lenleft = ent->end_seq - rack->rc_tp->snd_una; + else { + /* TSNH, we should catch it at the send */ + rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, + 0, timeleft, 0, HYBRID_LOG_CAPERROR, 0, ent); + rack->r_ctl.bw_rate_cap = 0; + return; + } + } else { + /* + * The hard way, figure out how much is gone and then + * take that away from the total the client asked for + * (thats off by tls overhead if this is tls). + */ + if (SEQ_GT(rack->rc_tp->snd_una, ent->start_seq)) + lengone = rack->rc_tp->snd_una - ent->start_seq; + else + lengone = 0; + if (lengone < (ent->end - ent->start)) + lenleft = (ent->end - ent->start) - lengone; + else { + /* TSNH, we should catch it at the send */ + rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, + 0, timeleft, lengone, HYBRID_LOG_CAPERROR, 0, ent); + rack->r_ctl.bw_rate_cap = 0; + return; + } + } + if (lenleft == 0) { + /* We have it all sent */ + rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, + 0, timeleft, lenleft, HYBRID_LOG_ALLSENT, 0, ent); + if (rack->r_ctl.bw_rate_cap) + goto normal_ratecap; + else + return; + } + calcbw = lenleft * HPTS_USEC_IN_SEC; + calcbw /= timeleft; + /* Now we must compensate for IP/TCP overhead */ + calcbw = rack_compensate_for_linerate(rack, calcbw); + /* Update the bit rate cap */ + rack->r_ctl.bw_rate_cap = calcbw; + if ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) && + (rack_hybrid_allow_set_maxseg == 1) && + ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) { + /* Lets set in a smaller mss possibly here to match our rate-cap */ + uint32_t orig_max; + + orig_max = rack->r_ctl.rc_pace_max_segs; + rack->r_ctl.rc_last_sft->hybrid_flags |= TCP_HYBRID_PACING_SETMSS; + rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, calcbw, ctf_fixed_maxseg(rack->rc_tp)); + rack_log_type_pacing_sizes(rack->rc_tp, rack, rack->r_ctl.client_suggested_maxseg, orig_max, __LINE__, 5); + } + rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, + calcbw, timeleft, lenleft, HYBRID_LOG_CAP_CALC, 0, ent); + if ((calcbw > 0) && (*bw > calcbw)) { + rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, + *bw, ent->deadline, lenleft, HYBRID_LOG_RATE_CAP, 0, ent); + *capped = 1; + *bw = calcbw; + } + return; + } +normal_ratecap: +#endif + if ((rack->r_ctl.bw_rate_cap > 0) && (*bw > rack->r_ctl.bw_rate_cap)) { +#ifdef TCP_REQUEST_TRK + if (rack->rc_hybrid_mode && + rack->rc_catch_up && + (rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) && + (rack_hybrid_allow_set_maxseg == 1) && + ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) { + /* Lets set in a smaller mss possibly here to match our rate-cap */ + uint32_t orig_max; + + orig_max = rack->r_ctl.rc_pace_max_segs; + rack->r_ctl.rc_last_sft->hybrid_flags |= TCP_HYBRID_PACING_SETMSS; + rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, rack->r_ctl.bw_rate_cap, ctf_fixed_maxseg(rack->rc_tp)); + rack_log_type_pacing_sizes(rack->rc_tp, rack, rack->r_ctl.client_suggested_maxseg, orig_max, __LINE__, 5); + } +#endif + *capped = 1; + *bw = rack->r_ctl.bw_rate_cap; + rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, + *bw, 0, 0, + HYBRID_LOG_RATE_CAP, 1, NULL); } - if (rack->r_ctl.gp_bw == 0) { +} + +static uint64_t +rack_get_gp_est(struct tcp_rack *rack) +{ + uint64_t bw, lt_bw, ret_bw; + + if (rack->rc_gp_filled == 0) { /* * We have yet no b/w measurement, * if we have a user set initial bw @@ -1815,15 +2264,20 @@ rack_get_bw(struct tcp_rack *rack) * so if we have like IW=30, we are not * calculating a "huge" b/w. */ - uint64_t bw, srtt; + uint64_t srtt; + + lt_bw = rack_get_lt_bw(rack); + if (lt_bw) { + /* + * No goodput bw but a long-term b/w does exist + * lets use that. + */ + ret_bw = lt_bw; + goto compensate; + } if (rack->r_ctl.init_rate) return (rack->r_ctl.init_rate); - /* Has the user set a max peak rate? */ -#ifdef NETFLIX_PEAKRATE - if (rack->rc_tp->t_maxpeakrate) - return (rack->rc_tp->t_maxpeakrate); -#endif /* Ok lets come up with the IW guess, if we have a srtt */ if (rack->rc_tp->t_srtt == 0) { /* @@ -1837,32 +2291,71 @@ rack_get_bw(struct tcp_rack *rack) srtt = (uint64_t)rack->rc_tp->t_srtt; bw *= (uint64_t)USECS_IN_SECOND; bw /= srtt; - if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap)) - bw = rack->r_ctl.bw_rate_cap; - return (bw); + ret_bw = bw; + goto compensate; + + } + if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { + /* Averaging is done, we can return the value */ + bw = rack->r_ctl.gp_bw; } else { - uint64_t bw; + /* Still doing initial average must calculate */ + bw = rack->r_ctl.gp_bw / max(rack->r_ctl.num_measurements, 1); + } + lt_bw = rack_get_lt_bw(rack); + if (lt_bw == 0) { + /* If we don't have one then equate it to the gp_bw */ + lt_bw = rack->r_ctl.gp_bw; + } + if ((rack->r_cwnd_was_clamped == 1) && (rack->r_clamped_gets_lower > 0)){ + /* if clamped take the lowest */ + if (lt_bw < bw) + ret_bw = lt_bw; + else + ret_bw = bw; + } else { + /* If not set for clamped to get lowest, take the highest */ + if (lt_bw > bw) + ret_bw = lt_bw; + else + ret_bw = bw; + } *** 9510 LINES SKIPPED ***