git: 62ce18fc9a8e - main - tcp: Rack rwnd collapse.
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Tue, 23 Aug 2022 13:18:19 UTC
The branch main has been updated by rrs: URL: https://cgit.FreeBSD.org/src/commit/?id=62ce18fc9a8e46ea72ce3a106e7b0cd1ad6a712b commit 62ce18fc9a8e46ea72ce3a106e7b0cd1ad6a712b Author: Randall Stewart <rrs@FreeBSD.org> AuthorDate: 2022-08-23 13:17:05 +0000 Commit: Randall Stewart <rrs@FreeBSD.org> CommitDate: 2022-08-23 13:17:05 +0000 tcp: Rack rwnd collapse. Currently when the peer collapses its rwnd, we mark packets to be retransmitted and use the must_retran flags like we do when a PMTU collapses to retransmit the collapsed packets. However this causes a problem with some middle boxes that play with the rwnd to control flow. As soon as the rwnd increases we start resending which may be not even a rtt.. and in fact the peer may have gotten the packets. Which means we gratuitously retransmit packets we should not. The fix here is to make sure that a rack time has passed before retransmitting the packets. This makes sure that the rwnd collapse was real and the packets do need retransmission. Reviewed by: tuexen Sponsored by: Netflix Inc Differential Revision: https://reviews.freebsd.org/D35166 --- sys/netinet/tcp_log_buf.h | 4 +- sys/netinet/tcp_stacks/rack.c | 403 ++++++++++++++++++++++++++------------ sys/netinet/tcp_stacks/tcp_rack.h | 19 +- 3 files changed, 300 insertions(+), 126 deletions(-) diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h index 1290a8ce6b29..c11757099c5d 100644 --- a/sys/netinet/tcp_log_buf.h +++ b/sys/netinet/tcp_log_buf.h @@ -236,7 +236,9 @@ enum tcp_log_events { TCP_LOG_FSB, /* FSB information 63 */ RACK_DSACK_HANDLING, /* Handling of DSACK in rack for reordering window 64 */ TCP_HYSTART, /* TCP Hystart logging 65 */ - TCP_LOG_END /* End (keep at end) 66 */ + TCP_CHG_QUERY, /* Change query during fnc_init() 66 */ + TCP_RACK_LOG_COLLAPSE, /* Window collapse by peer 67 */ + TCP_LOG_END /* End (keep at end) 68 */ }; enum tcp_log_states { diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index 0c91c9c6703f..ea370fe9247c 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -385,6 +385,9 @@ counter_u64_t rack_move_some; counter_u64_t rack_input_idle_reduces; counter_u64_t rack_collapsed_win; +counter_u64_t rack_collapsed_win_seen; +counter_u64_t rack_collapsed_win_rxt; +counter_u64_t rack_collapsed_win_rxt_bytes; counter_u64_t rack_try_scwnd; counter_u64_t rack_hw_pace_init_fail; counter_u64_t rack_hw_pace_lost; @@ -790,6 +793,9 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS) counter_u64_zero(rack_move_some); counter_u64_zero(rack_try_scwnd); counter_u64_zero(rack_collapsed_win); + counter_u64_zero(rack_collapsed_win_rxt); + counter_u64_zero(rack_collapsed_win_seen); + counter_u64_zero(rack_collapsed_win_rxt_bytes); } rack_clear_counter = 0; return (0); @@ -1757,12 +1763,31 @@ rack_init_sysctls(void) OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, &rack_input_idle_reduces, "Total number of idle reductions on input"); + rack_collapsed_win_seen = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "collapsed_win_seen", CTLFLAG_RD, + &rack_collapsed_win_seen, + "Total number of collapsed window events seen (where our window shrinks)"); + rack_collapsed_win = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "collapsed_win", CTLFLAG_RD, &rack_collapsed_win, - "Total number of collapsed windows"); + "Total number of collapsed window events where we mark packets"); + rack_collapsed_win_rxt = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "collapsed_win_rxt", CTLFLAG_RD, + &rack_collapsed_win_rxt, + "Total number of packets that were retransmitted"); + rack_collapsed_win_rxt_bytes = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "collapsed_win_bytes", CTLFLAG_RD, + &rack_collapsed_win_rxt_bytes, + "Total number of bytes that were retransmitted"); rack_try_scwnd = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), @@ -2772,6 +2797,9 @@ rack_counter_destroy(void) counter_u64_free(rack_sack_splits); counter_u64_free(rack_input_idle_reduces); counter_u64_free(rack_collapsed_win); + counter_u64_free(rack_collapsed_win_rxt); + counter_u64_free(rack_collapsed_win_rxt_bytes); + counter_u64_free(rack_collapsed_win_seen); counter_u64_free(rack_try_scwnd); counter_u64_free(rack_persists_sends); counter_u64_free(rack_persists_acks); @@ -5295,7 +5323,9 @@ activate_rxt: goto activate_rxt; } /* Convert from ms to usecs */ - if ((rsm->r_flags & RACK_SACK_PASSED) || (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { + if ((rsm->r_flags & RACK_SACK_PASSED) || + (rsm->r_flags & RACK_RWND_COLLAPSED) || + (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) == 1) && (rsm->r_flags & RACK_HAS_FIN)) { @@ -5757,7 +5787,7 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * real pacing. And the tlp or rxt is smaller * than the pacing calculation. Lets not * pace that long since we know the calculation - * so far is not accurate. + * so far is not accurate. */ slot = hpts_timeout; } @@ -6501,7 +6531,7 @@ rack_remxt_tmr(struct tcpcb *tp) trsm = rsm; if (rsm->r_flags & RACK_ACKED) rsm->r_flags |= RACK_WAS_ACKED; - rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); + rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED); rsm->r_flags |= RACK_MUST_RXT; } /* Clear the count (we just un-acked them) */ @@ -8040,6 +8070,13 @@ rack_log_sack_passed(struct tcpcb *tp, */ continue; } + if (nrsm->r_flags & RACK_RWND_COLLAPSED) { + /* + * If the peer dropped the rwnd on + * these then we don't worry about them. + */ + continue; + } if (nrsm->r_flags & RACK_SACK_PASSED) { /* * We found one that is already marked @@ -9797,7 +9834,7 @@ rack_strike_dupack(struct tcp_rack *rack) /* Sendmap entries that are marked to * be retransmitted do not need dupack's * struck. We get these marks for a number - * of reasons (rxt timeout with no sack, + * of reasons (rxt timeout with no sack, * mtu change, or rwnd collapses). When * these events occur, we know we must retransmit * them and mark the sendmap entries. Dupack counting @@ -10308,47 +10345,83 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, return (0); } + static void -rack_collapsed_window(struct tcp_rack *rack) +rack_log_collapse(struct tcp_rack *rack, uint32_t cnt, uint32_t split, uint32_t out, int line, + int dir, uint32_t flags, struct rack_sendmap *rsm) +{ + if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log, 0, sizeof(log)); + log.u_bbr.flex1 = cnt; + log.u_bbr.flex2 = split; + log.u_bbr.flex3 = out; + log.u_bbr.flex4 = line; + log.u_bbr.flex5 = rack->r_must_retran; + log.u_bbr.flex6 = flags; + log.u_bbr.flex7 = rack->rc_has_collapsed; + log.u_bbr.flex8 = dir; /* + * 1 is collapsed, 0 is uncollapsed, + * 2 is log of a rsm being marked, 3 is a split. + */ + if (rsm == NULL) + log.u_bbr.rttProp = 0; + else + log.u_bbr.rttProp = (uint64_t)rsm; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + TCP_RACK_LOG_COLLAPSE, 0, + 0, &log, false, &tv); + } +} + +static void +rack_collapsed_window(struct tcp_rack *rack, uint32_t out, int line) { /* - * Now we must walk the - * send map and divide the - * ones left stranded. These - * guys can't cause us to abort - * the connection and are really - * "unsent". However if a buggy - * client actually did keep some - * of the data i.e. collapsed the win - * and refused to ack and then opened - * the win and acked that data. We would - * get into an ack war, the simplier - * method then of just pretending we - * did not send those segments something - * won't work. + * Here all we do is mark the collapsed point and set the flag. + * This may happen again and again, but there is no + * sense splitting our map until we know where the + * peer finally lands in the collapse. */ - struct rack_sendmap *rsm, *nrsm, fe; + rack_trace_point(rack, RACK_TP_COLLAPSED_WND); + if ((rack->rc_has_collapsed == 0) || + (rack->r_ctl.last_collapse_point != (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd))) + counter_u64_add(rack_collapsed_win_seen, 1); + rack->r_ctl.last_collapse_point = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd; + rack->r_ctl.high_collapse_point = rack->rc_tp->snd_max; + rack->rc_has_collapsed = 1; + rack->r_collapse_point_valid = 1; + rack_log_collapse(rack, 0, 0, rack->r_ctl.last_collapse_point, line, 1, 0, NULL); +} + +static void +rack_un_collapse_window(struct tcp_rack *rack, int line) +{ + struct rack_sendmap *nrsm, *rsm, fe; + int cnt = 0, split = 0; #ifdef INVARIANTS struct rack_sendmap *insret; #endif - tcp_seq max_seq; - rack_trace_point(rack, RACK_TP_COLLAPSED_WND); - max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd; memset(&fe, 0, sizeof(fe)); - fe.r_start = max_seq; - /* Find the first seq past or at maxseq */ + rack->rc_has_collapsed = 0; + fe.r_start = rack->r_ctl.last_collapse_point; rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); if (rsm == NULL) { - /* Nothing to do strange */ - rack->rc_has_collapsed = 0; + /* Nothing to do maybe the peer ack'ed it all */ + rack_log_collapse(rack, 0, 0, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL); return; } - /* - * Now do we need to split at - * the collapse point? - */ - if (SEQ_GT(max_seq, rsm->r_start)) { + /* Now do we need to split this one? */ + if (SEQ_GT(rack->r_ctl.last_collapse_point, rsm->r_start)) { + rack_log_collapse(rack, rsm->r_start, rsm->r_end, + rack->r_ctl.last_collapse_point, line, 3, rsm->r_flags, rsm); nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); if (nrsm == NULL) { /* We can't get a rsm, mark all? */ @@ -10356,7 +10429,8 @@ rack_collapsed_window(struct tcp_rack *rack) goto no_split; } /* Clone it */ - rack_clone_rsm(rack, nrsm, rsm, max_seq); + split = 1; + rack_clone_rsm(rack, nrsm, rsm, rack->r_ctl.last_collapse_point); #ifndef INVARIANTS (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); #else @@ -10366,7 +10440,8 @@ rack_collapsed_window(struct tcp_rack *rack) nrsm, insret, rack, rsm); } #endif - rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT, max_seq, __LINE__); + rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT, + rack->r_ctl.last_collapse_point, __LINE__); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; @@ -10378,38 +10453,15 @@ rack_collapsed_window(struct tcp_rack *rack) rsm = nrsm; } no_split: - counter_u64_add(rack_collapsed_win, 1); RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) { nrsm->r_flags |= RACK_RWND_COLLAPSED; + rack_log_collapse(rack, nrsm->r_start, nrsm->r_end, 0, line, 4, nrsm->r_flags, nrsm); + cnt++; } - rack->rc_has_collapsed = 1; -} - -static void -rack_un_collapse_window(struct tcp_rack *rack) -{ - struct rack_sendmap *rsm; - int cnt = 0;; - - rack->r_ctl.rc_out_at_rto = 0; - rack->r_ctl.rc_snd_max_at_rto = rack->rc_tp->snd_una; - RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { - if (rsm->r_flags & RACK_RWND_COLLAPSED) { - rsm->r_flags &= ~RACK_RWND_COLLAPSED; - rsm->r_flags |= RACK_MUST_RXT; - if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) { - rack->r_ctl.rc_snd_max_at_rto = rsm->r_end; - rack->r_ctl.rc_out_at_rto += (rsm->r_end - rsm->r_start); - } - cnt++; - } - else - break; - } - rack->rc_has_collapsed = 0; if (cnt) { - rack->r_must_retran = 1; + counter_u64_add(rack_collapsed_win, 1); } + rack_log_collapse(rack, cnt, split, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL); } static void @@ -10518,9 +10570,12 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, } if (tp->snd_wnd < ctf_outstanding(tp)) /* The peer collapsed the window */ - rack_collapsed_window(rack); + rack_collapsed_window(rack, ctf_outstanding(tp), __LINE__); else if (rack->rc_has_collapsed) - rack_un_collapse_window(rack); + rack_un_collapse_window(rack, __LINE__); + if ((rack->r_collapse_point_valid) && + (SEQ_GT(th->th_ack, rack->r_ctl.high_collapse_point))) + rack->r_collapse_point_valid = 0; /* Was persist timer active and now we have window space? */ if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), @@ -11076,10 +11131,12 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if (tp->snd_wnd < ctf_outstanding(tp)) { /* The peer collapsed the window */ - rack_collapsed_window(rack); + rack_collapsed_window(rack, ctf_outstanding(tp), __LINE__); } else if (rack->rc_has_collapsed) - rack_un_collapse_window(rack); - + rack_un_collapse_window(rack, __LINE__); + if ((rack->r_collapse_point_valid) && + (SEQ_GT(tp->snd_una, rack->r_ctl.high_collapse_point))) + rack->r_collapse_point_valid = 0; /* * Pull snd_wl2 up to prevent seq wrap relative to th_ack. */ @@ -13066,13 +13123,6 @@ rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uin /* Not a valid win update */ return; } - if (tp->snd_wnd > tp->max_sndwnd) - tp->max_sndwnd = tp->snd_wnd; - if (tp->snd_wnd < (tp->snd_max - high_seq)) { - /* The peer collapsed the window */ - rack_collapsed_window(rack); - } else if (rack->rc_has_collapsed) - rack_un_collapse_window(rack); /* Do we exit persists? */ if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), @@ -13609,6 +13659,15 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb #ifdef TCP_ACCOUNTING ts_val = get_cyclecount(); #endif + /* Tend to any collapsed window */ + if (SEQ_GT(tp->snd_max, high_seq) && (tp->snd_wnd < (tp->snd_max - high_seq))) { + /* The peer collapsed the window */ + rack_collapsed_window(rack, (tp->snd_max - high_seq), __LINE__); + } else if (rack->rc_has_collapsed) + rack_un_collapse_window(rack, __LINE__); + if ((rack->r_collapse_point_valid) && + (SEQ_GT(high_seq, rack->r_ctl.high_collapse_point))) + rack->r_collapse_point_valid = 0; acked_amount = acked = (high_seq - tp->snd_una); if (acked) { /* @@ -15930,6 +15989,11 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + if (rsm->r_flags & RACK_RWND_COLLAPSED) { + rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm); + counter_u64_add(rack_collapsed_win_rxt, 1); + counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start)); + } memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); if (rack->rack_no_prr) @@ -16538,6 +16602,58 @@ failed: return (-1); } +static struct rack_sendmap * +rack_check_collapsed(struct tcp_rack *rack, uint32_t cts) +{ + struct rack_sendmap *rsm = NULL; + struct rack_sendmap fe; + int thresh; + +restart: + fe.r_start = rack->r_ctl.last_collapse_point; + rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); + if ((rsm == NULL) || ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0)) { + /* Nothing, strange turn off validity */ + rack->r_collapse_point_valid = 0; + return (NULL); + } + /* Can we send it yet? */ + if (rsm->r_end > (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd)) { + /* + * Receiver window has not grown enough for + * the segment to be put on the wire. + */ + return (NULL); + } + if (rsm->r_flags & RACK_ACKED) { + /* + * It has been sacked, lets move to the + * next one if possible. + */ + rack->r_ctl.last_collapse_point = rsm->r_end; + /* Are we done? */ + if (SEQ_GEQ(rack->r_ctl.last_collapse_point, + rack->r_ctl.high_collapse_point)) { + rack->r_collapse_point_valid = 0; + return (NULL); + } + goto restart; + } + /* Now has it been long enough ? */ + thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(rack->rc_tp, rack), cts); + if ((cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) > thresh) { + rack_log_collapse(rack, rsm->r_start, + (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])), + thresh, __LINE__, 6, rsm->r_flags, rsm); + return (rsm); + } + /* Not enough time */ + rack_log_collapse(rack, rsm->r_start, + (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])), + thresh, __LINE__, 7, rsm->r_flags, rsm); + return (NULL); +} + static int rack_output(struct tcpcb *tp) { @@ -16598,7 +16714,6 @@ rack_output(struct tcpcb *tp) struct ip6_hdr *ip6 = NULL; int32_t isipv6; #endif - uint8_t filled_all = 0; bool hw_tls = false; /* setup and take the cache hits here */ @@ -16863,6 +16978,29 @@ again: sb_offset = rsm->r_start - tp->snd_una; if (len >= segsiz) len = segsiz; + } else if (rack->r_collapse_point_valid && + ((rsm = rack_check_collapsed(rack, cts)) != NULL)) { + /* + * If an RSM is returned then enough time has passed + * for us to retransmit it. Move up the collapse point, + * since this rsm has its chance to retransmit now. + */ + rack_trace_point(rack, RACK_TP_COLLAPSED_RXT); + rack->r_ctl.last_collapse_point = rsm->r_end; + /* Are we done? */ + if (SEQ_GEQ(rack->r_ctl.last_collapse_point, + rack->r_ctl.high_collapse_point)) + rack->r_collapse_point_valid = 0; + sack_rxmit = 1; + /* We are not doing a TLP */ + doing_tlp = 0; + len = rsm->r_end - rsm->r_start; + sb_offset = rsm->r_start - tp->snd_una; + sendalot = 0; + if ((rack->full_size_rxt == 0) && + (rack->shape_rxt_to_pacing_min == 0) && + (len >= segsiz)) + len = segsiz; } else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) { /* We have a retransmit that takes precedence */ if ((!IN_FASTRECOVERY(tp->t_flags)) && @@ -16921,53 +17059,72 @@ again: } if (rack->r_must_retran && (doing_tlp == 0) && + (SEQ_GT(tp->snd_max, tp->snd_una)) && (rsm == NULL)) { /* - * Non-Sack and we had a RTO or Sack/non-Sack and a - * MTU change, we need to retransmit until we reach - * the former snd_max (rack->r_ctl.rc_snd_max_at_rto). + * There are two different ways that we + * can get into this block: + * a) This is a non-sack connection, we had a time-out + * and thus r_must_retran was set and everything + * left outstanding as been marked for retransmit. + * b) The MTU of the path shrank, so that everything + * was marked to be retransmitted with the smaller + * mtu and r_must_retran was set. + * + * This means that we expect the sendmap (outstanding) + * to all be marked must. We can use the tmap to + * look at them. + * */ - if (SEQ_GT(tp->snd_max, tp->snd_una)) { - int sendwin, flight; - - sendwin = min(tp->snd_wnd, tp->snd_cwnd); - flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto); - if (flight >= sendwin) { - so = inp->inp_socket; - sb = &so->so_snd; - goto just_return_nolock; - } - rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); - if (rsm == NULL) { - /* TSNH */ - rack->r_must_retran = 0; - rack->r_ctl.rc_out_at_rto = 0; - so = inp->inp_socket; - sb = &so->so_snd; - goto just_return_nolock; - } - if ((rsm->r_flags & RACK_MUST_RXT) == 0) { - /* It does not have the flag, we are done */ - rack->r_must_retran = 0; - rack->r_ctl.rc_out_at_rto = 0; - } else { - sack_rxmit = 1; - len = rsm->r_end - rsm->r_start; - sendalot = 0; - sb_offset = rsm->r_start - tp->snd_una; - if (len >= segsiz) - len = segsiz; - /* - * Delay removing the flag RACK_MUST_RXT so - * that the fastpath for retransmit will - * work with this rsm. - */ + int sendwin, flight; - } - } else { - /* We must be done if there is nothing outstanding */ + sendwin = min(tp->snd_wnd, tp->snd_cwnd); + flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto); + if (flight >= sendwin) { + /* + * We can't send yet. + */ + so = inp->inp_socket; + sb = &so->so_snd; + goto just_return_nolock; + } + /* + * This is the case a/b mentioned above. All + * outstanding/not-acked should be marked. + * We can use the tmap to find them. + */ + rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); + if (rsm == NULL) { + /* TSNH */ + rack->r_must_retran = 0; + rack->r_ctl.rc_out_at_rto = 0; + so = inp->inp_socket; + sb = &so->so_snd; + goto just_return_nolock; + } + if ((rsm->r_flags & RACK_MUST_RXT) == 0) { + /* + * The first one does not have the flag, did we collapse + * further up in our list? + */ rack->r_must_retran = 0; rack->r_ctl.rc_out_at_rto = 0; + rsm = NULL; + sack_rxmit = 0; + } else { + sack_rxmit = 1; + len = rsm->r_end - rsm->r_start; + sb_offset = rsm->r_start - tp->snd_una; + sendalot = 0; + if ((rack->full_size_rxt == 0) && + (rack->shape_rxt_to_pacing_min == 0) && + (len >= segsiz)) + len = segsiz; + /* + * Delay removing the flag RACK_MUST_RXT so + * that the fastpath for retransmit will + * work with this rsm. + */ } } /* @@ -18177,7 +18334,7 @@ send: if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, ((rsm == NULL) ? hw_tls : 0) #ifdef NETFLIX_COPY_ARGS - , &filled_all + , &s_mb, &s_moff #endif ); if (len <= (tp->t_maxseg - optlen)) { @@ -18548,15 +18705,17 @@ send: log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; log.u_bbr.flex4 = orig_len; - if (filled_all) - log.u_bbr.flex5 = 0x80000000; - else - log.u_bbr.flex5 = 0; /* Save off the early/late values */ log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; log.u_bbr.bw_inuse = rack_get_bw(rack); - if (rsm || sack_rxmit) { + log.u_bbr.flex8 = 0; + if (rsm) { + if (rsm->r_flags & RACK_RWND_COLLAPSED) { + rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm); + counter_u64_add(rack_collapsed_win_rxt, 1); + counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start)); + } if (doing_tlp) log.u_bbr.flex8 = 2; else diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h index e8560446b798..c747ceac7628 100644 --- a/sys/netinet/tcp_stacks/tcp_rack.h +++ b/sys/netinet/tcp_stacks/tcp_rack.h @@ -275,7 +275,7 @@ struct rack_opts_stats { * non-zero, the default is 4 for continuous tracing. * You also set in the number of connections you want * have get BB logs in net.inet.tcp.<stack>.tp.count. - * + * * Count will decrement every time BB logging is assigned * to a connection that hit your tracepoint. * @@ -291,6 +291,7 @@ struct rack_opts_stats { #define RACK_TP_HWENOBUF 0x00000001 /* When we are doing hardware pacing and hit enobufs */ #define RACK_TP_ENOBUF 0x00000002 /* When we hit enobufs with software pacing */ #define RACK_TP_COLLAPSED_WND 0x00000003 /* When a peer to collapses its rwnd on us */ +#define RACK_TP_COLLAPSED_RXT 0x00000004 /* When we actually retransmit a collapsed window rsm */ #define MIN_GP_WIN 6 /* We need at least 6 MSS in a GP measurement */ #ifdef _KERNEL @@ -472,6 +473,8 @@ struct rack_control { uint32_t roundends; /* acked value above which round ends */ uint32_t num_dsack; /* Count of dsack's seen (1 per window)*/ uint32_t forced_ack_ts; + uint32_t last_collapse_point; /* Last point peer collapsed too */ + uint32_t high_collapse_point; uint32_t rc_lower_rtt_us_cts; /* Time our GP rtt was last lowered */ uint32_t rc_time_probertt_entered; uint32_t rc_time_probertt_starts; @@ -546,7 +549,15 @@ struct tcp_rack { struct inpcb *rc_inp; /* The inpcb Lock(a) */ uint8_t rc_free_cnt; /* Number of free entries on the rc_free list * Lock(a) */ - uint8_t client_bufferlvl; /* 0 - 5 normaly, less than or at 2 means its real low */ + uint8_t client_bufferlvl : 4, /* Expected range [0,5]: 0=unset, 1=low/empty */ + rack_deferred_inited : 1, + /* ******************************************************************** */ + /* Note for details of next two fields see rack_init_retransmit_rate() */ + /* ******************************************************************** */ + full_size_rxt: 1, + shape_rxt_to_pacing_min : 1, + /* ******************************************************************** */ + spare : 1; uint8_t no_prr_addback : 1, gp_ready : 1, defer_options: 1, @@ -647,7 +658,9 @@ struct tcp_rack { r_late : 1, r_wanted_output: 1, r_rr_config : 2, - rc_avail_bit : 3; + r_persist_lt_bw_off : 1, + r_collapse_point_valid : 1, + rc_avail_bit : 2; uint16_t rc_init_win : 8, rc_gp_rtt_set : 1, rc_gp_dyn_mul : 1,