git: 7dc78150c730 - main - tcp: refactor cwnd during SACK transmissions to allow TSO
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Tue, 29 Oct 2024 19:27:11 UTC
The branch main has been updated by rscheff: URL: https://cgit.FreeBSD.org/src/commit/?id=7dc78150c730e90fa2afdaba3aa645932b30c429 commit 7dc78150c730e90fa2afdaba3aa645932b30c429 Author: Richard Scheffenegger <rscheff@FreeBSD.org> AuthorDate: 2024-10-29 17:47:06 +0000 Commit: Richard Scheffenegger <rscheff@FreeBSD.org> CommitDate: 2024-10-29 18:04:12 +0000 tcp: refactor cwnd during SACK transmissions to allow TSO Refactoring of cwnd and moving the adjustment for SACKed data into tcp_output() - cwnd tracking the maximum extent starting at snd_una - allows both SACK loss recovery as well as SACK transmissions after RTO during slow start and if allowed, the use of TSO while in loss recovery. Reviewed By: tuexen, cc, #transport Sponsored by: NetApp, Inc. Differential Revision: https://reviews.freebsd.org/D43470 --- sys/netinet/tcp_input.c | 85 +++++++++++++++++++++++++++++------------------- sys/netinet/tcp_output.c | 69 +++++++++++++++++++++------------------ sys/netinet/tcp_sack.c | 17 +++++----- 3 files changed, 97 insertions(+), 74 deletions(-) diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index 5d14664f3fc0..e5f5e09e57d8 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -2653,13 +2653,14 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, tcp_do_prr_ack(tp, th, &to, sack_changed, &maxseg); } else if (tcp_is_sack_recovery(tp, &to) && - IN_FASTRECOVERY(tp->t_flags)) { + IN_FASTRECOVERY(tp->t_flags) && + (tp->snd_nxt == tp->snd_max)) { int awnd; /* * Compute the amount of data in flight first. * We can inject new data into the pipe iff - * we have less than 1/2 the original window's + * we have less than ssthresh * worth of data in flight. */ if (V_tcp_do_newsack) { @@ -2669,10 +2670,18 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, tp->sackhint.sack_bytes_rexmit; } if (awnd < tp->snd_ssthresh) { - tp->snd_cwnd += maxseg; + tp->snd_cwnd += imax(maxseg, + imin(2 * maxseg, + tp->sackhint.delivered_data)); if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } + } else if (tcp_is_sack_recovery(tp, &to) && + IN_FASTRECOVERY(tp->t_flags) && + SEQ_LT(tp->snd_nxt, tp->snd_max)) { + tp->snd_cwnd += imax(maxseg, + imin(2 * maxseg, + tp->sackhint.delivered_data)); } else { tp->snd_cwnd += maxseg; } @@ -2696,14 +2705,13 @@ enter_recovery: tcp_seq onxt = tp->snd_nxt; /* - * If we're doing sack, or prr, check - * to see if we're already in sack + * If we're doing sack, check to + * see if we're already in sack * recovery. If we're not doing sack, * check to see if we're in newreno * recovery. */ - if (V_tcp_do_prr || - (tp->t_flags & TF_SACK_PERMIT)) { + if (tcp_is_sack_recovery(tp, &to)) { if (IN_FASTRECOVERY(tp->t_flags)) { tp->t_dupacks = 0; break; @@ -2723,29 +2731,40 @@ enter_recovery: tp->t_rtttime = 0; if (V_tcp_do_prr) { /* - * snd_ssthresh is already updated by - * cc_cong_signal. + * snd_ssthresh and snd_recover are + * already updated by cc_cong_signal. */ if (tcp_is_sack_recovery(tp, &to)) { /* - * Exclude Limited Transmit + * Include Limited Transmit * segments here */ tp->sackhint.prr_delivered = - maxseg; + imin(tp->snd_max - th->th_ack, + (tp->snd_limited + 1) * maxseg); } else { tp->sackhint.prr_delivered = - imin(tp->snd_max - tp->snd_una, - imin(INT_MAX / 65536, - tp->t_dupacks) * maxseg); + maxseg; } tp->sackhint.recover_fs = max(1, tp->snd_nxt - tp->snd_una); } + tp->snd_limited = 0; if (tcp_is_sack_recovery(tp, &to)) { TCPSTAT_INC(tcps_sack_recovery_episode); - tp->snd_cwnd = maxseg; + /* + * When entering LR after RTO due to + * Duplicate ACKs, retransmit existing + * holes from the scoreboard. + */ + tcp_resend_sackholes(tp); + /* Avoid inflating cwnd in tcp_output */ + tp->snd_nxt = tp->snd_max; + tp->snd_cwnd = tcp_compute_pipe(tp) + + maxseg; (void) tcp_output(tp); + /* Set cwnd to the expected flightsize */ + tp->snd_cwnd = tp->snd_ssthresh; if (SEQ_GT(th->th_ack, tp->snd_una)) { goto resume_partialack; } @@ -2790,7 +2809,8 @@ enter_recovery: (tp->t_rxtshift == 0)) tp->snd_cwnd = SEQ_SUB(tp->snd_nxt, - tp->snd_una); + tp->snd_una) - + tcp_sack_adjust(tp); tp->snd_cwnd += (tp->t_dupacks - tp->snd_limited) * maxseg; @@ -3049,9 +3069,8 @@ process_ACK: SEQ_GEQ(th->th_ack, tp->snd_recover)) { cc_post_recovery(tp, th); } - if (tp->t_flags & TF_SACK_PERMIT) { - if (SEQ_GT(tp->snd_una, tp->snd_recover)) - tp->snd_recover = tp->snd_una; + if (SEQ_GT(tp->snd_una, tp->snd_recover)) { + tp->snd_recover = tp->snd_una; } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; @@ -4138,9 +4157,7 @@ tcp_do_prr_ack(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, */ if (IN_FASTRECOVERY(tp->t_flags)) { if (tcp_is_sack_recovery(tp, to)) { - tp->snd_cwnd = tp->snd_nxt - tp->snd_recover + - tp->sackhint.sack_bytes_rexmit + - (snd_cnt * maxseg); + tp->snd_cwnd = pipe - del_data + (snd_cnt * maxseg); } else { tp->snd_cwnd = (tp->snd_max - tp->snd_una) + (snd_cnt * maxseg); @@ -4168,17 +4185,19 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; - tp->snd_nxt = th->th_ack; - /* - * Set snd_cwnd to one segment beyond acknowledged offset. - * (tp->snd_una has not yet been updated when this function is called.) - */ - tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th); - tp->t_flags |= TF_ACKNOW; - (void) tcp_output(tp); - tp->snd_cwnd = ocwnd; - if (SEQ_GT(onxt, tp->snd_nxt)) - tp->snd_nxt = onxt; + if (IN_FASTRECOVERY(tp->t_flags)) { + tp->snd_nxt = th->th_ack; + /* + * Set snd_cwnd to one segment beyond acknowledged offset. + * (tp->snd_una has not yet been updated when this function is called.) + */ + tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th); + tp->t_flags |= TF_ACKNOW; + (void) tcp_output(tp); + tp->snd_cwnd = ocwnd; + if (SEQ_GT(onxt, tp->snd_nxt)) + tp->snd_nxt = onxt; + } /* * Partial window deflation. Relies on fact that tp->snd_una * not updated yet. diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index a048abf5f97a..860b785b631b 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -266,8 +266,10 @@ again: * resending already delivered data. Adjust snd_nxt accordingly. */ if ((tp->t_flags & TF_SACK_PERMIT) && - SEQ_LT(tp->snd_nxt, tp->snd_max)) + (tp->sackhint.nexthole != NULL) && + !IN_FASTRECOVERY(tp->t_flags)) { sendwin = tcp_sack_adjust(tp); + } sendalot = 0; tso = 0; mtu = 0; @@ -292,10 +294,13 @@ again: if ((tp->t_flags & TF_SACK_PERMIT) && (IN_FASTRECOVERY(tp->t_flags) || SEQ_LT(tp->snd_nxt, tp->snd_max)) && (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { - uint32_t cwin; + int32_t cwin; - cwin = - imax(min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt, 0); + if (IN_FASTRECOVERY(tp->t_flags)) { + cwin = imax(sendwin - tcp_compute_pipe(tp), 0); + } else { + cwin = imax(sendwin - off, 0); + } /* Do not retransmit SACK segments beyond snd_recover */ if (SEQ_GT(p->end, tp->snd_recover)) { /* @@ -314,19 +319,34 @@ again: goto after_sack_rexmit; } else { /* Can rexmit part of the current hole */ - len = ((int32_t)ulmin(cwin, - SEQ_SUB(tp->snd_recover, p->rxmit))); + len = SEQ_SUB(tp->snd_recover, p->rxmit); + if (cwin <= len) { + len = cwin; + } else { + sendalot = 1; + } } } else { - len = ((int32_t)ulmin(cwin, - SEQ_SUB(p->end, p->rxmit))); + len = SEQ_SUB(p->end, p->rxmit); + if (cwin <= len) { + len = cwin; + } else { + sendalot = 1; + } } + /* we could have transmitted from the scoreboard, + * but sendwin (expected flightsize) - pipe didn't + * allow any transmission. + * Bypass recalculating the possible transmission + * length further down by setting sack_rxmit. + * Wouldn't be here if there would have been + * nothing in the scoreboard to transmit. + */ + sack_rxmit = 1; if (len > 0) { off = SEQ_SUB(p->rxmit, tp->snd_una); KASSERT(off >= 0,("%s: sack block to the left of una : %d", __func__, off)); - sack_rxmit = 1; - sendalot = 1; } } after_sack_rexmit: @@ -390,34 +410,15 @@ after_sack_rexmit: */ if (sack_rxmit == 0) { if ((sack_bytes_rxmt == 0) || SEQ_LT(tp->snd_nxt, tp->snd_max)) { - len = ((int32_t)min(sbavail(&so->so_snd), sendwin) - - off); + len = imin(sbavail(&so->so_snd), sendwin) - off; } else { - int32_t cwin; - /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible in the scoreboard. */ - len = ((int32_t)min(sbavail(&so->so_snd), tp->snd_wnd) - - off); - /* - * Don't remove this (len > 0) check ! - * We explicitly check for len > 0 here (although it - * isn't really necessary), to work around a gcc - * optimization issue - to force gcc to compute - * len above. Without this check, the computation - * of len is bungled by the optimizer. - */ - if (len > 0) { - cwin = tp->snd_cwnd - imax(0, (int32_t) - (tp->snd_nxt - tp->snd_recover)) - - sack_bytes_rxmt; - if (cwin < 0) - cwin = 0; - len = imin(len, cwin); - } + len = imin(sbavail(&so->so_snd) - off, + sendwin - tcp_compute_pipe(tp)); } } @@ -1647,6 +1648,10 @@ timer: if ((error == 0) && sack_rxmit && SEQ_LT(tp->snd_nxt, SEQ_MIN(p->rxmit, p->end))) { + /* + * When transmitting from SACK scoreboard + * after an RTO, pull snd_nxt along. + */ tp->snd_nxt = SEQ_MIN(p->rxmit, p->end); } if (error) { diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c index 09e172ad4601..f642acd4c46a 100644 --- a/sys/netinet/tcp_sack.c +++ b/sys/netinet/tcp_sack.c @@ -961,16 +961,15 @@ tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th, u_int *maxsegp) /* Send one or 2 segments based on how much new data was acked. */ if ((BYTES_THIS_ACK(tp, th) / maxseg) >= 2) num_segs = 2; - if (V_tcp_do_newsack) { - tp->snd_cwnd = imax(tp->snd_nxt - th->th_ack + - tp->sackhint.sack_bytes_rexmit - - tp->sackhint.sacked_bytes - - tp->sackhint.lost_bytes, maxseg) + - num_segs * maxseg; - } else { + if (tp->snd_nxt == tp->snd_max) { tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit + - imax(0, tp->snd_nxt - tp->snd_recover) + - num_segs * maxseg); + (tp->snd_nxt - tp->snd_recover) + num_segs * maxseg); + } else { + /* + * Since cwnd is not the expected flightsize during + * SACK LR, not deflating cwnd allows the partial + * ACKed amount to be sent. + */ } if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh;