git: 7dc78150c730 - main - tcp: refactor cwnd during SACK transmissions to allow TSO

From: Richard Scheffenegger <rscheff_at_FreeBSD.org>
Date: Tue, 29 Oct 2024 19:27:11 UTC
The branch main has been updated by rscheff:

URL: https://cgit.FreeBSD.org/src/commit/?id=7dc78150c730e90fa2afdaba3aa645932b30c429

commit 7dc78150c730e90fa2afdaba3aa645932b30c429
Author:     Richard Scheffenegger <rscheff@FreeBSD.org>
AuthorDate: 2024-10-29 17:47:06 +0000
Commit:     Richard Scheffenegger <rscheff@FreeBSD.org>
CommitDate: 2024-10-29 18:04:12 +0000

    tcp: refactor cwnd during SACK transmissions to allow TSO
    
    Refactoring of cwnd and moving the adjustment for SACKed data into
    tcp_output() - cwnd tracking the maximum extent starting at snd_una -
    allows both SACK loss recovery as well as SACK transmissions after
    RTO during slow start and if allowed, the use of TSO while in loss
    recovery.
    
    Reviewed By:            tuexen, cc, #transport
    Sponsored by:           NetApp, Inc.
    Differential Revision:  https://reviews.freebsd.org/D43470
---
 sys/netinet/tcp_input.c  | 85 +++++++++++++++++++++++++++++-------------------
 sys/netinet/tcp_output.c | 69 +++++++++++++++++++++------------------
 sys/netinet/tcp_sack.c   | 17 +++++-----
 3 files changed, 97 insertions(+), 74 deletions(-)

diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 5d14664f3fc0..e5f5e09e57d8 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -2653,13 +2653,14 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
 						tcp_do_prr_ack(tp, th, &to,
 						    sack_changed, &maxseg);
 					} else if (tcp_is_sack_recovery(tp, &to) &&
-						    IN_FASTRECOVERY(tp->t_flags)) {
+						    IN_FASTRECOVERY(tp->t_flags) &&
+						    (tp->snd_nxt == tp->snd_max)) {
 						int awnd;
 
 						/*
 						 * Compute the amount of data in flight first.
 						 * We can inject new data into the pipe iff
-						 * we have less than 1/2 the original window's
+						 * we have less than ssthresh
 						 * worth of data in flight.
 						 */
 						if (V_tcp_do_newsack) {
@@ -2669,10 +2670,18 @@ tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
 								tp->sackhint.sack_bytes_rexmit;
 						}
 						if (awnd < tp->snd_ssthresh) {
-							tp->snd_cwnd += maxseg;
+							tp->snd_cwnd += imax(maxseg,
+							    imin(2 * maxseg,
+							    tp->sackhint.delivered_data));
 							if (tp->snd_cwnd > tp->snd_ssthresh)
 								tp->snd_cwnd = tp->snd_ssthresh;
 						}
+					} else if (tcp_is_sack_recovery(tp, &to) &&
+						    IN_FASTRECOVERY(tp->t_flags) &&
+						    SEQ_LT(tp->snd_nxt, tp->snd_max)) {
+						tp->snd_cwnd += imax(maxseg,
+						    imin(2 * maxseg,
+						    tp->sackhint.delivered_data));
 					} else {
 						tp->snd_cwnd += maxseg;
 					}
@@ -2696,14 +2705,13 @@ enter_recovery:
 					tcp_seq onxt = tp->snd_nxt;
 
 					/*
-					 * If we're doing sack, or prr, check
-					 * to see if we're already in sack
+					 * If we're doing sack, check to
+					 * see if we're already in sack
 					 * recovery. If we're not doing sack,
 					 * check to see if we're in newreno
 					 * recovery.
 					 */
-					if (V_tcp_do_prr ||
-					    (tp->t_flags & TF_SACK_PERMIT)) {
+					if (tcp_is_sack_recovery(tp, &to)) {
 						if (IN_FASTRECOVERY(tp->t_flags)) {
 							tp->t_dupacks = 0;
 							break;
@@ -2723,29 +2731,40 @@ enter_recovery:
 					tp->t_rtttime = 0;
 					if (V_tcp_do_prr) {
 						/*
-						 * snd_ssthresh is already updated by
-						 * cc_cong_signal.
+						 * snd_ssthresh and snd_recover are
+						 * already updated by cc_cong_signal.
 						 */
 						if (tcp_is_sack_recovery(tp, &to)) {
 							/*
-							 * Exclude Limited Transmit
+							 * Include Limited Transmit
 							 * segments here
 							 */
 							tp->sackhint.prr_delivered =
-							    maxseg;
+							    imin(tp->snd_max - th->th_ack,
+							    (tp->snd_limited + 1) * maxseg);
 						} else {
 							tp->sackhint.prr_delivered =
-							    imin(tp->snd_max - tp->snd_una,
-							    imin(INT_MAX / 65536,
-								tp->t_dupacks) * maxseg);
+							    maxseg;
 						}
 						tp->sackhint.recover_fs = max(1,
 						    tp->snd_nxt - tp->snd_una);
 					}
+					tp->snd_limited = 0;
 					if (tcp_is_sack_recovery(tp, &to)) {
 						TCPSTAT_INC(tcps_sack_recovery_episode);
-						tp->snd_cwnd = maxseg;
+						/*
+						 * When entering LR after RTO due to
+						 * Duplicate ACKs, retransmit existing
+						 * holes from the scoreboard.
+						 */
+						tcp_resend_sackholes(tp);
+						/* Avoid inflating cwnd in tcp_output */
+						tp->snd_nxt = tp->snd_max;
+						tp->snd_cwnd = tcp_compute_pipe(tp) +
+						    maxseg;
 						(void) tcp_output(tp);
+						/* Set cwnd to the expected flightsize */
+						tp->snd_cwnd = tp->snd_ssthresh;
 						if (SEQ_GT(th->th_ack, tp->snd_una)) {
 							goto resume_partialack;
 						}
@@ -2790,7 +2809,8 @@ enter_recovery:
 					    (tp->t_rxtshift == 0))
 						tp->snd_cwnd =
 						    SEQ_SUB(tp->snd_nxt,
-							    tp->snd_una);
+							    tp->snd_una) -
+							tcp_sack_adjust(tp);
 					tp->snd_cwnd +=
 					    (tp->t_dupacks - tp->snd_limited) *
 					    maxseg;
@@ -3049,9 +3069,8 @@ process_ACK:
 		    SEQ_GEQ(th->th_ack, tp->snd_recover)) {
 			cc_post_recovery(tp, th);
 		}
-		if (tp->t_flags & TF_SACK_PERMIT) {
-			if (SEQ_GT(tp->snd_una, tp->snd_recover))
-				tp->snd_recover = tp->snd_una;
+		if (SEQ_GT(tp->snd_una, tp->snd_recover)) {
+			tp->snd_recover = tp->snd_una;
 		}
 		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 			tp->snd_nxt = tp->snd_una;
@@ -4138,9 +4157,7 @@ tcp_do_prr_ack(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to,
 	 */
 	if (IN_FASTRECOVERY(tp->t_flags)) {
 		if (tcp_is_sack_recovery(tp, to)) {
-			tp->snd_cwnd = tp->snd_nxt - tp->snd_recover +
-					    tp->sackhint.sack_bytes_rexmit +
-					    (snd_cnt * maxseg);
+			tp->snd_cwnd = pipe - del_data + (snd_cnt * maxseg);
 		} else {
 			tp->snd_cwnd = (tp->snd_max - tp->snd_una) +
 					    (snd_cnt * maxseg);
@@ -4168,17 +4185,19 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
 
 	tcp_timer_activate(tp, TT_REXMT, 0);
 	tp->t_rtttime = 0;
-	tp->snd_nxt = th->th_ack;
-	/*
-	 * Set snd_cwnd to one segment beyond acknowledged offset.
-	 * (tp->snd_una has not yet been updated when this function is called.)
-	 */
-	tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th);
-	tp->t_flags |= TF_ACKNOW;
-	(void) tcp_output(tp);
-	tp->snd_cwnd = ocwnd;
-	if (SEQ_GT(onxt, tp->snd_nxt))
-		tp->snd_nxt = onxt;
+	if (IN_FASTRECOVERY(tp->t_flags)) {
+		tp->snd_nxt = th->th_ack;
+		/*
+		 * Set snd_cwnd to one segment beyond acknowledged offset.
+		 * (tp->snd_una has not yet been updated when this function is called.)
+		 */
+		tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th);
+		tp->t_flags |= TF_ACKNOW;
+		(void) tcp_output(tp);
+		tp->snd_cwnd = ocwnd;
+		if (SEQ_GT(onxt, tp->snd_nxt))
+			tp->snd_nxt = onxt;
+	}
 	/*
 	 * Partial window deflation.  Relies on fact that tp->snd_una
 	 * not updated yet.
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index a048abf5f97a..860b785b631b 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -266,8 +266,10 @@ again:
 	 * resending already delivered data.  Adjust snd_nxt accordingly.
 	 */
 	if ((tp->t_flags & TF_SACK_PERMIT) &&
-	    SEQ_LT(tp->snd_nxt, tp->snd_max))
+	    (tp->sackhint.nexthole != NULL) &&
+	    !IN_FASTRECOVERY(tp->t_flags)) {
 		sendwin = tcp_sack_adjust(tp);
+	}
 	sendalot = 0;
 	tso = 0;
 	mtu = 0;
@@ -292,10 +294,13 @@ again:
 	if ((tp->t_flags & TF_SACK_PERMIT) &&
 	    (IN_FASTRECOVERY(tp->t_flags) || SEQ_LT(tp->snd_nxt, tp->snd_max)) &&
 	    (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
-		uint32_t cwin;
+		int32_t cwin;
 
-		cwin =
-		    imax(min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt, 0);
+		if (IN_FASTRECOVERY(tp->t_flags)) {
+			cwin = imax(sendwin - tcp_compute_pipe(tp), 0);
+		} else {
+			cwin = imax(sendwin - off, 0);
+		}
 		/* Do not retransmit SACK segments beyond snd_recover */
 		if (SEQ_GT(p->end, tp->snd_recover)) {
 			/*
@@ -314,19 +319,34 @@ again:
 				goto after_sack_rexmit;
 			} else {
 				/* Can rexmit part of the current hole */
-				len = ((int32_t)ulmin(cwin,
-				    SEQ_SUB(tp->snd_recover, p->rxmit)));
+				len = SEQ_SUB(tp->snd_recover, p->rxmit);
+				if (cwin <= len) {
+					len = cwin;
+				} else {
+					sendalot = 1;
+				}
 			}
 		} else {
-			len = ((int32_t)ulmin(cwin,
-			    SEQ_SUB(p->end, p->rxmit)));
+			len = SEQ_SUB(p->end, p->rxmit);
+			if (cwin <= len) {
+				len = cwin;
+			} else {
+				sendalot = 1;
+			}
 		}
+		/* we could have transmitted from the scoreboard,
+		 * but sendwin (expected flightsize) - pipe didn't
+		 * allow any transmission.
+		 * Bypass recalculating the possible transmission
+		 * length further down by setting sack_rxmit.
+		 * Wouldn't be here if there would have been
+		 * nothing in the scoreboard to transmit.
+		 */
+		sack_rxmit = 1;
 		if (len > 0) {
 			off = SEQ_SUB(p->rxmit, tp->snd_una);
 			KASSERT(off >= 0,("%s: sack block to the left of una : %d",
 			    __func__, off));
-			sack_rxmit = 1;
-			sendalot = 1;
 		}
 	}
 after_sack_rexmit:
@@ -390,34 +410,15 @@ after_sack_rexmit:
 	 */
 	if (sack_rxmit == 0) {
 		if ((sack_bytes_rxmt == 0) || SEQ_LT(tp->snd_nxt, tp->snd_max)) {
-			len = ((int32_t)min(sbavail(&so->so_snd), sendwin) -
-			    off);
+			len = imin(sbavail(&so->so_snd), sendwin) - off;
 		} else {
-			int32_t cwin;
-
 			/*
 			 * We are inside of a SACK recovery episode and are
 			 * sending new data, having retransmitted all the
 			 * data possible in the scoreboard.
 			 */
-			len = ((int32_t)min(sbavail(&so->so_snd), tp->snd_wnd) -
-			    off);
-			/*
-			 * Don't remove this (len > 0) check !
-			 * We explicitly check for len > 0 here (although it
-			 * isn't really necessary), to work around a gcc
-			 * optimization issue - to force gcc to compute
-			 * len above. Without this check, the computation
-			 * of len is bungled by the optimizer.
-			 */
-			if (len > 0) {
-				cwin = tp->snd_cwnd - imax(0, (int32_t)
-					(tp->snd_nxt - tp->snd_recover)) -
-					sack_bytes_rxmt;
-				if (cwin < 0)
-					cwin = 0;
-				len = imin(len, cwin);
-			}
+			len = imin(sbavail(&so->so_snd) - off,
+				sendwin - tcp_compute_pipe(tp));
 		}
 	}
 
@@ -1647,6 +1648,10 @@ timer:
 	if ((error == 0) &&
 	    sack_rxmit &&
 	    SEQ_LT(tp->snd_nxt, SEQ_MIN(p->rxmit, p->end))) {
+		/*
+		 * When transmitting from SACK scoreboard
+		 * after an RTO, pull snd_nxt along.
+		 */
 		tp->snd_nxt = SEQ_MIN(p->rxmit, p->end);
 	}
 	if (error) {
diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c
index 09e172ad4601..f642acd4c46a 100644
--- a/sys/netinet/tcp_sack.c
+++ b/sys/netinet/tcp_sack.c
@@ -961,16 +961,15 @@ tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th, u_int *maxsegp)
 	/* Send one or 2 segments based on how much new data was acked. */
 	if ((BYTES_THIS_ACK(tp, th) / maxseg) >= 2)
 		num_segs = 2;
-	if (V_tcp_do_newsack) {
-		tp->snd_cwnd = imax(tp->snd_nxt - th->th_ack +
-				tp->sackhint.sack_bytes_rexmit -
-				tp->sackhint.sacked_bytes -
-				tp->sackhint.lost_bytes, maxseg) +
-				num_segs * maxseg;
-	} else {
+	if (tp->snd_nxt == tp->snd_max) {
 		tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit +
-		    imax(0, tp->snd_nxt - tp->snd_recover) +
-		    num_segs * maxseg);
+		    (tp->snd_nxt - tp->snd_recover) + num_segs * maxseg);
+	} else {
+		/*
+		 * Since cwnd is not the expected flightsize during
+		 * SACK LR, not deflating cwnd allows the partial
+		 * ACKed amount to be sent.
+		 */
 	}
 	if (tp->snd_cwnd > tp->snd_ssthresh)
 		tp->snd_cwnd = tp->snd_ssthresh;