svn commit: r360282 - stable/11/sys/netinet
Michael Tuexen
tuexen at FreeBSD.org
Fri Apr 24 21:30:32 UTC 2020
Author: tuexen
Date: Fri Apr 24 21:30:31 2020
New Revision: 360282
URL: https://svnweb.freebsd.org/changeset/base/360282
Log:
Improve the TCP SACK generation by reporting DSACKs
MFC r347382:
Receiver side DSACK implemenation.
This adds initial support for RFC 2883.
MFC r347407:
Don't use C++ style comments.
MFC r349987:
This is the second in a number of patches needed to
get BBRv1 into the tree. This fixes the DSACK bug but
is also needed by BBR. We have yet to go two more
one will be for the pacing code (tcp_ratelimit.c) and
the second will be for the new updated LRO code that
allows a transport to know the arrival times of packets
and (tcp_lro.c). After that we should finally be able
to get BBRv1 into head.
MFC r351725:
This patch improves the DSACK handling to conform with RFC 2883.
The lowest SACK block is used when multiple Blocks would be elegible as
DSACK blocks ACK blocks get reordered - while maintaining the ordering of
SACK blocks not relevant in the DSACK context is maintained.
MFC r351801:
Fix the SACK block generation in the base TCP stack by bringing it in
sync with the RACK stack.
MFC r352072:
Only update SACK/DSACK lists when a non-empty segment was received.
This fixes hitting a KASSERT with a valid packet exchange.
MFC r352386
Don't write to memory outside of the allocated array for SACK blocks.
MFC r356796:
Remove debug code not needed anymore.
MFC r357100:
The server side of TCP fast open relies on the delayed ACK timer to allow
including user data in the SYN-ACK. When DSACK support was added in
r347382, an immediate ACK was sent even for the received SYN with
user data. This patch fixes that and allows again to send user data with
the SYN-ACK.
This is joint work of rrs, rscheff, and tuexen.
Differential Revision: https://reviews.freebsd.org/D19334
Differential Revision: https://reviews.freebsd.org/D20908
Differential Revision: https://reviews.freebsd.org/D21038
Differential Revision: https://reviews.freebsd.org/D21513
Differential Revision: https://reviews.freebsd.org/D21567
Differential Revision: https://reviews.freebsd.org/D23208
Differential Revision: https://reviews.freebsd.org/D23212
Modified:
stable/11/sys/netinet/tcp_input.c
stable/11/sys/netinet/tcp_output.c
stable/11/sys/netinet/tcp_sack.c
stable/11/sys/netinet/tcp_var.h
Directory Properties:
stable/11/ (props changed)
Modified: stable/11/sys/netinet/tcp_input.c
==============================================================================
--- stable/11/sys/netinet/tcp_input.c Fri Apr 24 21:21:49 2020 (r360281)
+++ stable/11/sys/netinet/tcp_input.c Fri Apr 24 21:30:31 2020 (r360282)
@@ -1539,7 +1539,6 @@ tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, stru
} else {
tp->rfbuf_cnt += tlen; /* add up */
}
-
return (newsize);
}
@@ -2307,6 +2306,18 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, stru
TCPSTAT_INC(tcps_rcvpartduppack);
TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
}
+ /*
+ * DSACK - add SACK block for dropped range
+ */
+ if ((todrop > 0) && (tp->t_flags & TF_SACK_PERMIT)) {
+ tcp_update_sack_list(tp, th->th_seq,
+ th->th_seq + todrop);
+ /*
+ * ACK now, as the next in-sequence segment
+ * will clear the DSACK block again
+ */
+ tp->t_flags |= TF_ACKNOW;
+ }
drop_hdrlen += todrop; /* drop from the top afterwards */
th->th_seq += todrop;
tlen -= todrop;
@@ -3035,6 +3046,8 @@ dodata: /* XXX */
if ((tlen || (thflags & TH_FIN) || tfo_syn) &&
TCPS_HAVERCVDFIN(tp->t_state) == 0) {
tcp_seq save_start = th->th_seq;
+ tcp_seq save_rnxt = tp->rcv_nxt;
+ int save_tlen = tlen;
m_adj(m, drop_hdrlen); /* delayed header drop */
/*
* Insert segment which includes th into TCP reassembly queue
@@ -3074,11 +3087,41 @@ dodata: /* XXX */
* m_adj() doesn't actually frees any mbufs
* when trimming from the head.
*/
- thflags = tcp_reass(tp, th, &save_start, &tlen, m);
+ tcp_seq temp = save_start;
+ thflags = tcp_reass(tp, th, &temp, &tlen, m);
tp->t_flags |= TF_ACKNOW;
}
- if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
- tcp_update_sack_list(tp, save_start, save_start + tlen);
+ if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) {
+ if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
+ /*
+ * DSACK actually handled in the fastpath
+ * above.
+ */
+ tcp_update_sack_list(tp, save_start,
+ save_start + save_tlen);
+ } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
+ if ((tp->rcv_numsacks >= 1) &&
+ (tp->sackblks[0].end == save_start)) {
+ /*
+ * Partial overlap, recorded at todrop
+ * above.
+ */
+ tcp_update_sack_list(tp,
+ tp->sackblks[0].start,
+ tp->sackblks[0].end);
+ } else {
+ tcp_update_dsack_list(tp, save_start,
+ save_start + save_tlen);
+ }
+ } else if (tlen >= save_tlen) {
+ /* Update of sackblks. */
+ tcp_update_dsack_list(tp, save_start,
+ save_start + save_tlen);
+ } else if (tlen > 0) {
+ tcp_update_dsack_list(tp, save_start,
+ save_start + tlen);
+ }
+ }
#if 0
/*
* Note the amount of data that peer has sent into
Modified: stable/11/sys/netinet/tcp_output.c
==============================================================================
--- stable/11/sys/netinet/tcp_output.c Fri Apr 24 21:21:49 2020 (r360281)
+++ stable/11/sys/netinet/tcp_output.c Fri Apr 24 21:30:31 2020 (r360282)
@@ -1548,7 +1548,13 @@ timer:
if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
tp->snd_max = tp->snd_nxt + xlen;
}
-
+ if ((error == 0) &&
+ (TCPS_HAVEESTABLISHED(tp->t_state) &&
+ (tp->t_flags & TF_SACK_PERMIT) &&
+ tp->rcv_numsacks > 0)) {
+ /* Clean up any DSACK's sent */
+ tcp_clean_dsack_blocks(tp);
+ }
if (error) {
/*
Modified: stable/11/sys/netinet/tcp_sack.c
==============================================================================
--- stable/11/sys/netinet/tcp_sack.c Fri Apr 24 21:21:49 2020 (r360281)
+++ stable/11/sys/netinet/tcp_sack.c Fri Apr 24 21:30:31 2020 (r360282)
@@ -150,7 +150,104 @@ SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalholes,
&VNET_NAME(tcp_sack_globalholes), 0,
"Global number of TCP SACK holes currently allocated");
+
/*
+ * This function will find overlaps with the currently stored sackblocks
+ * and add any overlap as a dsack block upfront
+ */
+void
+tcp_update_dsack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end)
+{
+ struct sackblk head_blk,mid_blk,saved_blks[MAX_SACK_BLKS];
+ int i, j, n, identical;
+ tcp_seq start, end;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end"));
+
+ if (SEQ_LT(rcv_end, tp->rcv_nxt) ||
+ ((rcv_end == tp->rcv_nxt) &&
+ (tp->rcv_numsacks > 0 ) &&
+ (tp->sackblks[0].end == tp->rcv_nxt))) {
+ saved_blks[0].start = rcv_start;
+ saved_blks[0].end = rcv_end;
+ } else {
+ saved_blks[0].start = saved_blks[0].end = 0;
+ }
+
+ head_blk.start = head_blk.end = 0;
+ mid_blk.start = rcv_start;
+ mid_blk.end = rcv_end;
+ identical = 0;
+
+ for (i = 0; i < tp->rcv_numsacks; i++) {
+ start = tp->sackblks[i].start;
+ end = tp->sackblks[i].end;
+ if (SEQ_LT(rcv_end, start)) {
+ /* pkt left to sack blk */
+ continue;
+ }
+ if (SEQ_GT(rcv_start, end)) {
+ /* pkt right to sack blk */
+ continue;
+ }
+ if (SEQ_GT(tp->rcv_nxt, end)) {
+ if ((SEQ_MAX(rcv_start, start) != SEQ_MIN(rcv_end, end)) &&
+ (SEQ_GT(head_blk.start, SEQ_MAX(rcv_start, start)) ||
+ (head_blk.start == head_blk.end))) {
+ head_blk.start = SEQ_MAX(rcv_start, start);
+ head_blk.end = SEQ_MIN(rcv_end, end);
+ }
+ continue;
+ }
+ if (((head_blk.start == head_blk.end) ||
+ SEQ_LT(start, head_blk.start)) &&
+ (SEQ_GT(end, rcv_start) &&
+ SEQ_LEQ(start, rcv_end))) {
+ head_blk.start = start;
+ head_blk.end = end;
+ }
+ mid_blk.start = SEQ_MIN(mid_blk.start, start);
+ mid_blk.end = SEQ_MAX(mid_blk.end, end);
+ if ((mid_blk.start == start) &&
+ (mid_blk.end == end))
+ identical = 1;
+ }
+ if (SEQ_LT(head_blk.start, head_blk.end)) {
+ /* store overlapping range */
+ saved_blks[0].start = SEQ_MAX(rcv_start, head_blk.start);
+ saved_blks[0].end = SEQ_MIN(rcv_end, head_blk.end);
+ }
+ n = 1;
+ /*
+ * Second, if not ACKed, store the SACK block that
+ * overlaps with the DSACK block unless it is identical
+ */
+ if ((SEQ_LT(tp->rcv_nxt, mid_blk.end) &&
+ !((mid_blk.start == saved_blks[0].start) &&
+ (mid_blk.end == saved_blks[0].end))) ||
+ identical == 1) {
+ saved_blks[n].start = mid_blk.start;
+ saved_blks[n++].end = mid_blk.end;
+ }
+ for (j = 0; (j < tp->rcv_numsacks) && (n < MAX_SACK_BLKS); j++) {
+ if (((SEQ_LT(tp->sackblks[j].end, mid_blk.start) ||
+ SEQ_GT(tp->sackblks[j].start, mid_blk.end)) &&
+ (SEQ_GT(tp->sackblks[j].start, tp->rcv_nxt))))
+ saved_blks[n++] = tp->sackblks[j];
+ }
+ j = 0;
+ for (i = 0; i < n; i++) {
+ /* we can end up with a stale inital entry */
+ if (SEQ_LT(saved_blks[i].start, saved_blks[i].end)) {
+ tp->sackblks[j++] = saved_blks[i];
+ }
+ }
+ tp->rcv_numsacks = j;
+}
+
+/*
* This function is called upon receipt of new valid data (while not in
* header prediction mode), and it updates the ordered list of sacks.
*/
@@ -169,11 +266,18 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_sta
INP_WLOCK_ASSERT(tp->t_inpcb);
/* Check arguments. */
- KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end"));
+ KASSERT(SEQ_LEQ(rcv_start, rcv_end), ("rcv_start <= rcv_end"));
- /* SACK block for the received segment. */
- head_blk.start = rcv_start;
- head_blk.end = rcv_end;
+ if ((rcv_start == rcv_end) &&
+ (tp->rcv_numsacks >= 1) &&
+ (rcv_end == tp->sackblks[0].end)) {
+ /* retaining DSACK block below rcv_nxt (todrop) */
+ head_blk = tp->sackblks[0];
+ } else {
+ /* SACK block for the received segment. */
+ head_blk.start = rcv_start;
+ head_blk.end = rcv_end;
+ }
/*
* Merge updated SACK blocks into head_blk, and save unchanged SACK
@@ -194,12 +298,54 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_sta
* Merge this SACK block into head_blk. This SACK
* block itself will be discarded.
*/
- if (SEQ_GT(head_blk.start, start))
+ /*
+ * |-|
+ * |---| merge
+ *
+ * |-|
+ * |---| merge
+ *
+ * |-----|
+ * |-| DSACK smaller
+ *
+ * |-|
+ * |-----| DSACK smaller
+ */
+ if (head_blk.start == end)
head_blk.start = start;
- if (SEQ_LT(head_blk.end, end))
+ else if (head_blk.end == start)
head_blk.end = end;
+ else {
+ if (SEQ_LT(head_blk.start, start)) {
+ tcp_seq temp = start;
+ start = head_blk.start;
+ head_blk.start = temp;
+ }
+ if (SEQ_GT(head_blk.end, end)) {
+ tcp_seq temp = end;
+ end = head_blk.end;
+ head_blk.end = temp;
+ }
+ if ((head_blk.start != start) ||
+ (head_blk.end != end)) {
+ if ((num_saved >= 1) &&
+ SEQ_GEQ(saved_blks[num_saved-1].start, start) &&
+ SEQ_LEQ(saved_blks[num_saved-1].end, end))
+ num_saved--;
+ saved_blks[num_saved].start = start;
+ saved_blks[num_saved].end = end;
+ num_saved++;
+ }
+ }
} else {
/*
+ * This block supercedes the prior block
+ */
+ if ((num_saved >= 1) &&
+ SEQ_GEQ(saved_blks[num_saved-1].start, start) &&
+ SEQ_LEQ(saved_blks[num_saved-1].end, end))
+ num_saved--;
+ /*
* Save this SACK block.
*/
saved_blks[num_saved].start = start;
@@ -212,7 +358,7 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_sta
* Update SACK list in tp->sackblks[].
*/
num_head = 0;
- if (SEQ_GT(head_blk.start, tp->rcv_nxt)) {
+ if (SEQ_LT(rcv_start, rcv_end)) {
/*
* The received data segment is an out-of-order segment. Put
* head_blk at the top of SACK list.
@@ -226,6 +372,10 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_sta
if (num_saved >= MAX_SACK_BLKS)
num_saved--;
}
+ if ((rcv_start == rcv_end) &&
+ (rcv_start == tp->sackblks[0].end)) {
+ num_head = 1;
+ }
if (num_saved > 0) {
/*
* Copy the saved SACK blocks back.
@@ -236,6 +386,45 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_sta
/* Save the number of SACK blocks. */
tp->rcv_numsacks = num_head + num_saved;
+}
+
+void
+tcp_clean_dsack_blocks(struct tcpcb *tp)
+{
+ struct sackblk saved_blks[MAX_SACK_BLKS];
+ int num_saved, i;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ /*
+ * Clean up any DSACK blocks that
+ * are in our queue of sack blocks.
+ *
+ */
+ num_saved = 0;
+ for (i = 0; i < tp->rcv_numsacks; i++) {
+ tcp_seq start = tp->sackblks[i].start;
+ tcp_seq end = tp->sackblks[i].end;
+ if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) {
+ /*
+ * Discard this D-SACK block.
+ */
+ continue;
+ }
+ /*
+ * Save this SACK block.
+ */
+ saved_blks[num_saved].start = start;
+ saved_blks[num_saved].end = end;
+ num_saved++;
+ }
+ if (num_saved > 0) {
+ /*
+ * Copy the saved SACK blocks back.
+ */
+ bcopy(saved_blks, &tp->sackblks[0],
+ sizeof(struct sackblk) * num_saved);
+ }
+ tp->rcv_numsacks = num_saved;
}
/*
Modified: stable/11/sys/netinet/tcp_var.h
==============================================================================
--- stable/11/sys/netinet/tcp_var.h Fri Apr 24 21:21:49 2020 (r360281)
+++ stable/11/sys/netinet/tcp_var.h Fri Apr 24 21:30:31 2020 (r360282)
@@ -831,7 +831,9 @@ uint32_t tcp_new_ts_offset(struct in_conninfo *);
tcp_seq tcp_new_isn(struct in_conninfo *);
int tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq);
+void tcp_update_dsack_list(struct tcpcb *, tcp_seq, tcp_seq);
void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend);
+void tcp_clean_dsack_blocks(struct tcpcb *tp);
void tcp_clean_sackreport(struct tcpcb *tp);
void tcp_sack_adjust(struct tcpcb *tp);
struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt);
More information about the svn-src-all
mailing list