PERFORCE change 159606 for review
Andre Oppermann
andre at FreeBSD.org
Sun Mar 22 05:20:30 PDT 2009
http://perforce.freebsd.org/chv.cgi?CH=159606
Change 159606 by andre at andre_t61 on 2009/03/22 12:20:02
Checkpoint WIP.
Affected files ...
.. //depot/projects/tcp_new/netinet/tcp_input.c#8 edit
.. //depot/projects/tcp_new/netinet/tcp_output.c#5 edit
.. //depot/projects/tcp_new/netinet/tcp_timer.c#2 edit
.. //depot/projects/tcp_new/netinet/tcp_var.h#4 edit
Differences ...
==== //depot/projects/tcp_new/netinet/tcp_input.c#8 (text+ko) ====
@@ -1686,7 +1686,7 @@
("%s: tlen < 0", __func__));
/*
- * If new data is received on a connection after the
+ * <<If new data is received on a connection after the
* socket is closed or the user process is gone, and
* doesn't has a file descriptor reference anymore,
* send an RST the other end. This is an artifact
@@ -1697,7 +1697,7 @@
* won't be delivering it to an application. And we
* can't just wait here and drop the data into a void
* until the other side gives up as that could go on
- * forever.
+ * forever.>>
* Stevens Vol.2: section 28.8, page 957, lines 687-696
*
* NB: Segments without any data but ack'ing our FIN are
@@ -1731,8 +1731,6 @@
/*
* Update send SACK information and tell us how much more
* data has left the network (relative to last SACK we got).
- * XXXAO: Determine if there was a duplicate ACK going on
- * based on the changes of the SACK information.
*/
if ((to.to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes))
sacked = tcp_sack_doack(tp, &to, th->th_ack);
@@ -1755,13 +1753,20 @@
tcp_do_time(tp, th, &to, acked, tlen, sacked);
/*
+ * Process the ACK to advance the unacknowledged pointer,
+ * or to detect duplicate ACKs.
+ */
+ tcp_do_ack(tp, th, tiwin, acked, tlen, sacked);
+
+ /*
* Update congestion control information.
*/
- nudgeoutput |= tcp_congest(tp, th, tiwin, acked, tlen, sacked);
+ tcp_cc_ack(tp, th, tiwin, acked, tlen, sacked);
+ KASSERT(tp->snd_cwnd > tp->snd_mss,
+ ("%s: cwnd < 1*mss after congestion control function", __func__));
/*
- * Drop acknowledged data from send socket buffer
- * and advance the unacknowledged pointer.
+ * Drop acknowledged data from send socket buffer.
* RFC793: section 3.9, page 72, fifth check
*/
if (acked > 0)
@@ -1789,11 +1794,6 @@
}
/*
- * Advance the unacknowledged pointer.
- */
- tp->snd_una = th->th_ack;
-
- /*
* Wake up and inform any writers on the socket.
*
* NB: sowwakeup_locked() does an implicit unlock.
@@ -1811,6 +1811,10 @@
("%s: got ack for FIN but haven't sent FIN yet",
__func__));
+ KASSERT(!tcp_timer_active(TT_RXMIT),
+ ("%s: ourfinisacked but RXMIT still active",
+ __func__);
+
/*
* Handle ack'ed FIN according to previous state.
*/
@@ -1892,19 +1896,6 @@
* NB: Continue with segment.
*/
}
-
- /*
- * Stop the retransmit timer if all data we sent
- * has been acknowledged. Otherwise restart it
- * if we still have outstanding data.
- *
- * XXXAO: Refine the test. The TF_NEEDFIN may not
- * enough.
- */
- if (tp->snd_una == tp->snd_nxt && !(tp->t_flags & TF_NEEDFIN))
- tcp_timer_activate(TT_RXMIT, 0);
- else
- tcp_timer_activate(TT_RXMIT, tp->snd_rto);
}
/*
@@ -1947,7 +1938,7 @@
* segment with urgent that got pulled and now is zero
*/
if (!TCPS_HAVERCVDFIN(tp->t_state) &&
- (tlen > 0 || (tp->rcv_trq != NULL && th->th_flags & TH_FIN))) {
+ (tlen > 0 || (tp->rcv_trq != NULL && (th->th_flags & TH_FIN)))) {
int newsize = 0; /* Rcvbuf autoscaling. */
/*
@@ -2214,6 +2205,7 @@
* the ACK for our FIN.
*/
tcp_twstart(tp);
+ tp = NULL;
INP_INFO_WUNLOCK(&tcbinfo);
goto done;
@@ -2262,15 +2254,19 @@
* delayed ACK timer and be done.
*
* XXXAO: Multi-delack?
+ * XXXAO: Always call into tcp_output and have it decide what to do.
*/
+ (void)tcp_output(tp, TPO_TINPUT);
+#if 0
if ((tp->t_flags & TF_ACKNOW) || tp->snd_delack > 1 ||
nudgeoutput || (tp->t_flags & TF_RXWIN0SENT) ||
!tcp_delack_enabled) {
- (void) tcp_output(tp);
+ (void) tcp_output(tp, TPO_TINPUT);
} else if (SEQ_GT(tp->rcv_nxt, tp->snd_lastack)) {
tp->snd_delack++;
tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
}
+#endif
INP_UNLOCK(tp->t_inpcb);
return;
@@ -2295,7 +2291,7 @@
*/
tp->t_flags |= TF_ACKNOW;
m_freem(m);
- (void) tcp_output(tp);
+ (void) tcp_output(tp, TPO_TINPUT);
INP_UNLOCK(tp->t_inpcb);
return;
@@ -2848,7 +2844,7 @@
* on segments without ACK. The SYN_RECEIVED case is completely handled
* in syncache and the initialization is done there.
*/
-int
+static int
tcp_do_wu(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to,
int tiwin, int acked, int tlen, int sacked)
{
@@ -2912,8 +2908,8 @@
tp->snd_wnd = tiwin;
if (SEQ_GT(th->th_seq, tp->snd_wu_seq))
tp->snd_wu_seq = th->th_seq;
- if (tp->snd_wnd > tp->snd_maxwnd)
- tp->snd_maxwnd = tp->snd_wnd;
+ if (tp->snd_maxwnd < tiwin)
+ tp->snd_maxwnd = tiwin;
/*
* Force a call to tcp_output only if we have data to send.
@@ -2924,6 +2920,53 @@
return (0);
}
+static void
+tcp_do_ack(tp, th, tiwin, acked, tlen, sacked)
+{
+ /*
+ * Without SACK detecting a duplicate ACK is based on an
+ * empty segment with the same ACK as we already know and
+ * the same advertised receive window. Otherwise we could
+ * mistake a simple window update for a duplicate ACK.
+ *
+ * With SACK is gets much simpler. Any increase in the
+ * sack'ed data equals to a duplicate ACK.
+ *
+ * Things become difficult when we have an ongoing two-way
+ * data exchange. Here the receiver seeing the loss has
+ * add new SACK information or to prevent the transmission
+ * of new data to make the ACK segment detectable as duplicate
+ * ACK.
+ *
+ * XXXAO: This is not entirely correct as it allows for other
+ * packets between the duplicate ACKs.
+ */
+ if (sacked > 0 ||
+ (tlen == 0 && acked == 0 && SEQ_LT(tp->snd_una, tp->snd_nxt) && tp->snd_wnd == tiwin))
+ tp->snd_dupack += 1;
+ else if (acked > 0 && tp->snd_dupack > 0)
+ tp->snd_dupack = 0;
+
+ KASSERT(SEQ_LT(tp->snd_una, tp->snd_nxt) || tp->snd_dupack == 0,
+ ("%s: snd_dupack > 0 but snd_una == snd_nxt", __func__));
+
+ /*
+ * Advance the unacknowledged pointer.
+ */
+ tp->snd_una += acked;
+
+ /*
+ * Stop the retransmit timer if all data we sent has been
+ * acknowledged. Otherwise restart it if we still have
+ * outstanding data.
+ */
+ if (tp->snd_una == tp->snd_nxt)
+ tcp_timer_activate(TT_RXMIT, 0);
+ else if (acked > 0)
+ tcp_timer_activate(TT_RXMIT, tp->snd_rto);
+
+}
+
/*
* Process urgent data in TCP segments.
*
==== //depot/projects/tcp_new/netinet/tcp_output.c#5 (text+ko) ====
@@ -103,6 +103,13 @@
/*
* Tcp output routine: figure out what should be sent and send it.
*
+ * We get here through:
+ * 1. write/send/etc
+ * 2. tcp_input (not always)
+ * 3. read/recfrom
+ * 4. delayed ACK, retransmission or persistent timeout
+ *
+ * Our work is to find out:
* 1. How much to send, if any
* 1.1 subject to nagles algorithm (don't send small segments)
* 1.2 subject to send window
@@ -111,14 +118,14 @@
* 3. Send an outstanding ACK
* 3.1 subject to delayed ack
* 4. Send a window update
- * 4.1 subject to silly window avoidance
+ * 4.1 subject to silly window avoidance (don't send small window updates)
* 4.2 subject to delayed ack
* 5. Send retransmit
* 6. Send urgent data
* 7. Send based on flags
*/
int
-tcp_output(struct tcpcb *tp)
+tcp_output(struct tcpcb *tp, int reason)
{
int off, flags, error, optlen;
tcp_win len, recwin, swin;
@@ -150,21 +157,23 @@
flags = tcp_outflags[tp->t_state];
/*
- * Determine length of data that should be transmitted,
- * and flags that will be used.
- * If there is some data or critical controls (SYN, RST)
- * to send, then transmit; otherwise, investigate further.
+ * Determine our current receive window.
+ * This value is used for the window field in the TCP
+ * header and to determine whether we have to send a
+ * window update.
+ *
+ * NB: rwin is already scaled.
*/
-
+ rwin = tcp_rcv_wnd(tp, so);
-
/*
* We have been idle for "a while" and no acks are
* expected to clock out any data we send --
* slow start to get ack "clock" running again.
+ * RFC2581: Restart window.
*
- * Set the slow-start flight size depending on whether
- * this is a local network or not.
+ * XXXAO: Use a decaying algorithm. It's not useful
+ * to have cwnd to drop of a cliff.
*/
if (tp->snd_nxt == tp->snd_una &&
(ticks - tp->t_rcvtime) >= max(tp->t_rxtcur, tcp_min_idle)) {
@@ -172,12 +181,9 @@
}
/*
- * Compute our current receive window.
- * XXXAO: Handle window updates.
- */
- rwin = tcp_rcv_wnd(tp, so);
-
- /*
+ * Determine length of data that should be transmitted, if there
+ * is some data to send, then transmit; otherwise, investigate further.
+ *
* First step: how much to send.
*
* Check out our send window.
@@ -192,41 +198,45 @@
* c) how much data we have to send
* d) the pacing algorithm (optional)
*
- * XXXAO: Add output pacing where one can limit the amount
- * of data that is sent in a time period through a socket
- * option.
+ * duna = unacknowledged data in flight
+ * swnd = remaining space in send window as advertised by remote end
+ * cwnd = congestion window, remaing amount of data that can be unacknowledged in flight
+ * dlen = remaing amount of data in send buffer available for sending
+ * len = amount of data we have *and* can send righ now
+ *
+ * <- duna -><- swnd ->
+ * <- cwnd ->
+ * <-dlen->
+ * seq .......|+++++++++xxxxxxxx---z---------|.......
+ * ^ ^
+ * snd_una snd_nxt
*
- * XXXAO
*/
- swin = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
- swin = tp->snd_wnd - tp->snd_inflight; /* XXXAO: Alternative, SACK */
+ duna = SEQ_DELTA(tp->snd_nxt, tp->snd_una);
+ swnd = imax(0, tp->snd_wnd - duna);
+ cwnd = imax(0, tp->snd_cwnd - duna);
+ dlen = so->so_snd.sb_cc - duna;
+ len = min(dlen, min(swnd, cwnd));
- len = min(swin, tp->snd_cwnd);
- len = so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una);
- len = tcp_snd_pace(tp, len); /* XXXAO: todo token bucket */
-
- if (tp->t_flags & TF_REXMT) {
- len = tcp_snd_rexmt(tp, len);
- goto send;
+ if (len > 0 && (tp->t_flags & TF_PACE)) {
+ len = tcp_snd_pace(tp, len); /* XXXAO: todo token bucket, mss sized */
+ if (len == 0)
+ return (0); /* next token is pending */
}
- /*
- * Second step: Do we send?
- */
- if (tp->t_flags & TF_ACKNOW)
- goto send;
+ inflight = duna - tp->snd_sacked;
/*
* Send out a SYN immediatly.
*/
- if (flags & TH_SYN)
+ if ((flags & TH_SYN) && !(tp->t_flags & TF_SENTSYN))
goto send;
/*
* If our state indicates that FIN should be sent
* and we have not yet done so, then we need to send.
*/
- if (flags & TH_FIN) {
+ if ((flags & TH_FIN) && !(tp->t_flags & TF_SENTFIN)) {
/*
* All data is already sent and only the FIN is outstanding.
*/
@@ -238,11 +248,23 @@
* if the window is big enough. Do not care about nagle
* and others. Otherwise things will go their normal way.
*/
- if (len <= snd_wnd)
+ if (len > 0)
goto send;
}
/*
+ * Pending ACK?
+ */
+ if (tp->t_flags & TF_ACKNOW)
+ goto send;
+ if (SEQ_LT(tp->snd_lastack, tp->snd_nxt) && !(tp->t_flags & TF_DELACK))
+ goto send;
+ if (tp->t_flags & TF_DUPACK) {
+ len = 0;
+ goto send;
+ }
+
+ /*
* Sender silly window avoidance. We transmit under the following
* conditions when len is non-zero:
*
@@ -254,11 +276,17 @@
* data (receiver may be limited the window size)
* - We need to retransmit
*
+ * The idea behind delayed ACK is twofold:
+ * a) aggregate multiple ACKs together
+ * b) aggregate the response from application with the ACK
+ * In both cases the events are probably very close together
+ * and thus the delayed ACK time should be very short.
+ *
* a) Nagle algorithm: tinygram problem
* b) silly window syndrome: buffer almost full
*
* Quoting Nagle:
- * The concept behind delayed ACKs is to bet, when receiving some data from the net,
+ * <<The concept behind delayed ACKs is to bet, when receiving some data from the net,
* that the local application will send a reply very soon. So there's no need to
* send an ACK immediately; the ACK can be piggybacked on the next data going the
* other way. If that doesn't happen, after a 500ms delay, an ACK is sent anyway.
@@ -277,49 +305,119 @@
* an ACK is a bet that the local application will reply to the data just received.
* Some apps, like character echo in Telnet servers, do respond every time. Others,
* like X-Windows "clients" (really servers, but X is backwards about this), only reply
- * some of the time.
+ * some of the time.>>
* http://developers.slashdot.org/comments.pl?sid=174457&threshold=1&commentsort=0&mode=thread&cid=14515105
*
* XXXAO: mss - options!
*/
if (len) {
+ /*
+ * Always send if there is no outstanding data in flight.
+ */
if (tp->snd_nxt == tp->snd_una)
goto send;
+
+ /*
+ * Always send if NODELAY is enabled. This gives at least
+ * one segment per application write no matter how small
+ * the amount of data.
+ */
if (tp->t_flags & TF_NODELAY)
goto send;
+
+ /*
+ * Always send if we have more than one MSS worth of data.
+ */
if (len >= tp->snd_mss)
goto send;
+
+ /*
+ * For small windows send if we have half a window worth
+ * of data.
+ */
if (tp->snd_maxwnd > 0 && len >= tp->snd_maxwnd / 2)
goto send;
}
/*
- * Send window update? We only send them if the window opened
- * up again either because the socket buffer was drained or
+ * Persistent mode.
+ * Send out probe byte if there is data available.
+ * RFC793: section 3.7, page 42-44
+ * RFC1122: section 4.2.2.17
+ */
+ if (swnd == 0 && dlen > 0 && (tp->t_flags & TF_FORCEDATA)) {
+ len = 1;
+ goto send;
+ }
+ if (swnd == 0 && duna > tp->snd_wnd) {
+ /*
+ * Window shrank
+ * after we sent into it. If window shrank to 0,
+ * cancel pending retransmit, pull snd_nxt back
+ * to (closed) window, and set the persist timer
+ * if it isn't already going. If the window didn't
+ * close completely, just wait for an ACK.
+ */
+ tcp_timer_activate(tp, TT_REXMT, 0);
+ tp->t_rxtshift = 0;
+ if (!tcp_timer_active(tp, TT_PERSIST))
+ tcp_setpersist(tp);
+ }
+
+ /*
+ * Send window update?
+ *
+ * The receive window informs the remote side about the
+ * remaining space in our receive buffer. We only send
+ * window updates if the socket buffer was drained or
* enlarged.
- * When the application reads data from the socket we get notified
- * to potentially inform the remote end about more receive space.
+ *
+ * When the application reads (and by it removes) data
+ * from the receive buffer we get notified and have to
+ * decide whether the change justifies a window update
+ * segment.
+ *
+ * We must avoid to the silly window syndrome whereas
+ * every read from the receive buffer, no matter how
+ * small, causes a window update to be sent.
+ *
+ * To prevent this we employ a silly window avoidance
+ * algorithm which causes updates to the window only
+ * when the new window is enlarged by at least two MSS
+ * sized segments. This part is done by tcp_rcv_wnd()
+ * and already incorporated into the rwin value we got.
*
- * XXXAO: Do not send many small window updates if we are not
- * expecting more data and there was enough space adversized
- * the last time.
+ * Our logic to determine whether to send an independent
+ * window update segment is more stringent. We only
+ * send window updates if the new space in the receive
+ * buffer is at least double the previous value. This
+ * prevents a flurry of independent window updates when
+ * the socket buffer has queued a lot of data and the
+ * application is doing small reads. This may leave
+ * some available space in the receive buffer not
+ * advertised to the remote side. As soon as it is
+ * sending data again our resulting ACKs will contain
+ * full value and no stalling will happen.
*
- * NB: Do not send window updates if the remote end won't send
+ * Independent window updates are not sent if a delayed
+ * ACK is pending. There we can simply piggy back the
+ * new window information on the pending ACK. Neither
+ * do we send window updates if we have received a FIN.
+ * It would be pointless as we are unable to receive
* more data.
+ *
+ * RFC793: section 3.7, page 42-44
+ * RFC1122: section 4.2.2.16
+ * Stevens Vol.2: section 26.3, page 858-861, figure 26.8
*/
- if (!TCPS_HAVERCVDFIN(tp->t_state) && rwin > tp->rcv_advwnd) {
- delta = rwin - tp->rcv_advwnd;
-
- if (delta >= 2 * tp->snd_mss)
- goto send;
- if (2 * delta >= (long)so->so_rcv.sb_hiwat)
+ if (tp->rcv_advwin < rwin && !(tp->t_flags & TF_DELACK) &&
+ !TCPS_HAVERCVDFIN(tp->t_state))
+ if (rwin >= 2 * tp->rcv_advwin)
goto send;
- }
/*
* No reason to send a segment, just return.
*/
- SOCKBUF_UNLOCK(&so->so_snd);
return (0);
send:
@@ -415,17 +513,20 @@
* According to RFC1323 the window field in a SYN (i.e., a <SYN>
* or <SYN,ACK>) segment itself is never scaled. The <SYN,ACK>
* case is handled in syncache.
+ *
+ * XXXAO: when sending dup-acks do not mix with window updates, otherwise
+ * the logic at the receiver may mistake the dup-ack
+ * XXXAO: rwin is already scaled.
*/
if (flags & TH_SYN)
th->th_win = (u_short)(min(rwin, TCP_MAXWIN));
+ else if (tp->t_flags & TF_DUPACK)
+ th->th_win = (u_short)tp->rcv_advwin;
else
th->th_win = (u_short)(rwin >> tp->rcv_scale);
/*
* Fill in fields.
- *
- * XXXAO: remembering maximum advertised window for
- * use in delaying messages about window sizes.
*/
if (tp->snd_nxt == tp->snd_rxmit) {
th->th_seq = tp->snd_nxt;
@@ -436,8 +537,7 @@
}
/*
- * If resending a FIN, be sure not to use a new sequence number.
- * XXXAO: Resending SYN?
+ * If resending a SYN or FIN, be sure not to use a new sequence number.
*/
if ((flags & TH_SYN) && (tp->t_flags & TF_SENTSYN))
th->th_seq--;
@@ -453,7 +553,10 @@
SOCKBUF_UNLOCK(&so->so_snd);
/*
- * NB: len > 0 means we sent this much data w/o an error.
+ * NB: len > 0 means we sent this much data w/o error.
+ * error == 0 means we sent at least a single segment w/o error.
+ *
+ * XXXAO: Avoid unconditional writes to the tcpcb.
*/
if (len > 0) {
/*
@@ -463,35 +566,45 @@
tp->snd_nxt += len;
else
tp->snd_rxmit += len;
+ }
+ if (error == 0) {
/*
- * Data sent (as far as we can tell).
- * If this advertises a larger window than any other segment,
- * then remember the size of the advertised window.
- * Any pending ACK has now been sent.
+ * Integrate FIN into sequence space.
*/
- if (rwin > 0 && SEQ_GT(tp->rcv_nxt + rwin, tp->rcv_adv))
- tp->rcv_adv = tp->rcv_nxt + rwin;
+ if ((flags & TH_FIN) && !(tp-t_flags & TF_SENTFIN)) {
+ tp->snd_nxt++;
+ tp->t_flags |= TF_SENTFIN;
+ }
}
- if (error == 0) {
+ if (len > 0 || error == 0) {
/*
- * Integrate SYN and FIN into sequence space.
- * XXXAO: If we send data with SYN this breaks.
+ * Integrate SYN into sequence space.
*/
if ((flags & TH_SYN) && !(tp->t_flags & TF_SENTSYN)) {
tp->snd_nxt++;
tp->t_flags |= TF_SENTSYN;
}
- if ((flags & TH_FIN) && !(tp-t_flags & TF_SENTFIN)) {
- tp->snd_nxt++;
- tp->t_flags |= TF_SENTFIN;
- }
+
+ /*
+ * Any pending ACK has been sent.
+ * Clear related flags and disarm the delayed ACK timer.
+ */
+ tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
+ if (SEQ_LT(tp->snd_lastack, tp->rcv_nxt)
+ tp->snd_lastack = tp->rcv_nxt;
+ if (tcp_timer_active(tp, TT_DELACK))
+ tcp_timer_activate(tp, TT_DELACK, 0);
/*
* Remember last advertised receive window.
+ * We need this information to send proper
+ * duplicate ACKs and to know whether we
+ * have to send a window update later on.
*/
- tp->rcv_advwnd = rwin;
+ if (tp->rcv_advwin != rwin)
+ tp->rcv_advwin = rwin;
/*
* Adjust the RXWIN0SENT flag - indicate that we have advertised
@@ -507,15 +620,6 @@
tp->t_flags &= ~TF_RXWIN0SENT;
}
- if (len > 0 || error == 0) {
- if (SEQ_LT(tp->last_ack_sent, tp->rcv_nxt)
- tp->last_ack_sent = tp->rcv_nxt;
-
- tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
- if (tcp_timer_active(tp, TT_DELACK))
- tcp_timer_activate(tp, TT_DELACK, 0);
- }
-
if (len > 0 && error == 0) {
if ((tp->t_flags & TF_FORCEDATA) == 0 ||
!tcp_timer_active(tp, TT_PERSIST))
@@ -639,7 +743,7 @@
}
tcpstat.tcps_sndtotal++;
- if (tp->t_flags & TF_ACKNOW) /* XXXAO: test whether we increased last_ack_sent */
+ if (tp->t_flags & TF_ACKNOW) /* XXXAO: test whether we increased snd_lastack */
tcpstat.tcps_sndacks++;
else if (flags & (TH_SYN|TH_FIN|TH_RST))
tcpstat.tcps_sndctrl++;
@@ -651,131 +755,12 @@
return (0);
}
-int
-tcp_junk()
-{
- /*
- * If in persist timeout with window of 0, send 1 byte.
- * Otherwise, if window is small but nonzero
- * and timer expired, we will send what we can
- * and go to transmit state.
- */
- if (tp->t_flags & TF_FORCEDATA) {
- if (snd_wnd == 0) {
- /*
- * If we still have some data to send, then
- * clear the FIN bit. Usually this would
- * happen below when it realizes that we
- * aren't sending all the data. However,
- * if we have exactly 1 byte of unsent data,
- * then it won't clear the FIN bit below,
- * and if we are in persist state, we wind
- * up sending the packet without recording
- * that we sent the FIN bit.
- *
- * We can't just blindly clear the FIN bit,
- * because if we don't have any more data
- * to send then the probe will be the FIN
- * itself.
- */
- if (off < so->so_snd.sb_cc)
- flags &= ~TH_FIN;
- snd_wnd = 1;
- } else {
- tcp_timer_activate(tp, TT_PERSIST, 0);
- tp->t_rxtshift = 0;
- }
-
- if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */
- goto send;
- /*
- * TCP window updates are not reliable, rather a polling protocol
- * using ``persist'' packets is used to insure receipt of window
- * updates. The three ``states'' for the output side are:
- * idle not doing retransmits or persists
- * persisting to move a small or zero window
- * (re)transmitting and thereby not persisting
- *
- * If send window is too small, there is data to transmit, and no
- * retransmit or persist is pending, then go to persist state.
- * If nothing happens soon, send when timer expires:
- * if window is nonzero, transmit what we can,
- * otherwise force out a byte.
- * XXX: We don't force anything here, only return!?
- */
- if (len > 0 && !tcp_timer_active(tp, TT_REXMT) &&
- !tcp_timer_active(tp, TT_PERSIST)) {
- tp->t_rxtshift = 0;
- tcp_setpersist(tp);
- }
- }
-
- /*
- * Urgent data pending.
- */
- if (SEQ_GT(tp->snd_up, tp->snd_una))
- goto send;
-
- if (len < 0) {
- /*
- * If FIN has been sent but not acked,
- * but we haven't been called to retransmit,
- * len will be < 0. Otherwise, window shrank
- * after we sent into it. If window shrank to 0,
- * cancel pending retransmit, pull snd_nxt back
- * to (closed) window, and set the persist timer
- * if it isn't already going. If the window didn't
- * close completely, just wait for an ACK.
- */
- len = 0;
- if (snd_wnd == 0) {
- tcp_timer_activate(tp, TT_REXMT, 0);
- tp->t_rxtshift = 0;
- if (!tcp_timer_active(tp, TT_PERSIST))
- tcp_setpersist(tp);
- }
- }
-
- /*
- * TSO may only be used if we are in a pure bulk sending state. The
- * presence of TCP-MD5, SACK retransmits, SACK advertizements and
- * IP options prevent using TSO. With TSO the TCP header is the same
- * (except for the sequence number) for all generated packets. This
- * makes it impossible to transmit any options which vary per generated
- * segment or packet.
- */
- if ((tp->t_flags & TF_TSO) && tcp_do_tso &&
- ((tp->t_flags & TF_SIGNATURE) == 0) &&
- inp->inp_options == NULL &&
- inp->in6p_options == NULL &&
- inp->inp_sp == NULL) /* XXXAO: update */
- tso = 1;
-
-#if 0
- /*
- * Urgent pointer handling.
- */
- if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
- th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
- th->th_flags |= TH_URG;
- } else {
- /*
- * If no urgent pointer to send, then we pull
- * the urgent pointer to the left edge of the send window
- * so that it doesn't drift into the send window on sequence
- * number wraparound.
- */
- tp->snd_up = tp->snd_una; /* drag it along */
- }
-#endif
-}
-
/*
* Do a retransmit from snd_nxt or a later point. This is separate
* from the normal transmit case as the logic is quite a bit different.
*/
static int
-tcp_do_retransmit()
+tcp_retransmit(struct tcpcb *tp, int len)
{
/*
@@ -786,27 +771,12 @@
*/
/*
* We have the following mechanisms:
- * 1. Fast retransmit: After we get three duplicate ACKs
- * 2. NewReno Fast recovery RFC3782
+ * 1. Fast recovery: After we get three duplicate ACKs RFC2581
+ * 2. NewReno RFC3782
* 3. Limited transmit RFC3042
* 4. SACK tells us where to send how much data RFC3517
*/
- /*
- * XXXAO: remembering maximum advertised window for
- * use in delaying messages about window sizes.
- */
- if (tp->snd_nxt == tp->snd_rxmit) {
- th->th_seq = tp->snd_nxt;
- off = tp->snd_nxt - tp->snd_una;
- } else {
- th->th_seq = tp->snd_rxmit;
- off = min(tp->snd_rxmit - tp->snd_una, so->so_snd.sb_cc);
- }
- /*
- * Check if we have to remove FIN on SACK retransmits.
- */
- if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
- flags &= ~TH_FIN;
+
}
/*
@@ -843,6 +813,9 @@
#ifdef FAST_IPSEC
/*
* NB: This is an expensive operation and involves memory allocation.
+ *
+ * XXXAO: If the IPSEC header size doesn't change during a session
+ * lifetime we could compute the number at establishment time.
*/
linkhdr += (int)ipsec_hdrsiz_tcp(tp);
#endif
@@ -1006,6 +979,14 @@
("%s: data beyond FIN", __func__);
/*
+ * Set the PUSH bit to indicate that we have reached
+ * the end of the send buffer.
+ */
+ if (off + slen == so->so_snd.sb_cc) {
+ th->th_flags =| TH_PSH;
+ }
+
+ /*
* If we're sending everything we've got, set PUSH.
* This will keep happy those implementations which
* only give data to the user when a buffer fills or
@@ -1124,49 +1105,86 @@
/*
- * Shall we send data or not?
- * And what window shall we advertize?
+ * Calculate and update our current receive window.
+ * Return the scaled receive window.
*/
-static int
+static u_int
tcp_rcv_wnd(struct tcpcb *tp, struct socket *so)
{
- int delta;
+ int delta, rwin;
KASSERT(SEQ_GEQ(tp->rcv_wnd, tp->rcv_nxt),
- ("%s: ", __func__));
-
- delta = sbspace(so->so_rcv) - SEQ_DELTA(tp->rcv_wnd, tp->rcv_nxt);
+ ("%s: receive window below rcv_nxt", __func__));
/*
- * Determine if we should send window update.
- * Silly window avoidance: Only send window update
- * if we've got at least two segments of space.
- * If the socket buffer was shrunk then delta is
- * a negative value.
+ * Calculate the amount of space in the receive buffer relative
+ * to the current end of the receive window. If the receive
+ * buffer was shrunk delta becomes negative.
+ *
+ * <- sb_hiwat ->
+ * <- sb_cc ->
+ * seq .......|++++++++++------------------z-|.......
+ * ^ ^
+ * rcv_nxt rcv_wnd
+ *
+ * XXXAO: To avoid the locking overhead tcp_usr_rcvd could update
+ * a rcv_read pointer.
*/
+ SB_LOCK(so->so_rcv);
+ if (so->so_rcv.sb_hiwat - so->so_rcv.sb_cc > 0)
+ delta = SEQ_DELTA(tp->rcv_wnd - so->so_rcv.sb_hiwat,
+ tp->rcv_nxt - so->so_rcv.sb_cc);
+ else
+ delta = so->so_rcv.sb_hiwat - so->so_rcv.sb_cc;
+ SB_UNLOCK(so->so_rcv);
+
/*
- * - if socket buffer is less than 1/4 free, send many updates
- * - piggy back window update on delayed ack
- * - if socket buffer > 1/4 free send updates only from time to time
- * - when sending dup-acks do not mix with window updates, otherwise
- * the logic at the receiver may mistake the dup-ack
- * - the new value must be larger than the minimal unscaled increment
- * - if delta is more than 50% or we reach the full window
+ * Silly window avoidance: Only grow the window if we've
+ * got at least two segments of additional space available.
+ * Take into account the granularity of the window scale
+ * shift.
+ *
+ * NB: We do not shrink the window even if the receive
+ * buffer was shrunk on us. We won't re-open the window
+ * as more data comes in though.
+ *
+ * RFC793: section 3.7, page 42-44
+ * RFC1122: section 4.2.2.16
+ * Stevens Vol.2: section 26.3, page 858-861
*/
if (delta > 0 && (delta >> tp->rcv_scale) > 0 &&
- delta >= 2 * tp->snd_mss) {
+ (tp->rcv_scale << (delta >> tp->rcv_scale)) >= 2 * tp->snd_mss)
tp->rcv_wnd += delta;
- }
+
+ /*
+ * Report shrunk socket buffers.
+ */
if (delta < 0)
tcp_log("our receive socket buffer was shrunk");
- rwin = (tp->rcv_wnd - tp->rcv_nxt) - so->so_rcv.sb_cc;
+ /*
+ * Our current open receive window to be advertized is
+ * the remaining space in the socket buffer.
+ */
+ rwin = SEQ_DELTA(tp->rcv_wnd - tp->rcv_nxt);
+
+ return (rwin >> tp->rcv_scale);
+}
- return (rwin);
+/*
+ * Pace the segment stream by limiting the amount of data
+ * that is sent per time unit (tocken bucket).
+ *
+ * NB: Never go below one MSS per time unit.
+ */
+static void
+tcp_snd_pace(struct tcpcp *tp)
+{
+ return;
}
-void
-tcp_snd_autoscale(struct tcpcb *tp)
+static void
+tcp_snd_autoscale(struct tcpcb *tp, int swnd)
{
/*
* Automatic sizing of send socket buffer. Often the send buffer
@@ -1219,7 +1237,7 @@
if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
so->so_snd.sb_cc < tcp_autosndbuf_max &&
- sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
+ swin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
if (!sbreserve_locked(&so->so_snd,
min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
tcp_autosndbuf_max), so, curthread))
>>> TRUNCATED FOR MAIL (1000 lines) <<<
More information about the p4-projects
mailing list