PERFORCE change 159606 for review

Andre Oppermann andre at FreeBSD.org
Sun Mar 22 05:20:30 PDT 2009


http://perforce.freebsd.org/chv.cgi?CH=159606

Change 159606 by andre at andre_t61 on 2009/03/22 12:20:02

	        Checkpoint WIP.

Affected files ...

.. //depot/projects/tcp_new/netinet/tcp_input.c#8 edit
.. //depot/projects/tcp_new/netinet/tcp_output.c#5 edit
.. //depot/projects/tcp_new/netinet/tcp_timer.c#2 edit
.. //depot/projects/tcp_new/netinet/tcp_var.h#4 edit

Differences ...

==== //depot/projects/tcp_new/netinet/tcp_input.c#8 (text+ko) ====

@@ -1686,7 +1686,7 @@
 	    ("%s: tlen < 0", __func__));
 
 	/*
-	 * If new data is received on a connection after the
+	 * <<If new data is received on a connection after the
 	 * socket is closed or the user process is gone, and
 	 * doesn't has a file descriptor reference anymore,
 	 * send an RST the other end.  This is an artifact
@@ -1697,7 +1697,7 @@
 	 * won't be delivering it to an application.  And we
 	 * can't just wait here and drop the data into a void
 	 * until the other side gives up as that could go on
-	 * forever.
+	 * forever.>>
 	 *  Stevens Vol.2: section 28.8, page 957, lines 687-696
 	 *
 	 * NB: Segments without any data but ack'ing our FIN are
@@ -1731,8 +1731,6 @@
 	/*
 	 * Update send SACK information and tell us how much more
 	 * data has left the network (relative to last SACK we got).
-	 * XXXAO: Determine if there was a duplicate ACK going on
-	 * based on the changes of the SACK information.
 	 */
 	if ((to.to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes))
 		sacked = tcp_sack_doack(tp, &to, th->th_ack);
@@ -1755,13 +1753,20 @@
 	tcp_do_time(tp, th, &to, acked, tlen, sacked);
 
 	/*
+	 * Process the ACK to advance the unacknowledged pointer,
+	 * or to detect duplicate ACKs.
+	 */
+	tcp_do_ack(tp, th, tiwin, acked, tlen, sacked);
+
+	/*
 	 * Update congestion control information.
 	 */
-	nudgeoutput |= tcp_congest(tp, th, tiwin, acked, tlen, sacked);
+	tcp_cc_ack(tp, th, tiwin, acked, tlen, sacked);
+	KASSERT(tp->snd_cwnd > tp->snd_mss,
+	    ("%s: cwnd < 1*mss after congestion control function", __func__));
 
 	/*
-	 * Drop acknowledged data from send socket buffer
-	 * and advance the unacknowledged pointer.
+	 * Drop acknowledged data from send socket buffer.
 	 *  RFC793: section 3.9, page 72, fifth check
 	 */
 	if (acked > 0)
@@ -1789,11 +1794,6 @@
 		}
 
 		/*
-		 * Advance the unacknowledged pointer.
-		 */
-		tp->snd_una = th->th_ack;
-
-		/*
 		 * Wake up and inform any writers on the socket.
 		 *
 		 * NB: sowwakeup_locked() does an implicit unlock.
@@ -1811,6 +1811,10 @@
 			    ("%s: got ack for FIN but haven't sent FIN yet",
 			    __func__));
 
+			KASSERT(!tcp_timer_active(TT_RXMIT),
+			    ("%s: ourfinisacked but RXMIT still active",
+			    __func__);
+
 			/*
 			 * Handle ack'ed FIN according to previous state.
 			 */
@@ -1892,19 +1896,6 @@
 			 * NB: Continue with segment.
 			 */
 		}
-
-		/*
-		 * Stop the retransmit timer if all data we sent
-		 * has been acknowledged.  Otherwise restart it
-		 * if we still have outstanding data.
-		 *
-		 * XXXAO: Refine the test.  The TF_NEEDFIN may not
-		 * enough.
-		 */
-		if (tp->snd_una == tp->snd_nxt && !(tp->t_flags & TF_NEEDFIN))
-			tcp_timer_activate(TT_RXMIT, 0);
-		else
-			tcp_timer_activate(TT_RXMIT, tp->snd_rto);
 	}
 
 	/*
@@ -1947,7 +1938,7 @@
 	 *  segment with urgent that got pulled and now is zero
 	 */
 	if (!TCPS_HAVERCVDFIN(tp->t_state) &&
-	    (tlen > 0 || (tp->rcv_trq != NULL && th->th_flags & TH_FIN))) {
+	    (tlen > 0 || (tp->rcv_trq != NULL && (th->th_flags & TH_FIN)))) {
 		int newsize = 0;	/* Rcvbuf autoscaling. */
 
 		/*
@@ -2214,6 +2205,7 @@
 			 * the ACK for our FIN.
 			 */
 			tcp_twstart(tp);
+			tp = NULL;
 			INP_INFO_WUNLOCK(&tcbinfo);
 			goto done;
 
@@ -2262,15 +2254,19 @@
 	 * delayed ACK timer and be done.
 	 *
 	 * XXXAO: Multi-delack?
+	 * XXXAO: Always call into tcp_output and have it decide what to do.
 	 */
+	(void)tcp_output(tp, TPO_TINPUT);
+#if 0
 	if ((tp->t_flags & TF_ACKNOW) || tp->snd_delack > 1 ||
 	    nudgeoutput || (tp->t_flags & TF_RXWIN0SENT) ||
 	    !tcp_delack_enabled) {
-		(void) tcp_output(tp);
+		(void) tcp_output(tp, TPO_TINPUT);
 	} else if (SEQ_GT(tp->rcv_nxt, tp->snd_lastack)) {
 		tp->snd_delack++;
 		tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
 	}
+#endif
 
 	INP_UNLOCK(tp->t_inpcb);
 	return;
@@ -2295,7 +2291,7 @@
 	 */
 	tp->t_flags |= TF_ACKNOW;
 	m_freem(m);
-	(void) tcp_output(tp);
+	(void) tcp_output(tp, TPO_TINPUT);
 	INP_UNLOCK(tp->t_inpcb);
 	return;
 
@@ -2848,7 +2844,7 @@
  * on segments without ACK.  The SYN_RECEIVED case is completely handled
  * in syncache and the initialization is done there.
  */
-int
+static int
 tcp_do_wu(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to,
     int tiwin, int acked, int tlen, int sacked)
 {
@@ -2912,8 +2908,8 @@
 		tp->snd_wnd = tiwin;
 		if (SEQ_GT(th->th_seq, tp->snd_wu_seq))
 			tp->snd_wu_seq = th->th_seq;
-		if (tp->snd_wnd > tp->snd_maxwnd)
-			tp->snd_maxwnd = tp->snd_wnd;
+		if (tp->snd_maxwnd < tiwin)
+			tp->snd_maxwnd = tiwin;
 
 		/*
 		 * Force a call to tcp_output only if we have data to send.
@@ -2924,6 +2920,53 @@
 	return (0);
 }
 
+static void
+tcp_do_ack(tp, th, tiwin, acked, tlen, sacked)
+{
+	/*
+	 * Without SACK detecting a duplicate ACK is based on an
+	 * empty segment with the same ACK as we already know and
+	 * the same advertised receive window.  Otherwise we could
+	 * mistake a simple window update for a duplicate ACK.
+	 *
+	 * With SACK is gets much simpler.  Any increase in the
+	 * sack'ed data equals to a duplicate ACK.
+	 *
+	 * Things become difficult when we have an ongoing two-way
+	 * data exchange.  Here the receiver seeing the loss has
+	 * add new SACK information or to prevent the transmission
+	 * of new data to make the ACK segment detectable as duplicate
+	 * ACK.
+	 *
+	 * XXXAO: This is not entirely correct as it allows for other
+	 * packets between the duplicate ACKs.
+	 */
+	if (sacked > 0 ||
+	    (tlen == 0 && acked == 0 && SEQ_LT(tp->snd_una, tp->snd_nxt) && tp->snd_wnd == tiwin))
+		tp->snd_dupack += 1;
+	else if (acked > 0 && tp->snd_dupack > 0)
+		tp->snd_dupack = 0;
+
+	KASSERT(SEQ_LT(tp->snd_una, tp->snd_nxt) || tp->snd_dupack == 0,
+	    ("%s: snd_dupack > 0 but snd_una == snd_nxt", __func__));
+
+	/*
+	 * Advance the unacknowledged pointer.
+	 */
+	tp->snd_una += acked;
+
+	/*
+	 * Stop the retransmit timer if all data we sent has been
+	 * acknowledged.  Otherwise restart it if we still have
+	 * outstanding data.
+	 */
+	if (tp->snd_una == tp->snd_nxt)
+		tcp_timer_activate(TT_RXMIT, 0);
+	else if (acked > 0)
+		tcp_timer_activate(TT_RXMIT, tp->snd_rto);
+
+}
+
 /*
  * Process urgent data in TCP segments.
  *

==== //depot/projects/tcp_new/netinet/tcp_output.c#5 (text+ko) ====

@@ -103,6 +103,13 @@
 /*
  * Tcp output routine: figure out what should be sent and send it.
  *
+ * We get here through:
+ * 1. write/send/etc
+ * 2. tcp_input (not always)
+ * 3. read/recfrom
+ * 4. delayed ACK, retransmission or persistent timeout
+ *
+ * Our work is to find out:
  * 1. How much to send, if any
  *  1.1 subject to nagles algorithm (don't send small segments)
  *  1.2 subject to send window
@@ -111,14 +118,14 @@
  * 3. Send an outstanding ACK
  *  3.1 subject to delayed ack
  * 4. Send a window update
- *  4.1 subject to silly window avoidance
+ *  4.1 subject to silly window avoidance (don't send small window updates)
  *  4.2 subject to delayed ack
  * 5. Send retransmit
  * 6. Send urgent data
  * 7. Send based on flags
  */
 int
-tcp_output(struct tcpcb *tp)
+tcp_output(struct tcpcb *tp, int reason)
 {
 	int off, flags, error, optlen;
 	tcp_win len, recwin, swin;
@@ -150,21 +157,23 @@
 	flags = tcp_outflags[tp->t_state];
 
 	/*
-	 * Determine length of data that should be transmitted,
-	 * and flags that will be used.
-	 * If there is some data or critical controls (SYN, RST)
-	 * to send, then transmit; otherwise, investigate further.
+	 * Determine our current receive window.
+	 * This value is used for the window field in the TCP
+	 * header and to determine whether we have to send a
+	 * window update.
+	 *
+	 * NB: rwin is already scaled.
 	 */
-
+	rwin = tcp_rcv_wnd(tp, so);
 
-
 	/*
 	 * We have been idle for "a while" and no acks are
 	 * expected to clock out any data we send --
 	 * slow start to get ack "clock" running again.
+	 *  RFC2581: Restart window.
 	 *
-	 * Set the slow-start flight size depending on whether
-	 * this is a local network or not.
+	 * XXXAO: Use a decaying algorithm.  It's not useful
+	 * to have cwnd to drop of a cliff.
 	 */
 	if (tp->snd_nxt == tp->snd_una &&
 	    (ticks - tp->t_rcvtime) >= max(tp->t_rxtcur, tcp_min_idle)) {
@@ -172,12 +181,9 @@
 	}
 
 	/*
-	 * Compute our current receive window.
-	 * XXXAO: Handle window updates.
-	 */
-	rwin = tcp_rcv_wnd(tp, so);
-
-	/*
+	 * Determine length of data that should be transmitted, if there
+	 * is some data to send, then transmit; otherwise, investigate further.
+	 *
 	 * First step: how much to send.
 	 *
 	 * Check out our send window.
@@ -192,41 +198,45 @@
 	 *  c) how much data we have to send
 	 *  d) the pacing algorithm (optional)
 	 *
-	 * XXXAO: Add output pacing where one can limit the amount
-	 * of data that is sent in a time period through a socket
-	 * option.
+	 * duna = unacknowledged data in flight
+	 * swnd = remaining space in send window as advertised by remote end
+	 * cwnd = congestion window, remaing amount of data that can be unacknowledged in flight
+	 * dlen = remaing amount of data in send buffer available for sending
+	 * len = amount of data we have *and* can send righ now
+	 *
+	 *             <- duna -><-       swnd       ->
+	 *                       <-  cwnd  ->
+	 *                       <-dlen->
+	 * seq  .......|+++++++++xxxxxxxx---z---------|.......
+	 *             ^        ^
+	 *          snd_una  snd_nxt
 	 *
-	 * XXXAO
 	 */
-	swin = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
-	swin = tp->snd_wnd - tp->snd_inflight;	/* XXXAO: Alternative, SACK */
+	duna = SEQ_DELTA(tp->snd_nxt, tp->snd_una);
+	swnd = imax(0, tp->snd_wnd - duna);
+	cwnd = imax(0, tp->snd_cwnd - duna);
+	dlen = so->so_snd.sb_cc - duna;
+	len = min(dlen, min(swnd, cwnd));
 
-	len = min(swin, tp->snd_cwnd);
-	len = so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una);
-	len = tcp_snd_pace(tp, len);	/* XXXAO: todo token bucket */
-
-	if (tp->t_flags & TF_REXMT) {
-		len = tcp_snd_rexmt(tp, len);
-		goto send;
+	if (len > 0 && (tp->t_flags & TF_PACE)) {
+		len = tcp_snd_pace(tp, len);	/* XXXAO: todo token bucket, mss sized */
+		if (len == 0)
+			return (0);		/* next token is pending */
 	}
 
-	/*
-	 * Second step: Do we send?
-	 */
-	if (tp->t_flags & TF_ACKNOW)
-		goto send;
+	inflight = duna - tp->snd_sacked;
 
 	/*
 	 * Send out a SYN immediatly.
 	 */
-	if (flags & TH_SYN)
+	if ((flags & TH_SYN) && !(tp->t_flags & TF_SENTSYN))
 		goto send;
 
 	/*
 	 * If our state indicates that FIN should be sent
 	 * and we have not yet done so, then we need to send.
 	 */
-	if (flags & TH_FIN) {
+	if ((flags & TH_FIN) && !(tp->t_flags & TF_SENTFIN)) {
 		/*
 		 * All data is already sent and only the FIN is outstanding.
 		 */
@@ -238,11 +248,23 @@
 		 * if the window is big enough.  Do not care about nagle
 		 * and others.  Otherwise things will go their normal way.
 		 */
-		if (len <= snd_wnd)
+		if (len > 0)
 			goto send;
 	}
 
 	/*
+	 * Pending ACK?
+	 */
+	if (tp->t_flags & TF_ACKNOW)
+		goto send;
+	if (SEQ_LT(tp->snd_lastack, tp->snd_nxt) && !(tp->t_flags & TF_DELACK))
+		goto send;
+	if (tp->t_flags & TF_DUPACK) {
+		len = 0;
+		goto send;
+	}
+
+	/*
 	 * Sender silly window avoidance.   We transmit under the following
 	 * conditions when len is non-zero:
 	 *
@@ -254,11 +276,17 @@
 	 *   data (receiver may be limited the window size)
 	 * - We need to retransmit
 	 *
+	 * The idea behind delayed ACK is twofold:
+	 * a) aggregate multiple ACKs together
+	 * b) aggregate the response from application with the ACK
+	 * In both cases the events are probably very close together
+	 * and thus the delayed ACK time should be very short.
+	 *
 	 * a) Nagle algorithm: tinygram problem
 	 * b) silly window syndrome: buffer almost full
 	 *
 	 * Quoting Nagle:
-	 * The concept behind delayed ACKs is to bet, when receiving some data from the net,
+	 * <<The concept behind delayed ACKs is to bet, when receiving some data from the net,
 	 * that the local application will send a reply very soon.  So there's no need to
 	 * send an ACK immediately; the ACK can be piggybacked on the next data going the
 	 * other way. If that doesn't happen, after a 500ms delay, an ACK is sent anyway.
@@ -277,49 +305,119 @@
 	 * an ACK is a bet that the local application will reply to the data just received.
 	 * Some apps, like character echo in Telnet servers, do respond every time. Others,
 	 * like X-Windows "clients" (really servers, but X is backwards about this), only reply
-	 * some of the time. 
+	 * some of the time.>>
 	 * http://developers.slashdot.org/comments.pl?sid=174457&threshold=1&commentsort=0&mode=thread&cid=14515105
 	 *
 	 * XXXAO: mss - options!
 	 */
 	if (len) {
+		/*
+		 * Always send if there is no outstanding data in flight.
+		 */
 		if (tp->snd_nxt == tp->snd_una)
 			goto send;
+
+		/*
+		 * Always send if NODELAY is enabled.  This gives at least
+		 * one segment per application write no matter how small
+		 * the amount of data.
+		 */
 		if (tp->t_flags & TF_NODELAY)
 			goto send;
+
+		/*
+		 * Always send if we have more than one MSS worth of data.
+		 */
 		if (len >= tp->snd_mss)
 			goto send;
+
+		/*
+		 * For small windows send if we have half a window worth
+		 * of data.
+		 */
 		if (tp->snd_maxwnd > 0 && len >= tp->snd_maxwnd / 2)
 			goto send;
 	}
 
 	/*
-	 * Send window update?  We only send them if the window opened
-	 * up again either because the socket buffer was drained or
+	 * Persistent mode.
+	 * Send out probe byte if there is data available.
+	 *  RFC793: section 3.7, page 42-44
+	 *  RFC1122: section 4.2.2.17
+	 */
+	if (swnd == 0 && dlen > 0 && (tp->t_flags & TF_FORCEDATA)) {
+		len = 1;
+		goto send;
+	}
+	if (swnd == 0 && duna > tp->snd_wnd) {
+		/*
+		 * Window shrank
+		 * after we sent into it.  If window shrank to 0,
+		 * cancel pending retransmit, pull snd_nxt back
+		 * to (closed) window, and set the persist timer
+		 * if it isn't already going.  If the window didn't
+		 * close completely, just wait for an ACK.
+		 */
+		tcp_timer_activate(tp, TT_REXMT, 0);
+		tp->t_rxtshift = 0;
+		if (!tcp_timer_active(tp, TT_PERSIST))
+			tcp_setpersist(tp);
+	}
+
+	/*
+	 * Send window update?
+	 *
+	 * The receive window informs the remote side about the
+	 * remaining space in our receive buffer.  We only send
+	 * window updates if the socket buffer was drained or
 	 * enlarged.
-	 * When the application reads data from the socket we get notified
-	 * to potentially inform the remote end about more receive space.
+	 *
+	 * When the application reads (and by it removes) data
+	 * from the receive buffer we get notified and have to
+	 * decide whether the change justifies a window update
+	 * segment.
+	 *
+	 * We must avoid to the silly window syndrome whereas
+	 * every read from the receive buffer, no matter how
+	 * small, causes a window update to be sent.
+	 * 
+	 * To prevent this we employ a silly window avoidance
+	 * algorithm which causes updates to the window only
+	 * when the new window is enlarged by at least two MSS
+	 * sized segments.  This part is done by tcp_rcv_wnd()
+	 * and already incorporated into the rwin value we got.
 	 *
-	 * XXXAO: Do not send many small window updates if we are not
-	 * expecting more data and there was enough space adversized
-	 * the last time.
+	 * Our logic to determine whether to send an independent
+	 * window update segment is more stringent.  We only
+	 * send window updates if the new space in the receive
+	 * buffer is at least double the previous value.  This
+	 * prevents a flurry of independent window updates when
+	 * the socket buffer has queued a lot of data and the
+	 * application is doing small reads.  This may leave
+	 * some available space in the receive buffer not
+	 * advertised to the remote side.  As soon as it is
+	 * sending data again our resulting ACKs will contain
+	 * full value and no stalling will happen.
 	 *
-	 * NB: Do not send window updates if the remote end won't send
+	 * Independent window updates are not sent if a delayed
+	 * ACK is pending.  There we can simply piggy back the
+	 * new window information on the pending ACK.  Neither
+	 * do we send window updates if we have received a FIN.
+	 * It would be pointless as we are unable to receive
 	 * more data.
+	 *
+	 *  RFC793: section 3.7, page 42-44
+	 *  RFC1122: section 4.2.2.16
+	 *  Stevens Vol.2: section 26.3, page 858-861, figure 26.8
 	 */
-	if (!TCPS_HAVERCVDFIN(tp->t_state) && rwin > tp->rcv_advwnd) {
-		delta = rwin - tp->rcv_advwnd;
-
-		if (delta >= 2 * tp->snd_mss)
-			goto send;
-		if (2 * delta >= (long)so->so_rcv.sb_hiwat)
+	if (tp->rcv_advwin < rwin && !(tp->t_flags & TF_DELACK) &&
+	    !TCPS_HAVERCVDFIN(tp->t_state))
+		if (rwin >= 2 * tp->rcv_advwin)
 			goto send;
-	}
 
 	/*
 	 * No reason to send a segment, just return.
 	 */
-	SOCKBUF_UNLOCK(&so->so_snd);
 	return (0);
 
 send:
@@ -415,17 +513,20 @@
 	 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
 	 * or <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK>
 	 * case is handled in syncache.
+	 *
+	 * XXXAO: when sending dup-acks do not mix with window updates, otherwise
+	 *   the logic at the receiver may mistake the dup-ack
+	 * XXXAO: rwin is already scaled.
 	 */
 	if (flags & TH_SYN)
 		th->th_win = (u_short)(min(rwin, TCP_MAXWIN));
+	else if (tp->t_flags & TF_DUPACK)
+		th->th_win = (u_short)tp->rcv_advwin;
 	else
 		th->th_win = (u_short)(rwin >> tp->rcv_scale);
 
 	/*
 	 * Fill in fields.
-	 *
-	 * XXXAO: remembering maximum advertised window for
-	 * use in delaying messages about window sizes.
 	 */
 	if (tp->snd_nxt == tp->snd_rxmit) {
 		th->th_seq = tp->snd_nxt;
@@ -436,8 +537,7 @@
 	}
 
 	/*
-	 * If resending a FIN, be sure not to use a new sequence number.
-	 * XXXAO: Resending SYN?
+	 * If resending a SYN or FIN, be sure not to use a new sequence number.
 	 */
 	if ((flags & TH_SYN) && (tp->t_flags & TF_SENTSYN))
 		th->th_seq--;
@@ -453,7 +553,10 @@
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	/*
-	 * NB: len > 0 means we sent this much data w/o an error.
+	 * NB: len > 0 means we sent this much data w/o error.
+	 * error == 0 means we sent at least a single segment w/o error.
+	 *
+	 * XXXAO: Avoid unconditional writes to the tcpcb.
 	 */
 	if (len > 0) {
 		/*
@@ -463,35 +566,45 @@
 			tp->snd_nxt += len;
 		else
 			tp->snd_rxmit += len;
+	}
 
+	if (error == 0) {
 		/*
-		 * Data sent (as far as we can tell).
-		 * If this advertises a larger window than any other segment,
-		 * then remember the size of the advertised window.
-		 * Any pending ACK has now been sent.
+		 * Integrate FIN into sequence space.
 		 */
-		if (rwin > 0 && SEQ_GT(tp->rcv_nxt + rwin, tp->rcv_adv))
-			tp->rcv_adv = tp->rcv_nxt + rwin;
+		if ((flags & TH_FIN) && !(tp-t_flags & TF_SENTFIN)) {
+			tp->snd_nxt++;
+			tp->t_flags |= TF_SENTFIN;
+		}
 	}
 
-	if (error == 0) {
+	if (len > 0 || error == 0) {
 		/*
-		 * Integrate SYN and FIN into sequence space.
-		 * XXXAO: If we send data with SYN this breaks.
+		 * Integrate SYN into sequence space.
 		 */
 		if ((flags & TH_SYN) && !(tp->t_flags & TF_SENTSYN)) {
 			tp->snd_nxt++;
 			tp->t_flags |= TF_SENTSYN;
 		}
-		if ((flags & TH_FIN) && !(tp-t_flags & TF_SENTFIN)) {
-			tp->snd_nxt++;
-			tp->t_flags |= TF_SENTFIN;
-		}
+
+		/*
+		 * Any pending ACK has been sent.
+		 * Clear related flags and disarm the delayed ACK timer.
+		 */
+		tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
+		if (SEQ_LT(tp->snd_lastack, tp->rcv_nxt)
+			tp->snd_lastack = tp->rcv_nxt;
+		if (tcp_timer_active(tp, TT_DELACK))
+			tcp_timer_activate(tp, TT_DELACK, 0);
 
 		/*
 		 * Remember last advertised receive window.
+		 * We need this information to send proper
+		 * duplicate ACKs and to know whether we
+		 * have to send a window update later on.
 		 */
-		tp->rcv_advwnd = rwin;
+		if (tp->rcv_advwin != rwin)
+			tp->rcv_advwin = rwin;
 
 		/*
 		 * Adjust the RXWIN0SENT flag - indicate that we have advertised
@@ -507,15 +620,6 @@
 			tp->t_flags &= ~TF_RXWIN0SENT;
 	}
 
-	if (len > 0 || error == 0) {
-		if (SEQ_LT(tp->last_ack_sent, tp->rcv_nxt)
-			tp->last_ack_sent = tp->rcv_nxt;
-
-		tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
-		if (tcp_timer_active(tp, TT_DELACK))
-			tcp_timer_activate(tp, TT_DELACK, 0);
-	}
-
 	if (len > 0 && error == 0) {
 		if ((tp->t_flags & TF_FORCEDATA) == 0 || 
 		    !tcp_timer_active(tp, TT_PERSIST))
@@ -639,7 +743,7 @@
 	}
 	tcpstat.tcps_sndtotal++;
 
-	if (tp->t_flags & TF_ACKNOW)	/* XXXAO: test whether we increased last_ack_sent */
+	if (tp->t_flags & TF_ACKNOW)	/* XXXAO: test whether we increased snd_lastack */
 		tcpstat.tcps_sndacks++;
 	else if (flags & (TH_SYN|TH_FIN|TH_RST))
 		tcpstat.tcps_sndctrl++;
@@ -651,131 +755,12 @@
 	return (0);
 }
 
-int
-tcp_junk()
-{
-	/*
-	 * If in persist timeout with window of 0, send 1 byte.
-	 * Otherwise, if window is small but nonzero
-	 * and timer expired, we will send what we can
-	 * and go to transmit state.
-	 */
-	if (tp->t_flags & TF_FORCEDATA) {
-		if (snd_wnd == 0) {
-			/*
-			 * If we still have some data to send, then
-			 * clear the FIN bit.  Usually this would
-			 * happen below when it realizes that we
-			 * aren't sending all the data.  However,
-			 * if we have exactly 1 byte of unsent data,
-			 * then it won't clear the FIN bit below,
-			 * and if we are in persist state, we wind
-			 * up sending the packet without recording
-			 * that we sent the FIN bit.
-			 *
-			 * We can't just blindly clear the FIN bit,
-			 * because if we don't have any more data
-			 * to send then the probe will be the FIN
-			 * itself.
-			 */
-			if (off < so->so_snd.sb_cc)
-				flags &= ~TH_FIN;
-			snd_wnd = 1;
-		} else {
-			tcp_timer_activate(tp, TT_PERSIST, 0);
-			tp->t_rxtshift = 0;
-		}
-
-		if (tp->t_flags & TF_FORCEDATA)	/* typ. timeout case */
-			goto send;
-		/*
-		 * TCP window updates are not reliable, rather a polling protocol
-		 * using ``persist'' packets is used to insure receipt of window
-		 * updates.  The three ``states'' for the output side are:
-		 *	idle			not doing retransmits or persists
-		 *	persisting		to move a small or zero window
-		 *	(re)transmitting	and thereby not persisting
-		 *
-		 * If send window is too small, there is data to transmit, and no
-		 * retransmit or persist is pending, then go to persist state.
-		 * If nothing happens soon, send when timer expires:
-		 * if window is nonzero, transmit what we can,
-		 * otherwise force out a byte.
-		 * XXX: We don't force anything here, only return!?
-		 */
-		if (len > 0 && !tcp_timer_active(tp, TT_REXMT) &&
-		    !tcp_timer_active(tp, TT_PERSIST)) {
-			tp->t_rxtshift = 0;
-			tcp_setpersist(tp);
-		}
-	}
-
-	/*
-	 * Urgent data pending.
-	 */
-	if (SEQ_GT(tp->snd_up, tp->snd_una))
-		goto send;
-
-	if (len < 0) {
-		/*
-		 * If FIN has been sent but not acked,
-		 * but we haven't been called to retransmit,
-		 * len will be < 0.  Otherwise, window shrank
-		 * after we sent into it.  If window shrank to 0,
-		 * cancel pending retransmit, pull snd_nxt back
-		 * to (closed) window, and set the persist timer
-		 * if it isn't already going.  If the window didn't
-		 * close completely, just wait for an ACK.
-		 */
-		len = 0;
-		if (snd_wnd == 0) {
-			tcp_timer_activate(tp, TT_REXMT, 0);
-			tp->t_rxtshift = 0;
-			if (!tcp_timer_active(tp, TT_PERSIST))
-				tcp_setpersist(tp);
-		}
-	}
-
-	/*
-	 * TSO may only be used if we are in a pure bulk sending state.  The
-	 * presence of TCP-MD5, SACK retransmits, SACK advertizements and
-	 * IP options prevent using TSO.  With TSO the TCP header is the same
-	 * (except for the sequence number) for all generated packets.  This
-	 * makes it impossible to transmit any options which vary per generated
-	 * segment or packet.
-	 */
-	if ((tp->t_flags & TF_TSO) && tcp_do_tso &&
-	    ((tp->t_flags & TF_SIGNATURE) == 0) &&
-	    inp->inp_options == NULL &&
-	    inp->in6p_options == NULL &&
-	    inp->inp_sp == NULL)	/* XXXAO: update */
-		tso = 1;
-
-#if 0
-	/*
-	 * Urgent pointer handling.
-	 */
-	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
-		th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
-		th->th_flags |= TH_URG;
-	} else {
-		/*
-		 * If no urgent pointer to send, then we pull
-		 * the urgent pointer to the left edge of the send window
-		 * so that it doesn't drift into the send window on sequence
-		 * number wraparound.
-		 */
-		tp->snd_up = tp->snd_una;		/* drag it along */
-	}
-#endif
-}
-
 /*
  * Do a retransmit from snd_nxt or a later point.  This is separate
  * from the normal transmit case as the logic is quite a bit different.
  */
 static int
-tcp_do_retransmit()
+tcp_retransmit(struct tcpcb *tp, int len)
 {
 
 	/*
@@ -786,27 +771,12 @@
 	 */
 	/*
 	 * We have the following mechanisms:
-	 *  1. Fast retransmit: After we get three duplicate ACKs
-	 *  2. NewReno Fast recovery RFC3782
+	 *  1. Fast recovery: After we get three duplicate ACKs RFC2581
+	 *  2. NewReno RFC3782
 	 *  3. Limited transmit RFC3042
 	 *  4. SACK tells us where to send how much data RFC3517
 	 */
-	/*
-	 * XXXAO: remembering maximum advertised window for
-	 * use in delaying messages about window sizes.
-	 */
-	if (tp->snd_nxt == tp->snd_rxmit) {
-		th->th_seq = tp->snd_nxt;
-		off = tp->snd_nxt - tp->snd_una;
-	} else {
-		th->th_seq = tp->snd_rxmit;
-		off = min(tp->snd_rxmit - tp->snd_una, so->so_snd.sb_cc);
-	}
-	/*
-	 * Check if we have to remove FIN on SACK retransmits.
-	 */
-	if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
-		flags &= ~TH_FIN;
+
 }
 
 /*
@@ -843,6 +813,9 @@
 #ifdef FAST_IPSEC
 	/*
 	 * NB: This is an expensive operation and involves memory allocation.
+	 *
+	 * XXXAO: If the IPSEC header size doesn't change during a session
+	 * lifetime we could compute the number at establishment time.
 	 */
 	linkhdr += (int)ipsec_hdrsiz_tcp(tp);
 #endif
@@ -1006,6 +979,14 @@
 		    ("%s: data beyond FIN", __func__);
 
 		/*
+		 * Set the PUSH bit to indicate that we have reached
+		 * the end of the send buffer.
+		 */
+		if (off + slen == so->so_snd.sb_cc) {
+			th->th_flags =| TH_PSH;
+		}
+
+		/*
 		 * If we're sending everything we've got, set PUSH.
 		 * This will keep happy those implementations which
 		 * only give data to the user when a buffer fills or
@@ -1124,49 +1105,86 @@
 
 
 /*
- * Shall we send data or not?
- * And what window shall we advertize?
+ * Calculate and update our current receive window.
+ * Return the scaled receive window.
  */
-static int
+static u_int
 tcp_rcv_wnd(struct tcpcb *tp, struct socket *so)
 {
-	int delta;
+	int delta, rwin;
 
 	KASSERT(SEQ_GEQ(tp->rcv_wnd, tp->rcv_nxt),
-	    ("%s: ", __func__));
-
-	delta = sbspace(so->so_rcv) - SEQ_DELTA(tp->rcv_wnd, tp->rcv_nxt);
+	    ("%s: receive window below rcv_nxt", __func__));
 
 	/*
-	 * Determine if we should send window update.
-	 * Silly window avoidance: Only send window update
-	 * if we've got at least two segments of space.
-	 * If the socket buffer was shrunk then delta is
-	 * a negative value.
+	 * Calculate the amount of space in the receive buffer relative
+	 * to the current end of the receive window.  If the receive
+	 * buffer was shrunk delta becomes negative.
+	 *
+	 *             <-          sb_hiwat          ->
+	 *             <- sb_cc ->
+	 * seq  .......|++++++++++------------------z-|.......
+	 *                        ^                 ^
+	 *                     rcv_nxt           rcv_wnd
+	 *
+	 * XXXAO: To avoid the locking overhead tcp_usr_rcvd could update
+	 * a rcv_read pointer.
 	 */
+	SB_LOCK(so->so_rcv);
+	if (so->so_rcv.sb_hiwat - so->so_rcv.sb_cc > 0)
+		delta = SEQ_DELTA(tp->rcv_wnd - so->so_rcv.sb_hiwat,
+			    tp->rcv_nxt - so->so_rcv.sb_cc);
+	else
+		delta = so->so_rcv.sb_hiwat - so->so_rcv.sb_cc;
+	SB_UNLOCK(so->so_rcv);
+
 	/*
-	 * - if socket buffer is less than 1/4 free, send many updates
-	 * - piggy back window update on delayed ack
-	 * - if socket buffer > 1/4 free send updates only from time to time
-	 * - when sending dup-acks do not mix with window updates, otherwise
-	 *   the logic at the receiver may mistake the dup-ack
-	 * - the new value must be larger than the minimal unscaled increment
-	 * - if delta is more than 50% or we reach the full window
+	 * Silly window avoidance: Only grow the window if we've
+	 * got at least two segments of additional space available.
+	 * Take into account the granularity of the window scale
+	 * shift.
+	 *
+	 * NB: We do not shrink the window even if the receive
+	 * buffer was shrunk on us.  We won't re-open the window
+	 * as more data comes in though.
+	 *
+	 *  RFC793: section 3.7, page 42-44
+	 *  RFC1122: section 4.2.2.16
+	 *  Stevens Vol.2: section 26.3, page 858-861
 	 */
 	if (delta > 0 && (delta >> tp->rcv_scale) > 0 &&
-	    delta >= 2 * tp->snd_mss) {
+	    (tp->rcv_scale << (delta >> tp->rcv_scale)) >= 2 * tp->snd_mss)
 		tp->rcv_wnd += delta;
-	}
+
+	/*
+	 * Report shrunk socket buffers.
+	 */
 	if (delta < 0)
 		tcp_log("our receive socket buffer was shrunk");
 
-	rwin = (tp->rcv_wnd - tp->rcv_nxt) - so->so_rcv.sb_cc;
+	/*
+	 * Our current open receive window to be advertized is
+	 * the remaining space in the socket buffer.
+	 */
+	rwin = SEQ_DELTA(tp->rcv_wnd - tp->rcv_nxt);
+
+	return (rwin >> tp->rcv_scale);
+}
 
-	return (rwin);
+/*
+ * Pace the segment stream by limiting the amount of data
+ * that is sent per time unit (tocken bucket).
+ *
+ * NB: Never go below one MSS per time unit.
+ */
+static void
+tcp_snd_pace(struct tcpcp *tp)
+{
+	return;
 }
 
-void
-tcp_snd_autoscale(struct tcpcb *tp)
+static void
+tcp_snd_autoscale(struct tcpcb *tp, int swnd)
 {
 	/*
 	 * Automatic sizing of send socket buffer.  Often the send buffer
@@ -1219,7 +1237,7 @@
 		if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
 		    so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
 		    so->so_snd.sb_cc < tcp_autosndbuf_max &&
-		    sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
+		    swin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
 			if (!sbreserve_locked(&so->so_snd,
 			    min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
 			     tcp_autosndbuf_max), so, curthread))

>>> TRUNCATED FOR MAIL (1000 lines) <<<


More information about the p4-projects mailing list