PERFORCE change 129464 for review
Rui Paulo
rpaulo at FreeBSD.org
Sat Nov 24 11:13:03 PST 2007
http://perforce.freebsd.org/chv.cgi?CH=129464
Change 129464 by rpaulo at rpaulo_zoo on 2007/11/24 19:12:59
End host TCP ECN implementation. My Google Summer Of Code
project for 2006.
Obtained from: NetBSD
Affected files ...
.. //depot/projects/tcpecn/netinet/tcp_input.c#2 edit
.. //depot/projects/tcpecn/netinet/tcp_output.c#2 edit
.. //depot/projects/tcpecn/netinet/tcp_syncache.c#2 edit
.. //depot/projects/tcpecn/netinet/tcp_usrreq.c#2 edit
.. //depot/projects/tcpecn/netinet/tcp_var.h#2 edit
Differences ...
==== //depot/projects/tcpecn/netinet/tcp_input.c#2 (text+ko) ====
@@ -128,6 +128,14 @@
&tcp_do_rfc3390, 0,
"Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
+int tcp_do_ecn = 0;
+int tcp_ecn_maxretries = 1;
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN");
+SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW,
+ &tcp_do_ecn, 0, "TCP ECN support");
+SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW,
+ &tcp_ecn_maxretries, 0, "Max retries before giving up on ECN");
+
static int tcp_insecure_rst = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW,
&tcp_insecure_rst, 0,
@@ -152,14 +160,32 @@
static void tcp_dooptions(struct tcpopt *, u_char *, int, int);
static void tcp_do_segment(struct mbuf *, struct tcphdr *,
- struct socket *, struct tcpcb *, int, int);
+ struct socket *, struct tcpcb *, int, int, uint8_t);
static void tcp_dropwithreset(struct mbuf *, struct tcphdr *,
struct tcpcb *, int, int);
static void tcp_pulloutofband(struct socket *,
struct tcphdr *, struct mbuf *, int);
static void tcp_xmit_timer(struct tcpcb *, int);
static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
+static void inline
+ tcp_congestion_exp(struct tcpcb *);
+static void inline
+tcp_congestion_exp(struct tcpcb *tp)
+{
+ u_int win;
+
+ win = min(tp->snd_wnd, tp->snd_cwnd) /
+ 2 / tp->t_maxseg;
+ if (win < 2)
+ win = 2;
+ tp->snd_ssthresh = win * tp->t_maxseg;
+ ENTER_FASTRECOVERY(tp);
+ tp->snd_recover = tp->snd_max;
+ if (tp->t_flags & TF_ECN_PERMIT)
+ tp->t_flags |= TF_ECN_SND_CWR;
+}
+
/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
#ifdef INET6
#define ND6_HINT(tp) \
@@ -238,6 +264,7 @@
int drop_hdrlen;
int thflags;
int rstreason = 0; /* For badport_bandlim accounting purposes */
+ uint8_t iptos;
#ifdef IPFIREWALL_FORWARD
struct m_tag *fwd_tag;
#endif
@@ -347,6 +374,13 @@
ip->ip_v = IPVERSION;
}
+#ifdef INET6
+ if (isipv6)
+ iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
+ else
+#endif
+ iptos = ip->ip_tos;
+
/*
* Check that TCP offset makes sense,
* pull out TCP options and adjust length. XXX
@@ -642,7 +676,8 @@
* contains. tcp_do_segment() consumes
* the mbuf chain and unlocks the inpcb.
*/
- tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen);
+ tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
+ iptos);
INP_INFO_UNLOCK_ASSERT(&tcbinfo);
return;
}
@@ -842,7 +877,7 @@
* state. tcp_do_segment() always consumes the mbuf chain, unlocks
* the inpcb, and unlocks pcbinfo.
*/
- tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen);
+ tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos);
INP_INFO_UNLOCK_ASSERT(&tcbinfo);
return;
@@ -866,7 +901,7 @@
static void
tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
- struct tcpcb *tp, int drop_hdrlen, int tlen)
+ struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos)
{
int thflags, acked, ourfinisacked, needoutput = 0;
int headlocked = 1;
@@ -909,6 +944,35 @@
tiwin = th->th_win << tp->snd_scale;
/*
+ * TCP ECN processing.
+ */
+ if (tp->t_flags & TF_ECN_PERMIT) {
+ switch (iptos & IPTOS_ECN_MASK) {
+ case IPTOS_ECN_CE:
+ tp->t_flags |= TF_ECN_SND_ECE;
+ tcpstat.tcps_ecn_ce++;
+ break;
+ case IPTOS_ECN_ECT0:
+ tcpstat.tcps_ecn_ect0++;
+ break;
+ case IPTOS_ECN_ECT1:
+ tcpstat.tcps_ecn_ect1++;
+ break;
+ }
+
+ if (thflags & TH_CWR)
+ tp->t_flags &= ~TF_ECN_SND_ECE;
+
+ /*
+ * Congestion experienced.
+ * Ignore if we are already trying to recover.
+ */
+ if ((thflags & TH_ECE) &&
+ SEQ_GEQ(tp->snd_una, tp->snd_recover))
+ tcp_congestion_exp(tp);
+ }
+
+ /*
* Parse options on any incoming segment.
*/
tcp_dooptions(&to, (u_char *)(th + 1),
@@ -976,7 +1040,8 @@
*/
if (tp->t_state == TCPS_ESTABLISHED &&
th->th_seq == tp->rcv_nxt &&
- (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
+ (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE|TH_CWR))
+ == TH_ACK &&
tp->snd_nxt == tp->snd_max &&
tiwin && tiwin == tp->snd_wnd &&
((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
@@ -1253,6 +1318,8 @@
* Otherwise this is an acceptable SYN segment
* initialize tp->rcv_nxt and tp->irs
* if seg contains ack then advance tp->snd_una
+ * if seg contains an ECE and ECN support is enabled, the stream
+ * is ECN capable.
* if SYN has been acked change to ESTABLISHED else SYN_RCVD state
* arrange for segment to be acked (eventually)
* continue processing rest of data/controls, beginning with URG
@@ -1297,6 +1364,12 @@
tcp_delacktime);
else
tp->t_flags |= TF_ACKNOW;
+
+ if ((thflags & TH_ECE) && tcp_do_ecn) {
+ tp->t_flags |= TF_ECN_PERMIT;
+ tcpstat.tcps_ecn_shs++;
+ }
+
/*
* Received <SYN,ACK> in SYN_SENT[*] state.
* Transitions:
@@ -1758,6 +1831,9 @@
* so bump cwnd by the amount in the receiver
* to keep a constant cwnd packets in the
* network.
+ *
+ * When using TCP ECN, notify the peer that
+ * we reduced the cwnd.
*/
if (!tcp_timer_active(tp, TT_REXMT) ||
th->th_ack != tp->snd_una)
@@ -1789,7 +1865,6 @@
goto drop;
} else if (tp->t_dupacks == tcprexmtthresh) {
tcp_seq onxt = tp->snd_nxt;
- u_int win;
/*
* If we're doing sack, check to
@@ -1803,20 +1878,15 @@
tp->t_dupacks = 0;
break;
}
- } else if (tcp_do_newreno) {
+ } else if (tcp_do_newreno ||
+ tcp_do_ecn) {
if (SEQ_LEQ(th->th_ack,
tp->snd_recover)) {
tp->t_dupacks = 0;
break;
}
}
- win = min(tp->snd_wnd, tp->snd_cwnd) /
- 2 / tp->t_maxseg;
- if (win < 2)
- win = 2;
- tp->snd_ssthresh = win * tp->t_maxseg;
- ENTER_FASTRECOVERY(tp);
- tp->snd_recover = tp->snd_max;
+ tcp_congestion_exp(tp);
tcp_timer_activate(tp, TT_REXMT, 0);
tp->t_rtttime = 0;
if (tp->t_flags & TF_SACK_PERMIT) {
==== //depot/projects/tcpecn/netinet/tcp_output.c#2 (text+ko) ====
@@ -884,6 +884,49 @@
tp->snd_nxt == tp->snd_max)
tp->snd_nxt--;
/*
+ * If we are starting a connection, send ECN setup
+ * SYN packet. If we are on a retransmit, we may
+ * resend those bits a number of times as per
+ * RFC 3168.
+ */
+ if (tp->t_state == TCPS_SYN_SENT && tcp_do_ecn) {
+ if (tp->t_rxtshift >= 1) {
+ if (tp->t_rxtshift <= tcp_ecn_maxretries)
+ flags |= TH_ECE|TH_CWR;
+ } else
+ flags |= TH_ECE|TH_CWR;
+ }
+
+ if (tp->t_state == TCPS_ESTABLISHED &&
+ (tp->t_flags & TF_ECN_PERMIT)) {
+ /*
+ * If the peer has ECN, mark data packets with
+ * ECN capable transmission (ECT).
+ * Ignore pure ack packets, retransmissions and window probes.
+ */
+ if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
+ !((tp->t_flags & TF_FORCEDATA) && len == 1)) {
+#ifdef INET6
+ if (isipv6)
+ ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
+ else
+#endif
+ ip->ip_tos |= IPTOS_ECN_ECT0;
+ tcpstat.tcps_ecn_ect0++;
+ }
+
+ /*
+ * Reply with proper ECN notifications.
+ */
+ if (tp->t_flags & TF_ECN_SND_CWR) {
+ flags |= TH_CWR;
+ tp->t_flags &= ~TF_ECN_SND_CWR;
+ }
+ if (tp->t_flags & TF_ECN_SND_ECE)
+ flags |= TH_ECE;
+ }
+
+ /*
* If we are doing retransmissions, then snd_nxt will
* not reflect the first unsent octet. For ACK only
* packets, we do not want the sequence number of the
==== //depot/projects/tcpecn/netinet/tcp_syncache.c#2 (text+ko) ====
@@ -127,7 +127,7 @@
u_int8_t sc_ip_tos; /* IPv4 TOS */
u_int8_t sc_requested_s_scale:4,
sc_requested_r_scale:4;
- u_int8_t sc_flags;
+ u_int16_t sc_flags;
#define SCF_NOOPT 0x01 /* no TCP options */
#define SCF_WINSCALE 0x02 /* negotiated window scaling */
#define SCF_TIMESTAMP 0x04 /* negotiated timestamps */
@@ -135,6 +135,7 @@
#define SCF_UNREACH 0x10 /* icmp unreachable received */
#define SCF_SIGNATURE 0x20 /* send MD5 digests */
#define SCF_SACK 0x80 /* send SACK option */
+#define SCF_ECN 0x100 /* send ECN setup packet */
#ifdef MAC
struct label *sc_label; /* MAC label reference */
#endif
@@ -778,6 +779,9 @@
tp->t_flags |= TF_SACK_PERMIT;
}
+ if (sc->sc_flags & SCF_ECN)
+ tp->t_flags |= TF_ECN_PERMIT;
+
/*
* Set up MSS and get cached values from tcp_hostcache.
* This might overwrite some of the defaults we just set.
@@ -1190,7 +1194,9 @@
sc->sc_peer_mss = to->to_mss; /* peer mss may be zero */
if (noopt)
sc->sc_flags |= SCF_NOOPT;
-
+ if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn)
+ sc->sc_flags |= SCF_ECN;
+
if (tcp_syncookies) {
syncookie_generate(sch, sc, &flowtmp);
#ifdef INET6
@@ -1325,6 +1331,41 @@
th->th_win = htons(sc->sc_wnd);
th->th_urp = 0;
+ if (sc->sc_flags & SCF_ECN) {
+ th->th_flags |= TH_ECE;
+ tcpstat.tcps_ecn_shs++;
+
+ /*
+ * draft-ietf-tcpm-ecnsyn-00.txt
+ *
+ * "[...] a TCP node MAY respond to an ECN-setup
+ * SYN packet by setting ECT in the responding
+ * ECN-setup SYN/ACK packet, indicating to routers
+ * that the SYN/ACK packet is ECN-Capable.
+ * This allows a congested router along the path
+ * to mark the packet instead of dropping the
+ * packet as an indication of congestion."
+ *
+ * "[...] There can be a great benefit in setting
+ * an ECN-capable codepoint in SYN/ACK packets [...]
+ * Congestion is most likely to occur in
+ * the server-to-client direction. As a result,
+ * setting an ECN-capable codepoint in SYN/ACK
+ * packets can reduce the occurence of three-second
+ * retransmit timeouts resulting from the drop
+ * of SYN/ACK packets."
+ *
+ * Page 4 and 6, January 2006.
+ */
+#ifdef INET6
+ if (sc->sc_inc.inc_isipv6)
+ ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
+ else
+#endif
+ ip->ip_tos |= IPTOS_ECN_ECT0;
+ tcpstat.tcps_ecn_ect0++;
+ }
+
/* Tack on the TCP options. */
if ((sc->sc_flags & SCF_NOOPT) == 0) {
to.to_flags = 0;
==== //depot/projects/tcpecn/netinet/tcp_usrreq.c#2 (text+ko) ====
@@ -1712,6 +1712,10 @@
db_printf("%sTF_TSO", comma ? ", " : "");
comma = 1;
}
+ if (t_flags & TF_ECN_PERMIT) {
+ db_printf("%sTF_ECN_PERMIT", comma ? ", " : "");
+ comma = 1;
+ }
}
static void
==== //depot/projects/tcpecn/netinet/tcp_var.h#2 (text+ko) ====
@@ -123,6 +123,9 @@
#define TF_SIGNATURE 0x400000 /* require MD5 digests (RFC2385) */
#define TF_FORCEDATA 0x800000 /* force out a byte */
#define TF_TSO 0x1000000 /* TSO enabled on this connection */
+#define TF_ECN_PERMIT 0x2000000 /* connection ECN-ready */
+#define TF_ECN_SND_CWR 0x4000000 /* ECN CWR in queue */
+#define TF_ECN_SND_ECE 0x8000000 /* ECN ECE in queue */
tcp_seq snd_una; /* send unacknowledged */
tcp_seq snd_max; /* highest sequence number sent;
@@ -429,6 +432,12 @@
u_long tcps_sack_rcv_blocks; /* SACK blocks (options) received */
u_long tcps_sack_send_blocks; /* SACK blocks (options) sent */
u_long tcps_sack_sboverflow; /* times scoreboard overflowed */
+
+ /* ECN related stats */
+ u_long tcps_ecn_ce; /* ECN Congestion Experienced */
+ u_long tcps_ecn_ect0; /* ECN Capable Transport */
+ u_long tcps_ecn_ect1; /* ECN Capable Transport */
+ u_long tcps_ecn_shs; /* ECN successful handshakes */
};
/*
@@ -505,6 +514,8 @@
extern int tcp_do_sack; /* SACK enabled/disabled */
extern int tcp_sc_rst_sock_fail; /* RST on sock alloc failure */
+extern int tcp_do_ecn; /* TCP ECN enabled/disabled */
+extern int tcp_ecn_maxretries;
int tcp_addoptions(struct tcpopt *, u_char *);
struct tcpcb *
More information about the p4-projects
mailing list