svn commit: r303626 - in head/sys: netinet netinet6

Andrew Gallatin gallatin at FreeBSD.org
Mon Aug 1 17:02:22 UTC 2016


Author: gallatin
Date: Mon Aug  1 17:02:21 2016
New Revision: 303626
URL: https://svnweb.freebsd.org/changeset/base/303626

Log:
  Rework IPV6 TCP path MTU discovery to match IPv4
  
  - Re-write tcp_ctlinput6() to closely mimic the IPv4 tcp_ctlinput()
  
  - Now that tcp_ctlinput6() updates t_maxseg, we can allow ip6_output()
    to send TCP packets without looking at the tcp host cache for every
    single transmit.
  
  - Make the icmp6 code mimic the IPv4 code & avoid returning
    PRC_HOSTDEAD because it is so expensive.
  
  Without these changes in place, every TCP6 pmtu discovery or host
  unreachable ICMP resulted in a call to in6_pcbnotify() which walks the
  tcbinfo table with the write lock held.  Because the tcbinfo table is
  shared between IPv4 and IPv6, this causes huge scalabilty issues on
  servers with lots of (~100K) TCP connections, to the point where even
  a small percent of IPv6 traffic had a disproportionate impact on
  overall throughput.
  
  Reviewed by:	bz, rrs, ae (all earlier versions), lstewart (in Netflix's tree)
  Sponsored by:		Netflix
  Differential Revision:	https://reviews.freebsd.org/D7272

Modified:
  head/sys/netinet/tcp_subr.c
  head/sys/netinet6/icmp6.c
  head/sys/netinet6/ip6_output.c

Modified: head/sys/netinet/tcp_subr.c
==============================================================================
--- head/sys/netinet/tcp_subr.c	Mon Aug  1 16:40:42 2016	(r303625)
+++ head/sys/netinet/tcp_subr.c	Mon Aug  1 17:02:21 2016	(r303626)
@@ -78,6 +78,7 @@ __FBSDID("$FreeBSD$");
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
+#include <netinet/icmp6.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_pcb.h>
@@ -2040,72 +2041,146 @@ tcp_ctlinput(int cmd, struct sockaddr *s
 void
 tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
 {
-	struct tcphdr th;
+	struct in6_addr *dst;
+	struct tcphdr *th;
 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 	struct ip6_hdr *ip6;
 	struct mbuf *m;
+	struct inpcb *inp;
+	struct tcpcb *tp;
+	struct icmp6_hdr *icmp6;
 	struct ip6ctlparam *ip6cp = NULL;
 	const struct sockaddr_in6 *sa6_src = NULL;
-	int off;
-	struct tcp_portonly {
-		u_int16_t th_sport;
-		u_int16_t th_dport;
-	} *thp;
+	struct in_conninfo inc;
+	tcp_seq icmp_tcp_seq;
+	unsigned int mtu;
+	unsigned int off;
+
 
 	if (sa->sa_family != AF_INET6 ||
 	    sa->sa_len != sizeof(struct sockaddr_in6))
 		return;
 
-	if (cmd == PRC_MSGSIZE)
-		notify = tcp_mtudisc_notify;
-	else if (!PRC_IS_REDIRECT(cmd) &&
-		 ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
-		return;
-
 	/* if the parameter is from icmp6, decode it. */
 	if (d != NULL) {
 		ip6cp = (struct ip6ctlparam *)d;
+		icmp6 = ip6cp->ip6c_icmp6;
 		m = ip6cp->ip6c_m;
 		ip6 = ip6cp->ip6c_ip6;
 		off = ip6cp->ip6c_off;
 		sa6_src = ip6cp->ip6c_src;
+		dst = ip6cp->ip6c_finaldst;
 	} else {
 		m = NULL;
 		ip6 = NULL;
 		off = 0;	/* fool gcc */
 		sa6_src = &sa6_any;
+		dst = NULL;
 	}
 
-	if (ip6 != NULL) {
-		struct in_conninfo inc;
-		/*
-		 * XXX: We assume that when IPV6 is non NULL,
-		 * M and OFF are valid.
-		 */
+	if (cmd == PRC_MSGSIZE)
+		notify = tcp_mtudisc_notify;
+	else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
+		cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) &&
+		ip6 != NULL)
+		notify = tcp_drop_syn_sent;
 
-		/* check if we can safely examine src and dst ports */
-		if (m->m_pkthdr.len < off + sizeof(*thp))
-			return;
+	/*
+	 * Hostdead is ugly because it goes linearly through all PCBs.
+	 * XXX: We never get this from ICMP, otherwise it makes an
+	 * excellent DoS attack on machines with many connections.
+	 */
+	else if (cmd == PRC_HOSTDEAD)
+		ip6 = NULL;
+	else if ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)
+		return;
 
-		bzero(&th, sizeof(th));
-		m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
+	if (ip6 == NULL) {
+		in6_pcbnotify(&V_tcbinfo, sa, 0,
+			      (const struct sockaddr *)sa6_src,
+			      0, cmd, NULL, notify);
+		return;
+	}
 
-		in6_pcbnotify(&V_tcbinfo, sa, th.th_dport,
-		    (struct sockaddr *)ip6cp->ip6c_src,
-		    th.th_sport, cmd, NULL, notify);
+	/* Check if we can safely get the ports from the tcp hdr */
+	if (m == NULL ||
+	    (m->m_pkthdr.len <
+		(int32_t) (off + offsetof(struct tcphdr, th_seq)))) {
+		return;
+	}
 
+	th = (struct tcphdr *) mtodo(ip6cp->ip6c_m, ip6cp->ip6c_off);
+	INP_INFO_RLOCK(&V_tcbinfo);
+	inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_dst, th->th_dport,
+	    &ip6->ip6_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL);
+	if (inp != NULL && PRC_IS_REDIRECT(cmd)) {
+		/* signal EHOSTDOWN, as it flushes the cached route */
+		inp = (*notify)(inp, EHOSTDOWN);
+		if (inp != NULL)
+			INP_WUNLOCK(inp);
+	} else if (inp != NULL)  {
+		if (!(inp->inp_flags & INP_TIMEWAIT) &&
+		    !(inp->inp_flags & INP_DROPPED) &&
+		    !(inp->inp_socket == NULL)) {
+			icmp_tcp_seq = ntohl(th->th_seq);
+			tp = intotcpcb(inp);
+			if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
+			    SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
+				if (cmd == PRC_MSGSIZE) {
+					/*
+					 * MTU discovery:
+					 * If we got a needfrag set the MTU
+					 * in the route to the suggested new
+					 * value (if given) and then notify.
+					 */
+					mtu = ntohl(icmp6->icmp6_mtu);
+					/*
+					 * If no alternative MTU was
+					 * proposed, or the proposed
+					 * MTU was too small, set to
+					 * the min.
+					 */
+					if (mtu < IPV6_MMTU)
+						mtu = IPV6_MMTU - 8;
+
+
+					bzero(&inc, sizeof(inc));
+					inc.inc_fibnum = M_GETFIB(m);
+					inc.inc_flags |= INC_ISIPV6;
+					inc.inc6_faddr = *dst;
+					if (in6_setscope(&inc.inc6_faddr,
+						m->m_pkthdr.rcvif, NULL))
+						goto unlock_inp;
+
+					/*
+					 * Only process the offered MTU if it
+					 * is smaller than the current one.
+					 */
+					if (mtu < tp->t_maxseg +
+					    (sizeof (*th) + sizeof (*ip6))) {
+						tcp_hc_updatemtu(&inc, mtu);
+						tcp_mtudisc(inp, mtu);
+						ICMP6STAT_INC(icp6s_pmtuchg);
+					}
+				} else
+					inp = (*notify)(inp,
+					    inet6ctlerrmap[cmd]);
+			}
+		}
+unlock_inp:
+		if (inp != NULL)
+			INP_WUNLOCK(inp);
+	} else {
 		bzero(&inc, sizeof(inc));
-		inc.inc_fport = th.th_dport;
-		inc.inc_lport = th.th_sport;
-		inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
-		inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
+		inc.inc_fibnum = M_GETFIB(m);
 		inc.inc_flags |= INC_ISIPV6;
-		INP_INFO_RLOCK(&V_tcbinfo);
-		syncache_unreach(&inc, &th);
-		INP_INFO_RUNLOCK(&V_tcbinfo);
-	} else
-		in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src,
-			      0, cmd, NULL, notify);
+		inc.inc_fport = th->th_dport;
+		inc.inc_lport = th->th_sport;
+		inc.inc6_faddr = *dst;
+		inc.inc6_laddr = ip6->ip6_src;
+		syncache_unreach(&inc, th);
+	}
+	INP_INFO_RUNLOCK(&V_tcbinfo);
 }
 #endif /* INET6 */
 

Modified: head/sys/netinet6/icmp6.c
==============================================================================
--- head/sys/netinet6/icmp6.c	Mon Aug  1 16:40:42 2016	(r303625)
+++ head/sys/netinet6/icmp6.c	Mon Aug  1 17:02:21 2016	(r303626)
@@ -485,15 +485,13 @@ icmp6_input(struct mbuf **mp, int *offp,
 		icmp6_ifstat_inc(ifp, ifs6_in_dstunreach);
 		switch (code) {
 		case ICMP6_DST_UNREACH_NOROUTE:
+		case ICMP6_DST_UNREACH_ADDR:	/* PRC_HOSTDEAD is a DOS */
 			code = PRC_UNREACH_NET;
 			break;
 		case ICMP6_DST_UNREACH_ADMIN:
 			icmp6_ifstat_inc(ifp, ifs6_in_adminprohib);
 			code = PRC_UNREACH_PROTOCOL; /* is this a good code? */
 			break;
-		case ICMP6_DST_UNREACH_ADDR:
-			code = PRC_HOSTDEAD;
-			break;
 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
 			/* I mean "source address was incorrect." */
 			code = PRC_PARAMPROB;

Modified: head/sys/netinet6/ip6_output.c
==============================================================================
--- head/sys/netinet6/ip6_output.c	Mon Aug  1 16:40:42 2016	(r303625)
+++ head/sys/netinet6/ip6_output.c	Mon Aug  1 17:02:21 2016	(r303626)
@@ -150,9 +150,10 @@ static int ip6_insertfraghdr(struct mbuf
 static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
 static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *);
 static int ip6_getpmtu(struct route_in6 *, int,
-	struct ifnet *, const struct in6_addr *, u_long *, int *, u_int);
+	struct ifnet *, const struct in6_addr *, u_long *, int *, u_int,
+	u_int);
 static int ip6_calcmtu(struct ifnet *, const struct in6_addr *, u_long,
-	u_long *, int *);
+	u_long *, int *, u_int);
 static int ip6_getpmtu_ctl(u_int, const struct in6_addr *, u_long *);
 static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int);
 
@@ -718,7 +719,7 @@ again:
 
 	/* Determine path MTU. */
 	if ((error = ip6_getpmtu(ro_pmtu, ro != ro_pmtu, ifp, &ip6->ip6_dst,
-	    &mtu, &alwaysfrag, fibnum)) != 0)
+		    &mtu, &alwaysfrag, fibnum, *nexthdrp)) != 0)
 		goto bad;
 
 	/*
@@ -1250,7 +1251,7 @@ ip6_getpmtu_ctl(u_int fibnum, const stru
 	ifp = nh6.nh_ifp;
 	mtu = nh6.nh_mtu;
 
-	error = ip6_calcmtu(ifp, dst, mtu, mtup, NULL);
+	error = ip6_calcmtu(ifp, dst, mtu, mtup, NULL, 0);
 	fib6_free_nh_ext(fibnum, &nh6);
 
 	return (error);
@@ -1269,7 +1270,7 @@ ip6_getpmtu_ctl(u_int fibnum, const stru
 static int
 ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup,
     struct ifnet *ifp, const struct in6_addr *dst, u_long *mtup,
-    int *alwaysfragp, u_int fibnum)
+    int *alwaysfragp, u_int fibnum, u_int proto)
 {
 	struct nhop6_basic nh6;
 	struct in6_addr kdst;
@@ -1307,7 +1308,7 @@ ip6_getpmtu(struct route_in6 *ro_pmtu, i
 	if (ro_pmtu->ro_rt)
 		mtu = ro_pmtu->ro_rt->rt_mtu;
 
-	return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp));
+	return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp, proto));
 }
 
 /*
@@ -1319,7 +1320,7 @@ ip6_getpmtu(struct route_in6 *ro_pmtu, i
  */
 static int
 ip6_calcmtu(struct ifnet *ifp, const struct in6_addr *dst, u_long rt_mtu,
-    u_long *mtup, int *alwaysfragp)
+    u_long *mtup, int *alwaysfragp, u_int proto)
 {
 	u_long mtu = 0;
 	int alwaysfrag = 0;
@@ -1334,7 +1335,11 @@ ip6_calcmtu(struct ifnet *ifp, const str
 		inc.inc6_faddr = *dst;
 
 		ifmtu = IN6_LINKMTU(ifp);
-		mtu = tcp_hc_getmtu(&inc);
+
+		/* TCP is known to react to pmtu changes so skip hc */
+		if (proto != IPPROTO_TCP)
+			mtu = tcp_hc_getmtu(&inc);
+
 		if (mtu)
 			mtu = min(mtu, rt_mtu);
 		else


More information about the svn-src-head mailing list