svn commit: r303626 - in head/sys: netinet netinet6
Andrew Gallatin
gallatin at FreeBSD.org
Mon Aug 1 17:02:22 UTC 2016
Author: gallatin
Date: Mon Aug 1 17:02:21 2016
New Revision: 303626
URL: https://svnweb.freebsd.org/changeset/base/303626
Log:
Rework IPV6 TCP path MTU discovery to match IPv4
- Re-write tcp_ctlinput6() to closely mimic the IPv4 tcp_ctlinput()
- Now that tcp_ctlinput6() updates t_maxseg, we can allow ip6_output()
to send TCP packets without looking at the tcp host cache for every
single transmit.
- Make the icmp6 code mimic the IPv4 code & avoid returning
PRC_HOSTDEAD because it is so expensive.
Without these changes in place, every TCP6 pmtu discovery or host
unreachable ICMP resulted in a call to in6_pcbnotify() which walks the
tcbinfo table with the write lock held. Because the tcbinfo table is
shared between IPv4 and IPv6, this causes huge scalabilty issues on
servers with lots of (~100K) TCP connections, to the point where even
a small percent of IPv6 traffic had a disproportionate impact on
overall throughput.
Reviewed by: bz, rrs, ae (all earlier versions), lstewart (in Netflix's tree)
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D7272
Modified:
head/sys/netinet/tcp_subr.c
head/sys/netinet6/icmp6.c
head/sys/netinet6/ip6_output.c
Modified: head/sys/netinet/tcp_subr.c
==============================================================================
--- head/sys/netinet/tcp_subr.c Mon Aug 1 16:40:42 2016 (r303625)
+++ head/sys/netinet/tcp_subr.c Mon Aug 1 17:02:21 2016 (r303626)
@@ -78,6 +78,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip_icmp.h>
#include <netinet/ip_var.h>
#ifdef INET6
+#include <netinet/icmp6.h>
#include <netinet/ip6.h>
#include <netinet6/in6_fib.h>
#include <netinet6/in6_pcb.h>
@@ -2040,72 +2041,146 @@ tcp_ctlinput(int cmd, struct sockaddr *s
void
tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
{
- struct tcphdr th;
+ struct in6_addr *dst;
+ struct tcphdr *th;
struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
struct ip6_hdr *ip6;
struct mbuf *m;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ struct icmp6_hdr *icmp6;
struct ip6ctlparam *ip6cp = NULL;
const struct sockaddr_in6 *sa6_src = NULL;
- int off;
- struct tcp_portonly {
- u_int16_t th_sport;
- u_int16_t th_dport;
- } *thp;
+ struct in_conninfo inc;
+ tcp_seq icmp_tcp_seq;
+ unsigned int mtu;
+ unsigned int off;
+
if (sa->sa_family != AF_INET6 ||
sa->sa_len != sizeof(struct sockaddr_in6))
return;
- if (cmd == PRC_MSGSIZE)
- notify = tcp_mtudisc_notify;
- else if (!PRC_IS_REDIRECT(cmd) &&
- ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
- return;
-
/* if the parameter is from icmp6, decode it. */
if (d != NULL) {
ip6cp = (struct ip6ctlparam *)d;
+ icmp6 = ip6cp->ip6c_icmp6;
m = ip6cp->ip6c_m;
ip6 = ip6cp->ip6c_ip6;
off = ip6cp->ip6c_off;
sa6_src = ip6cp->ip6c_src;
+ dst = ip6cp->ip6c_finaldst;
} else {
m = NULL;
ip6 = NULL;
off = 0; /* fool gcc */
sa6_src = &sa6_any;
+ dst = NULL;
}
- if (ip6 != NULL) {
- struct in_conninfo inc;
- /*
- * XXX: We assume that when IPV6 is non NULL,
- * M and OFF are valid.
- */
+ if (cmd == PRC_MSGSIZE)
+ notify = tcp_mtudisc_notify;
+ else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
+ cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) &&
+ ip6 != NULL)
+ notify = tcp_drop_syn_sent;
- /* check if we can safely examine src and dst ports */
- if (m->m_pkthdr.len < off + sizeof(*thp))
- return;
+ /*
+ * Hostdead is ugly because it goes linearly through all PCBs.
+ * XXX: We never get this from ICMP, otherwise it makes an
+ * excellent DoS attack on machines with many connections.
+ */
+ else if (cmd == PRC_HOSTDEAD)
+ ip6 = NULL;
+ else if ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)
+ return;
- bzero(&th, sizeof(th));
- m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
+ if (ip6 == NULL) {
+ in6_pcbnotify(&V_tcbinfo, sa, 0,
+ (const struct sockaddr *)sa6_src,
+ 0, cmd, NULL, notify);
+ return;
+ }
- in6_pcbnotify(&V_tcbinfo, sa, th.th_dport,
- (struct sockaddr *)ip6cp->ip6c_src,
- th.th_sport, cmd, NULL, notify);
+ /* Check if we can safely get the ports from the tcp hdr */
+ if (m == NULL ||
+ (m->m_pkthdr.len <
+ (int32_t) (off + offsetof(struct tcphdr, th_seq)))) {
+ return;
+ }
+ th = (struct tcphdr *) mtodo(ip6cp->ip6c_m, ip6cp->ip6c_off);
+ INP_INFO_RLOCK(&V_tcbinfo);
+ inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_dst, th->th_dport,
+ &ip6->ip6_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL);
+ if (inp != NULL && PRC_IS_REDIRECT(cmd)) {
+ /* signal EHOSTDOWN, as it flushes the cached route */
+ inp = (*notify)(inp, EHOSTDOWN);
+ if (inp != NULL)
+ INP_WUNLOCK(inp);
+ } else if (inp != NULL) {
+ if (!(inp->inp_flags & INP_TIMEWAIT) &&
+ !(inp->inp_flags & INP_DROPPED) &&
+ !(inp->inp_socket == NULL)) {
+ icmp_tcp_seq = ntohl(th->th_seq);
+ tp = intotcpcb(inp);
+ if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
+ SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
+ if (cmd == PRC_MSGSIZE) {
+ /*
+ * MTU discovery:
+ * If we got a needfrag set the MTU
+ * in the route to the suggested new
+ * value (if given) and then notify.
+ */
+ mtu = ntohl(icmp6->icmp6_mtu);
+ /*
+ * If no alternative MTU was
+ * proposed, or the proposed
+ * MTU was too small, set to
+ * the min.
+ */
+ if (mtu < IPV6_MMTU)
+ mtu = IPV6_MMTU - 8;
+
+
+ bzero(&inc, sizeof(inc));
+ inc.inc_fibnum = M_GETFIB(m);
+ inc.inc_flags |= INC_ISIPV6;
+ inc.inc6_faddr = *dst;
+ if (in6_setscope(&inc.inc6_faddr,
+ m->m_pkthdr.rcvif, NULL))
+ goto unlock_inp;
+
+ /*
+ * Only process the offered MTU if it
+ * is smaller than the current one.
+ */
+ if (mtu < tp->t_maxseg +
+ (sizeof (*th) + sizeof (*ip6))) {
+ tcp_hc_updatemtu(&inc, mtu);
+ tcp_mtudisc(inp, mtu);
+ ICMP6STAT_INC(icp6s_pmtuchg);
+ }
+ } else
+ inp = (*notify)(inp,
+ inet6ctlerrmap[cmd]);
+ }
+ }
+unlock_inp:
+ if (inp != NULL)
+ INP_WUNLOCK(inp);
+ } else {
bzero(&inc, sizeof(inc));
- inc.inc_fport = th.th_dport;
- inc.inc_lport = th.th_sport;
- inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
- inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
+ inc.inc_fibnum = M_GETFIB(m);
inc.inc_flags |= INC_ISIPV6;
- INP_INFO_RLOCK(&V_tcbinfo);
- syncache_unreach(&inc, &th);
- INP_INFO_RUNLOCK(&V_tcbinfo);
- } else
- in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src,
- 0, cmd, NULL, notify);
+ inc.inc_fport = th->th_dport;
+ inc.inc_lport = th->th_sport;
+ inc.inc6_faddr = *dst;
+ inc.inc6_laddr = ip6->ip6_src;
+ syncache_unreach(&inc, th);
+ }
+ INP_INFO_RUNLOCK(&V_tcbinfo);
}
#endif /* INET6 */
Modified: head/sys/netinet6/icmp6.c
==============================================================================
--- head/sys/netinet6/icmp6.c Mon Aug 1 16:40:42 2016 (r303625)
+++ head/sys/netinet6/icmp6.c Mon Aug 1 17:02:21 2016 (r303626)
@@ -485,15 +485,13 @@ icmp6_input(struct mbuf **mp, int *offp,
icmp6_ifstat_inc(ifp, ifs6_in_dstunreach);
switch (code) {
case ICMP6_DST_UNREACH_NOROUTE:
+ case ICMP6_DST_UNREACH_ADDR: /* PRC_HOSTDEAD is a DOS */
code = PRC_UNREACH_NET;
break;
case ICMP6_DST_UNREACH_ADMIN:
icmp6_ifstat_inc(ifp, ifs6_in_adminprohib);
code = PRC_UNREACH_PROTOCOL; /* is this a good code? */
break;
- case ICMP6_DST_UNREACH_ADDR:
- code = PRC_HOSTDEAD;
- break;
case ICMP6_DST_UNREACH_BEYONDSCOPE:
/* I mean "source address was incorrect." */
code = PRC_PARAMPROB;
Modified: head/sys/netinet6/ip6_output.c
==============================================================================
--- head/sys/netinet6/ip6_output.c Mon Aug 1 16:40:42 2016 (r303625)
+++ head/sys/netinet6/ip6_output.c Mon Aug 1 17:02:21 2016 (r303626)
@@ -150,9 +150,10 @@ static int ip6_insertfraghdr(struct mbuf
static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *);
static int ip6_getpmtu(struct route_in6 *, int,
- struct ifnet *, const struct in6_addr *, u_long *, int *, u_int);
+ struct ifnet *, const struct in6_addr *, u_long *, int *, u_int,
+ u_int);
static int ip6_calcmtu(struct ifnet *, const struct in6_addr *, u_long,
- u_long *, int *);
+ u_long *, int *, u_int);
static int ip6_getpmtu_ctl(u_int, const struct in6_addr *, u_long *);
static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int);
@@ -718,7 +719,7 @@ again:
/* Determine path MTU. */
if ((error = ip6_getpmtu(ro_pmtu, ro != ro_pmtu, ifp, &ip6->ip6_dst,
- &mtu, &alwaysfrag, fibnum)) != 0)
+ &mtu, &alwaysfrag, fibnum, *nexthdrp)) != 0)
goto bad;
/*
@@ -1250,7 +1251,7 @@ ip6_getpmtu_ctl(u_int fibnum, const stru
ifp = nh6.nh_ifp;
mtu = nh6.nh_mtu;
- error = ip6_calcmtu(ifp, dst, mtu, mtup, NULL);
+ error = ip6_calcmtu(ifp, dst, mtu, mtup, NULL, 0);
fib6_free_nh_ext(fibnum, &nh6);
return (error);
@@ -1269,7 +1270,7 @@ ip6_getpmtu_ctl(u_int fibnum, const stru
static int
ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup,
struct ifnet *ifp, const struct in6_addr *dst, u_long *mtup,
- int *alwaysfragp, u_int fibnum)
+ int *alwaysfragp, u_int fibnum, u_int proto)
{
struct nhop6_basic nh6;
struct in6_addr kdst;
@@ -1307,7 +1308,7 @@ ip6_getpmtu(struct route_in6 *ro_pmtu, i
if (ro_pmtu->ro_rt)
mtu = ro_pmtu->ro_rt->rt_mtu;
- return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp));
+ return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp, proto));
}
/*
@@ -1319,7 +1320,7 @@ ip6_getpmtu(struct route_in6 *ro_pmtu, i
*/
static int
ip6_calcmtu(struct ifnet *ifp, const struct in6_addr *dst, u_long rt_mtu,
- u_long *mtup, int *alwaysfragp)
+ u_long *mtup, int *alwaysfragp, u_int proto)
{
u_long mtu = 0;
int alwaysfrag = 0;
@@ -1334,7 +1335,11 @@ ip6_calcmtu(struct ifnet *ifp, const str
inc.inc6_faddr = *dst;
ifmtu = IN6_LINKMTU(ifp);
- mtu = tcp_hc_getmtu(&inc);
+
+ /* TCP is known to react to pmtu changes so skip hc */
+ if (proto != IPPROTO_TCP)
+ mtu = tcp_hc_getmtu(&inc);
+
if (mtu)
mtu = min(mtu, rt_mtu);
else
More information about the svn-src-head
mailing list