git: d88eb4654f37 - main - tcp: address a wire level race with 2 ACKs at the end of TCP handshake
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Wed, 10 Aug 2022 14:32:53 UTC
The branch main has been updated by glebius: URL: https://cgit.FreeBSD.org/src/commit/?id=d88eb4654f372d0451139a1dbf525a8f2cad1cf8 commit d88eb4654f372d0451139a1dbf525a8f2cad1cf8 Author: Gleb Smirnoff <glebius@FreeBSD.org> AuthorDate: 2022-08-10 14:32:37 +0000 Commit: Gleb Smirnoff <glebius@FreeBSD.org> CommitDate: 2022-08-10 14:32:37 +0000 tcp: address a wire level race with 2 ACKs at the end of TCP handshake Imagine we are in SYN-RCVD state and two ACKs arrive at the same time, both valid, e.g. coming from the same host and with valid sequence. First packet would locate the listening socket in the inpcb database, write-lock it and start expanding the syncache entry into a socket. Meanwhile second packet would wait on the write lock of the listening socket. First packet will create a new ESTABLISHED socket, free the syncache entry and unlock the listening socket. Second packet would call into syncache_expand(), but this time it will fail as there is no syncache entry. Second packet would generate RST, effectively resetting the remote connection. It seems to me, that it is impossible to solve this problem with just rearranging locks, as the race happens at a wire level. To solve the problem, for an ACK packet arrived on a listening socket, that failed syncache lookup, perform a second non-wildcard lookup right away. That lookup may find the new born socket. Otherwise, we indeed send RST. Tested by: kp Reviewed by: tuexen, rrs PR: 265154 Differential revision: https://reviews.freebsd.org/D36066 --- sys/netinet/tcp_input.c | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index a4647297c521..8aed90be69a0 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -833,8 +833,9 @@ tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) * PCB, be it a listening one or a synchronized one. The packet * shall not modify its state. */ - lookupflag = (thflags & (TH_ACK|TH_SYN)) == TH_SYN ? - INPLOOKUP_RLOCKPCB : INPLOOKUP_WLOCKPCB; + lookupflag = INPLOOKUP_WILDCARD | + ((thflags & (TH_ACK|TH_SYN)) == TH_SYN ? + INPLOOKUP_RLOCKPCB : INPLOOKUP_WLOCKPCB); findpcb: #ifdef INET6 if (isipv6 && fwd_tag != NULL) { @@ -857,13 +858,12 @@ findpcb: inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &next_hop6->sin6_addr, next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) : - th->th_dport, INPLOOKUP_WILDCARD | lookupflag, - m->m_pkthdr.rcvif); + th->th_dport, lookupflag, m->m_pkthdr.rcvif); } } else if (isipv6) { inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, - th->th_sport, &ip6->ip6_dst, th->th_dport, - INPLOOKUP_WILDCARD | lookupflag, m->m_pkthdr.rcvif, m); + th->th_sport, &ip6->ip6_dst, th->th_dport, lookupflag, + m->m_pkthdr.rcvif, m); } #endif /* INET6 */ #if defined(INET6) && defined(INET) @@ -889,13 +889,12 @@ findpcb: inp = in_pcblookup(&V_tcbinfo, ip->ip_src, th->th_sport, next_hop->sin_addr, next_hop->sin_port ? ntohs(next_hop->sin_port) : - th->th_dport, INPLOOKUP_WILDCARD | lookupflag, - m->m_pkthdr.rcvif); + th->th_dport, lookupflag, m->m_pkthdr.rcvif); } } else inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, - th->th_sport, ip->ip_dst, th->th_dport, - INPLOOKUP_WILDCARD | lookupflag, m->m_pkthdr.rcvif, m); + th->th_sport, ip->ip_dst, th->th_dport, lookupflag, + m->m_pkthdr.rcvif, m); #endif /* INET */ /* @@ -904,6 +903,11 @@ findpcb: * XXX MRT Send RST using which routing table? */ if (inp == NULL) { + if (rstreason != 0) { + /* We came here after second (safety) lookup. */ + MPASS((lookupflag & INPLOOKUP_WILDCARD) == 0); + goto dropwithreset; + } /* * Log communication attempts to ports that are not * in use. @@ -1095,13 +1099,26 @@ findpcb: goto dropunlock; } else if (rstreason == 0) { /* - * No syncache entry or ACK was not - * for our SYN/ACK. Send a RST. + * No syncache entry, or ACK was not for our + * SYN/ACK. Do our protection against double + * ACK. If peer sent us 2 ACKs, then for the + * first one syncache_expand() successfully + * converted syncache entry into a socket, + * while we were waiting on the inpcb lock. We + * don't want to sent RST for the second ACK, + * so we perform second lookup without wildcard + * match, hoping to find the new socket. If + * the ACK is stray indeed, rstreason would + * hint the above code that the lookup was a + * second attempt. + * * NB: syncache did its own logging * of the failure cause. */ + INP_WUNLOCK(inp); rstreason = BANDLIM_RST_OPENPORT; - goto dropwithreset; + lookupflag &= ~INPLOOKUP_WILDCARD; + goto findpcb; } tfo_socket_result: if (so == NULL) { @@ -1390,7 +1407,7 @@ tfo_socket_result: * to upgrade the lock, because calling convention for stacks is * write-lock on PCB. If upgrade fails, drop the SYN. */ - if (lookupflag == INPLOOKUP_RLOCKPCB && INP_TRY_UPGRADE(inp) == 0) + if ((lookupflag & INPLOOKUP_RLOCKPCB) && INP_TRY_UPGRADE(inp) == 0) goto dropunlock; tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos);