git: 5f610cf1ec62 - stable/14 - inpcb: Close some SO_REUSEPORT_LB races
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Tue, 14 Jan 2025 14:43:32 UTC
The branch stable/14 has been updated by markj: URL: https://cgit.FreeBSD.org/src/commit/?id=5f610cf1ec62ee7c9843a2eee5010901f3ebdbe6 commit 5f610cf1ec62ee7c9843a2eee5010901f3ebdbe6 Author: Mark Johnston <markj@FreeBSD.org> AuthorDate: 2024-12-12 14:02:12 +0000 Commit: Mark Johnston <markj@FreeBSD.org> CommitDate: 2025-01-14 14:14:24 +0000 inpcb: Close some SO_REUSEPORT_LB races For a long time, the inpcb lookup path has been lockless in the common case: we use net_epoch to synchronize lookups. However, the routines which update lbgroups were not careful to synchronize with unlocked lookups. I believe that in the worst case this can result in spurious connection aborts (I have a regression test case to exercise this), but it's hard to be certain. Modify in_pcblbgroup* routines to synchronize with unlocked lookup: - When removing inpcbs from an lbgroup, do not shrink the array. The maximum number of lbgroup entries is INPCBLBGROUP_SIZMAX (256), and it doesn't seem worth the complexity to shrink the array when a socket is removed. - When resizing an lbgroup, do not insert it into the hash table until it is fully initialized; otherwise lookups may observe a partially constructed lbgroup. - When adding an inpcb to the group, increment the counter after adding the array entry, using a release store. Otherwise it's possible for lookups to observe a null array slot. - When looking up an entry, use a corresponding acquire load. Reviewed by: ae, glebius MFC after: 1 month Sponsored by: Klara, Inc. Sponsored by: Stormshield Differential Revision: https://reviews.freebsd.org/D48020 (cherry picked from commit a600aabe9b04f0906069a8fb1f8d696ad186080f) --- sys/netinet/in_pcb.c | 94 ++++++++++++++++++++++++++++---------------------- sys/netinet6/in6_pcb.c | 13 +++++-- 2 files changed, 63 insertions(+), 44 deletions(-) diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 331804545bee..15729bcd1ce3 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -255,9 +255,8 @@ static void in_pcbremhash(struct inpcb *); */ static struct inpcblbgroup * -in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, struct ucred *cred, - u_char vflag, uint16_t port, const union in_dependaddr *addr, int size, - uint8_t numa_domain) +in_pcblbgroup_alloc(struct ucred *cred, u_char vflag, uint16_t port, + const union in_dependaddr *addr, int size, uint8_t numa_domain) { struct inpcblbgroup *grp; size_t bytes; @@ -272,7 +271,6 @@ in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, struct ucred *cred, grp->il_numa_domain = numa_domain; grp->il_dependladdr = *addr; grp->il_inpsiz = size; - CK_LIST_INSERT_HEAD(hdr, grp, il_list); return (grp); } @@ -294,6 +292,24 @@ in_pcblbgroup_free(struct inpcblbgroup *grp) NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); } +static void +in_pcblbgroup_insert(struct inpcblbgroup *grp, struct inpcb *inp) +{ + KASSERT(grp->il_inpcnt < grp->il_inpsiz, + ("invalid local group size %d and count %d", grp->il_inpsiz, + grp->il_inpcnt)); + INP_WLOCK_ASSERT(inp); + + inp->inp_flags |= INP_INLBGROUP; + grp->il_inp[grp->il_inpcnt] = inp; + + /* + * Synchronize with in_pcblookup_lbgroup(): make sure that we don't + * expose a null slot to the lookup path. + */ + atomic_store_rel_int(&grp->il_inpcnt, grp->il_inpcnt + 1); +} + static struct inpcblbgroup * in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, struct inpcblbgroup *old_grp, int size) @@ -301,7 +317,7 @@ in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, struct inpcblbgroup *grp; int i; - grp = in_pcblbgroup_alloc(hdr, old_grp->il_cred, old_grp->il_vflag, + grp = in_pcblbgroup_alloc(old_grp->il_cred, old_grp->il_vflag, old_grp->il_lport, &old_grp->il_dependladdr, size, old_grp->il_numa_domain); if (grp == NULL) @@ -314,34 +330,11 @@ in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, for (i = 0; i < old_grp->il_inpcnt; ++i) grp->il_inp[i] = old_grp->il_inp[i]; grp->il_inpcnt = old_grp->il_inpcnt; + CK_LIST_INSERT_HEAD(hdr, grp, il_list); in_pcblbgroup_free(old_grp); return (grp); } -/* - * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i] - * and shrink group if possible. - */ -static void -in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp, - int i) -{ - struct inpcblbgroup *grp, *new_grp; - - grp = *grpp; - for (; i + 1 < grp->il_inpcnt; ++i) - grp->il_inp[i] = grp->il_inp[i + 1]; - grp->il_inpcnt--; - - if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN && - grp->il_inpcnt <= grp->il_inpsiz / 4) { - /* Shrink this group. */ - new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2); - if (new_grp != NULL) - *grpp = new_grp; - } -} - /* * Add PCB to load balance group for SO_REUSEPORT_LB option. */ @@ -386,11 +379,13 @@ in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) } if (grp == NULL) { /* Create new load balance group. */ - grp = in_pcblbgroup_alloc(hdr, inp->inp_cred, inp->inp_vflag, + grp = in_pcblbgroup_alloc(inp->inp_cred, inp->inp_vflag, inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, INPCBLBGROUP_SIZMIN, numa_domain); if (grp == NULL) return (ENOBUFS); + in_pcblbgroup_insert(grp, inp); + CK_LIST_INSERT_HEAD(hdr, grp, il_list); } else if (grp->il_inpcnt == grp->il_inpsiz) { if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { if (ratecheck(&lastprint, &interval)) @@ -403,15 +398,10 @@ in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); if (grp == NULL) return (ENOBUFS); + in_pcblbgroup_insert(grp, inp); + } else { + in_pcblbgroup_insert(grp, inp); } - - KASSERT(grp->il_inpcnt < grp->il_inpsiz, - ("invalid local group size %d and count %d", grp->il_inpsiz, - grp->il_inpcnt)); - - grp->il_inp[grp->il_inpcnt] = inp; - grp->il_inpcnt++; - inp->inp_flags |= INP_INLBGROUP; return (0); } @@ -443,8 +433,17 @@ in_pcbremlbgrouphash(struct inpcb *inp) /* We are the last, free this local group. */ in_pcblbgroup_free(grp); } else { - /* Pull up inpcbs, shrink group if possible. */ - in_pcblbgroup_reorder(hdr, &grp, i); + KASSERT(grp->il_inpcnt >= 2, + ("invalid local group count %d", + grp->il_inpcnt)); + grp->il_inp[i] = + grp->il_inp[grp->il_inpcnt - 1]; + + /* + * Synchronize with in_pcblookup_lbgroup(). + */ + atomic_store_rel_int(&grp->il_inpcnt, + grp->il_inpcnt - 1); } inp->inp_flags &= ~INP_INLBGROUP; return; @@ -2112,8 +2111,11 @@ in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, const struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild; + struct inpcb *inp; + u_int count; INP_HASH_LOCK_ASSERT(pcbinfo); + NET_EPOCH_ASSERT(); hdr = &pcbinfo->ipi_lbgrouphashbase[ INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; @@ -2172,9 +2174,17 @@ in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, grp = local_wild; if (grp == NULL) return (NULL); + out: - return (grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % - grp->il_inpcnt]); + /* + * Synchronize with in_pcblbgroup_insert(). + */ + count = atomic_load_acq_int(&grp->il_inpcnt); + if (count == 0) + return (NULL); + inp = grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % count]; + KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); + return (inp); } static bool diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c index 04b87819d629..b3b2f03451aa 100644 --- a/sys/netinet6/in6_pcb.c +++ b/sys/netinet6/in6_pcb.c @@ -922,6 +922,8 @@ in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, const struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild; + struct inpcb *inp; + u_int count; INP_HASH_LOCK_ASSERT(pcbinfo); @@ -983,8 +985,15 @@ in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, if (grp == NULL) return (NULL); out: - return (grp->il_inp[INP6_PCBLBGROUP_PKTHASH(faddr, lport, fport) % - grp->il_inpcnt]); + /* + * Synchronize with in_pcblbgroup_insert(). + */ + count = atomic_load_acq_int(&grp->il_inpcnt); + if (count == 0) + return (NULL); + inp = grp->il_inp[INP6_PCBLBGROUP_PKTHASH(faddr, lport, fport) % count]; + KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); + return (inp); } static bool