git: 93c67567e015 - main - Remove "options PCBGROUP"
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Thu, 02 Dec 2021 18:49:04 UTC
The branch main has been updated by glebius: URL: https://cgit.FreeBSD.org/src/commit/?id=93c67567e01582731f3075bc68b1fc5f1fd5e5a2 commit 93c67567e01582731f3075bc68b1fc5f1fd5e5a2 Author: Gleb Smirnoff <glebius@FreeBSD.org> AuthorDate: 2021-12-02 18:48:48 +0000 Commit: Gleb Smirnoff <glebius@FreeBSD.org> CommitDate: 2021-12-02 18:48:48 +0000 Remove "options PCBGROUP" With upcoming changes to the inpcb synchronisation it is going to be broken. Even its current status after the move of PCB synchronization to the network epoch is very questionable. This experimental feature was sponsored by Juniper but ended never to be used in Juniper and doesn't exist in their source tree [sjg@, stevek@, jtl@]. In the past (AFAIK, pre-epoch times) it was tried out at Netflix [gallatin@, rrs@] with no positive result and at Yandex [ae@, melifaro@]. I'm up to resurrecting it back if there is any interest from anybody. Reviewed by: rrs Differential revision: https://reviews.freebsd.org/D33020 --- sys/conf/files | 2 - sys/conf/options | 1 - sys/net/rss_config.c | 1 - sys/netinet/in_pcb.c | 316 +------------------------ sys/netinet/in_pcb.h | 66 +----- sys/netinet/in_pcbgroup.c | 566 -------------------------------------------- sys/netinet/in_rss.c | 1 - sys/netinet/tcp_syncache.c | 1 - sys/netinet6/in6_pcb.c | 252 -------------------- sys/netinet6/in6_pcb.h | 10 - sys/netinet6/in6_pcbgroup.c | 153 ------------ sys/netinet6/in6_rss.c | 1 - 12 files changed, 4 insertions(+), 1366 deletions(-) diff --git a/sys/conf/files b/sys/conf/files index 0c54622e2563..9b3683a45d80 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -4323,7 +4323,6 @@ netinet/ip_id.c optional inet netinet/in_jail.c optional inet netinet/in_mcast.c optional inet netinet/in_pcb.c optional inet | inet6 -netinet/in_pcbgroup.c optional inet pcbgroup | inet6 pcbgroup netinet/in_prot.c optional inet | inet6 netinet/in_proto.c optional inet | inet6 netinet/in_rmx.c optional inet @@ -4411,7 +4410,6 @@ netinet6/in6_ifattach.c optional inet6 netinet6/in6_jail.c optional inet6 netinet6/in6_mcast.c optional inet6 netinet6/in6_pcb.c optional inet6 -netinet6/in6_pcbgroup.c optional inet6 pcbgroup netinet6/in6_proto.c optional inet6 netinet6/in6_rmx.c optional inet6 netinet6/in6_rss.c optional inet6 rss diff --git a/sys/conf/options b/sys/conf/options index 123a770ed74c..faa37d5bc67d 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -458,7 +458,6 @@ MBUF_PROFILING MBUF_STRESS_TEST MROUTING opt_mrouting.h NFSLOCKD -PCBGROUP opt_pcbgroup.h PF_DEFAULT_TO_DROP opt_pf.h ROUTE_MPATH opt_route.h ROUTETABLES opt_route.h diff --git a/sys/net/rss_config.c b/sys/net/rss_config.c index 5efa1ab3b5c1..ee15ed3da2bf 100644 --- a/sys/net/rss_config.c +++ b/sys/net/rss_config.c @@ -32,7 +32,6 @@ __FBSDID("$FreeBSD$"); #include "opt_inet6.h" -#include "opt_pcbgroup.h" #include <sys/param.h> #include <sys/mbuf.h> diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 9dd2aee11bf0..f1ac46b28477 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -45,7 +45,6 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ratelimit.h" -#include "opt_pcbgroup.h" #include "opt_route.h" #include "opt_rss.h" @@ -542,9 +541,6 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, &pcbinfo->ipi_porthashmask); pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_lbgrouphashmask); -#ifdef PCBGROUP - in_pcbgroup_init(pcbinfo, hashfields, hash_nelements); -#endif pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0); uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); @@ -567,9 +563,6 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) pcbinfo->ipi_porthashmask); hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, pcbinfo->ipi_lbgrouphashmask); -#ifdef PCBGROUP - in_pcbgroup_destroy(pcbinfo); -#endif uma_zdestroy(pcbinfo->ipi_zone); INP_LIST_LOCK_DESTROY(pcbinfo); INP_HASH_LOCK_DESTROY(pcbinfo); @@ -1522,8 +1515,7 @@ in_pcbdetach(struct inpcb *inp) * in_pcbref() bumps the reference count on an inpcb in order to maintain * stability of an inpcb pointer despite the inpcb lock being released. This * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, - * but where the inpcb lock may already held, or when acquiring a reference - * via a pcbgroup. + * but where the inpcb lock may already held. * * in_pcbref() should be used only to provide brief memory stability, and * must always be followed by a call to INP_WLOCK() and in_pcbrele() to @@ -1783,9 +1775,6 @@ in_pcbdrop(struct inpcb *inp) } INP_HASH_WUNLOCK(inp->inp_pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; -#ifdef PCBGROUP - in_pcbgroup_remove(inp); -#endif } } @@ -2097,241 +2086,6 @@ in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, return (local_wild); } -#ifdef PCBGROUP -/* - * Lookup PCB in hash list, using pcbgroup tables. - */ -static struct inpcb * -in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, - struct in_addr faddr, u_int fport_arg, struct in_addr laddr, - u_int lport_arg, int lookupflags, struct ifnet *ifp) -{ - struct inpcbhead *head; - struct inpcb *inp, *tmpinp; - u_short fport = fport_arg, lport = lport_arg; - bool locked; - - /* - * First look for an exact match. - */ - tmpinp = NULL; - INP_GROUP_LOCK(pcbgroup); - head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, - pcbgroup->ipg_hashmask)]; - CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) { -#ifdef INET6 - /* XXX inp locking */ - if ((inp->inp_vflag & INP_IPV4) == 0) - continue; -#endif - if (inp->inp_faddr.s_addr == faddr.s_addr && - inp->inp_laddr.s_addr == laddr.s_addr && - inp->inp_fport == fport && - inp->inp_lport == lport) { - /* - * XXX We should be able to directly return - * the inp here, without any checks. - * Well unless both bound with SO_REUSEPORT? - */ - if (prison_flag(inp->inp_cred, PR_IP4)) - goto found; - if (tmpinp == NULL) - tmpinp = inp; - } - } - if (tmpinp != NULL) { - inp = tmpinp; - goto found; - } - -#ifdef RSS - /* - * For incoming connections, we may wish to do a wildcard - * match for an RSS-local socket. - */ - if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { - struct inpcb *local_wild = NULL, *local_exact = NULL; -#ifdef INET6 - struct inpcb *local_wild_mapped = NULL; -#endif - struct inpcb *jail_wild = NULL; - struct inpcbhead *head; - int injail; - - /* - * Order of socket selection - we always prefer jails. - * 1. jailed, non-wild. - * 2. jailed, wild. - * 3. non-jailed, non-wild. - * 4. non-jailed, wild. - */ - - head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY, - lport, 0, pcbgroup->ipg_hashmask)]; - CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) { -#ifdef INET6 - /* XXX inp locking */ - if ((inp->inp_vflag & INP_IPV4) == 0) - continue; -#endif - if (inp->inp_faddr.s_addr != INADDR_ANY || - inp->inp_lport != lport) - continue; - - injail = prison_flag(inp->inp_cred, PR_IP4); - if (injail) { - if (prison_check_ip4(inp->inp_cred, - &laddr) != 0) - continue; - } else { - if (local_exact != NULL) - continue; - } - - if (inp->inp_laddr.s_addr == laddr.s_addr) { - if (injail) - goto found; - else - local_exact = inp; - } else if (inp->inp_laddr.s_addr == INADDR_ANY) { -#ifdef INET6 - /* XXX inp locking, NULL check */ - if (inp->inp_vflag & INP_IPV6PROTO) - local_wild_mapped = inp; - else -#endif - if (injail) - jail_wild = inp; - else - local_wild = inp; - } - } /* LIST_FOREACH */ - - inp = jail_wild; - if (inp == NULL) - inp = local_exact; - if (inp == NULL) - inp = local_wild; -#ifdef INET6 - if (inp == NULL) - inp = local_wild_mapped; -#endif - if (inp != NULL) - goto found; - } -#endif - - /* - * Then look for a wildcard match, if requested. - */ - if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { - struct inpcb *local_wild = NULL, *local_exact = NULL; -#ifdef INET6 - struct inpcb *local_wild_mapped = NULL; -#endif - struct inpcb *jail_wild = NULL; - struct inpcbhead *head; - int injail; - - /* - * Order of socket selection - we always prefer jails. - * 1. jailed, non-wild. - * 2. jailed, wild. - * 3. non-jailed, non-wild. - * 4. non-jailed, wild. - */ - head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport, - 0, pcbinfo->ipi_wildmask)]; - CK_LIST_FOREACH(inp, head, inp_pcbgroup_wild) { -#ifdef INET6 - /* XXX inp locking */ - if ((inp->inp_vflag & INP_IPV4) == 0) - continue; -#endif - if (inp->inp_faddr.s_addr != INADDR_ANY || - inp->inp_lport != lport) - continue; - - injail = prison_flag(inp->inp_cred, PR_IP4); - if (injail) { - if (prison_check_ip4(inp->inp_cred, - &laddr) != 0) - continue; - } else { - if (local_exact != NULL) - continue; - } - - if (inp->inp_laddr.s_addr == laddr.s_addr) { - if (injail) - goto found; - else - local_exact = inp; - } else if (inp->inp_laddr.s_addr == INADDR_ANY) { -#ifdef INET6 - /* XXX inp locking, NULL check */ - if (inp->inp_vflag & INP_IPV6PROTO) - local_wild_mapped = inp; - else -#endif - if (injail) - jail_wild = inp; - else - local_wild = inp; - } - } /* LIST_FOREACH */ - inp = jail_wild; - if (inp == NULL) - inp = local_exact; - if (inp == NULL) - inp = local_wild; -#ifdef INET6 - if (inp == NULL) - inp = local_wild_mapped; -#endif - if (inp != NULL) - goto found; - } /* if (lookupflags & INPLOOKUP_WILDCARD) */ - INP_GROUP_UNLOCK(pcbgroup); - return (NULL); - -found: - if (lookupflags & INPLOOKUP_WLOCKPCB) - locked = INP_TRY_WLOCK(inp); - else if (lookupflags & INPLOOKUP_RLOCKPCB) - locked = INP_TRY_RLOCK(inp); - else - panic("%s: locking bug", __func__); - if (__predict_false(locked && (inp->inp_flags2 & INP_FREED))) { - if (lookupflags & INPLOOKUP_WLOCKPCB) - INP_WUNLOCK(inp); - else - INP_RUNLOCK(inp); - return (NULL); - } else if (!locked) - in_pcbref(inp); - INP_GROUP_UNLOCK(pcbgroup); - if (!locked) { - if (lookupflags & INPLOOKUP_WLOCKPCB) { - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) - return (NULL); - } else { - INP_RLOCK(inp); - if (in_pcbrele_rlocked(inp)) - return (NULL); - } - } -#ifdef INVARIANTS - if (lookupflags & INPLOOKUP_WLOCKPCB) - INP_WLOCK_ASSERT(inp); - else - INP_RLOCK_ASSERT(inp); -#endif - return (inp); -} -#endif /* PCBGROUP */ - /* * Lookup PCB in hash list, using pcbinfo tables. This variation assumes * that the caller has locked the hash list, and will not perform any further @@ -2497,40 +2251,17 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, /* * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf * from which a pre-calculated hash value may be extracted. - * - * Possibly more of this logic should be in in_pcbgroup.c. */ struct inpcb * in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { -#if defined(PCBGROUP) && !defined(RSS) - struct inpcbgroup *pcbgroup; -#endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); - /* - * When not using RSS, use connection groups in preference to the - * reservation table when looking up 4-tuples. When using RSS, just - * use the reservation table, due to the cost of the Toeplitz hash - * in software. - * - * XXXRW: This policy belongs in the pcbgroup code, as in principle - * we could be doing RSS with a non-Toeplitz hash that is affordable - * in software. - */ -#if defined(PCBGROUP) && !defined(RSS) - if (in_pcbgroup_enabled(pcbinfo)) { - pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, - fport); - return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, - laddr, lport, lookupflags, ifp)); - } -#endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp, M_NODOM)); } @@ -2540,39 +2271,12 @@ in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp, struct mbuf *m) { -#ifdef PCBGROUP - struct inpcbgroup *pcbgroup; -#endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); -#ifdef PCBGROUP - /* - * If we can use a hardware-generated hash to look up the connection - * group, use that connection group to find the inpcb. Otherwise - * fall back on a software hash -- or the reservation table if we're - * using RSS. - * - * XXXRW: As above, that policy belongs in the pcbgroup code. - */ - if (in_pcbgroup_enabled(pcbinfo) && - !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) { - pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), - m->m_pkthdr.flowid); - if (pcbgroup != NULL) - return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, - fport, laddr, lport, lookupflags, ifp)); -#ifndef RSS - pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, - fport); - return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, - laddr, lport, lookupflags, ifp)); -#endif - } -#endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp, m->m_pkthdr.numa_domain)); } @@ -2647,13 +2351,7 @@ in_pcbinshash_internal(struct inpcb *inp, struct mbuf *m) CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); inp->inp_flags |= INP_INHASHLIST; -#ifdef PCBGROUP - if (m != NULL) { - in_pcbgroup_update_mbuf(inp, m); - } else { - in_pcbgroup_update(inp); - } -#endif + return (0); } @@ -2702,13 +2400,6 @@ in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m) CK_LIST_REMOVE(inp, inp_hash); CK_LIST_INSERT_HEAD(head, inp, inp_hash); - -#ifdef PCBGROUP - if (m != NULL) - in_pcbgroup_update_mbuf(inp, m); - else - in_pcbgroup_update(inp); -#endif } void @@ -2749,9 +2440,6 @@ in_pcbremlists(struct inpcb *inp) } CK_LIST_REMOVE(inp, inp_list); pcbinfo->ipi_count--; -#ifdef PCBGROUP - in_pcbgroup_remove(inp); -#endif } /* diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index d6a335236599..813c87559de3 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -156,7 +156,6 @@ struct in_conninfo { * (b) - Protected by the hpts lock. * (c) - Constant after initialization * (e) - Protected by the net_epoch_prempt epoch - * (g) - Protected by the pcbgroup lock * (i) - Protected by the inpcb lock * (p) - Protected by the pcbinfo lock for the inpcb * (l) - Protected by the pcblist lock for the inpcb @@ -231,7 +230,6 @@ struct m_snd_tag; struct inpcb { /* Cache line #1 (amd64) */ CK_LIST_ENTRY(inpcb) inp_hash; /* [w](h/i) [r](e/i) hash list */ - CK_LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */ struct rwlock inp_lock; /* Cache line #2 (amd64) */ #define inp_start_zero inp_hpts @@ -276,8 +274,6 @@ struct inpcb { uint32_t inp_hpts_drop_reas; /* reason we are dropping the PCB (lock i&b) */ TAILQ_ENTRY(inpcb) inp_input; /* pacing in queue next lock(b) */ struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */ - struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */ - CK_LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */ struct ucred *inp_cred; /* (c) cache of socket cred */ u_int32_t inp_flow; /* (i) IPv6 flow information */ u_char inp_vflag; /* (i) IP version flag (v4/v6) */ @@ -423,7 +419,6 @@ struct inpcbport { * ipi_lock (before) * inpcb locks (before) * ipi_list locks (before) - * {ipi_hash_lock, pcbgroup locks} * * Locking key: * @@ -432,7 +427,6 @@ struct inpcbport { * (g) Locked by ipi_lock * (l) Locked by ipi_list_lock * (h) Read using either net_epoch_preempt or inpcb lock; write requires both ipi_hash_lock and inpcb lock - * (p) Protected by one or more pcbgroup locks * (x) Synchronisation properties poorly defined */ struct inpcbinfo { @@ -466,16 +460,7 @@ struct inpcbinfo { struct uma_zone *ipi_zone; /* (c) */ /* - * Connection groups associated with this protocol. These fields are - * constant, but pcbgroup structures themselves are protected by - * per-pcbgroup locks. - */ - struct inpcbgroup *ipi_pcbgroups; /* (c) */ - u_int ipi_npcbgroups; /* (c) */ - u_int ipi_hashfields; /* (c) */ - - /* - * Global lock protecting modification non-pcbgroup hash lookup tables. + * Global lock protecting modification hash lookup tables. */ struct mtx ipi_hash_lock; @@ -492,14 +477,6 @@ struct inpcbinfo { struct inpcbporthead *ipi_porthashbase; /* (h) */ u_long ipi_porthashmask; /* (h) */ - /* - * List of wildcard inpcbs for use with pcbgroups. In the past, was - * per-pcbgroup but is now global. All pcbgroup locks must be held - * to modify the list, so any is sufficient to read it. - */ - struct inpcbhead *ipi_wildbase; /* (p) */ - u_long ipi_wildmask; /* (p) */ - /* * Load balance groups used for the SO_REUSEPORT_LB option, * hashed by local port. @@ -524,31 +501,6 @@ struct inpcbinfo { }; #ifdef _KERNEL -/* - * Connection groups hold sets of connections that have similar CPU/thread - * affinity. Each connection belongs to exactly one connection group. - */ -struct inpcbgroup { - /* - * Per-connection group hash of inpcbs, hashed by local and foreign - * addresses and port numbers. - */ - struct inpcbhead *ipg_hashbase; /* (c) */ - u_long ipg_hashmask; /* (c) */ - - /* - * Notional affinity of this pcbgroup. - */ - u_int ipg_cpu; /* (p) */ - - /* - * Per-connection group lock, not to be confused with ipi_lock. - * Protects the hash table hung off the group, but also the global - * wildcard list in inpcbinfo. - */ - struct mtx ipg_lock; -} __aligned(CACHE_LINE_SIZE); - /* * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group * (or unique address:port combination) can be re-used at most @@ -728,7 +680,7 @@ int inp_so_options(const struct inpcb *inp); */ #define INP_MBUF_L_ACKS 0x00000001 /* We need large mbufs for ack compression */ #define INP_MBUF_ACKCMP 0x00000002 /* TCP mbuf ack compression ok */ -#define INP_PCBGROUPWILD 0x00000004 /* in pcbgroup wildcard list */ +/* 0x00000004 */ #define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */ #define INP_FREED 0x00000010 /* inp itself is not valid */ #define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */ @@ -809,20 +761,6 @@ void in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *, int in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi); -struct inpcbgroup * - in_pcbgroup_byhash(struct inpcbinfo *, u_int, uint32_t); -struct inpcbgroup * - in_pcbgroup_byinpcb(struct inpcb *); -struct inpcbgroup * - in_pcbgroup_bytuple(struct inpcbinfo *, struct in_addr, u_short, - struct in_addr, u_short); -void in_pcbgroup_destroy(struct inpcbinfo *); -int in_pcbgroup_enabled(struct inpcbinfo *); -void in_pcbgroup_init(struct inpcbinfo *, u_int, int); -void in_pcbgroup_remove(struct inpcb *); -void in_pcbgroup_update(struct inpcb *); -void in_pcbgroup_update_mbuf(struct inpcb *, struct mbuf *); - void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); int in_pcballoc(struct socket *, struct inpcbinfo *); int in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *); diff --git a/sys/netinet/in_pcbgroup.c b/sys/netinet/in_pcbgroup.c deleted file mode 100644 index 11ed75be1198..000000000000 --- a/sys/netinet/in_pcbgroup.c +++ /dev/null @@ -1,566 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 2010-2011 Juniper Networks, Inc. - * All rights reserved. - * - * This software was developed by Robert N. M. Watson under contract - * to Juniper Networks, Inc. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include <sys/cdefs.h> - -__FBSDID("$FreeBSD$"); - -#include "opt_inet6.h" -#include "opt_rss.h" - -#include <sys/param.h> -#include <sys/lock.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#include <sys/mutex.h> -#include <sys/smp.h> -#include <sys/socket.h> -#include <sys/socketvar.h> - -#include <net/rss_config.h> - -#include <netinet/in.h> - -#include <netinet/in_pcb.h> -#include <netinet/in_rss.h> -#ifdef INET6 -#include <netinet6/in6_pcb.h> -#endif /* INET6 */ - -/* - * pcbgroups, or "connection groups" are based on Willman, Rixner, and Cox's - * 2006 USENIX paper, "An Evaluation of Network Stack Parallelization - * Strategies in Modern Operating Systems". This implementation differs - * significantly from that described in the paper, in that it attempts to - * introduce not just notions of affinity for connections and distribute work - * so as to reduce lock contention, but also align those notions with - * hardware work distribution strategies such as RSS. In this construction, - * connection groups supplement, rather than replace, existing reservation - * tables for protocol 4-tuples, offering CPU-affine lookup tables with - * minimal cache line migration and lock contention during steady state - * operation. - * - * Hardware-offloaded checksums are often inefficient in software -- for - * example, Toeplitz, specified by RSS, introduced a significant overhead if - * performed during per-packge processing. It is therefore desirable to fall - * back on traditional reservation table lookups without affinity where - * hardware-offloaded checksums aren't available, such as for traffic over - * non-RSS interfaces. - * - * Internet protocols, such as UDP and TCP, register to use connection groups - * by providing an ipi_hashfields value other than IPI_HASHFIELDS_NONE; this - * indicates to the connection group code whether a 2-tuple or 4-tuple is - * used as an argument to hashes that assign a connection to a particular - * group. This must be aligned with any hardware offloaded distribution - * model, such as RSS or similar approaches taken in embedded network boards. - * Wildcard sockets require special handling, as in Willman 2006, and are - * shared between connection groups -- while being protected by group-local - * locks. This means that connection establishment and teardown can be - * signficantly more expensive than without connection groups, but that - * steady-state processing can be significantly faster. - * - * When RSS is used, certain connection group parameters, such as the number - * of groups, are provided by the RSS implementation, found in in_rss.c. - * Otherwise, in_pcbgroup.c selects possible sensible parameters - * corresponding to the degree of parallelism exposed by netisr. - * - * Most of the implementation of connection groups is in this file; however, - * connection group lookup is implemented in in_pcb.c alongside reservation - * table lookups -- see in_pcblookup_group(). - * - * TODO: - * - * Implement dynamic rebalancing of buckets with connection groups; when - * load is unevenly distributed, search for more optimal balancing on - * demand. This might require scaling up the number of connection groups - * by <<1. - * - * Provide an IP 2-tuple or 4-tuple netisr m2cpu handler based on connection - * groups for ip_input and ip6_input, allowing non-offloaded work - * distribution. - * - * Expose effective CPU affinity of connections to userspace using socket - * options. - * - * Investigate per-connection affinity overrides based on socket options; an - * option could be set, certainly resulting in work being distributed - * differently in software, and possibly propagated to supporting hardware - * with TCAMs or hardware hash tables. This might require connections to - * exist in more than one connection group at a time. - * - * Hook netisr thread reconfiguration events, and propagate those to RSS so - * that rebalancing can occur when the thread pool grows or shrinks. - * - * Expose per-pcbgroup statistics to userspace monitoring tools such as - * netstat, in order to allow better debugging and profiling. - */ - -void -in_pcbgroup_init(struct inpcbinfo *pcbinfo, u_int hashfields, - int hash_nelements) -{ - struct inpcbgroup *pcbgroup; - u_int numpcbgroups, pgn; - - /* - * Only enable connection groups for a protocol if it has been - * specifically requested. - */ - if (hashfields == IPI_HASHFIELDS_NONE) - return; - - /* - * Connection groups are about multi-processor load distribution, - * lock contention, and connection CPU affinity. As such, no point - * in turning them on for a uniprocessor machine, it only wastes - * memory. - */ - if (mp_ncpus == 1) - return; - -#ifdef RSS - /* - * If we're using RSS, then RSS determines the number of connection - * groups to use: one connection group per RSS bucket. If for some - * reason RSS isn't able to provide a number of buckets, disable - * connection groups entirely. - * - * XXXRW: Can this ever happen? - */ - numpcbgroups = rss_getnumbuckets(); - if (numpcbgroups == 0) - return; -#else - /* - * Otherwise, we'll just use one per CPU for now. If we decide to - * do dynamic rebalancing a la RSS, we'll need similar logic here. - */ - numpcbgroups = mp_ncpus; -#endif - - pcbinfo->ipi_hashfields = hashfields; - pcbinfo->ipi_pcbgroups = malloc(numpcbgroups * - sizeof(*pcbinfo->ipi_pcbgroups), M_PCB, M_WAITOK | M_ZERO); - pcbinfo->ipi_npcbgroups = numpcbgroups; - pcbinfo->ipi_wildbase = hashinit(hash_nelements, M_PCB, - &pcbinfo->ipi_wildmask); - for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) { - pcbgroup = &pcbinfo->ipi_pcbgroups[pgn]; - pcbgroup->ipg_hashbase = hashinit(hash_nelements, M_PCB, - &pcbgroup->ipg_hashmask); - INP_GROUP_LOCK_INIT(pcbgroup, "pcbgroup"); - - /* - * Initialise notional affinity of the pcbgroup -- for RSS, - * we want the same notion of affinity as NICs to be used. In - * the non-RSS case, just round robin for the time being. - * - * XXXRW: The notion of a bucket to CPU mapping is common at - * both pcbgroup and RSS layers -- does that mean that we - * should migrate it all from RSS to here, and just leave RSS - * responsible only for providing hashing and mapping functions? - */ -#ifdef RSS - pcbgroup->ipg_cpu = rss_getcpu(pgn); -#else - pcbgroup->ipg_cpu = (pgn % mp_ncpus); -#endif - } -} - -void -in_pcbgroup_destroy(struct inpcbinfo *pcbinfo) -{ - struct inpcbgroup *pcbgroup; - u_int pgn; - - if (pcbinfo->ipi_npcbgroups == 0) - return; - - for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) { - pcbgroup = &pcbinfo->ipi_pcbgroups[pgn]; - KASSERT(CK_LIST_EMPTY(pcbinfo->ipi_listhead), - ("in_pcbinfo_destroy: listhead not empty")); - INP_GROUP_LOCK_DESTROY(pcbgroup); - hashdestroy(pcbgroup->ipg_hashbase, M_PCB, - pcbgroup->ipg_hashmask); - } - hashdestroy(pcbinfo->ipi_wildbase, M_PCB, pcbinfo->ipi_wildmask); - free(pcbinfo->ipi_pcbgroups, M_PCB); - pcbinfo->ipi_pcbgroups = NULL; - pcbinfo->ipi_npcbgroups = 0; - pcbinfo->ipi_hashfields = 0; -} - -/* - * Given a hash of whatever the covered tuple might be, return a pcbgroup - * index. Where RSS is supported, try to align bucket selection with RSS CPU - * affinity strategy. - */ -static __inline u_int -in_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash) -{ - -#ifdef RSS - return (rss_getbucket(hash)); -#else - return (hash % pcbinfo->ipi_npcbgroups); -#endif -} - -/* - * Map a (hashtype, hash) tuple into a connection group, or NULL if the hash - * information is insufficient to identify the pcbgroup. This might occur if - * a TCP packet turns up with a 2-tuple hash, or if an RSS hash is present but - * RSS is not compiled into the kernel. - */ -struct inpcbgroup * -in_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash) -{ - -#ifdef RSS - if ((pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE && - hashtype == M_HASHTYPE_RSS_TCP_IPV4) || - (pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE && - hashtype == M_HASHTYPE_RSS_UDP_IPV4) || - (pcbinfo->ipi_hashfields == IPI_HASHFIELDS_2TUPLE && - hashtype == M_HASHTYPE_RSS_IPV4)) - return (&pcbinfo->ipi_pcbgroups[ - in_pcbgroup_getbucket(pcbinfo, hash)]); -#endif - return (NULL); -} - -static struct inpcbgroup * -in_pcbgroup_bymbuf(struct inpcbinfo *pcbinfo, struct mbuf *m) -{ - - return (in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), - m->m_pkthdr.flowid)); -} - -struct inpcbgroup * -in_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, struct in_addr laddr, - u_short lport, struct in_addr faddr, u_short fport) -{ - uint32_t hash; - - /* - * RSS note: we pass foreign addr/port as source, and local addr/port - * as destination, as we want to align with what the hardware is - * doing. - */ - switch (pcbinfo->ipi_hashfields) { - case IPI_HASHFIELDS_4TUPLE: -#ifdef RSS - hash = rss_hash_ip4_4tuple(faddr, fport, laddr, lport); -#else - hash = faddr.s_addr ^ fport; -#endif - break; - - case IPI_HASHFIELDS_2TUPLE: -#ifdef RSS - hash = rss_hash_ip4_2tuple(faddr, laddr); -#else - hash = faddr.s_addr ^ laddr.s_addr; -#endif - break; - - default: - hash = 0; - } - return (&pcbinfo->ipi_pcbgroups[in_pcbgroup_getbucket(pcbinfo, - hash)]); -} - -struct inpcbgroup * -in_pcbgroup_byinpcb(struct inpcb *inp) -{ -#ifdef RSS - /* - * Listen sockets with INP_RSS_BUCKET_SET set have a pre-determined - * RSS bucket and thus we should use this pcbgroup, rather than - * using a tuple or hash. - * - * XXX should verify that there's actually pcbgroups and inp_rss_listen_bucket - * fits in that! - */ - if (inp->inp_flags2 & INP_RSS_BUCKET_SET) - return (&inp->inp_pcbinfo->ipi_pcbgroups[inp->inp_rss_listen_bucket]); -#endif - - return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr, - inp->inp_lport, inp->inp_faddr, inp->inp_fport)); -} - -static void -in_pcbwild_add(struct inpcb *inp) -{ - struct inpcbinfo *pcbinfo; - struct inpcbhead *head; - u_int pgn; - - INP_WLOCK_ASSERT(inp); - KASSERT(!(inp->inp_flags2 & INP_PCBGROUPWILD), - ("%s: is wild",__func__)); - - pcbinfo = inp->inp_pcbinfo; - for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) - INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]); - head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, inp->inp_lport, - 0, pcbinfo->ipi_wildmask)]; - CK_LIST_INSERT_HEAD(head, inp, inp_pcbgroup_wild); - inp->inp_flags2 |= INP_PCBGROUPWILD; - for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) *** 740 LINES SKIPPED ***