svn commit: r186686 - projects/pnet/sys/netinet
Robert Watson
rwatson at FreeBSD.org
Thu Jan 1 22:11:45 UTC 2009
Author: rwatson
Date: Thu Jan 1 22:11:44 2009
New Revision: 186686
URL: http://svn.freebsd.org/changeset/base/186686
Log:
Add IP SUBSET patch to pnet branch: the IP_SUBSET socket option allows
identically bound UDP sockets to balance load between them using various
strategies, including random assignment, flow-based assignment,
CPU-based assignment, and kernel thread ID-based assignment.
UDP applications, such as BIND, memcached, etc, can create multiple
sockets, each with SO_REUSEPORT set, followed by specifying their index
among a set of matching sockets all servicing the same port number.
Modified:
projects/pnet/sys/netinet/in.h
projects/pnet/sys/netinet/in_pcb.c
projects/pnet/sys/netinet/in_pcb.h
projects/pnet/sys/netinet/in_proto.c
projects/pnet/sys/netinet/udp_usrreq.c
projects/pnet/sys/netinet/udp_var.h
Modified: projects/pnet/sys/netinet/in.h
==============================================================================
--- projects/pnet/sys/netinet/in.h Thu Jan 1 20:47:09 2009 (r186685)
+++ projects/pnet/sys/netinet/in.h Thu Jan 1 22:11:44 2009 (r186686)
@@ -486,6 +486,21 @@ __END_DECLS
#define MCAST_BLOCK_SOURCE 84 /* block a source */
#define MCAST_UNBLOCK_SOURCE 85 /* unblock a source */
+/* Binding subsets. */
+#define IP_SUBSET 86 /* get/set binding subset */
+
+struct ip_subset {
+ u_int is_strategy;
+ u_int is_count;
+ u_int is_member;
+};
+
+#define IP_SUBSET_STRATEGY_DISABLED 0
+#define IP_SUBSET_STRATEGY_FLOW 1
+#define IP_SUBSET_STRATEGY_RANDOM 2
+#define IP_SUBSET_STRATEGY_THREADID 3
+#define IP_SUBSET_STRATEGY_CPU 4
+
/*
* Defaults and limits for options
*/
Modified: projects/pnet/sys/netinet/in_pcb.c
==============================================================================
--- projects/pnet/sys/netinet/in_pcb.c Thu Jan 1 20:47:09 2009 (r186685)
+++ projects/pnet/sys/netinet/in_pcb.c Thu Jan 1 22:11:44 2009 (r186686)
@@ -204,6 +204,7 @@ in_pcballoc(struct socket *so, struct in
inp->inp_socket = so;
inp->inp_cred = crhold(so->so_cred);
inp->inp_inc.inc_fibnum = so->so_fibnum;
+ inp->inp_subset_strategy = IP_SUBSET_STRATEGY_DISABLED;
#ifdef MAC
error = mac_inpcb_init(inp, M_NOWAIT);
if (error != 0)
@@ -1284,12 +1285,114 @@ in_pcblookup_local(struct inpcbinfo *pcb
#undef INP_LOOKUP_MAPPED_PCB_COST
/*
+ * Implement various subsetting strategies: determine whether a particular
+ * inpcb, implementing a particular strategy, matches the passed tuple or
+ * not.
+ */
+static int
+in_subset_match(struct inpcb *inp, struct in_addr faddr, u_short fport,
+ struct in_addr laddr, u_short lport, u_short ip_id, u_int32_t flowid)
+{
+
+ switch (inp->inp_subset_strategy) {
+ case IP_SUBSET_STRATEGY_FLOW:
+ /*
+ * If the packet has a flow tag, use that, but otherwise,
+ * calculate our own flow tag using the IP/port tuple.
+ */
+ if (flowid != 0) {
+ if ((flowid % inp->inp_subset_count) ==
+ inp->inp_subset_member)
+ return (1);
+ } else {
+ /*
+ * XXXRW: This hash is not the hash that you are
+ * looking for.
+ */
+ if (((faddr.s_addr ^ laddr.s_addr ^ fport ^ lport) %
+ inp->inp_subset_count) == inp->inp_subset_member)
+ return (1);
+ }
+ return (0);
+
+ case IP_SUBSET_STRATEGY_RANDOM:
+ /*
+ * If there is a flow tag, use that and the IP ID as a source
+ * of entropy. Otherwise, calculate our own flow tag as
+ * above and combine with the IP ID.
+ *
+ * XXXRW: This hash is also not the hash that you are looking
+ * for.
+ */
+ if (flowid != 0) {
+ if (((flowid ^ ip_id) % inp->inp_subset_count) ==
+ inp->inp_subset_member)
+ return (1);
+ } else {
+ if (((faddr.s_addr ^ laddr.s_addr ^ fport ^ lport ^
+ ip_id) % inp->inp_subset_count) ==
+ inp->inp_subset_member)
+ return (1);
+ }
+ return (0);
+
+ case IP_SUBSET_STRATEGY_THREADID:
+ /*
+ * Experiment: pick the socket to use based on the kernel
+ * thread ID processing the packet. This will be fixed for
+ * particular RSS input queues, so will assign work to a
+ * particular socket based on which input queue it came from.
+ * This doesn't attempt to balance the work at all, simply
+ * ensure that datagrams local to a particular CPU are
+ * assigned to the same socket consistently.
+ */
+ if ((curthread->td_tid % inp->inp_subset_count) ==
+ inp->inp_subset_member)
+ return (1);
+ return (0);
+
+ case IP_SUBSET_STRATEGY_CPU:
+ /*
+ * Experimental: packets from the same CPU will always get
+ * assigned to the same socket. Doesn't attempt to load
+ * balance or maintain ordering, as source threads may not
+ * always be on the same CPU. However, may achieve a more
+ * even or predictable balance than
+ * IP_SUBSET_STRATEGY_THREADID.
+ *
+ * This might be quite a bit more interesting if sockets had
+ * a formal affinity themselves, as then we could direct
+ * datagrams to that explicitly.
+ */
+ if ((curcpu % inp->inp_subset_count) ==
+ inp->inp_subset_member)
+ return (1);
+ return (0);
+
+ /* case IP_SUBSET_STRATEGY_FILLSOCK: */
+ /*
+ * In this theoretical mode, we attempt to fill sockets in
+ * the order they are matched, and don't move onto the next
+ * socket unless the previous one is filled. This requires
+ * us to peak up a layer and see if there is room for the
+ * current datagram; this proves somewhat tricky as we need
+ * to make sure we don't return ICMP when the last one proves
+ * full, so we don't try to do that yet.
+ */
+
+ default:
+ panic("in_subset_match: strategy %d",
+ inp->inp_subset_strategy);
+ }
+}
+
+/*
* Lookup PCB in hash list.
*/
struct inpcb *
-in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
- u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
- struct ifnet *ifp)
+in_pcblookup_hash_full(struct inpcbinfo *pcbinfo, struct in_addr faddr,
+ u_int fport_arg, struct in_addr laddr, u_int lport_arg, u_short ip_id,
+ u_int32_t flowid, int wildcard, struct ifnet *ifp)
{
struct inpcbhead *head;
struct inpcb *inp, *tmpinp;
@@ -1309,20 +1412,25 @@ in_pcblookup_hash(struct inpcbinfo *pcbi
if ((inp->inp_vflag & INP_IPV4) == 0)
continue;
#endif
- if (inp->inp_faddr.s_addr == faddr.s_addr &&
- inp->inp_laddr.s_addr == laddr.s_addr &&
- inp->inp_fport == fport &&
- inp->inp_lport == lport) {
- /*
- * XXX We should be able to directly return
- * the inp here, without any checks.
- * Well unless both bound with SO_REUSEPORT?
- */
- if (jailed(inp->inp_cred))
- return (inp);
- if (tmpinp == NULL)
- tmpinp = inp;
- }
+ if (inp->inp_faddr.s_addr != faddr.s_addr ||
+ inp->inp_laddr.s_addr != laddr.s_addr ||
+ inp->inp_fport != fport ||
+ inp->inp_lport != lport)
+ continue;
+ if (inp->inp_subset_strategy != IP_SUBSET_STRATEGY_DISABLED
+ && !in_subset_match(inp, faddr, fport, laddr, lport,
+ ip_id, flowid))
+ continue;
+
+ /*
+ * XXX We should be able to directly return
+ * the inp here, without any checks.
+ * Well unless both bound with SO_REUSEPORT?
+ */
+ if (jailed(inp->inp_cred))
+ return (inp);
+ if (tmpinp == NULL)
+ tmpinp = inp;
}
if (tmpinp != NULL)
return (tmpinp);
@@ -1372,6 +1480,12 @@ in_pcblookup_hash(struct inpcbinfo *pcbi
continue;
}
+ if (inp->inp_subset_strategy !=
+ IP_SUBSET_STRATEGY_DISABLED &&
+ !in_subset_match(inp, faddr, fport, laddr, lport,
+ ip_id, flowid))
+ continue;
+
if (inp->inp_laddr.s_addr == laddr.s_addr) {
if (injail)
return (inp);
@@ -1405,6 +1519,16 @@ in_pcblookup_hash(struct inpcbinfo *pcbi
return (NULL);
}
+struct inpcb *
+in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
+ u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
+ struct ifnet *ifp)
+{
+
+ return (in_pcblookup_hash_full(pcbinfo, faddr, fport_arg, laddr,
+ lport_arg, 0, 0, wildcard, ifp));
+}
+
/*
* Insert PCB onto various hash lists.
*/
Modified: projects/pnet/sys/netinet/in_pcb.h
==============================================================================
--- projects/pnet/sys/netinet/in_pcb.h Thu Jan 1 20:47:09 2009 (r186685)
+++ projects/pnet/sys/netinet/in_pcb.h Thu Jan 1 22:11:44 2009 (r186686)
@@ -199,6 +199,9 @@ struct inpcb {
} inp_depend6;
LIST_ENTRY(inpcb) inp_portlist; /* (i/p) */
struct inpcbport *inp_phd; /* (i/p) head of this list */
+ u_int inp_subset_strategy;
+ u_int inp_subset_count;
+ u_int inp_subset_member;
#define inp_zero_size offsetof(struct inpcb, inp_gencnt)
inp_gen_t inp_gencnt; /* (c) generation count */
struct rwlock inp_lock;
@@ -493,6 +496,11 @@ struct inpcb *
struct inpcb *
in_pcblookup_hash(struct inpcbinfo *, struct in_addr, u_int,
struct in_addr, u_int, int, struct ifnet *);
+struct inpcb *
+ in_pcblookup_hash_full(struct inpcbinfo *pcbinfo,
+ struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
+ u_int lport_arg, u_short ip_id, u_int32_t flowid, int wildcard,
+ struct ifnet *ifp);
void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr,
int, struct inpcb *(*)(struct inpcb *, int));
void in_pcbref(struct inpcb *);
Modified: projects/pnet/sys/netinet/in_proto.c
==============================================================================
--- projects/pnet/sys/netinet/in_proto.c Thu Jan 1 20:47:09 2009 (r186685)
+++ projects/pnet/sys/netinet/in_proto.c Thu Jan 1 22:11:44 2009 (r186686)
@@ -124,7 +124,7 @@ struct protosw inetsw[] = {
.pr_flags = PR_ATOMIC|PR_ADDR,
.pr_input = udp_input,
.pr_ctlinput = udp_ctlinput,
- .pr_ctloutput = ip_ctloutput,
+ .pr_ctloutput = udp_ctloutput,
.pr_init = udp_init,
.pr_usrreqs = &udp_usrreqs
},
Modified: projects/pnet/sys/netinet/udp_usrreq.c
==============================================================================
--- projects/pnet/sys/netinet/udp_usrreq.c Thu Jan 1 20:47:09 2009 (r186685)
+++ projects/pnet/sys/netinet/udp_usrreq.c Thu Jan 1 22:11:44 2009 (r186686)
@@ -526,8 +526,8 @@ udp_input(struct mbuf *m, int off)
/*
* Locate pcb for datagram.
*/
- inp = in_pcblookup_hash(&V_udbinfo, ip->ip_src, uh->uh_sport,
- ip->ip_dst, uh->uh_dport, 1, ifp);
+ inp = in_pcblookup_hash_full(&V_udbinfo, ip->ip_src, uh->uh_sport,
+ ip->ip_dst, uh->uh_dport, ip->ip_id, m->m_pkthdr.flowid, 1, ifp);
if (inp == NULL) {
if (udp_log_in_vain) {
char buf[4*sizeof "123"];
@@ -621,6 +621,9 @@ udp_ctlinput(int cmd, struct sockaddr *s
*
* XXX: We never get this from ICMP, otherwise it makes an excellent
* DoS attack on machines with many connections.
+ *
+ * XXXRW: With subsetting, we should deliver this to all matching
+ * connections for the specific tuple.
*/
if (cmd == PRC_HOSTDEAD)
ip = NULL;
@@ -644,6 +647,67 @@ udp_ctlinput(int cmd, struct sockaddr *s
udp_notify);
}
+int
+udp_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+ INIT_VNET_INET(so->so_vnet);
+ struct ip_subset is;
+ struct inpcb *inp;
+ int error;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("udp_ctloutput: inp == NULL"));
+
+ if (sopt->sopt_level != IPPROTO_UDP)
+ return (ip_ctloutput(so, sopt));
+
+ switch (sopt->sopt_dir) {
+ case SOPT_GET:
+ switch (sopt->sopt_name) {
+ case IP_SUBSET:
+ bzero(&is, sizeof(is));
+ INP_RLOCK(inp);
+ is.is_strategy = inp->inp_subset_strategy;
+ is.is_count = inp->inp_subset_count;
+ is.is_member = inp->inp_subset_member;
+ INP_RUNLOCK(inp);
+ return (sooptcopyout(sopt, &is, sizeof(is)));
+ }
+ break;
+
+ case SOPT_SET:
+ switch (sopt->sopt_name) {
+ case IP_SUBSET:
+ error = sooptcopyin(sopt, &is, sizeof(is),
+ sizeof(is));
+ if (error)
+ return (error);
+ switch (is.is_strategy) {
+ case IP_SUBSET_STRATEGY_DISABLED:
+ break;
+
+ case IP_SUBSET_STRATEGY_FLOW:
+ case IP_SUBSET_STRATEGY_RANDOM:
+ if (is.is_count == 0 ||
+ is.is_member >= is.is_count)
+ return (EINVAL);
+ break;
+
+ default:
+ return (EINVAL);
+ }
+ INP_WLOCK(inp);
+ inp->inp_subset_strategy = is.is_strategy;
+ inp->inp_subset_count = is.is_count;
+ inp->inp_subset_member = is.is_member;
+ INP_WUNLOCK(inp);
+ return (0);
+ }
+ break;
+ }
+ return (ENOPROTOOPT);
+}
+
static int
udp_pcblist(SYSCTL_HANDLER_ARGS)
{
@@ -758,6 +822,11 @@ udp_getcred(SYSCTL_HANDLER_ARGS)
error = SYSCTL_IN(req, addrs, sizeof(addrs));
if (error)
return (error);
+
+ /*
+ * XXXRW: with IP subsetting, potentially more than one socket may
+ * match, so we just return the cred for the first one.
+ */
INP_INFO_RLOCK(&V_udbinfo);
inp = in_pcblookup_hash(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
addrs[0].sin_addr, addrs[0].sin_port, 1, NULL);
Modified: projects/pnet/sys/netinet/udp_var.h
==============================================================================
--- projects/pnet/sys/netinet/udp_var.h Thu Jan 1 20:47:09 2009 (r186685)
+++ projects/pnet/sys/netinet/udp_var.h Thu Jan 1 22:11:44 2009 (r186686)
@@ -106,6 +106,7 @@ extern u_long udp_recvspace;
extern int udp_log_in_vain;
void udp_ctlinput(int, struct sockaddr *, void *);
+int udp_ctloutput(struct socket *so, struct sockopt *sopt);
void udp_init(void);
void udp_input(struct mbuf *, int);
struct inpcb *udp_notify(struct inpcb *inp, int errno);
More information about the svn-src-projects
mailing list