svn commit: r261601 - in head: sys/net sys/netinet sys/netinet6 usr.bin/netstat
Gleb Smirnoff
glebius at FreeBSD.org
Fri Feb 7 15:18:23 UTC 2014
Author: glebius
Date: Fri Feb 7 15:18:23 2014
New Revision: 261601
URL: http://svnweb.freebsd.org/changeset/base/261601
Log:
o Revamp API between flowtable and netinet, netinet6.
- ip_output() and ip_output6() simply call flowtable_lookup(),
passing mbuf and address family. That's the only code under
#ifdef FLOWTABLE in the protocols code now.
o Revamp statistics gathering and export.
- Remove hand made pcpu stats, and utilize counter(9).
- Snapshot of statistics is available via 'netstat -rs'.
- All sysctls are moved into net.flowtable namespace, since
spreading them over net.inet isn't correct.
o Properly separate at compile time INET and INET6 parts.
o General cleanup.
- Remove chain of multiple flowtables. We simply have one for
IPv4 and one for IPv6.
- Flowtables are allocated in flowtable.c, symbols are static.
- With proper argument to SYSINIT() we no longer need flowtable_ready.
- Hash salt doesn't need to be per-VNET.
- Removed rudimentary debugging, which use quite useless in dtrace era.
The runtime behavior of flowtable shouldn't be changed by this commit.
Sponsored by: Netflix
Sponsored by: Nginx, Inc.
Added:
head/usr.bin/netstat/flowtable.c (contents, props changed)
Modified:
head/sys/net/flowtable.c
head/sys/net/flowtable.h
head/sys/net/route.c
head/sys/netinet/ip_input.c
head/sys/netinet/ip_output.c
head/sys/netinet6/in6_proto.c
head/sys/netinet6/ip6_input.c
head/sys/netinet6/ip6_output.c
head/usr.bin/netstat/Makefile
head/usr.bin/netstat/main.c
head/usr.bin/netstat/netstat.h
Modified: head/sys/net/flowtable.c
==============================================================================
--- head/sys/net/flowtable.c Fri Feb 7 15:10:24 2014 (r261600)
+++ head/sys/net/flowtable.c Fri Feb 7 15:18:23 2014 (r261601)
@@ -146,23 +146,13 @@ union flentryp {
struct flentry **pcpu[MAXCPU];
};
-struct flowtable_stats {
- uint64_t ft_collisions;
- uint64_t ft_allocated;
- uint64_t ft_misses;
- uint64_t ft_max_depth;
- uint64_t ft_free_checks;
- uint64_t ft_frees;
- uint64_t ft_hits;
- uint64_t ft_lookups;
-} __aligned(CACHE_LINE_SIZE);
-
struct flowtable {
- struct flowtable_stats ft_stats[MAXCPU];
+ counter_u64_t *ft_stat;
+ uma_zone_t ft_zone;
int ft_size;
int ft_lock_count;
uint32_t ft_flags;
- char *ft_name;
+ uint32_t ft_max_depth;
fl_lock_t *ft_lock;
fl_lock_t *ft_unlock;
fl_rtalloc_t *ft_rtalloc;
@@ -173,9 +163,7 @@ struct flowtable {
union flentryp ft_table;
bitstr_t *ft_masks[MAXCPU];
bitstr_t *ft_tmpmask;
- struct flowtable *ft_next;
- uint32_t ft_count __aligned(CACHE_LINE_SIZE);
uint32_t ft_udp_idle __aligned(CACHE_LINE_SIZE);
uint32_t ft_fin_wait_idle;
uint32_t ft_syn_idle;
@@ -183,17 +171,12 @@ struct flowtable {
boolean_t ft_full;
} __aligned(CACHE_LINE_SIZE);
-static struct proc *flowcleanerproc;
-static VNET_DEFINE(struct flowtable *, flow_list_head);
-static VNET_DEFINE(uint32_t, flow_hashjitter);
-static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
-static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
-
-#define V_flow_list_head VNET(flow_list_head)
-#define V_flow_hashjitter VNET(flow_hashjitter)
-#define V_flow_ipv4_zone VNET(flow_ipv4_zone)
-#define V_flow_ipv6_zone VNET(flow_ipv6_zone)
+#define FLOWSTAT_ADD(ft, name, v) \
+ counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / sizeof(uint64_t)], (v))
+#define FLOWSTAT_INC(ft, name) FLOWSTAT_ADD(ft, name, 1)
+static struct proc *flowcleanerproc;
+static uint32_t flow_hashjitter;
static struct cv flowclean_f_cv;
static struct cv flowclean_c_cv;
@@ -201,24 +184,8 @@ static struct mtx flowclean_lock;
static uint32_t flowclean_cycles;
static uint32_t flowclean_freq;
-#ifdef FLOWTABLE_DEBUG
-#define FLDPRINTF(ft, flags, fmt, ...) \
-do { \
- if ((ft)->ft_flags & (flags)) \
- printf((fmt), __VA_ARGS__); \
-} while (0); \
-
-#else
-#define FLDPRINTF(ft, flags, fmt, ...)
-
-#endif
-
-
/*
* TODO:
- * - Make flowtable stats per-cpu, aggregated at sysctl call time,
- * to avoid extra cache evictions caused by incrementing a shared
- * counter
* - add sysctls to resize && flush flow tables
* - Add per flowtable sysctls for statistics and configuring timeouts
* - add saturation counter to rtentry to support per-packet load-balancing
@@ -230,148 +197,51 @@ do { \
* - support explicit connection state (currently only ad-hoc for DSR)
* - idetach() cleanup for options VIMAGE builds.
*/
-VNET_DEFINE(int, flowtable_enable) = 1;
-static VNET_DEFINE(int, flowtable_debug);
+#ifdef INET
+static VNET_DEFINE(struct flowtable, ip4_ft);
+#define V_ip4_ft VNET(ip4_ft)
+static uma_zone_t flow_ipv4_zone;
+#endif
+#ifdef INET6
+static VNET_DEFINE(struct flowtable, ip6_ft);
+#define V_ip6_ft VNET(ip6_ft)
+static uma_zone_t flow_ipv6_zone;
+#endif
+
+static VNET_DEFINE(int, flowtable_enable) = 1;
static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
-static VNET_DEFINE(int, flowtable_nmbflows);
-static VNET_DEFINE(int, flowtable_ready) = 0;
#define V_flowtable_enable VNET(flowtable_enable)
-#define V_flowtable_debug VNET(flowtable_debug)
#define V_flowtable_syn_expire VNET(flowtable_syn_expire)
#define V_flowtable_udp_expire VNET(flowtable_udp_expire)
#define V_flowtable_fin_wait_expire VNET(flowtable_fin_wait_expire)
#define V_flowtable_tcp_expire VNET(flowtable_tcp_expire)
-#define V_flowtable_nmbflows VNET(flowtable_nmbflows)
-#define V_flowtable_ready VNET(flowtable_ready)
-static SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL,
+static SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL,
"flowtable");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW,
- &VNET_NAME(flowtable_debug), 0, "print debug info.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
+SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_RW,
&VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
/*
* XXX This does not end up updating timeouts at runtime
* and only reflects the value for the last table added :-/
*/
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
+SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
&VNET_NAME(flowtable_syn_expire), 0,
"seconds after which to remove syn allocated flow.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
+SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
&VNET_NAME(flowtable_udp_expire), 0,
"seconds after which to remove flow allocated to UDP.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
+SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
&VNET_NAME(flowtable_fin_wait_expire), 0,
"seconds after which to remove a flow in FIN_WAIT.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
+SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
&VNET_NAME(flowtable_tcp_expire), 0,
"seconds after which to remove flow allocated to a TCP connection.");
-
-/*
- * Maximum number of flows that can be allocated of a given type.
- *
- * The table is allocated at boot time (for the pure caching case
- * there is no reason why this could not be changed at runtime)
- * and thus (currently) needs to be set with a tunable.
- */
-static int
-sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
-{
- int error, newnmbflows;
-
- newnmbflows = V_flowtable_nmbflows;
- error = sysctl_handle_int(oidp, &newnmbflows, 0, req);
- if (error == 0 && req->newptr) {
- if (newnmbflows > V_flowtable_nmbflows) {
- V_flowtable_nmbflows = newnmbflows;
- uma_zone_set_max(V_flow_ipv4_zone,
- V_flowtable_nmbflows);
- uma_zone_set_max(V_flow_ipv6_zone,
- V_flowtable_nmbflows);
- } else
- error = EINVAL;
- }
- return (error);
-}
-SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
- CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
- "Maximum number of flows allowed");
-
-
-
-#define FS_PRINT(sb, field) sbuf_printf((sb), "\t%s: %jd\n", #field, fs->ft_##field)
-
-static void
-fs_print(struct sbuf *sb, struct flowtable_stats *fs)
-{
-
- FS_PRINT(sb, collisions);
- FS_PRINT(sb, allocated);
- FS_PRINT(sb, misses);
- FS_PRINT(sb, max_depth);
- FS_PRINT(sb, free_checks);
- FS_PRINT(sb, frees);
- FS_PRINT(sb, hits);
- FS_PRINT(sb, lookups);
-}
-
-static void
-flowtable_show_stats(struct sbuf *sb, struct flowtable *ft)
-{
- int i;
- struct flowtable_stats fs, *pfs;
-
- if (ft->ft_flags & FL_PCPU) {
- bzero(&fs, sizeof(fs));
- pfs = &fs;
- CPU_FOREACH(i) {
- pfs->ft_collisions += ft->ft_stats[i].ft_collisions;
- pfs->ft_allocated += ft->ft_stats[i].ft_allocated;
- pfs->ft_misses += ft->ft_stats[i].ft_misses;
- pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks;
- pfs->ft_frees += ft->ft_stats[i].ft_frees;
- pfs->ft_hits += ft->ft_stats[i].ft_hits;
- pfs->ft_lookups += ft->ft_stats[i].ft_lookups;
- if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth)
- pfs->ft_max_depth = ft->ft_stats[i].ft_max_depth;
- }
- } else {
- pfs = &ft->ft_stats[0];
- }
- fs_print(sb, pfs);
-}
-
-static int
-sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS)
-{
- struct flowtable *ft;
- struct sbuf *sb;
- int error;
-
- sb = sbuf_new(NULL, NULL, 64*1024, SBUF_FIXEDLEN);
-
- ft = V_flow_list_head;
- while (ft != NULL) {
- sbuf_printf(sb, "\ntable name: %s\n", ft->ft_name);
- flowtable_show_stats(sb, ft);
- ft = ft->ft_next;
- }
- sbuf_finish(sb);
- error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
- sbuf_delete(sb);
-
- return (error);
-}
-SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD,
- NULL, 0, sysctl_flowtable_stats, "A", "flowtable statistics");
-
-
#ifndef RADIX_MPATH
static void
rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
@@ -419,12 +289,8 @@ flowtable_pcpu_unlock(struct flowtable *
#define FL_STALE (1<<8)
#define FL_OVERWRITE (1<<10)
-void
-flow_invalidate(struct flentry *fle)
-{
-
- fle->f_flags |= FL_STALE;
-}
+static struct flentry *flowtable_lookup_common(struct flowtable *,
+ struct sockaddr_storage *, struct sockaddr_storage *, struct mbuf *, int);
static __inline int
proto_to_flags(uint8_t proto)
@@ -495,8 +361,8 @@ ipv4_flow_print_tuple(int flags, int pro
#endif
static int
-ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
- struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags)
+ipv4_mbuf_demarshal(struct mbuf *m, struct sockaddr_in *ssin,
+ struct sockaddr_in *dsin, uint16_t *flags)
{
struct ip *ip;
uint8_t proto;
@@ -516,11 +382,8 @@ ipv4_mbuf_demarshal(struct flowtable *ft
ssin->sin_addr = ip->ip_src;
proto = ip->ip_p;
- if ((*flags & FL_HASH_ALL) == 0) {
- FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ",
- *flags);
+ if ((*flags & FL_HASH_ALL) == 0)
goto skipports;
- }
iphlen = ip->ip_hl << 2; /* XXX options? */
@@ -544,7 +407,6 @@ ipv4_mbuf_demarshal(struct flowtable *ft
dport = sh->dest_port;
break;
default:
- FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", proto);
return (ENOTSUP);
/* no port - hence not a protocol we care about */
break;
@@ -559,7 +421,7 @@ skipports:
}
static uint32_t
-ipv4_flow_lookup_hash_internal(
+ipv4_flow_lookup_hash(
struct sockaddr_in *ssin, struct sockaddr_in *dsin,
uint32_t *key, uint16_t flags)
{
@@ -567,8 +429,6 @@ ipv4_flow_lookup_hash_internal(
uint8_t proto;
int offset = 0;
- if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
- return (0);
proto = flags_to_proto(flags);
sport = dport = key[2] = key[1] = key[0] = 0;
if ((ssin != NULL) && (flags & FL_HASH_ALL)) {
@@ -583,13 +443,13 @@ ipv4_flow_lookup_hash_internal(
((uint16_t *)key)[0] = sport;
((uint16_t *)key)[1] = dport;
} else
- offset = V_flow_hashjitter + proto;
+ offset = flow_hashjitter + proto;
return (jenkins_hash32(key, 3, offset));
}
static struct flentry *
-flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m)
+flowtable_lookup_ipv4(struct mbuf *m)
{
struct sockaddr_storage ssa, dsa;
uint16_t flags;
@@ -599,11 +459,11 @@ flowtable_lookup_mbuf4(struct flowtable
ssin = (struct sockaddr_in *)&ssa;
bzero(dsin, sizeof(*dsin));
bzero(ssin, sizeof(*ssin));
- flags = ft->ft_flags;
- if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0)
+ flags = V_ip4_ft.ft_flags;
+ if (ipv4_mbuf_demarshal(m, ssin, dsin, &flags) != 0)
return (NULL);
- return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
+ return (flowtable_lookup_common(&V_ip4_ft, &ssa, &dsa, m, flags));
}
void
@@ -644,8 +504,8 @@ do { \
#define UDP(p) ((struct udphdr *)(p))
static int
-ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
- struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags)
+ipv6_mbuf_demarshal(struct mbuf *m, struct sockaddr_in6 *ssin6,
+ struct sockaddr_in6 *dsin6, uint16_t *flags)
{
struct ip6_hdr *ip6;
uint8_t proto;
@@ -763,7 +623,7 @@ do { \
} while (0)
static uint32_t
-ipv6_flow_lookup_hash_internal(
+ipv6_flow_lookup_hash(
struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6,
uint32_t *key, uint16_t flags)
{
@@ -771,9 +631,6 @@ ipv6_flow_lookup_hash_internal(
uint8_t proto;
int offset = 0;
- if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
- return (0);
-
proto = flags_to_proto(flags);
zero_key(key);
sport = dport = 0;
@@ -789,13 +646,13 @@ ipv6_flow_lookup_hash_internal(
((uint16_t *)key)[0] = sport;
((uint16_t *)key)[1] = dport;
} else
- offset = V_flow_hashjitter + proto;
+ offset = flow_hashjitter + proto;
return (jenkins_hash32(key, 9, offset));
}
static struct flentry *
-flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m)
+flowtable_lookup_ipv6(struct mbuf *m)
{
struct sockaddr_storage ssa, dsa;
struct sockaddr_in6 *dsin6, *ssin6;
@@ -805,12 +662,12 @@ flowtable_lookup_mbuf6(struct flowtable
ssin6 = (struct sockaddr_in6 *)&ssa;
bzero(dsin6, sizeof(*dsin6));
bzero(ssin6, sizeof(*ssin6));
- flags = ft->ft_flags;
+ flags = V_ip6_ft.ft_flags;
- if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0)
+ if (ipv6_mbuf_demarshal(m, ssin6, dsin6, &flags) != 0)
return (NULL);
- return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
+ return (flowtable_lookup_common(&V_ip6_ft, &ssa, &dsa, m, flags));
}
void
@@ -910,43 +767,19 @@ flowtable_set_hashkey(struct flentry *fl
hashkey[i] = key[i];
}
-static struct flentry *
-flow_alloc(struct flowtable *ft)
-{
- struct flentry *newfle;
- uma_zone_t zone;
-
- newfle = NULL;
- zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
-
- newfle = uma_zalloc(zone, M_NOWAIT | M_ZERO);
- if (newfle != NULL)
- atomic_add_int(&ft->ft_count, 1);
- return (newfle);
-}
-
-static void
-flow_free(struct flentry *fle, struct flowtable *ft)
-{
- uma_zone_t zone;
-
- zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
- atomic_add_int(&ft->ft_count, -1);
- uma_zfree(zone, fle);
-}
-
static int
flow_full(struct flowtable *ft)
{
boolean_t full;
- uint32_t count;
+ int count, max;
full = ft->ft_full;
- count = ft->ft_count;
+ count = uma_zone_get_cur(ft->ft_zone);
+ max = uma_zone_get_max(ft->ft_zone);
- if (full && (count < (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 3))))
+ if (full && (count < (max - (max >> 3))))
ft->ft_full = FALSE;
- else if (!full && (count > (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 5))))
+ else if (!full && (count > (max - (max >> 5))))
ft->ft_full = TRUE;
if (full && !ft->ft_full) {
@@ -970,12 +803,11 @@ flowtable_insert(struct flowtable *ft, u
uint32_t fibnum, struct route *ro, uint16_t flags)
{
struct flentry *fle, *fletail, *newfle, **flep;
- struct flowtable_stats *fs = &ft->ft_stats[curcpu];
int depth;
bitstr_t *mask;
uint8_t proto;
- newfle = flow_alloc(ft);
+ newfle = uma_zalloc(ft->ft_zone, M_NOWAIT | M_ZERO);
if (newfle == NULL)
return (ENOMEM);
@@ -994,7 +826,7 @@ flowtable_insert(struct flowtable *ft, u
}
depth = 0;
- fs->ft_collisions++;
+ FLOWSTAT_INC(ft, ft_collisions);
/*
* find end of list and make sure that we were not
* preempted by another thread handling this flow
@@ -1006,7 +838,7 @@ flowtable_insert(struct flowtable *ft, u
* or we lost a race to insert
*/
FL_ENTRY_UNLOCK(ft, hash);
- flow_free(newfle, ft);
+ uma_zfree(ft->ft_zone, newfle);
if (flags & FL_OVERWRITE)
goto skip;
@@ -1022,8 +854,8 @@ flowtable_insert(struct flowtable *ft, u
fle = fle->f_next;
}
- if (depth > fs->ft_max_depth)
- fs->ft_max_depth = depth;
+ if (depth > ft->ft_max_depth)
+ ft->ft_max_depth = depth;
fletail->f_next = newfle;
fle = newfle;
skip:
@@ -1039,35 +871,6 @@ skip:
return (0);
}
-int
-kern_flowtable_insert(struct flowtable *ft,
- struct sockaddr_storage *ssa, struct sockaddr_storage *dsa,
- struct route *ro, uint32_t fibnum, int flags)
-{
- uint32_t key[9], hash;
-
- flags = (ft->ft_flags | flags | FL_OVERWRITE);
- hash = 0;
-
-#ifdef INET
- if (ssa->ss_family == AF_INET)
- hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa,
- (struct sockaddr_in *)dsa, key, flags);
-#endif
-#ifdef INET6
- if (ssa->ss_family == AF_INET6)
- hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 *)ssa,
- (struct sockaddr_in6 *)dsa, key, flags);
-#endif
- if (ro->ro_rt == NULL || ro->ro_lle == NULL)
- return (EINVAL);
-
- FLDPRINTF(ft, FL_DEBUG,
- "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n",
- key[0], key[1], key[2], hash, fibnum, flags);
- return (flowtable_insert(ft, hash, key, fibnum, ro, flags));
-}
-
static int
flowtable_key_equal(struct flentry *fle, uint32_t *key)
{
@@ -1090,49 +893,54 @@ flowtable_key_equal(struct flentry *fle,
}
struct flentry *
-flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af)
+flowtable_lookup(sa_family_t sa, struct mbuf *m)
{
- struct flentry *fle = NULL;
+ switch (sa) {
#ifdef INET
- if (af == AF_INET)
- fle = flowtable_lookup_mbuf4(ft, m);
+ case AF_INET:
+ return (flowtable_lookup_ipv4(m));
#endif
#ifdef INET6
- if (af == AF_INET6)
- fle = flowtable_lookup_mbuf6(ft, m);
-#endif
- if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) {
- m->m_flags |= M_FLOWID;
- m->m_pkthdr.flowid = fle->f_fhash;
+ case AF_INET6:
+ return (flowtable_lookup_ipv6(m));
+#endif
+ default:
+ panic("%s: sa %d", __func__, sa);
}
- return (fle);
}
-
-struct flentry *
-flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa,
- struct sockaddr_storage *dsa, uint32_t fibnum, int flags)
+
+static struct flentry *
+flowtable_lookup_common(struct flowtable *ft, struct sockaddr_storage *ssa,
+ struct sockaddr_storage *dsa, struct mbuf *m, int flags)
{
- uint32_t key[9], hash;
+ struct route_in6 sro6;
+ struct route sro, *ro;
struct flentry *fle;
- struct flowtable_stats *fs = &ft->ft_stats[curcpu];
- uint8_t proto = 0;
- int error = 0;
struct rtentry *rt;
struct llentry *lle;
- struct route sro, *ro;
- struct route_in6 sro6;
+ struct sockaddr_storage *l3addr;
+ struct ifnet *ifp;
+ uint32_t key[9], hash, fibnum;
+ uint8_t proto;
+
+ if (V_flowtable_enable == 0)
+ return (NULL);
sro.ro_rt = sro6.ro_rt = NULL;
sro.ro_lle = sro6.ro_lle = NULL;
- ro = NULL;
- hash = 0;
flags |= ft->ft_flags;
proto = flags_to_proto(flags);
+ fibnum = M_GETFIB(m);
+
+ switch (ssa->ss_family) {
#ifdef INET
- if (ssa->ss_family == AF_INET) {
+ case AF_INET: {
struct sockaddr_in *ssin, *dsin;
+ KASSERT(dsa->ss_family == AF_INET,
+ ("%s: dsa family %d\n", __func__, dsa->ss_family));
+
ro = &sro;
memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in));
/*
@@ -1150,13 +958,17 @@ flowtable_lookup(struct flowtable *ft, s
(ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
return (NULL);
- hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags);
+ hash = ipv4_flow_lookup_hash(ssin, dsin, key, flags);
+ break;
}
#endif
#ifdef INET6
- if (ssa->ss_family == AF_INET6) {
+ case AF_INET6: {
struct sockaddr_in6 *ssin6, *dsin6;
+ KASSERT(dsa->ss_family == AF_INET6,
+ ("%s: dsa family %d\n", __func__, dsa->ss_family));
+
ro = (struct route *)&sro6;
memcpy(&sro6.ro_dst, dsa,
sizeof(struct sockaddr_in6));
@@ -1165,19 +977,24 @@ flowtable_lookup(struct flowtable *ft, s
ssin6 = (struct sockaddr_in6 *)ssa;
flags |= FL_IPV6;
- hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags);
+ hash = ipv6_flow_lookup_hash(ssin6, dsin6, key, flags);
+ break;
}
#endif
+ default:
+ panic("%s: ssa family %d", __func__, ssa->ss_family);
+ }
+
/*
* Ports are zero and this isn't a transmit cache
* - thus not a protocol for which we need to keep
* state
* FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP
*/
- if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL)))
+ if (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL))
return (NULL);
- fs->ft_lookups++;
+ FLOWSTAT_INC(ft, ft_lookups);
FL_ENTRY_LOCK(ft, hash);
if ((fle = FL_ENTRY(ft, hash)) == NULL) {
FL_ENTRY_UNLOCK(ft, hash);
@@ -1195,11 +1012,11 @@ keycheck:
&& (rt->rt_flags & RTF_UP)
&& (rt->rt_ifp != NULL)
&& (lle->la_flags & LLE_VALID)) {
- fs->ft_hits++;
+ FLOWSTAT_INC(ft, ft_hits);
fle->f_uptime = time_uptime;
fle->f_flags |= flags;
FL_ENTRY_UNLOCK(ft, hash);
- return (fle);
+ goto success;
} else if (fle->f_next != NULL) {
fle = fle->f_next;
goto keycheck;
@@ -1209,7 +1026,7 @@ uncached:
if (flags & FL_NOAUTO || flow_full(ft))
return (NULL);
- fs->ft_misses++;
+ FLOWSTAT_INC(ft, ft_misses);
/*
* This bit of code ends up locking the
* same route 3 times (just like ip_output + ether_output)
@@ -1222,73 +1039,66 @@ uncached:
* receive the route locked
*/
-#ifdef INVARIANTS
- if ((ro->ro_dst.sa_family != AF_INET) &&
- (ro->ro_dst.sa_family != AF_INET6))
- panic("sa_family == %d\n", ro->ro_dst.sa_family);
-#endif
-
ft->ft_rtalloc(ro, hash, fibnum);
if (ro->ro_rt == NULL)
- error = ENETUNREACH;
- else {
- struct llentry *lle = NULL;
- struct sockaddr_storage *l3addr;
- struct rtentry *rt = ro->ro_rt;
- struct ifnet *ifp = rt->rt_ifp;
+ return (NULL);
- if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
- RTFREE(rt);
- ro->ro_rt = NULL;
- return (NULL);
- }
-#ifdef INET6
- if (ssa->ss_family == AF_INET6) {
- struct sockaddr_in6 *dsin6;
+ rt = ro->ro_rt;
+ ifp = rt->rt_ifp;
- dsin6 = (struct sockaddr_in6 *)dsa;
- if (in6_localaddr(&dsin6->sin6_addr)) {
- RTFREE(rt);
- ro->ro_rt = NULL;
- return (NULL);
- }
+ if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
+ RTFREE(rt);
+ return (NULL);
+ }
- if (rt->rt_flags & RTF_GATEWAY)
- l3addr = (struct sockaddr_storage *)rt->rt_gateway;
-
- else
- l3addr = (struct sockaddr_storage *)&ro->ro_dst;
- lle = llentry_alloc(ifp, LLTABLE6(ifp), l3addr);
- }
-#endif
+ switch (ssa->ss_family) {
#ifdef INET
- if (ssa->ss_family == AF_INET) {
- if (rt->rt_flags & RTF_GATEWAY)
- l3addr = (struct sockaddr_storage *)rt->rt_gateway;
- else
- l3addr = (struct sockaddr_storage *)&ro->ro_dst;
- lle = llentry_alloc(ifp, LLTABLE(ifp), l3addr);
- }
-
+ case AF_INET:
+ if (rt->rt_flags & RTF_GATEWAY)
+ l3addr = (struct sockaddr_storage *)rt->rt_gateway;
+ else
+ l3addr = (struct sockaddr_storage *)&ro->ro_dst;
+ lle = llentry_alloc(ifp, LLTABLE(ifp), l3addr);
+ break;
#endif
- ro->ro_lle = lle;
+#ifdef INET6
+ case AF_INET6: {
+ struct sockaddr_in6 *dsin6;
- if (lle == NULL) {
+ dsin6 = (struct sockaddr_in6 *)dsa;
+ if (in6_localaddr(&dsin6->sin6_addr)) {
RTFREE(rt);
- ro->ro_rt = NULL;
- return (NULL);
+ return (NULL);
}
- error = flowtable_insert(ft, hash, key, fibnum, ro, flags);
- if (error) {
- RTFREE(rt);
- LLE_FREE(lle);
- ro->ro_rt = NULL;
- ro->ro_lle = NULL;
- }
+ if (rt->rt_flags & RTF_GATEWAY)
+ l3addr = (struct sockaddr_storage *)rt->rt_gateway;
+ else
+ l3addr = (struct sockaddr_storage *)&ro->ro_dst;
+ lle = llentry_alloc(ifp, LLTABLE6(ifp), l3addr);
+ break;
+ }
+#endif
+ }
+
+ if (lle == NULL) {
+ RTFREE(rt);
+ return (NULL);
+ }
+ ro->ro_lle = lle;
+
+ if (flowtable_insert(ft, hash, key, fibnum, ro, flags) != 0) {
+ RTFREE(rt);
+ LLE_FREE(lle);
+ return (NULL);
}
- return ((error) ? NULL : fle);
+success:
+ if (fle != NULL && (m->m_flags & M_FLOWID) == 0) {
+ m->m_flags |= M_FLOWID;
+ m->m_pkthdr.flowid = fle->f_fhash;
+ }
+ return (fle);
}
/*
@@ -1296,37 +1106,24 @@ uncached:
*/
#define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
-struct flowtable *
-flowtable_alloc(char *name, int nentry, int flags)
+static void
+flowtable_alloc(struct flowtable *ft)
{
- struct flowtable *ft, *fttail;
- int i;
-
- if (V_flow_hashjitter == 0)
- V_flow_hashjitter = arc4random();
- KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
-
- ft = malloc(sizeof(struct flowtable),
- M_RTABLE, M_WAITOK | M_ZERO);
-
- ft->ft_name = name;
- ft->ft_flags = flags;
- ft->ft_size = nentry;
#ifdef RADIX_MPATH
ft->ft_rtalloc = rtalloc_mpath_fib;
#else
ft->ft_rtalloc = rtalloc_ign_wrapper;
#endif
- if (flags & FL_PCPU) {
+ if (ft->ft_flags & FL_PCPU) {
ft->ft_lock = flowtable_pcpu_lock;
ft->ft_unlock = flowtable_pcpu_unlock;
- for (i = 0; i <= mp_maxid; i++) {
+ for (int i = 0; i <= mp_maxid; i++) {
ft->ft_table.pcpu[i] =
- malloc(nentry*sizeof(struct flentry *),
+ malloc(ft->ft_size * sizeof(struct flentry *),
M_RTABLE, M_WAITOK | M_ZERO);
- ft->ft_masks[i] = bit_alloc(nentry);
+ ft->ft_masks[i] = bit_alloc(ft->ft_size);
}
} else {
ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
@@ -1335,23 +1132,24 @@ flowtable_alloc(char *name, int nentry,
ft->ft_lock = flowtable_global_lock;
ft->ft_unlock = flowtable_global_unlock;
ft->ft_table.global =
- malloc(nentry*sizeof(struct flentry *),
+ malloc(ft->ft_size * sizeof(struct flentry *),
M_RTABLE, M_WAITOK | M_ZERO);
ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
M_RTABLE, M_WAITOK | M_ZERO);
- for (i = 0; i < ft->ft_lock_count; i++)
- mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
+ for (int i = 0; i < ft->ft_lock_count; i++)
+ mtx_init(&ft->ft_locks[i], "flow", NULL,
+ MTX_DEF | MTX_DUPOK);
- ft->ft_masks[0] = bit_alloc(nentry);
+ ft->ft_masks[0] = bit_alloc(ft->ft_size);
}
- ft->ft_tmpmask = bit_alloc(nentry);
+ ft->ft_tmpmask = bit_alloc(ft->ft_size);
/*
* In the local transmit case the table truly is
* just a cache - so everything is eligible for
* replacement after 5s of non-use
*/
- if (flags & FL_HASH_ALL) {
+ if (ft->ft_flags & FL_HASH_ALL) {
ft->ft_udp_idle = V_flowtable_udp_expire;
ft->ft_syn_idle = V_flowtable_syn_expire;
ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
@@ -1361,20 +1159,6 @@ flowtable_alloc(char *name, int nentry,
ft->ft_syn_idle = ft->ft_tcp_idle = 30;
}
-
- /*
- * hook in to the cleaner list
- */
- if (V_flow_list_head == NULL)
- V_flow_list_head = ft;
- else {
- fttail = V_flow_list_head;
- while (fttail->ft_next != NULL)
- fttail = fttail->ft_next;
- fttail->ft_next = ft;
- }
-
- return (ft);
}
/*
@@ -1395,17 +1179,16 @@ fle_free(struct flentry *fle, struct flo
RTFREE(rt);
if (lle != NULL)
LLE_FREE(lle);
- flow_free(fle, ft);
+ uma_zfree(ft->ft_zone, fle);
}
static void
flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
{
- int curbit = 0, count, tmpsize;
+ int curbit = 0, tmpsize;
struct flentry *fle, **flehead, *fleprev;
struct flentry *flefreehead, *flefreetail, *fletmp;
bitstr_t *mask, *tmpmask;
- struct flowtable_stats *fs = &ft->ft_stats[curcpu];
flefreehead = flefreetail = NULL;
mask = flowtable_mask(ft);
@@ -1429,7 +1212,7 @@ flowtable_free_stale(struct flowtable *f
flehead = flowtable_entry(ft, curbit);
fle = fleprev = *flehead;
- fs->ft_free_checks++;
+ FLOWSTAT_INC(ft, ft_free_checks);
#ifdef DIAGNOSTIC
if (fle == NULL && curbit > 0) {
log(LOG_ALERT,
@@ -1484,22 +1267,34 @@ flowtable_free_stale(struct flowtable *f
tmpsize -= (curbit / 8) * 8;
bit_ffs(tmpmask, tmpsize, &curbit);
}
- count = 0;
while ((fle = flefreehead) != NULL) {
flefreehead = fle->f_next;
- count++;
- fs->ft_frees++;
+ FLOWSTAT_INC(ft, ft_frees);
fle_free(fle, ft);
}
- if (V_flowtable_debug && count)
- log(LOG_DEBUG, "freed %d flow entries\n", count);
}
void
-flowtable_route_flush(struct flowtable *ft, struct rtentry *rt)
+flowtable_route_flush(sa_family_t sa, struct rtentry *rt)
{
+ struct flowtable *ft;
int i;
+ switch (sa) {
+#ifdef INET
+ case AF_INET:
+ ft = &V_ip4_ft;
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ ft = &V_ip6_ft;
+ break;
+#endif
+ default:
+ panic("%s: sa %d", __func__, sa);
+ }
+
if (ft->ft_flags & FL_PCPU) {
CPU_FOREACH(i) {
if (smp_started == 1) {
@@ -1522,34 +1317,29 @@ flowtable_route_flush(struct flowtable *
}
static void
-flowtable_clean_vnet(void)
+flowtable_clean_vnet(struct flowtable *ft)
{
- struct flowtable *ft;
- int i;
-
- ft = V_flow_list_head;
- while (ft != NULL) {
- if (ft->ft_flags & FL_PCPU) {
- CPU_FOREACH(i) {
- if (smp_started == 1) {
- thread_lock(curthread);
- sched_bind(curthread, i);
- thread_unlock(curthread);
- }
- flowtable_free_stale(ft, NULL);
+ if (ft->ft_flags & FL_PCPU) {
+ int i;
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-all
mailing list