FreeBSD 10G forwarding performance @Intel
Alexander V. Chernikov
melifaro at FreeBSD.org
Mon Jul 16 17:44:49 UTC 2012
On 06.07.2012 10:11, Luigi Rizzo wrote:
> On Thu, Jul 05, 2012 at 05:40:37PM +0400, Alexander V. Chernikov wrote:
>> On 04.07.2012 19:48, Luigi Rizzo wrote:
> the thing discussed a few years ago (at least the one i took out of the
> discussion) was that the counter fields in rules should hold the
> index of a per-cpu counter associated to the rule. So CTR_INC(rule->ctr)
> becomes something like pcpu->ipfw_ctrs[rule->ctr]++
> Once you create a new rule you also grab one free index from ipfw_ctrs[],
> and the same should go for dummynet counters.
Old kernel from previous letters, same setup:
net.inet.ip.fw.enable=0
2.3 MPPS
net.inet.ip.fw.update_counters=0
net.inet.ip.fw.enable=1
1.93MPPS
net.inet.ip.fw.update_counters=1
1.74MPPS
Kernel with ipfw pcpu counters:
net.inet.ip.fw.enable=0
2.3 MPPS
net.inet.ip.fw.update_counters=0
net.inet.ip.fw.enable=1
1.93MPPS
net.inet.ip.fw.update_counters=1
1.93MPPS
Counters seems to be working without any (significant) overhead.
(Maybe I'm wrong somewhere?)
Additionally, I've got (from my previous pcpu attempt) a small patch
permitting ipfw to re-use rule map allocation instead of reallocating
on every rule. This saves a bit of system time:
loading 20k rules with ipfw binary gives us:
5.1s system time before and 4.1s system time after.
--
WBR, Alexander
-------------- next part --------------
>From 931ac84b88829fe15bb4410bd2b06614f41c99b5 Mon Sep 17 00:00:00 2001
From: "Alexander V. Chernikov" <melifaro at ipfw.ru>
Date: Mon, 9 Jul 2012 23:10:02 +0400
Subject: [PATCH 1/3] Do not allocate data buffers per each rule
addition/deletion
---
sys/netinet/ipfw/ip_fw2.c | 2 +
sys/netinet/ipfw/ip_fw_private.h | 4 ++
sys/netinet/ipfw/ip_fw_sockopt.c | 134 ++++++++++++++++++++++++++++++++++----
3 files changed, 126 insertions(+), 14 deletions(-)
diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c
index 98766f3..bf06c1e 100644
--- a/sys/netinet/ipfw/ip_fw2.c
+++ b/sys/netinet/ipfw/ip_fw2.c
@@ -2701,6 +2701,8 @@ vnet_ipfw_uninit(const void *unused)
}
if (chain->map)
free(chain->map, M_IPFW);
+ if (chain->b_map)
+ free(chain->b_map, M_IPFW);
IPFW_WUNLOCK(chain);
IPFW_UH_WUNLOCK(chain);
if (reap != NULL)
diff --git a/sys/netinet/ipfw/ip_fw_private.h b/sys/netinet/ipfw/ip_fw_private.h
index 2806741..272cef3 100644
--- a/sys/netinet/ipfw/ip_fw_private.h
+++ b/sys/netinet/ipfw/ip_fw_private.h
@@ -232,6 +232,10 @@ struct ip_fw_chain {
#endif
uint32_t id; /* ruleset id */
uint32_t gencnt; /* generation count */
+ int n_allocated; /* Size of primary map in items */
+ struct ip_fw **b_map; /* Backup map to use in swapping */
+ int b_allocated; /* Size of backup map in items */
+
};
struct sockopt; /* used by tcp_var.h */
diff --git a/sys/netinet/ipfw/ip_fw_sockopt.c b/sys/netinet/ipfw/ip_fw_sockopt.c
index a245143..4e71a11 100644
--- a/sys/netinet/ipfw/ip_fw_sockopt.c
+++ b/sys/netinet/ipfw/ip_fw_sockopt.c
@@ -97,29 +97,92 @@ ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id)
return hi;
}
+
+
+/*
+ * In order to minimize block allocations on every rule addition we
+ * use the following technique:
+ * 1) round up allocation to MAP_GRANULARITY
+ * 2) Save previously allocated map to the special bakup pointer.
+ *
+ * This permits us do backup <> primary (and vise versa) map swapping
+ * without reallocations in most cases.
+ *
+ */
+
+struct map_ptrs {
+ int backup; /* 1 if backup map used */
+ int items; /* number of items allocated */
+ struct ip_fw **map;
+ struct ip_fw **free_map; /* map pointer to free */
+};
+int
+get_map(struct ip_fw_chain *chain, int extra, int locked, struct map_ptrs *ptrs);
+
+#define MAP_GRANULARITY 128 /* 1k overhead per buffer */
+#define MAP_ROUNDUP(i) (i) + MAP_GRANULARITY - ((i) % MAP_GRANULARITY)
/*
* allocate a new map, returns the chain locked. extra is the number
* of entries to add or delete.
*/
-static struct ip_fw **
-get_map(struct ip_fw_chain *chain, int extra, int locked)
+int
+get_map(struct ip_fw_chain *chain, int extra, int locked, struct map_ptrs *ptrs)
{
for (;;) {
struct ip_fw **map;
int i;
+ /* Be optimistic by default */
+ if (!locked)
+ IPFW_UH_WLOCK(chain);
+
i = chain->n_rules + extra;
+
+ /*
+ * We have to fit to the following conditions in order
+ * to use backup map:
+ *
+ * 1) i <= chain->b_allocated (add case)
+ * 2) roundup(i) == chain->b_allocated (del case) (saves memory)
+ *
+ * Since roundup adds 0..(MAP_GRANULARITY-1)
+ * rule 2 requires rule 1 to be true.
+ */
+ CTR4(KTR_NET, "%s: req: %d roundup: %d, b_allocated: %d",
+ __func__, i, MAP_ROUNDUP(i), chain->b_allocated);
+
+ if (MAP_ROUNDUP(i) == chain->b_allocated) {
+ /* no new allocation. Just return backup map */
+ ptrs->backup = 1;
+ ptrs->items = chain->b_allocated;
+ ptrs->map = chain->b_map;
+ CTR3(KTR_NET, "%s: BACKUP SELECTED: %p items: %d", __func__, ptrs->map, ptrs->items);
+
+ return 1;
+ }
+
+ /* Pessimistic scenario - we need reallocation */
+ i = MAP_ROUNDUP(i);
+
+ if (!locked)
+ IPFW_UH_WUNLOCK(chain);
+
map = malloc(i * sizeof(struct ip_fw *), M_IPFW,
locked ? M_NOWAIT : M_WAITOK);
if (map == NULL) {
printf("%s: cannot allocate map\n", __FUNCTION__);
- return NULL;
+ return 0;
}
if (!locked)
IPFW_UH_WLOCK(chain);
- if (i >= chain->n_rules + extra) /* good */
- return map;
+ if (i >= chain->n_rules + extra) { /* good */
+ ptrs->backup = 0;
+ ptrs->items = i;
+ ptrs->map = map;
+ CTR3(KTR_NET, "%s: NEW MAP: %p items: %d", __func__, ptrs->map, ptrs->items);
+ return 1;
+ }
/* otherwise we lost the race, free and retry */
if (!locked)
IPFW_UH_WUNLOCK(chain);
@@ -128,10 +191,11 @@ get_map(struct ip_fw_chain *chain, int extra, int locked)
}
/*
- * swap the maps. It is supposed to be called with IPFW_UH_WLOCK
+ * swap the maps. It is supposed to be called with IPFW_UH_WLOCK.
+ * Returns old map pointer to free.
*/
static struct ip_fw **
-swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len)
+swap_map(struct ip_fw_chain *chain, struct map_ptrs *ptrs, int new_len)
{
struct ip_fw **old_map;
@@ -139,8 +203,43 @@ swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len)
chain->id++;
chain->n_rules = new_len;
old_map = chain->map;
- chain->map = new_map;
+ chain->map = ptrs->map;
IPFW_WUNLOCK(chain);
+
+ CTR4(KTR_NET, "%s: old: %p new: %p, backup: %d",
+ __func__, old_map, chain->map, ptrs->backup);
+
+ /* Finalize map change */
+ if (ptrs->backup) {
+ /*
+ * Old map has (at least) the same size as current map.
+ * Save it as backup map.
+ */
+ chain->b_allocated = chain->n_allocated;
+ chain->n_allocated = ptrs->items;
+ chain->b_map = old_map;
+ ptrs->free_map = NULL;
+ return old_map;
+ }
+
+ /*
+ * Backup map is not used. Free old map and use
+ * current map as candidate.
+ */
+
+ /* Set free pointer to old backup map */
+ ptrs->free_map = chain->b_map;
+ /* Set backup map to be old current map */
+ chain->b_map = old_map;
+
+ /* Save backup map size */
+ chain->b_allocated = chain->n_allocated;
+ chain->n_allocated = ptrs->items;
+
+ CTR6(KTR_NET, "%s: new: %p alloc: %d backup: %p alloc: %d free: %p",
+ __func__, chain->map, chain->n_allocated,
+ chain->b_map, chain->b_allocated, ptrs->free_map);
+
return old_map;
}
@@ -156,6 +255,7 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
{
struct ip_fw *rule;
int i, l, insert_before;
+ struct map_ptrs ptrs;
struct ip_fw **map; /* the new array of pointers */
if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1)
@@ -164,12 +264,13 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
l = RULESIZE(input_rule);
rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO);
/* get_map returns with IPFW_UH_WLOCK if successful */
- map = get_map(chain, 1, 0 /* not locked */);
- if (map == NULL) {
+ if (get_map(chain, 1, 0 /* not locked */, &ptrs) == 0) {
free(rule, M_IPFW);
return ENOSPC;
}
+ map = ptrs.map;
+
bcopy(input_rule, rule, l);
/* clear fields not settable from userland */
rule->x_next = NULL;
@@ -201,7 +302,8 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
}
rule->id = chain->id + 1;
- map = swap_map(chain, map, chain->n_rules + 1);
+ swap_map(chain, &ptrs, chain->n_rules + 1);
+ map = ptrs.free_map;
chain->static_len += l;
IPFW_UH_WUNLOCK(chain);
if (map)
@@ -283,6 +385,7 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
uint32_t num; /* rule number or old_set */
uint8_t cmd, new_set;
int start, end, i, ofs, n;
+ struct map_ptrs ptrs;
struct ip_fw **map = NULL;
int error = 0;
@@ -353,12 +456,13 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
}
/* We have something to delete. Allocate the new map */
- map = get_map(chain, -n, 1 /* locked */);
- if (map == NULL) {
+ if (get_map(chain, -n, 1 /* locked */, &ptrs) == 0) {
error = EINVAL;
break;
}
+ map = ptrs.map;
+
/* 1. bcopy the initial part of the map */
if (start > 0)
bcopy(chain->map, map, start * sizeof(struct ip_fw *));
@@ -372,7 +476,7 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
bcopy(chain->map + end, map + ofs,
(chain->n_rules - end) * sizeof(struct ip_fw *));
/* 4. swap the maps (under BH_LOCK) */
- map = swap_map(chain, map, chain->n_rules - n);
+ map = swap_map(chain, &ptrs, chain->n_rules - n);
/* 5. now remove the rules deleted from the old map */
for (i = start; i < end; i++) {
int l;
@@ -385,6 +489,8 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
rule->x_next = chain->reap;
chain->reap = rule;
}
+ /* Free map if we need to */
+ map = ptrs.free_map;
break;
/*
--
1.7.9.4
-------------- next part --------------
>From baad4a88bba898baa3aa5f0dffc8a40182bcfcd6 Mon Sep 17 00:00:00 2001
From: Charlie Root <root at test15.yandex.net>
Date: Thu, 12 Jul 2012 17:19:10 +0000
Subject: [PATCH 1/2] Make accounting in ipfw optional
---
sys/netinet/ipfw/ip_fw2.c | 32 +++++++++++++++++++++-----------
1 files changed, 21 insertions(+), 11 deletions(-)
diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c
index e5a13e4..7bb73b0 100644
--- a/sys/netinet/ipfw/ip_fw2.c
+++ b/sys/netinet/ipfw/ip_fw2.c
@@ -112,6 +112,8 @@ static int default_to_accept = 1;
static int default_to_accept;
#endif
+static int update_counters = 0;
+
VNET_DEFINE(int, autoinc_step);
VNET_DEFINE(unsigned int, fw_tables_max);
@@ -190,6 +192,8 @@ SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6,
"Permit single packet IPv6 fragments");
#endif /* INET6 */
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, update_counters, CTLFLAG_RW,
+ &update_counters, 0, "Update counters on match");
SYSEND
#endif /* SYSCTL_NODE */
@@ -2011,16 +2015,20 @@ do { \
break;
case O_COUNT:
- f->pcnt++; /* update stats */
- f->bcnt += pktlen;
- f->timestamp = time_uptime;
+ if (update_counters) {
+ f->pcnt++; /* update stats */
+ f->bcnt += pktlen;
+ f->timestamp = time_uptime;
+ }
l = 0; /* exit inner loop */
break;
case O_SKIPTO:
- f->pcnt++; /* update stats */
- f->bcnt += pktlen;
- f->timestamp = time_uptime;
+ if (update_counters) {
+ f->pcnt++; /* update stats */
+ f->bcnt += pktlen;
+ f->timestamp = time_uptime;
+ }
/* If possible use cached f_pos (in f->next_rule),
* whose version is written in f->next_rule
* (horrible hacks to avoid changing the ABI).
@@ -2375,11 +2383,13 @@ do { \
} /* end of outer for, scan rules */
if (done) {
- struct ip_fw *rule = chain->map[f_pos];
- /* Update statistics */
- rule->pcnt++;
- rule->bcnt += pktlen;
- rule->timestamp = time_uptime;
+ if (update_counters) {
+ struct ip_fw *rule = chain->map[f_pos];
+ /* Update statistics */
+ rule->pcnt++;
+ rule->bcnt += pktlen;
+ rule->timestamp = time_uptime;
+ }
} else {
retval = IP_FW_DENY;
printf("ipfw: ouch!, skip past end of rules, denying packet\n");
--
1.7.6.1
-------------- next part --------------
diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h
index 589c19c..b1f83bb 100644
--- a/sys/netinet/ip_fw.h
+++ b/sys/netinet/ip_fw.h
@@ -493,9 +493,14 @@ struct ip_fw {
uint32_t id; /* rule id */
/* These fields are present in all rules. */
+#if 0
uint64_t pcnt; /* Packet counter */
uint64_t bcnt; /* Byte counter */
uint32_t timestamp; /* tv_sec of last match */
+#endif
+ uint64_t _pad0;
+ uint64_t _pad1;
+ uint32_t cntr_idx; /* Index in pcpu counters */
ipfw_insn cmd[1]; /* storage for commands */
};
diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c
index 7bb73b0..db93a5c 100644
--- a/sys/netinet/ipfw/ip_fw2.c
+++ b/sys/netinet/ipfw/ip_fw2.c
@@ -2016,18 +2016,20 @@ do { \
case O_COUNT:
if (update_counters) {
- f->pcnt++; /* update stats */
- f->bcnt += pktlen;
- f->timestamp = time_uptime;
+ struct ip_fw_cntr *c = IPFW_PCPU_CNTR(chain, f->cntr_idx);
+ c->pcnt++; /* update stats */
+ c->bcnt += pktlen;
+ c->timestamp = time_uptime;
}
l = 0; /* exit inner loop */
break;
case O_SKIPTO:
if (update_counters) {
- f->pcnt++; /* update stats */
- f->bcnt += pktlen;
- f->timestamp = time_uptime;
+ struct ip_fw_cntr *c = IPFW_PCPU_CNTR(chain, f->cntr_idx);
+ c->pcnt++; /* update stats */
+ c->bcnt += pktlen;
+ c->timestamp = time_uptime;
}
/* If possible use cached f_pos (in f->next_rule),
* whose version is written in f->next_rule
@@ -2125,9 +2127,10 @@ do { \
break;
}
- f->pcnt++; /* update stats */
- f->bcnt += pktlen;
- f->timestamp = time_uptime;
+ struct ip_fw_cntr *c = IPFW_PCPU_CNTR(chain, f->cntr_idx);
+ c->pcnt++; /* update stats */
+ c->bcnt += pktlen;
+ c->timestamp = time_uptime;
stack = (uint16_t *)(mtag + 1);
/*
@@ -2263,9 +2266,10 @@ do { \
case O_SETFIB: {
uint32_t fib;
- f->pcnt++; /* update stats */
- f->bcnt += pktlen;
- f->timestamp = time_uptime;
+ struct ip_fw_cntr *c = IPFW_PCPU_CNTR(chain, f->cntr_idx);
+ c->pcnt++; /* update stats */
+ c->bcnt += pktlen;
+ c->timestamp = time_uptime;
fib = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg:
cmd->arg1;
if (fib >= rt_numfibs)
@@ -2315,8 +2319,9 @@ do { \
case O_REASS: {
int ip_off;
- f->pcnt++;
- f->bcnt += pktlen;
+ struct ip_fw_cntr *c = IPFW_PCPU_CNTR(chain, f->cntr_idx);
+ c->pcnt++;
+ c->bcnt += pktlen;
l = 0; /* in any case exit inner loop */
ip_off = ntohs(ip->ip_off);
@@ -2386,9 +2391,10 @@ do { \
if (update_counters) {
struct ip_fw *rule = chain->map[f_pos];
/* Update statistics */
- rule->pcnt++;
- rule->bcnt += pktlen;
- rule->timestamp = time_uptime;
+ struct ip_fw_cntr *c = IPFW_PCPU_CNTR(chain, rule->cntr_idx);
+ c->pcnt++;
+ c->bcnt += pktlen;
+ c->timestamp = time_uptime;
}
} else {
retval = IP_FW_DENY;
@@ -2439,6 +2445,7 @@ ipfw_init(void)
{
int error = 0;
+ _dpcpu_init();
ipfw_dyn_attach();
/*
* Only print out this stuff the first time around,
@@ -2500,6 +2507,7 @@ ipfw_destroy(void)
ipfw_log_bpf(0); /* uninit */
ipfw_dyn_detach();
+ _dpcpu_uninit();
printf("IP firewall unloaded\n");
}
@@ -2546,6 +2554,9 @@ vnet_ipfw_init(const void *unused)
return (ENOSPC);
}
+ /* Init per-cpu counters */
+ ipfw_init_counters(chain);
+
/* fill and insert the default rule */
rule->act_ofs = 0;
rule->rulenum = IPFW_DEFAULT_RULE;
@@ -2559,6 +2570,9 @@ vnet_ipfw_init(const void *unused)
IPFW_LOCK_INIT(chain);
ipfw_dyn_init();
+ /* Allocate first pcpu counter */
+ rule->cntr_idx = ipfw_alloc_counter(chain);
+
/* First set up some values that are compile time options */
V_ipfw_vnet_ready = 1; /* Open for business */
@@ -2619,10 +2633,11 @@ vnet_ipfw_uninit(const void *unused)
}
if (chain->map)
free(chain->map, M_IPFW);
+ _dpcpu_free(chain->cntrs);
IPFW_WUNLOCK(chain);
IPFW_UH_WUNLOCK(chain);
if (reap != NULL)
- ipfw_reap_rules(reap);
+ ipfw_reap_rules(chain, reap);
IPFW_LOCK_DESTROY(chain);
ipfw_dyn_uninit(1); /* free the remaining parts */
return 0;
diff --git a/sys/netinet/ipfw/ip_fw_private.h b/sys/netinet/ipfw/ip_fw_private.h
index 46d011c..bd4fad3 100644
--- a/sys/netinet/ipfw/ip_fw_private.h
+++ b/sys/netinet/ipfw/ip_fw_private.h
@@ -222,6 +222,7 @@ struct ip_fw_chain {
struct radix_node_head **tables; /* IPv4 tables */
struct radix_node_head **xtables; /* extended tables */
uint8_t *tabletype; /* Array of table types */
+ uintptr_t *cntrs; /* Per-cpu counters */
#if defined( __linux__ ) || defined( _WIN32 )
spinlock_t rwmtx;
spinlock_t uh_lock;
@@ -231,8 +232,25 @@ struct ip_fw_chain {
#endif
uint32_t id; /* ruleset id */
uint32_t gencnt; /* generation count */
+ int cntr_allocated; /* Size of allocated counters map */
+ uint32_t *cntr_idx; /* bit index of counters map */
};
+struct ip_fw_cntr {
+ uint64_t pcnt; /* Packet counter */
+ uint64_t bcnt; /* Byte counter */
+ uint32_t timestamp; /* tv_sec of last match */
+};
+#define IPFW_PCPU_CNTR(chain, idx) ((struct ip_fw_cntr *)(chain)->cntrs[curcpu] + idx)
+#define IPFW_PCPU_DEFAULT_COUNTERS 128
+void _dpcpu_init(void);
+void _dpcpu_uninit(void);
+uintptr_t *_dpcpu_alloc(size_t size, int flags);
+void _dpcpu_free(uintptr_t *ptr);
+void ipfw_init_counters(struct ip_fw_chain *chain);
+int ipfw_alloc_counter(struct ip_fw_chain *chain);
+void ipfw_free_counter(struct ip_fw_chain *chain, int idx);
+
struct sockopt; /* used by tcp_var.h */
/*
@@ -267,7 +285,10 @@ int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id);
int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule);
int ipfw_ctl(struct sockopt *sopt);
int ipfw_chk(struct ip_fw_args *args);
-void ipfw_reap_rules(struct ip_fw *head);
+void ipfw_reap_rules(struct ip_fw_chain *chain, struct ip_fw *head);
+
+void ipfw_zero_counter(struct ip_fw_chain *chain, int idx);
+void ipfw_export_counter(struct ip_fw_chain *chain, int idx, struct ip_fw_cntr *cntr);
/* In ip_fw_pfil */
int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
diff --git a/sys/netinet/ipfw/ip_fw_sockopt.c b/sys/netinet/ipfw/ip_fw_sockopt.c
index 0aecc2c..db308c2 100644
--- a/sys/netinet/ipfw/ip_fw_sockopt.c
+++ b/sys/netinet/ipfw/ip_fw_sockopt.c
@@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
#include <sys/malloc.h>
#include <sys/mbuf.h> /* struct m_tag used by nested headers */
#include <sys/kernel.h>
+#include <sys/smp.h>
#include <sys/lock.h>
#include <sys/priv.h>
#include <sys/proc.h>
@@ -72,6 +73,212 @@ MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
* static variables followed by global ones (none in this file)
*/
+MALLOC_DEFINE(M_XDPCPU, "XPCPU", "PCPU data");
+
+uma_zone_t _dpcpu_zone;
+
+static void ipfw_extend_counters(struct ip_fw_chain *chain);
+
+void
+_dpcpu_init()
+{
+ _dpcpu_zone = uma_zcreate("pcpu ptrs", sizeof(uintptr_t) * MAXCPU,
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+
+void
+_dpcpu_uninit()
+{
+ uma_zdestroy(_dpcpu_zone);
+}
+
+#define _DPCPU_ALIGN 512
+uintptr_t *
+_dpcpu_alloc(size_t size, int flags)
+{
+ uintptr_t *ptr;
+ int id, fail = 0;
+
+ if (size % _DPCPU_ALIGN)
+ size += _DPCPU_ALIGN - (size % _DPCPU_ALIGN);
+
+ if ((ptr = uma_zalloc(_dpcpu_zone, flags | M_ZERO)) == NULL)
+ return NULL;
+
+ CPU_FOREACH(id) {
+ /*
+ * We believe in aligned allocation here
+ */
+ if ((ptr[id] = (uintptr_t)malloc(size, M_XDPCPU, flags)) == 0) {
+ fail = 1;
+ break;
+ }
+
+ KASSERT((uintptr_t)ptr % _DPCPU_ALIGN == 0, ("malloc(9) returns unaligned data :("));
+ }
+
+ if (fail == 0)
+ return ptr;
+
+ CPU_FOREACH(id) {
+ if (ptr[id] == 0)
+ break;
+
+ free((void *)ptr[id], M_XDPCPU);
+ }
+
+ uma_zfree(_dpcpu_zone, ptr);
+
+ return NULL;
+}
+
+void
+_dpcpu_free(uintptr_t *ptr)
+{
+ int id;
+
+ CPU_FOREACH(id) {
+ if (ptr[id] == 0)
+ continue;
+ free((void *)ptr[id], M_XDPCPU);
+ }
+
+ uma_zfree(_dpcpu_zone, ptr);
+}
+
+void
+ipfw_init_counters(struct ip_fw_chain *chain)
+{
+ chain->cntrs = _dpcpu_alloc(IPFW_PCPU_DEFAULT_COUNTERS *
+ sizeof(struct ip_fw_cntr), M_WAITOK | M_ZERO);
+ chain->cntr_idx = malloc(IPFW_PCPU_DEFAULT_COUNTERS / 8, M_IPFW, M_WAITOK);
+ /* Free positions are marked by 1 */
+ memset(chain->cntr_idx, 0xFF, IPFW_PCPU_DEFAULT_COUNTERS / 8);
+ chain->cntr_allocated = IPFW_PCPU_DEFAULT_COUNTERS;
+}
+
+
+static void
+ipfw_extend_counters(struct ip_fw_chain *chain)
+{
+ uintptr_t *new_ptr, *old_ptr;
+ uint32_t *old_idx, *new_idx;
+ int i, id;
+
+ for (;;) {
+
+ i = 2 * chain->cntr_allocated;
+
+ new_ptr = _dpcpu_alloc(i * sizeof(struct ip_fw_cntr), M_WAITOK | M_ZERO);
+ new_idx = malloc(i / 8, M_IPFW, M_WAITOK);
+ /* Free positions are marked by 1 */
+ memset(new_idx, 0xFF, i / 8);
+
+ IPFW_UH_WLOCK(chain);
+ if (i == 2 * chain->cntr_allocated) {
+ CTR4(KTR_NET, "%s: NEW INDEX: %p cntr_idx: %d count: %d", __func__, new_ptr, new_idx, i);
+ break;
+ }
+ /* otherwise we lost the race, free and retry */
+ IPFW_UH_WUNLOCK(chain);
+
+ free(new_idx, M_IPFW);
+ _dpcpu_free(new_ptr);
+ }
+
+ /* Pointers allocated, let's migrate data */
+
+ IPFW_WLOCK(chain);
+ /* Copy pcpu counters */
+ CPU_FOREACH(id) {
+ memcpy((void *)new_ptr[id], (void *)chain->cntrs[id], chain->cntr_allocated * sizeof(struct ip_fw_cntr));
+ }
+ /* Copy index */
+ memcpy(new_idx, chain->cntr_idx, chain->cntr_allocated / 8);
+ /* Swap old and new pointers */
+ old_ptr = chain->cntrs;
+ chain->cntrs = new_ptr;
+ old_idx = chain->cntr_idx;
+ chain->cntr_idx = new_idx;
+ chain->cntr_allocated = i;
+
+ IPFW_WUNLOCK(chain);
+ IPFW_UH_WUNLOCK(chain);
+
+ /* Free old counters and index */
+ free(old_idx, M_IPFW);
+ _dpcpu_free(old_ptr);
+}
+
+int
+ipfw_alloc_counter(struct ip_fw_chain *chain)
+{
+ int i, x, idx;
+ uint32_t *v;
+
+ for (;;) {
+ IPFW_UH_WLOCK(chain);
+ for (i = 0, v = chain->cntr_idx; i < chain->cntr_allocated / 8 / 4; i++, v++) {
+ if ((x = ffsl(*v)) != 0) {
+ /* Found. Mark as used */
+ *v &= ~ (1 << (x - 1));
+ IPFW_UH_WUNLOCK(chain);
+ idx = i * 32 + x - 1;
+ ipfw_zero_counter(chain, idx);
+ CTR3(KTR_NET, "%s: allocated counter %d, map after: %X", __func__, idx, *v);
+ return idx;
+ }
+ }
+ IPFW_UH_WUNLOCK(chain);
+
+ /* Not found. Let's allocate data and retry */
+ ipfw_extend_counters(chain);
+ }
+}
+
+void
+ipfw_free_counter(struct ip_fw_chain *chain, int idx)
+{
+ uint32_t *v;
+
+ IPFW_UH_WLOCK(chain);
+ v = chain->cntr_idx;
+ v += idx / 32;
+ CTR3(KTR_NET, "%s: returned counter %d, map before: %X", __func__, idx, *v);
+ idx %= 32;
+ *v |= 1 << idx;
+
+ IPFW_UH_WUNLOCK(chain);
+}
+
+void
+ipfw_zero_counter(struct ip_fw_chain *chain, int idx)
+{
+ int id;
+ struct ip_fw_cntr *pcpu;
+
+ CTR2(KTR_NET, "%s: zeroing counter %d", __func__, idx);
+ CPU_FOREACH(id) {
+ pcpu = ((struct ip_fw_cntr *)chain->cntrs[id]) + idx;
+ memset(pcpu, 0, sizeof(struct ip_fw_cntr));
+ }
+}
+
+void
+ipfw_export_counter(struct ip_fw_chain *chain, int idx, struct ip_fw_cntr *cntr)
+{
+ int id;
+ struct ip_fw_cntr *pcpu;
+
+ CPU_FOREACH(id) {
+ pcpu = ((struct ip_fw_cntr *)chain->cntrs[id]) + idx;
+ cntr->pcnt += pcpu->pcnt;
+ cntr->bcnt += pcpu->bcnt;
+ if (cntr->timestamp < pcpu->timestamp)
+ cntr->timestamp = pcpu->timestamp;
+ }
+}
+
/*
* Find the smallest rule >= key, id.
* We could use bsearch but it is so simple that we code it directly
@@ -155,7 +362,7 @@ int
ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
{
struct ip_fw *rule;
- int i, l, insert_before;
+ int i, l, cntr_idx, insert_before;
struct ip_fw **map; /* the new array of pointers */
if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1)
@@ -163,12 +370,14 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
l = RULESIZE(input_rule);
rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO);
+ cntr_idx = ipfw_alloc_counter(chain);
if (rule == NULL)
return (ENOSPC);
/* get_map returns with IPFW_UH_WLOCK if successful */
map = get_map(chain, 1, 0 /* not locked */);
if (map == NULL) {
free(rule, M_IPFW);
+ ipfw_free_counter(chain, cntr_idx);
return ENOSPC;
}
@@ -176,10 +385,12 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
/* clear fields not settable from userland */
rule->x_next = NULL;
rule->next_rule = NULL;
+ rule->cntr_idx = cntr_idx;
+#if 0
rule->pcnt = 0;
rule->bcnt = 0;
rule->timestamp = 0;
-
+#endif
if (V_autoinc_step < 1)
V_autoinc_step = 1;
else if (V_autoinc_step > 1000)
@@ -217,12 +428,13 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
* A NULL pointer on input is handled correctly.
*/
void
-ipfw_reap_rules(struct ip_fw *head)
+ipfw_reap_rules(struct ip_fw_chain *chain, struct ip_fw *head)
{
struct ip_fw *rule;
while ((rule = head) != NULL) {
head = head->x_next;
+ ipfw_free_counter(chain, rule->cntr_idx);
free(rule, M_IPFW);
}
}
@@ -424,7 +636,7 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
rule = chain->reap;
chain->reap = NULL;
IPFW_UH_WUNLOCK(chain);
- ipfw_reap_rules(rule);
+ ipfw_reap_rules(chain, rule);
if (map)
free(map, M_IPFW);
return error;
@@ -436,13 +648,17 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
* so we only care that rules do not disappear.
*/
static void
-clear_counters(struct ip_fw *rule, int log_only)
+clear_counters(struct ip_fw_chain *chain, struct ip_fw *rule, int log_only)
{
ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
if (log_only == 0) {
+#if 0
rule->bcnt = rule->pcnt = 0;
rule->timestamp = 0;
+#endif
+ ipfw_zero_counter(chain, rule->cntr_idx);
+
}
if (l->o.opcode == O_LOG)
l->log_left = l->max_log;
@@ -481,7 +697,7 @@ zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
/* Skip rules not in our set. */
if (cmd == 1 && rule->set != set)
continue;
- clear_counters(rule, log_only);
+ clear_counters(chain, rule, log_only);
}
msg = log_only ? "All logging counts reset" :
"Accounting cleared";
@@ -491,7 +707,7 @@ zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
rule = chain->map[i];
if (rule->rulenum == rulenum) {
if (cmd == 0 || rule->set == set)
- clear_counters(rule, log_only);
+ clear_counters(chain, rule, log_only);
cleared = 1;
}
if (rule->rulenum > rulenum)
@@ -854,7 +1070,7 @@ struct ip_fw7 {
ipfw_insn cmd[1]; /* storage for commands */
};
- int convert_rule_to_7(struct ip_fw *rule);
+int convert_rule_to_7(struct ip_fw_chain *chain, struct ip_fw *rule);
int convert_rule_to_8(struct ip_fw *rule);
#ifndef RULESIZE7
@@ -887,7 +1103,7 @@ ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
if (bp + l + sizeof(uint32_t) <= ep) {
int error;
bcopy(rule, bp, l + sizeof(uint32_t));
- error = convert_rule_to_7((struct ip_fw *) bp);
+ error = convert_rule_to_7(chain, (struct ip_fw *) bp);
if (error)
return 0; /*XXX correct? */
/*
@@ -913,14 +1129,19 @@ ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
}
dst = (struct ip_fw *)bp;
bcopy(rule, dst, l);
+ /* copy statistics */
+ struct ip_fw_cntr *c = (struct ip_fw_cntr *)&dst->_pad0;
+ ipfw_export_counter(chain, rule->cntr_idx, c);
+ if (c->timestamp)
+ c->timestamp += boot_seconds;
+
/*
* XXX HACK. Store the disable mask in the "next"
* pointer in a wild attempt to keep the ABI the same.
* Why do we do this on EVERY rule?
*/
bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable));
- if (dst->timestamp)
- dst->timestamp += boot_seconds;
+
bp += l;
}
ipfw_get_dynamic(&bp, ep); /* protected by the dynamic lock */
@@ -1051,7 +1272,7 @@ ipfw_ctl(struct sockopt *sopt)
size = RULESIZE(rule);
if (!error && sopt->sopt_dir == SOPT_GET) {
if (is7) {
- error = convert_rule_to_7(rule);
+ error = convert_rule_to_7(chain, rule);
size = RULESIZE7(rule);
if (error)
return error;
@@ -1329,7 +1550,7 @@ ipfw_ctl(struct sockopt *sopt)
/* Functions to convert rules 7.2 <==> 8.0 */
int
-convert_rule_to_7(struct ip_fw *rule)
+convert_rule_to_7(struct ip_fw_chain *chain, struct ip_fw *rule)
{
/* Used to modify original rule */
struct ip_fw7 *rule7 = (struct ip_fw7 *)rule;
@@ -1355,9 +1576,12 @@ convert_rule_to_7(struct ip_fw *rule)
rule7->next_rule = (struct ip_fw7 *)tmp->next_rule;
rule7->next = (struct ip_fw7 *)tmp->x_next;
rule7->cmd_len = tmp->cmd_len;
+ ipfw_export_counter(chain, rule->cntr_idx, (struct ip_fw_cntr *)&rule7->pcnt);
+#if 0
rule7->pcnt = tmp->pcnt;
rule7->bcnt = tmp->bcnt;
rule7->timestamp = tmp->timestamp;
+#endif
/* Copy commands */
for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ;
@@ -1429,10 +1653,11 @@ convert_rule_to_8(struct ip_fw *rule)
rule->x_next = (struct ip_fw *)tmp->next;
rule->cmd_len = tmp->cmd_len;
rule->id = 0; /* XXX see if is ok = 0 */
+#if 0
rule->pcnt = tmp->pcnt;
rule->bcnt = tmp->bcnt;
rule->timestamp = tmp->timestamp;
-
+#endif
free (tmp, M_TEMP);
return 0;
}
More information about the freebsd-net
mailing list