parallelizing ipfw table
Gleb Smirnoff
glebius at FreeBSD.org
Mon Nov 28 16:19:40 GMT 2005
Ruslan,
okay, I have implemented a feature that allows to merge your ruleset
into one rule. Look how it works:
root at behemoth:~:|>kldload dummynet
root at behemoth:~:|>ipfw pipe 2 config
root at behemoth:~:|>ipfw pipe 3 config
root at behemoth:~:|>ipfw add 1000 pipe 65535 ip from any to table\(1\)
01000 pipe 65535 ip from any to table(1)
root at behemoth:~:|>ipfw table 1 add 217.72.144.35 2
root at behemoth:~:|>ipfw table 1 add 217.72.144.68 3
root at behemoth:~:|>ipfw table 1 list
217.72.144.35/32 2
217.72.144.68/32 3
root at behemoth:~:|>ping cell.sick.ru
PING cell.sick.ru (217.72.144.68): 56 data bytes
64 bytes from 217.72.144.68: icmp_seq=0 ttl=60 time=2.653 ms
^C
--- cell.sick.ru ping statistics ---
1 packets transmitted, 1 packets received, 0% packet loss
round-trip min/avg/max/stddev = 2.653/2.653/2.653/0.000 ms
root at behemoth:~:|>ping unixfaq.ru
PING unixfaq.ru (217.72.144.35): 56 data bytes
64 bytes from 217.72.144.35: icmp_seq=0 ttl=60 time=2.563 ms
^C
--- unixfaq.ru ping statistics ---
1 packets transmitted, 1 packets received, 0% packet loss
round-trip min/avg/max/stddev = 2.563/2.563/2.563/0.000 ms
root at behemoth:~:|>ipfw pipe 2 show
00002: unlimited 0 ms 50 sl. 1 queues (1 buckets) droptail
mask: 0x00 0x00000000/0x0000 -> 0x00000000/0x0000
BKT Prot ___Source IP/port____ ____Dest. IP/port____ Tot_pkt/bytes Pkt/Byte Drp
0 icmp 81.19.64.118/0 217.72.144.35/0 1 84 0 0 0
root at behemoth:~:|>ipfw pipe 3 show
00003: unlimited 0 ms 50 sl. 1 queues (1 buckets) droptail
mask: 0x00 0x00000000/0x0000 -> 0x00000000/0x0000
BKT Prot ___Source IP/port____ ____Dest. IP/port____ Tot_pkt/bytes Pkt/Byte Drp
0 icmp 81.19.64.118/0 217.72.144.68/0 1 84 0 0 0
The number 65535 is some magic number, which means "take argument from table". I
will make ipfw display some word instead of 65535, for example "tablearg". So, the
rule will be looking like:
pipe tablearg ip from any to table(1)
Implementing this feature I have encountered a problem, that I have encountered
before - the locate_flowset() function in ip_dummynet.c. The problem is that
dummynet and ipfw are so welded together. The rule itself points into a pipe,
and this puts a design limit that a packet from a rule can go only into one
pipe. So, I removed this limitation removing the pointer from rule to pipe. This
change also garbage-collected several XXX in the code. Yes, I've made a regression
here for the time for pipe/queue lookup. But this is a proof-of-concept patch.
Before checking in I will make a hash for pipes/queues in dummynet. This change
also is a step forward in divorcing dummynet and ipfw, and thus a step closer
to ng_dummynet node :).
The proof-of-concept patch attached.
Before committing it will be divided into three parts:
1) unlock the tables and remove caching
2) remove the rule->pipe pointer and implement hash for pipes/queues
3) introduce the tablearg feature
--
Totus tuus, Glebius.
GLEBIUS-RIPN GLEB-RIPE
-------------- next part --------------
Index: ip_dummynet.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/ip_dummynet.c,v
retrieving revision 1.95
diff -u -r1.95 ip_dummynet.c
--- ip_dummynet.c 27 Sep 2005 18:10:42 -0000 1.95
+++ ip_dummynet.c 28 Nov 2005 15:30:31 -0000
@@ -1111,19 +1111,11 @@
struct dn_flow_set *
locate_flowset(int pipe_nr, struct ip_fw *rule)
{
- struct dn_flow_set *fs;
+ struct dn_flow_set *fs = NULL;
ipfw_insn *cmd = ACTION_PTR(rule);
if (cmd->opcode == O_LOG)
cmd += F_LEN(cmd);
-#ifdef __i386__
- fs = ((ipfw_insn_pipe *)cmd)->pipe_ptr;
-#else
- bcopy(& ((ipfw_insn_pipe *)cmd)->pipe_ptr, &fs, sizeof(fs));
-#endif
-
- if (fs != NULL)
- return fs;
if (cmd->opcode == O_QUEUE)
for (fs=all_flow_sets; fs && fs->fs_nr != pipe_nr; fs=fs->next)
@@ -1135,12 +1127,6 @@
if (p1 != NULL)
fs = &(p1->fs) ;
}
- /* record for the future */
-#ifdef __i386__
- ((ipfw_insn_pipe *)cmd)->pipe_ptr = fs;
-#else
- bcopy(&fs, & ((ipfw_insn_pipe *)cmd)->pipe_ptr, sizeof(fs));
-#endif
return fs ;
}
@@ -1407,8 +1393,6 @@
struct dn_flow_set *fs, *curr_fs;
DUMMYNET_LOCK();
- /* remove all references to pipes ...*/
- flush_pipe_ptrs(NULL);
/* prevent future matches... */
p = all_pipes ;
all_pipes = NULL ;
@@ -1811,8 +1795,6 @@
all_pipes = b->next ;
else
a->next = b->next ;
- /* remove references to this pipe from the ip_fw rules. */
- flush_pipe_ptrs(&(b->fs));
/* remove all references to this pipe from flow_sets */
for (fs = all_flow_sets; fs; fs= fs->next )
@@ -1846,8 +1828,6 @@
all_flow_sets = b->next ;
else
a->next = b->next ;
- /* remove references to this flow_set from the ip_fw rules. */
- flush_pipe_ptrs(b);
if (b->pipe != NULL) {
/* Update total weight on parent pipe and cleanup parent heaps */
Index: ip_fw.h
===================================================================
RCS file: /home/ncvs/src/sys/netinet/ip_fw.h,v
retrieving revision 1.101
diff -u -r1.101 ip_fw.h
--- ip_fw.h 13 Aug 2005 11:02:33 -0000 1.101
+++ ip_fw.h 28 Nov 2005 15:37:05 -0000
@@ -271,19 +271,6 @@
} ipfw_insn_if;
/*
- * This is used for pipe and queue actions, which need to store
- * a single pointer (which can have different size on different
- * architectures.
- * Note that, because of previous instructions, pipe_ptr might
- * be unaligned in the overall structure, so it needs to be
- * manipulated with care.
- */
-typedef struct _ipfw_insn_pipe {
- ipfw_insn o;
- void *pipe_ptr; /* XXX */
-} ipfw_insn_pipe;
-
-/*
* This is used for storing an altq queue id number.
*/
typedef struct _ipfw_insn_altq {
@@ -474,6 +461,8 @@
ipfw_table_entry ent[0]; /* entries */
} ipfw_table;
+#define IP_FW_TABLECOOKIE USHRT_MAX
+
/*
* Main firewall chains definitions and global var's definitions.
*/
@@ -546,8 +535,6 @@
int ipfw_init(void);
void ipfw_destroy(void);
-void flush_pipe_ptrs(struct dn_flow_set *match); /* used by dummynet */
-
typedef int ip_fw_ctl_t(struct sockopt *);
extern ip_fw_ctl_t *ip_fw_ctl_ptr;
extern int fw_one_pass;
Index: ip_fw2.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/ip_fw2.c,v
retrieving revision 1.115
diff -u -r1.115 ip_fw2.c
--- ip_fw2.c 10 Nov 2005 22:10:39 -0000 1.115
+++ ip_fw2.c 28 Nov 2005 15:30:05 -0000
@@ -51,6 +51,7 @@
#include <sys/mbuf.h>
#include <sys/kernel.h>
#include <sys/jail.h>
+#include <sys/limits.h>
#include <sys/module.h>
#include <sys/proc.h>
#include <sys/socket.h>
@@ -126,9 +127,11 @@
int fw_prid;
};
+#define IPFW_TABLES_MAX 128
struct ip_fw_chain {
struct ip_fw *rules; /* list of rules */
struct ip_fw *reap; /* list of rules to reap */
+ struct radix_node_head *tables[IPFW_TABLES_MAX];
struct mtx mtx; /* lock guarding rule list */
int busy_count; /* busy count for rw locks */
int want_write;
@@ -192,15 +195,6 @@
u_int32_t value;
};
-#define IPFW_TABLES_MAX 128
-static struct ip_fw_table {
- struct radix_node_head *rnh;
- int modified;
- in_addr_t last_addr;
- int last_match;
- u_int32_t last_value;
-} ipfw_tables[IPFW_TABLES_MAX];
-
static int fw_debug = 1;
static int autoinc_step = 100; /* bounded to 1..1000 in add_rule() */
@@ -1703,25 +1697,24 @@
}
static void
-init_tables(void)
+init_tables(struct ip_fw_chain *ch)
{
int i;
- for (i = 0; i < IPFW_TABLES_MAX; i++) {
- rn_inithead((void **)&ipfw_tables[i].rnh, 32);
- ipfw_tables[i].modified = 1;
- }
+ for (i = 0; i < IPFW_TABLES_MAX; i++)
+ rn_inithead((void **)&ch->tables[i], 32);
}
static int
-add_table_entry(u_int16_t tbl, in_addr_t addr, u_int8_t mlen, u_int32_t value)
+add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+ uint8_t mlen, uint32_t value)
{
struct radix_node_head *rnh;
struct table_entry *ent;
if (tbl >= IPFW_TABLES_MAX)
return (EINVAL);
- rnh = ipfw_tables[tbl].rnh;
+ rnh = ch->tables[tbl];
ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT | M_ZERO);
if (ent == NULL)
return (ENOMEM);
@@ -1729,20 +1722,20 @@
ent->addr.sin_len = ent->mask.sin_len = 8;
ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr;
- RADIX_NODE_HEAD_LOCK(rnh);
+ IPFW_WLOCK(&layer3_chain);
if (rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent) ==
NULL) {
- RADIX_NODE_HEAD_UNLOCK(rnh);
+ IPFW_WUNLOCK(&layer3_chain);
free(ent, M_IPFW_TBL);
return (EEXIST);
}
- ipfw_tables[tbl].modified = 1;
- RADIX_NODE_HEAD_UNLOCK(rnh);
+ IPFW_WUNLOCK(&layer3_chain);
return (0);
}
static int
-del_table_entry(u_int16_t tbl, in_addr_t addr, u_int8_t mlen)
+del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+ uint8_t mlen)
{
struct radix_node_head *rnh;
struct table_entry *ent;
@@ -1750,18 +1743,17 @@
if (tbl >= IPFW_TABLES_MAX)
return (EINVAL);
- rnh = ipfw_tables[tbl].rnh;
+ rnh = ch->tables[tbl];
sa.sin_len = mask.sin_len = 8;
mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr;
- RADIX_NODE_HEAD_LOCK(rnh);
+ IPFW_WLOCK(ch);
ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh);
if (ent == NULL) {
- RADIX_NODE_HEAD_UNLOCK(rnh);
+ IPFW_WUNLOCK(ch);
return (ESRCH);
}
- ipfw_tables[tbl].modified = 1;
- RADIX_NODE_HEAD_UNLOCK(rnh);
+ IPFW_WUNLOCK(ch);
free(ent, M_IPFW_TBL);
return (0);
}
@@ -1780,63 +1772,48 @@
}
static int
-flush_table(u_int16_t tbl)
+flush_table(struct ip_fw_chain *ch, uint16_t tbl)
{
struct radix_node_head *rnh;
+ IPFW_WLOCK_ASSERT(ch);
+
if (tbl >= IPFW_TABLES_MAX)
return (EINVAL);
- rnh = ipfw_tables[tbl].rnh;
- RADIX_NODE_HEAD_LOCK(rnh);
+ rnh = ch->tables[tbl];
rnh->rnh_walktree(rnh, flush_table_entry, rnh);
- ipfw_tables[tbl].modified = 1;
- RADIX_NODE_HEAD_UNLOCK(rnh);
return (0);
}
static void
-flush_tables(void)
+flush_tables(struct ip_fw_chain *ch)
{
- u_int16_t tbl;
+ uint16_t tbl;
+
+ IPFW_WLOCK_ASSERT(ch);
for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++)
- flush_table(tbl);
+ flush_table(ch, tbl);
}
static int
-lookup_table(u_int16_t tbl, in_addr_t addr, u_int32_t *val)
+lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr,
+ uint32_t *val)
{
struct radix_node_head *rnh;
- struct ip_fw_table *table;
struct table_entry *ent;
struct sockaddr_in sa;
- int last_match;
if (tbl >= IPFW_TABLES_MAX)
return (0);
- table = &ipfw_tables[tbl];
- rnh = table->rnh;
- RADIX_NODE_HEAD_LOCK(rnh);
- if (addr == table->last_addr && !table->modified) {
- last_match = table->last_match;
- if (last_match)
- *val = table->last_value;
- RADIX_NODE_HEAD_UNLOCK(rnh);
- return (last_match);
- }
- table->modified = 0;
+ rnh = ch->tables[tbl];
sa.sin_len = 8;
sa.sin_addr.s_addr = addr;
ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh));
- table->last_addr = addr;
if (ent != NULL) {
- table->last_value = *val = ent->value;
- table->last_match = 1;
- RADIX_NODE_HEAD_UNLOCK(rnh);
+ *val = ent->value;
return (1);
}
- table->last_match = 0;
- RADIX_NODE_HEAD_UNLOCK(rnh);
return (0);
}
@@ -1850,17 +1827,15 @@
}
static int
-count_table(u_int32_t tbl, u_int32_t *cnt)
+count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt)
{
struct radix_node_head *rnh;
if (tbl >= IPFW_TABLES_MAX)
return (EINVAL);
- rnh = ipfw_tables[tbl].rnh;
+ rnh = ch->tables[tbl];
*cnt = 0;
- RADIX_NODE_HEAD_LOCK(rnh);
rnh->rnh_walktree(rnh, count_table_entry, cnt);
- RADIX_NODE_HEAD_UNLOCK(rnh);
return (0);
}
@@ -1886,17 +1861,17 @@
}
static int
-dump_table(ipfw_table *tbl)
+dump_table(struct ip_fw_chain *ch, ipfw_table *tbl)
{
struct radix_node_head *rnh;
+ IPFW_WLOCK_ASSERT(ch);
+
if (tbl->tbl >= IPFW_TABLES_MAX)
return (EINVAL);
- rnh = ipfw_tables[tbl->tbl].rnh;
+ rnh = ch->tables[tbl->tbl];
tbl->cnt = 0;
- RADIX_NODE_HEAD_LOCK(rnh);
rnh->rnh_walktree(rnh, dump_table_entry, tbl);
- RADIX_NODE_HEAD_UNLOCK(rnh);
return (0);
}
@@ -2409,9 +2384,9 @@
* Now scan the rules, and parse microinstructions for each rule.
*/
for (; f; f = f->next) {
- int l, cmdlen;
ipfw_insn *cmd;
- int skip_or; /* skip rest of OR block */
+ uint32_t tablearg = 0;
+ int l, cmdlen, skip_or; /* skip rest of OR block */
again:
if (set_disable & (1 << f->set) )
@@ -2567,12 +2542,15 @@
dst_ip.s_addr : src_ip.s_addr;
uint32_t v;
- match = lookup_table(cmd->arg1, a, &v);
+ match = lookup_table(chain, cmd->arg1, a,
+ &v);
if (!match)
break;
if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
match =
((ipfw_insn_u32 *)cmd)->d[0] == v;
+ else
+ tablearg = v;
}
break;
@@ -3024,7 +3002,10 @@
case O_PIPE:
case O_QUEUE:
args->rule = f; /* report matching rule */
- args->cookie = cmd->arg1;
+ if (cmd->arg1 == IP_FW_TABLECOOKIE)
+ args->cookie = tablearg;
+ else
+ args->cookie = cmd->arg1;
retval = IP_FW_DUMMYNET;
goto done;
@@ -3045,7 +3026,10 @@
}
dt = (struct divert_tag *)(mtag+1);
dt->cookie = f->rulenum;
- dt->info = cmd->arg1;
+ if (cmd->arg1 == IP_FW_TABLECOOKIE)
+ dt->info = tablearg;
+ else
+ dt->info = cmd->arg1;
m_tag_prepend(m, mtag);
retval = (cmd->opcode == O_DIVERT) ?
IP_FW_DIVERT : IP_FW_TEE;
@@ -3110,7 +3094,10 @@
case O_NETGRAPH:
case O_NGTEE:
args->rule = f; /* report matching rule */
- args->cookie = cmd->arg1;
+ if (cmd->arg1 == IP_FW_TABLECOOKIE)
+ args->cookie = tablearg;
+ else
+ args->cookie = cmd->arg1;
retval = (cmd->opcode == O_NETGRAPH) ?
IP_FW_NETGRAPH : IP_FW_NGTEE;
goto done;
@@ -3169,34 +3156,6 @@
}
/*
- * When pipes/queues are deleted, clear the "pipe_ptr" pointer to a given
- * pipe/queue, or to all of them (match == NULL).
- */
-void
-flush_pipe_ptrs(struct dn_flow_set *match)
-{
- struct ip_fw *rule;
-
- IPFW_WLOCK(&layer3_chain);
- for (rule = layer3_chain.rules; rule; rule = rule->next) {
- ipfw_insn_pipe *cmd = (ipfw_insn_pipe *)ACTION_PTR(rule);
-
- if (cmd->o.opcode != O_PIPE && cmd->o.opcode != O_QUEUE)
- continue;
- /*
- * XXX Use bcmp/bzero to handle pipe_ptr to overcome
- * possible alignment problems on 64-bit architectures.
- * This code is seldom used so we do not worry too
- * much about efficiency.
- */
- if (match == NULL ||
- !bcmp(&cmd->pipe_ptr, &match, sizeof(match)) )
- bzero(&cmd->pipe_ptr, sizeof(cmd->pipe_ptr));
- }
- IPFW_WUNLOCK(&layer3_chain);
-}
-
-/*
* Add a new rule to the list. Copy the rule into a malloc'ed area, then
* possibly create a rule number and add the rule to the list.
* Update the rule_number in the input struct so the caller knows it as well.
@@ -3685,7 +3644,7 @@
case O_PIPE:
case O_QUEUE:
- if (cmdlen != F_INSN_SIZE(ipfw_insn_pipe))
+ if (cmdlen != F_INSN_SIZE(ipfw_insn))
goto bad_size;
goto check_action;
@@ -4012,8 +3971,8 @@
sizeof(ent), sizeof(ent));
if (error)
break;
- error = add_table_entry(ent.tbl, ent.addr,
- ent.masklen, ent.value);
+ error = add_table_entry(&layer3_chain, ent.tbl,
+ ent.addr, ent.masklen, ent.value);
}
break;
@@ -4025,7 +3984,8 @@
sizeof(ent), sizeof(ent));
if (error)
break;
- error = del_table_entry(ent.tbl, ent.addr, ent.masklen);
+ error = del_table_entry(&layer3_chain, ent.tbl,
+ ent.addr, ent.masklen);
}
break;
@@ -4037,7 +3997,9 @@
sizeof(tbl), sizeof(tbl));
if (error)
break;
- error = flush_table(tbl);
+ IPFW_WLOCK(&layer3_chain);
+ error = flush_table(&layer3_chain, tbl);
+ IPFW_WUNLOCK(&layer3_chain);
}
break;
@@ -4048,8 +4010,10 @@
if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl),
sizeof(tbl))))
break;
- if ((error = count_table(tbl, &cnt)))
+ IPFW_RLOCK(&layer3_chain);
+ if ((error = count_table(&layer3_chain, tbl, &cnt)))
break;
+ IPFW_RUNLOCK(&layer3_chain);
error = sooptcopyout(sopt, &cnt, sizeof(cnt));
}
break;
@@ -4075,11 +4039,14 @@
}
tbl->size = (size - sizeof(*tbl)) /
sizeof(ipfw_table_entry);
- error = dump_table(tbl);
+ IPFW_WLOCK(&layer3_chain);
+ error = dump_table(&layer3_chain, tbl);
if (error) {
+ IPFW_WUNLOCK(&layer3_chain);
free(tbl, M_TEMP);
break;
}
+ IPFW_WUNLOCK(&layer3_chain);
error = sooptcopyout(sopt, tbl, size);
free(tbl, M_TEMP);
}
@@ -4242,7 +4209,7 @@
printf("limited to %d packets/entry by default\n",
verbose_limit);
- init_tables();
+ init_tables(&layer3_chain);
ip_fw_ctl_ptr = ipfw_ctl;
ip_fw_chk_ptr = ipfw_chk;
callout_reset(&ipfw_timeout, hz, ipfw_tick, NULL);
@@ -4259,13 +4226,13 @@
ip_fw_ctl_ptr = NULL;
callout_drain(&ipfw_timeout);
IPFW_WLOCK(&layer3_chain);
+ flush_tables(&layer3_chain);
layer3_chain.reap = NULL;
free_chain(&layer3_chain, 1 /* kill default rule */);
reap = layer3_chain.reap, layer3_chain.reap = NULL;
IPFW_WUNLOCK(&layer3_chain);
if (reap != NULL)
reap_rules(reap);
- flush_tables();
IPFW_DYN_LOCK_DESTROY();
uma_zdestroy(ipfw_dyn_rule_zone);
IPFW_LOCK_DESTROY(&layer3_chain);
More information about the freebsd-net
mailing list