[PATCH] Using PFIL lock in packet filters

Alexander V. Chernikov melifaro at FreeBSD.org
Wed Sep 5 10:33:20 UTC 2012


Hello list!

Currently we have the following locking strategy in our firewalls:

On every input/output IP packet PFIL acquires read lock while traversing 
list and after that reader (e.g. firewall handler) acquires its own lock 
(rwlock in case of ipfw).

Pfil framework currently uses per-head rmlock (e.g. different lock for 
IPv4 and IPv6 per each VNET instance).

Since most popular firewalls (ipfw and pf) uses per-VNET rwlock(*),
per-AF pfil locks does not give us much benefit.

Proposed idea is to:
1) Use single pfil lock per VNET instance by default.
2) Permit filters to use this lock via pfil_[rw]_[un]lock functions.

Patch for pfil and ipfw changes attached.

I've got several production routers running for a week with previous 
version of this patch without any (ipfw-related) problems.

Performance difference is quite significant, more details here:
http://lists.freebsd.org/pipermail/freebsd-net/2012-July/032842.html

I'm planning to commit these patches (with minor changes) at the 
beginning of next week if nobody objects.

(*) currently pf uses mutex, but hopefully glebius@ is working on much 
more scalable solution

-- 
WBR, Alexander

-------------- next part --------------
>From 347690db1c4ecaf267f3c027e18ea38734d28d42 Mon Sep 17 00:00:00 2001
From: "Alexander V. Chernikov" <melifaro at ipfw.ru>
Date: Wed, 5 Sep 2012 13:09:16 +0400
Subject: [PATCH 1/2] Export pfil lock

---
 share/man/man9/pfil.9 |   55 ++++++++++++++++++++++++++++++++++++++++++++--
 sys/net/pfil.c        |   58 +++++++++++++++++++++++++++++++++++++++++++++++++
 sys/net/pfil.h        |   46 ++++++++++++++++++++++++++++++---------
 3 files changed, 147 insertions(+), 12 deletions(-)

diff --git a/share/man/man9/pfil.9 b/share/man/man9/pfil.9
index 36a4d47..a76168b 100644
--- a/share/man/man9/pfil.9
+++ b/share/man/man9/pfil.9
@@ -28,7 +28,7 @@
 .\"
 .\" $FreeBSD: stable/8/share/man/man9/pfil.9 162404 2006-09-18 15:24:20Z ru $
 .\"
-.Dd September 29, 2004
+.Dd September 5, 2012
 .Dt PFIL 9
 .Os
 .Sh NAME
@@ -39,7 +39,11 @@
 .Nm pfil_hook_get ,
 .Nm pfil_add_hook ,
 .Nm pfil_remove_hook ,
-.Nm pfil_run_hooks
+.Nm pfil_run_hooks ,
+.Nm pfil_rlock
+.Nm pfil_runlock
+.Nm pfil_wlock
+.Nm pfil_wunlock
 .Nd packet filter interface
 .Sh SYNOPSIS
 .In sys/param.h
@@ -62,6 +66,14 @@
 .Fn (*func) "void *arg" "struct mbuf **mp" "struct ifnet *" "int dir" "struct inpcb *"
 .Ft int
 .Fn pfil_run_hooks "struct pfil_head *head" "struct mbuf **mp" "struct ifnet *" "int dir" "struct inpcb *"
+.Ft void
+.Fn pfil_rlock "struct pfil_head *" "struct rm_priotracker *"
+.Ft void
+.Fn pfil_runlock "struct pfil_head *" "struct rm_priotracker *"
+.Ft void
+.Fn pfil_wlock "struct pfil_head *"
+.Ft void
+.Fn pfil_wunlock "struct pfil_head *"
 .Sh DESCRIPTION
 The
 .Nm
@@ -86,6 +98,16 @@ The data link type is a
 .Xr bpf 4
 DLT constant indicating what kind of header is present on the packet
 at the filtering point.
+Each filtering point uses common per-VNET rmlock by default.
+This can be changed by specifying
+.Vt PFIL_FLAG_PRIVATE_LOCK
+as
+.Vt "flags"
+field in the
+.Vt pfil_head
+structure.
+Note that specifying private lock can break filters sharing the same
+ruleset and/or state between different data link types.
 Filtering points may be unregistered with the
 .Fn pfil_head_unregister
 function.
@@ -122,6 +144,31 @@ The filter returns an error (errno) if the packet processing is to stop, or 0
 if the processing is to continue.
 If the packet processing is to stop, it is the responsibility of the
 filter to free the packet.
+.Pp
+Every filter hook is called with
+.Nm
+read lock held.
+All heads uses the same lock within the same VNET instance.
+Packet filter can use this lock instead of own locking model to
+improve performance.
+Since
+.Nm
+uses
+.Xr rmlock 9
+.Fn pfil_rlock
+and
+.Fn pfil_runlock
+require
+.Va struct rm_priotracker
+to be passed as argument.
+Filter can acquire and release writer lock via
+.Fn pfil_wlock
+and
+.Fn pfil_wunlock
+functions.
+See
+.Xr rmlock 9
+for more details.
 .Sh RETURN VALUES
 If successful,
 .Fn pfil_head_get
@@ -146,6 +193,7 @@ might sleep!
 .Sh SEE ALSO
 .Xr bpf 4 ,
 .Xr if_bridge 4
+.Xr rmlock 4
 .Sh HISTORY
 The
 .Nm
@@ -181,6 +229,9 @@ as well as be less IP-centric.
 .Pp
 Fine-grained locking was added in
 .Fx 5.2 .
+.Nm
+lock export was added in
+.Fx 10.0 .
 .Sh BUGS
 The
 .Fn pfil_hook_get
diff --git a/sys/net/pfil.c b/sys/net/pfil.c
index 788ca24..6c48334 100644
--- a/sys/net/pfil.c
+++ b/sys/net/pfil.c
@@ -61,6 +61,8 @@ static int pfil_list_remove(pfil_list_t *,
 LIST_HEAD(pfilheadhead, pfil_head);
 VNET_DEFINE(struct pfilheadhead, pfil_head_list);
 #define	V_pfil_head_list	VNET(pfil_head_list)
+VNET_DEFINE(struct rmlock, pfil_lock);
+#define	V_pfil_lock	VNET(pfil_lock)
 
 /*
  * pfil_run_hooks() runs the specified packet filter hooks.
@@ -91,6 +93,60 @@ pfil_run_hooks(struct pfil_head *ph, struct mbuf **mp, struct ifnet *ifp,
 }
 
 /*
+ * pfil_try_rlock() acquires rm reader lock for specified head
+ * if this is immediately possible,
+ */
+int
+pfil_try_rlock(struct pfil_head *ph, struct rm_priotracker *tracker)
+{
+	return PFIL_TRY_RLOCK(ph, tracker);
+}
+
+/*
+ * pfil_rlock() acquires rm reader lock for specified head.
+ */
+void
+pfil_rlock(struct pfil_head *ph, struct rm_priotracker *tracker)
+{
+	PFIL_RLOCK(ph, tracker);
+}
+
+/*
+ * pfil_runlock() releases reader lock for specified head.
+ */
+void
+pfil_runlock(struct pfil_head *ph, struct rm_priotracker *tracker)
+{
+	PFIL_RUNLOCK(ph, tracker);
+}
+
+/*
+ * pfil_wlock() acquires writer lock for specified head.
+ */
+void
+pfil_wlock(struct pfil_head *ph)
+{
+	PFIL_WLOCK(ph);
+}
+
+/*
+ * pfil_wunlock() releases writer lock for specified head.
+ */
+void
+pfil_wunlock(struct pfil_head *ph)
+{
+	PFIL_WUNLOCK(ph);
+}
+
+/*
+ * pfil_wowned() releases writer lock for specified head.
+ */
+int
+pfil_wowned(struct pfil_head *ph)
+{
+	return PFIL_WOWNED(ph);
+}
+/*
  * pfil_head_register() registers a pfil_head with the packet filter hook
  * mechanism.
  */
@@ -295,6 +351,7 @@ vnet_pfil_init(const void *unused)
 {
 
 	LIST_INIT(&V_pfil_head_list);
+	PFIL_LOCK_INIT_REAL(&V_pfil_lock, "shared");
 	return (0);
 }
 
@@ -306,6 +363,7 @@ vnet_pfil_uninit(const void *unused)
 {
 
 	/*  XXX should panic if list is not empty */
+	PFIL_LOCK_DESTROY_REAL(&V_pfil_lock);
 	return (0);
 }
 
diff --git a/sys/net/pfil.h b/sys/net/pfil.h
index d314a72..7c99148 100644
--- a/sys/net/pfil.h
+++ b/sys/net/pfil.h
@@ -64,6 +64,8 @@ typedef	TAILQ_HEAD(pfil_list, packet_filter_hook) pfil_list_t;
 #define	PFIL_TYPE_AF		1	/* key is AF_* type */
 #define	PFIL_TYPE_IFNET		2	/* key is ifnet pointer */
 
+#define	PFIL_FLAG_PRIVATE_LOCK	0x01	/* Personal lock instead of global */
+
 struct pfil_head {
 	pfil_list_t	ph_in;
 	pfil_list_t	ph_out;
@@ -72,7 +74,9 @@ struct pfil_head {
 #if defined( __linux__ ) || defined( _WIN32 )
 	rwlock_t	ph_mtx;
 #else
-	struct rmlock	ph_lock;
+	struct rmlock	*ph_plock;	/* Pointer to the used lock */
+	struct rmlock	ph_lock;	/* Private lock storage */
+	int		flags;
 #endif
 	union {
 		u_long		phu_val;
@@ -90,21 +94,43 @@ int	pfil_remove_hook(int (*func)(void *, struct mbuf **, struct ifnet *,
 int	pfil_run_hooks(struct pfil_head *, struct mbuf **, struct ifnet *,
 	    int, struct inpcb *inp);
 
+struct rm_priotracker;	/* Do not require including rmlock header */
+int pfil_try_rlock(struct pfil_head *, struct rm_priotracker *);
+void pfil_rlock(struct pfil_head *, struct rm_priotracker *);
+void pfil_runlock(struct pfil_head *, struct rm_priotracker *);
+void pfil_wlock(struct pfil_head *);
+void pfil_wunlock(struct pfil_head *);
+int pfil_wowned(struct pfil_head *ph);
+
 int	pfil_head_register(struct pfil_head *);
 int	pfil_head_unregister(struct pfil_head *);
 
 struct pfil_head *pfil_head_get(int, u_long);
 
 #define	PFIL_HOOKED(p) ((p)->ph_nhooks > 0)
-#define	PFIL_LOCK_INIT(p) \
-    rm_init_flags(&(p)->ph_lock, "PFil hook read/write mutex", RM_RECURSE)
-#define	PFIL_LOCK_DESTROY(p) rm_destroy(&(p)->ph_lock)
-#define PFIL_RLOCK(p, t) rm_rlock(&(p)->ph_lock, (t))
-#define PFIL_WLOCK(p) rm_wlock(&(p)->ph_lock)
-#define PFIL_RUNLOCK(p, t) rm_runlock(&(p)->ph_lock, (t))
-#define PFIL_WUNLOCK(p) rm_wunlock(&(p)->ph_lock)
-#define PFIL_LIST_LOCK() mtx_lock(&pfil_global_lock)
-#define PFIL_LIST_UNLOCK() mtx_unlock(&pfil_global_lock)
+#define	PFIL_LOCK_INIT_REAL(l, t)	\
+	rm_init_flags(l, "PFil " t " rmlock", RM_RECURSE)
+#define	PFIL_LOCK_DESTROY_REAL(l)	\
+	rm_destroy(l)
+#define	PFIL_LOCK_INIT(p)	do {			\
+	if ((p)->flags & PFIL_FLAG_PRIVATE_LOCK) {	\
+		PFIL_LOCK_INIT_REAL(&(p)->ph_lock, "private");	\
+		(p)->ph_plock = &(p)->ph_lock;		\
+	} else						\
+		(p)->ph_plock = &V_pfil_lock;		\
+} while (0)
+#define	PFIL_LOCK_DESTROY(p)	do {			\
+	if ((p)->flags & PFIL_FLAG_PRIVATE_LOCK)	\
+		PFIL_LOCK_DESTROY_REAL((p)->ph_plock);	\
+} while (0)
+#define PFIL_TRY_RLOCK(p, t)	rm_try_rlock((p)->ph_plock, (t))
+#define PFIL_RLOCK(p, t)	rm_rlock((p)->ph_plock, (t))
+#define PFIL_WLOCK(p)		rm_wlock((p)->ph_plock)
+#define PFIL_RUNLOCK(p, t)	rm_runlock((p)->ph_plock, (t))
+#define PFIL_WUNLOCK(p)		rm_wunlock((p)->ph_plock)
+#define PFIL_WOWNED(p)		rm_wowned((p)->ph_plock)
+#define PFIL_LIST_LOCK()	mtx_lock(&pfil_global_lock)
+#define PFIL_LIST_UNLOCK()	mtx_unlock(&pfil_global_lock)
 
 static __inline struct packet_filter_hook *
 pfil_hook_get(int dir, struct pfil_head *ph)
-- 
1.7.9.4

-------------- next part --------------
>From 808f85d5bc8b0ac12bd9b11c6fa1f8b44ad936cd Mon Sep 17 00:00:00 2001
From: "Alexander V. Chernikov" <melifaro at ipfw.ru>
Date: Wed, 5 Sep 2012 13:10:15 +0400
Subject: [PATCH 2/2] Make ipfw use pfil hooks

---
 sys/netinet/ipfw/ip_fw2.c        |   22 ++++++++++++++++++++++
 sys/netinet/ipfw/ip_fw_nat.c     |   18 ++++++++++++++++++
 sys/netinet/ipfw/ip_fw_private.h |   17 +++++++++--------
 sys/netinet/ipfw/ip_fw_sockopt.c |    4 ++++
 sys/netinet/ipfw/ip_fw_table.c   |    1 +
 5 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c
index 352fd4a..bee16a3 100644
--- a/sys/netinet/ipfw/ip_fw2.c
+++ b/sys/netinet/ipfw/ip_fw2.c
@@ -62,6 +62,7 @@ __FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw2.c 240099 2012-09-04 19:43:26Z m
 #include <net/route.h>
 #include <net/pf_mtag.h>
 #include <net/vnet.h>
+#include <net/pfil.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
@@ -785,6 +786,10 @@ set_match(struct ip_fw_args *args, int slot,
  * All arguments are in args so we can modify them and return them
  * back to the caller.
  *
+ * We assume ipfw_chk is ALWAYS called from PFIL hook so
+ * read lock is alredy held. If this is not the case, PFIL
+ * read lock has to be acquired manually before calling ipfw_chk()
+ *
  * Parameters:
  *
  *	args->m	(in/out) The packet; we set to NULL when/if we nuke it.
@@ -1203,9 +1208,13 @@ do {								\
 		args->f_id.dst_port = dst_port = ntohs(dst_port);
 	}
 
+#if defined(__linux__) || defined(_WIN32)
 	IPFW_RLOCK(chain);
+#endif
 	if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */
+#if defined(__linux__) || defined(_WIN32)
 		IPFW_RUNLOCK(chain);
+#endif
 		return (IP_FW_PASS);	/* accept */
 	}
 	if (args->rule.slot) {
@@ -2476,7 +2485,9 @@ do {								\
 		retval = IP_FW_DENY;
 		printf("ipfw: ouch!, skip past end of rules, denying packet\n");
 	}
+#if defined(__linux__) || defined(_WIN32)
 	IPFW_RUNLOCK(chain);
+#endif
 #ifdef __FreeBSD__
 	if (ucred_cache != NULL)
 		crfree(ucred_cache);
@@ -2639,6 +2650,17 @@ vnet_ipfw_init(const void *unused)
 	chain->id = rule->id = 1;
 
 	IPFW_LOCK_INIT(chain);
+#ifdef __FreeBSD__
+#ifdef INET
+	chain->phead = pfil_head_get(PFIL_TYPE_AF, AF_INET);
+#else
+	chain->phead = pfil_head_get(PFIL_TYPE_AF, AF_INET6);
+#endif
+	if (error) {
+		printf("ipfw2: PFIL lock setup failed");
+		return (ENOENT);
+	}
+#endif
 	ipfw_dyn_init();
 
 	/* First set up some values that are compile time options */
diff --git a/sys/netinet/ipfw/ip_fw_nat.c b/sys/netinet/ipfw/ip_fw_nat.c
index 7b3f3a3..1e9b6af 100644
--- a/sys/netinet/ipfw/ip_fw_nat.c
+++ b/sys/netinet/ipfw/ip_fw_nat.c
@@ -42,6 +42,7 @@ __FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_nat.c 231991 2012-02-22 04:19:33
 #include <netinet/libalias/alias_local.h>
 
 #include <net/if.h>
+#include <net/pfil.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
@@ -201,6 +202,13 @@ add_redir_spool_cfg(char *buf, struct cfg_nat *ptr)
 	}
 }
 
+/*
+ * ipfw_nat - perform mbuf header translation.
+ *
+ * We assume ipfw_nat is ALWAYS called from ipfw_chk so
+ * PFIL read lock is alredy held. If this is not the case,
+ * read lock has to be acquired manually before calling ipfw_nat()
+ */
 static int
 ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m)
 {
@@ -268,7 +276,9 @@ ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m)
 
 		found = 0;
 		chain = &V_layer3_chain;
+#if defined(__linux__) || defined(_WIN32)
 		IPFW_RLOCK(chain);
+#endif
 		/* Check every nat entry... */
 		LIST_FOREACH(t, &chain->nat, _next) {
 			if ((t->mode & PKT_ALIAS_SKIP_GLOBAL) != 0)
@@ -281,7 +291,9 @@ ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m)
 				break;
 			}
 		}
+#if defined(__linux__) || defined(_WIN32)
 		IPFW_RUNLOCK(chain);
+#endif
 		if (found != 1) {
 			/* No instance found, return ignore */
 			args->m = mcl;
@@ -493,6 +505,9 @@ ipfw_nat_get_cfg(struct sockopt *sopt)
 	struct cfg_spool *s;
 	char *data;
 	int gencnt, nat_cnt, len, error;
+#ifdef __FreeBSD__
+	struct rm_priotracker	tracker;
+#endif
 
 	nat_cnt = 0;
 	len = sizeof(nat_cnt);
@@ -551,6 +566,9 @@ ipfw_nat_get_log(struct sockopt *sopt)
 	struct cfg_nat *ptr;
 	int i, size;
 	struct ip_fw_chain *chain;
+#ifdef __FreeBSD__
+	struct rm_priotracker	tracker;
+#endif
 
 	chain = &V_layer3_chain;
 
diff --git a/sys/netinet/ipfw/ip_fw_private.h b/sys/netinet/ipfw/ip_fw_private.h
index 1cb1115..44908d8 100644
--- a/sys/netinet/ipfw/ip_fw_private.h
+++ b/sys/netinet/ipfw/ip_fw_private.h
@@ -212,6 +212,8 @@ VNET_DECLARE(int, autoinc_step);
 VNET_DECLARE(unsigned int, fw_tables_max);
 #define V_fw_tables_max		VNET(fw_tables_max)
 
+struct pfil_head;
+
 struct ip_fw_chain {
 	struct ip_fw	*rules;		/* list of rules */
 	struct ip_fw	*reap;		/* list of rules to reap */
@@ -227,7 +229,7 @@ struct ip_fw_chain {
 	spinlock_t rwmtx;
 	spinlock_t uh_lock;
 #else
-	struct rwlock	rwmtx;
+	struct pfil_head	*phead;	/* PFIL head used for locking */
 	struct rwlock	uh_lock;	/* lock for upper half */
 #endif
 	uint32_t	id;		/* ruleset id */
@@ -241,22 +243,21 @@ struct sockopt;	/* used by tcp_var.h */
  * so the variable and the macros must be here.
  */
 
+/* Additional locking init is done in vnet_ipfw_init() */
 #define	IPFW_LOCK_INIT(_chain) do {			\
-	rw_init(&(_chain)->rwmtx, "IPFW static rules");	\
 	rw_init(&(_chain)->uh_lock, "IPFW UH lock");	\
 	} while (0)
 
 #define	IPFW_LOCK_DESTROY(_chain) do {			\
-	rw_destroy(&(_chain)->rwmtx);			\
 	rw_destroy(&(_chain)->uh_lock);			\
 	} while (0)
 
-#define	IPFW_WLOCK_ASSERT(_chain)	rw_assert(&(_chain)->rwmtx, RA_WLOCKED)
+#define	IPFW_WLOCK_ASSERT(_chain)
 
-#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx)
-#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx)
-#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx)
-#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx)
+#define IPFW_RLOCK(p) pfil_rlock((p)->phead, &tracker)
+#define IPFW_RUNLOCK(p) pfil_runlock((p)->phead, &tracker)
+#define IPFW_WLOCK(p) pfil_wlock((p)->phead)
+#define IPFW_WUNLOCK(p)	pfil_wunlock((p)->phead)
 
 #define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock)
 #define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock)
diff --git a/sys/netinet/ipfw/ip_fw_sockopt.c b/sys/netinet/ipfw/ip_fw_sockopt.c
index a245143..b67b25d 100644
--- a/sys/netinet/ipfw/ip_fw_sockopt.c
+++ b/sys/netinet/ipfw/ip_fw_sockopt.c
@@ -56,6 +56,7 @@ __FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_sockopt.c 233745 2012-03-31 11:2
 #include <net/if.h>
 #include <net/route.h>
 #include <net/vnet.h>
+#include <net/pfil.h>
 
 #include <netinet/in.h>
 #include <netinet/ip_var.h> /* hooks */
@@ -953,6 +954,9 @@ ipfw_ctl(struct sockopt *sopt)
 	u_int32_t rulenum[2];
 	uint32_t opt;
 	char xbuf[128];
+#ifdef __FreeBSD__
+	struct rm_priotracker	tracker;
+#endif
 	ip_fw3_opheader *op3 = NULL;
 
 	error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW);
diff --git a/sys/netinet/ipfw/ip_fw_table.c b/sys/netinet/ipfw/ip_fw_table.c
index d05c916..3eb412d 100644
--- a/sys/netinet/ipfw/ip_fw_table.c
+++ b/sys/netinet/ipfw/ip_fw_table.c
@@ -55,6 +55,7 @@ __FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_table.c 238265 2012-07-08 21:13:
 #include <sys/socket.h>
 #include <net/if.h>	/* ip_fw.h requires IFNAMSIZ */
 #include <net/radix.h>
+#include <net/pfil.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
-- 
1.7.9.4



More information about the freebsd-net mailing list