svn commit: r346213 - in stable/12: sbin/ipfw sys/conf sys/modules/ipfw_nat64 sys/netinet6 sys/netpfil/ipfw/nat64

Andrey V. Elsukov ae at
Sun Apr 14 13:08:20 UTC 2019

Author: ae
Date: Sun Apr 14 13:08:18 2019
New Revision: 346213

  MFC r345293:
      Update NAT64LSN implementation:
      o most of data structures and relations were modified to be able support
        large number of translation states. Now each supported protocol can
        use full ports range. Ports groups now are belong to IPv4 alias
        addresses, not hosts. Each ports group can keep several states chunks.
        This is controlled with new `states_chunks` config option. States
        chunks allow to have several translation states for single alias address
        and port, but for different destination addresses.
      o by default all hash tables now use jenkins hash.
      o ConcurrencyKit and epoch(9) is used to make NAT64LSN lockless on fast path.
      o one NAT64LSN instance now can be used to handle several IPv6 prefixes,
        special prefix "::" value should be used for this purpose when instance
        is created.
      o due to modified internal data structures relations, the socket opcode
        that does states listing was changed.
    Obtained from:	Yandex LLC
    Sponsored by:	Yandex LLC
  MFC r345294:
    Remove extra spaces.

Directory Properties:
  stable/12/   (props changed)

Modified: stable/12/sbin/ipfw/ipfw.8
--- stable/12/sbin/ipfw/ipfw.8	Sun Apr 14 12:39:09 2019	(r346212)
+++ stable/12/sbin/ipfw/ipfw.8	Sun Apr 14 13:08:18 2019	(r346213)
@@ -1,7 +1,7 @@
 .\" $FreeBSD$
-.Dd March 18, 2019
+.Dd March 19, 2019
 .Dt IPFW 8
@@ -3300,6 +3300,7 @@ See
 for more info.
+.Ss Stateful translation
 supports in-kernel IPv6/IPv4 network address and protocol translation.
 Stateful NAT64 translation allows IPv6-only clients to contact IPv4 servers
@@ -3317,7 +3318,8 @@ to be able use stateful NAT64 translator.
 Stateful NAT64 uses a bunch of memory for several types of objects.
 When IPv6 client initiates connection, NAT64 translator creates a host entry
 in the states table.
-Each host entry has a number of ports group entries allocated on demand.
+Each host entry uses preallocated IPv4 alias entry.
+Each alias entry has a number of ports group entries allocated on demand.
 Ports group entries contains connection state entries.
 There are several options to control limits and lifetime for these objects.
@@ -3337,6 +3339,11 @@ First time an original packet is handled and consumed 
 and then it is handled again as translated packet.
 This behavior can be changed by sysctl variable 
 .Va net.inet.ip.fw.nat64_direct_output .
+Also translated packet can be tagged using
+.Cm tag
+rule action, and then matched by
+.Cm tagged
+opcode to avoid loops and extra overhead.
 The stateful NAT64 configuration command is the following:
 .Bd -ragged -offset indent
@@ -3364,15 +3371,16 @@ to represent IPv4 addresses. This IPv6 prefix should b
 The translator implementation follows RFC6052, that restricts the length of
 prefixes to one of following: 32, 40, 48, 56, 64, or 96.
 The Well-Known IPv6 Prefix 64:ff9b:: must be 96 bits long.
-.It Cm max_ports Ar number
-Maximum number of ports reserved for upper level protocols to one IPv6 client.
-All reserved ports are divided into chunks between supported protocols.
-The number of connections from one IPv6 client is limited by this option.
-Note that closed TCP connections still remain in the list of connections until
-.Cm tcp_close_age
-interval will not expire.
-Default value is
-.Ar 2048 .
+The special
+.Ar ::/length
+prefix can be used to handle several IPv6 prefixes with one NAT64 instance.
+The NAT64 instance will determine a destination IPv4 address from prefix
+.Ar length .
+.It Cm states_chunks Ar number
+The number of states chunks in single ports group.
+Each ports group by default can keep 64 state entries in single chunk.
+The above value affects the maximum number of states that can be associated with single IPv4 alias address and port.
+The value must be power of 2, and up to 128.
 .It Cm host_del_age Ar seconds
 The number of seconds until the host entry for a IPv6 client will be deleted
 and all its resources will be released due to inactivity.

Modified: stable/12/sbin/ipfw/ipfw2.h
--- stable/12/sbin/ipfw/ipfw2.h	Sun Apr 14 12:39:09 2019	(r346212)
+++ stable/12/sbin/ipfw/ipfw2.h	Sun Apr 14 13:08:18 2019	(r346213)
@@ -278,6 +278,7 @@ enum tokens {

Modified: stable/12/sbin/ipfw/nat64lsn.c
--- stable/12/sbin/ipfw/nat64lsn.c	Sun Apr 14 12:39:09 2019	(r346212)
+++ stable/12/sbin/ipfw/nat64lsn.c	Sun Apr 14 13:08:18 2019	(r346213)
@@ -87,68 +87,70 @@ nat64lsn_print_states(void *buf)
 	char sflags[4], *sf, *proto;
 	ipfw_obj_header *oh;
 	ipfw_obj_data *od;
-	ipfw_nat64lsn_stg *stg;
-	ipfw_nat64lsn_state *ste;
+	ipfw_nat64lsn_stg_v1 *stg;
+	ipfw_nat64lsn_state_v1 *ste;
 	uint64_t next_idx;
 	int i, sz;
 	oh = (ipfw_obj_header *)buf;
 	od = (ipfw_obj_data *)(oh + 1);
-	stg = (ipfw_nat64lsn_stg *)(od + 1);
+	stg = (ipfw_nat64lsn_stg_v1 *)(od + 1);
 	sz = od->head.length - sizeof(*od);
 	next_idx = 0;
 	while (sz > 0 && next_idx != 0xFF) {
-		next_idx = stg->next_idx;
+		next_idx = stg->next.index;
 		sz -= sizeof(*stg);
 		if (stg->count == 0) {
-		switch (stg->proto) {
-		case IPPROTO_TCP:
-			proto = "TCP";
-			break;
-		case IPPROTO_UDP:
-			proto = "UDP";
-			break;
-			proto = "ICMPv6";
-			break;
-		}
-		inet_ntop(AF_INET6, &stg->host6, s, sizeof(s));
+		/*
+		 * NOTE: addresses are in network byte order,
+		 * ports are in host byte order.
+		 */
 		inet_ntop(AF_INET, &stg->alias4, a, sizeof(a));
-		ste = (ipfw_nat64lsn_state *)(stg + 1);
+		ste = (ipfw_nat64lsn_state_v1 *)(stg + 1);
 		for (i = 0; i < stg->count && sz > 0; i++) {
 			sf = sflags;
+			inet_ntop(AF_INET6, &ste->host6, s, sizeof(s));
 			inet_ntop(AF_INET, &ste->daddr, f, sizeof(f));
-			if (stg->proto == IPPROTO_TCP) {
+			switch (ste->proto) {
+			case IPPROTO_TCP:
+				proto = "TCP";
 				if (ste->flags & 0x02)
 					*sf++ = 'S';
 				if (ste->flags & 0x04)
 					*sf++ = 'E';
 				if (ste->flags & 0x01)
 					*sf++ = 'F';
+				break;
+			case IPPROTO_UDP:
+				proto = "UDP";
+				break;
+			case IPPROTO_ICMP:
+				proto = "ICMPv6";
+				break;
 			*sf = '\0';
-			switch (stg->proto) {
+			switch (ste->proto) {
 			case IPPROTO_TCP:
 			case IPPROTO_UDP:
 				    s, ste->sport, a, ste->aport, proto,
 				    sflags, ste->idle, f, ste->dport);
-			case IPPROTO_ICMPV6:
+			case IPPROTO_ICMP:
 				    s, a, proto, ste->idle, f);
-				    s, a, stg->proto, ste->idle, f);
+				    s, a, ste->proto, ste->idle, f);
 			sz -= sizeof(*ste);
-		stg = (ipfw_nat64lsn_stg *)ste;
+		stg = (ipfw_nat64lsn_stg_v1 *)ste;
 	return (next_idx);
@@ -174,6 +176,7 @@ nat64lsn_states_cb(ipfw_nat64lsn_cfg *cfg, const char 
 		err(EX_OSERR, NULL);
 	do {
 		oh = (ipfw_obj_header *)buf;
+		oh->opheader.version = 1; /* Force using ov new API */
 		od = (ipfw_obj_data *)(oh + 1);
 		nat64lsn_fill_ntlv(&oh->ntlv, cfg->name, set);
 		od->head.type = IPFW_TLV_OBJDATA;
@@ -363,12 +366,8 @@ nat64lsn_parse_int(const char *arg, const char *desc)
 static struct _s_x nat64newcmds[] = {
       { "prefix6",	TOK_PREFIX6 },
-      { "agg_len",	TOK_AGG_LEN }, /* not yet */
-      { "agg_count",	TOK_AGG_COUNT }, /* not yet */
-      { "port_range",	TOK_PORT_RANGE }, /* not yet */
       { "jmaxlen",	TOK_JMAXLEN },
       { "prefix4",	TOK_PREFIX4 },
-      { "max_ports",	TOK_MAX_PORTS },
       { "host_del_age",	TOK_HOST_DEL_AGE },
       { "pg_del_age",	TOK_PG_DEL_AGE },
       { "tcp_syn_age",	TOK_TCP_SYN_AGE },
@@ -376,10 +375,13 @@ static struct _s_x nat64newcmds[] = {
       { "tcp_est_age",	TOK_TCP_EST_AGE },
       { "udp_age",	TOK_UDP_AGE },
       { "icmp_age",	TOK_ICMP_AGE },
+      { "states_chunks",TOK_STATES_CHUNKS },
       { "log",		TOK_LOG },
       { "-log",		TOK_LOGOFF },
       { "allow_private", TOK_PRIVATE },
       { "-allow_private", TOK_PRIVATEOFF },
+      /* for compatibility with old configurations */
+      { "max_ports",	TOK_MAX_PORTS },	/* unused */
       { NULL, 0 }
@@ -436,42 +438,17 @@ nat64lsn_create(const char *name, uint8_t set, int ac,
 			nat64lsn_parse_prefix(*av, AF_INET6, &cfg->prefix6,
 			if (ipfw_check_nat64prefix(&cfg->prefix6,
-			    cfg->plen6) != 0)
+			    cfg->plen6) != 0 &&
+			    !IN6_IS_ADDR_UNSPECIFIED(&cfg->prefix6))
 				errx(EX_USAGE, "Bad prefix6 %s", *av);
 			ac--; av++;
-#if 0
-		case TOK_AGG_LEN:
-			NEED1("Aggregation prefix len required");
-			cfg->agg_prefix_len = nat64lsn_parse_int(*av, opt);
-			ac--; av++;
-			break;
-		case TOK_AGG_COUNT:
-			NEED1("Max per-prefix count required");
-			cfg->agg_prefix_max = nat64lsn_parse_int(*av, opt);
-			ac--; av++;
-			break;
-			NEED1("port range x[:y] required");
-			if ((p = strchr(*av, ':')) == NULL)
-				cfg->min_port = (uint16_t)nat64lsn_parse_int(
-				    *av, opt);
-			else {
-				*p++ = '\0';
-				cfg->min_port = (uint16_t)nat64lsn_parse_int(
-				    *av, opt);
-				cfg->max_port = (uint16_t)nat64lsn_parse_int(
-				    p, opt);
-			}
-			ac--; av++;
-			break;
 		case TOK_JMAXLEN:
 			NEED1("job queue length required");
 			cfg->jmaxlen = nat64lsn_parse_int(*av, opt);
 			ac--; av++;
 		case TOK_MAX_PORTS:
 			NEED1("Max per-user ports required");
 			cfg->max_ports = nat64lsn_parse_int(*av, opt);
@@ -519,6 +496,12 @@ nat64lsn_create(const char *name, uint8_t set, int ac,
 			    *av, opt);
 			ac--; av++;
+			NEED1("number of chunks required");
+			cfg->states_chunks = (uint8_t)nat64lsn_parse_int(
+			    *av, opt);
+			ac--; av++;
+			break;
 		case TOK_LOG:
 			cfg->flags |= NAT64_LOG;
@@ -630,6 +613,12 @@ nat64lsn_config(const char *name, uint8_t set, int ac,
 			    *av, opt);
 			ac--; av++;
+			NEED1("number of chunks required");
+			cfg->states_chunks = (uint8_t)nat64lsn_parse_int(
+			    *av, opt);
+			ac--; av++;
+			break;
 		case TOK_LOG:
 			cfg->flags |= NAT64_LOG;
@@ -789,31 +778,24 @@ nat64lsn_show_cb(ipfw_nat64lsn_cfg *cfg, const char *n
 	printf("nat64lsn %s prefix4 %s/%u", cfg->name, abuf, cfg->plen4);
 	inet_ntop(AF_INET6, &cfg->prefix6, abuf, sizeof(abuf));
 	printf(" prefix6 %s/%u", abuf, cfg->plen6);
-#if 0
-	printf("agg_len %u agg_count %u ", cfg->agg_prefix_len,
-	    cfg->agg_prefix_max);
-	if (cfg->min_port != NAT64LSN_PORT_MIN ||
-	    cfg->max_port != NAT64LSN_PORT_MAX)
-		printf(" port_range %u:%u", cfg->min_port, cfg->max_port);
-	if (cfg->jmaxlen != NAT64LSN_JMAXLEN)
-		printf(" jmaxlen %u ", cfg->jmaxlen);
-	if (cfg->max_ports != NAT64LSN_MAX_PORTS)
-		printf(" max_ports %u", cfg->max_ports);
-	if (cfg->nh_delete_delay != NAT64LSN_HOST_AGE)
+	if (co.verbose || cfg->states_chunks > 1)
+		printf(" states_chunks %u", cfg->states_chunks);
+	if (co.verbose || cfg->nh_delete_delay != NAT64LSN_HOST_AGE)
 		printf(" host_del_age %u", cfg->nh_delete_delay);
-	if (cfg->pg_delete_delay != NAT64LSN_PG_AGE)
-		printf(" pg_del_age %u ", cfg->pg_delete_delay);
-	if (cfg->st_syn_ttl != NAT64LSN_TCP_SYN_AGE)
+	if (co.verbose || cfg->pg_delete_delay != NAT64LSN_PG_AGE)
+		printf(" pg_del_age %u", cfg->pg_delete_delay);
+	if (co.verbose || cfg->st_syn_ttl != NAT64LSN_TCP_SYN_AGE)
 		printf(" tcp_syn_age %u", cfg->st_syn_ttl);
-	if (cfg->st_close_ttl != NAT64LSN_TCP_FIN_AGE)
+	if (co.verbose || cfg->st_close_ttl != NAT64LSN_TCP_FIN_AGE)
 		printf(" tcp_close_age %u", cfg->st_close_ttl);
-	if (cfg->st_estab_ttl != NAT64LSN_TCP_EST_AGE)
+	if (co.verbose || cfg->st_estab_ttl != NAT64LSN_TCP_EST_AGE)
 		printf(" tcp_est_age %u", cfg->st_estab_ttl);
-	if (cfg->st_udp_ttl != NAT64LSN_UDP_AGE)
+	if (co.verbose || cfg->st_udp_ttl != NAT64LSN_UDP_AGE)
 		printf(" udp_age %u", cfg->st_udp_ttl);
-	if (cfg->st_icmp_ttl != NAT64LSN_ICMP_AGE)
+	if (co.verbose || cfg->st_icmp_ttl != NAT64LSN_ICMP_AGE)
 		printf(" icmp_age %u", cfg->st_icmp_ttl);
+	if (co.verbose || cfg->jmaxlen != NAT64LSN_JMAXLEN)
+		printf(" jmaxlen %u", cfg->jmaxlen);
 	if (cfg->flags & NAT64_LOG)
 		printf(" log");
 	if (cfg->flags & NAT64_ALLOW_PRIVATE)

Modified: stable/12/sys/conf/files
--- stable/12/sys/conf/files	Sun Apr 14 12:39:09 2019	(r346212)
+++ stable/12/sys/conf/files	Sun Apr 14 13:08:18 2019	(r346213)
@@ -4448,9 +4448,9 @@ netpfil/ipfw/nat64/nat64clat.c	optional inet inet6 ipf
 netpfil/ipfw/nat64/nat64clat_control.c	optional inet inet6 ipfirewall \
 netpfil/ipfw/nat64/nat64lsn.c	optional inet inet6 ipfirewall \
-	ipfirewall_nat64
+	ipfirewall_nat64 compile-with "${NORMAL_C} -I$S/contrib/ck/include"
 netpfil/ipfw/nat64/nat64lsn_control.c	optional inet inet6 ipfirewall \
-	ipfirewall_nat64
+	ipfirewall_nat64 compile-with "${NORMAL_C} -I$S/contrib/ck/include"
 netpfil/ipfw/nat64/nat64stl.c	optional inet inet6 ipfirewall \
 netpfil/ipfw/nat64/nat64stl_control.c	optional inet inet6 ipfirewall \

Modified: stable/12/sys/modules/ipfw_nat64/Makefile
--- stable/12/sys/modules/ipfw_nat64/Makefile	Sun Apr 14 12:39:09 2019	(r346212)
+++ stable/12/sys/modules/ipfw_nat64/Makefile	Sun Apr 14 13:08:18 2019	(r346213)
@@ -8,4 +8,6 @@ SRCS+=	nat64clat.c nat64clat_control.c
 SRCS+=	nat64lsn.c nat64lsn_control.c
 SRCS+=	nat64stl.c nat64stl_control.c
+CFLAGS+= -I${SRCTOP}/sys/contrib/ck/include
 .include <>

Modified: stable/12/sys/netinet6/ip_fw_nat64.h
--- stable/12/sys/netinet6/ip_fw_nat64.h	Sun Apr 14 12:39:09 2019	(r346212)
+++ stable/12/sys/netinet6/ip_fw_nat64.h	Sun Apr 14 13:08:18 2019	(r346213)
@@ -122,7 +122,7 @@ typedef struct _ipfw_nat64clat_cfg {
  * NAT64LSN default configuration values
-#define	NAT64LSN_MAX_PORTS	2048	/* Max number of ports per host */
+#define	NAT64LSN_MAX_PORTS	2048	/* Unused */
 #define	NAT64LSN_JMAXLEN	2048	/* Max outstanding requests. */
 #define	NAT64LSN_TCP_SYN_AGE	10	/* State's TTL after SYN received. */
 #define	NAT64LSN_TCP_EST_AGE	(2 * 3600) /* TTL for established connection */
@@ -135,16 +135,20 @@ typedef struct _ipfw_nat64clat_cfg {
 typedef struct _ipfw_nat64lsn_cfg {
 	char		name[64];	/* NAT name			*/
 	uint32_t	flags;
-	uint32_t	max_ports;	/* Max ports per client */
-	uint32_t	agg_prefix_len;	/* Prefix length to count */
-	uint32_t	agg_prefix_max;	/* Max hosts per agg prefix */
+	uint32_t	max_ports;      /* Unused */
+	uint32_t	agg_prefix_len; /* Unused */
+	uint32_t	agg_prefix_max; /* Unused */
 	struct in_addr	prefix4;
 	uint16_t	plen4;		/* Prefix length */
 	uint16_t	plen6;		/* Prefix length */
 	struct in6_addr	prefix6;	/* NAT64 prefix */
 	uint32_t	jmaxlen;	/* Max jobqueue length */
-	uint16_t	min_port;	/* Min port group # to use */
-	uint16_t	max_port;	/* Max port group # to use */
+	uint16_t	min_port;	/* Unused */
+	uint16_t	max_port;	/* Unused */
 	uint16_t	nh_delete_delay;/* Stale host delete delay */
 	uint16_t	pg_delete_delay;/* Stale portgroup delete delay */
 	uint16_t	st_syn_ttl;	/* TCP syn expire */
@@ -153,7 +157,7 @@ typedef struct _ipfw_nat64lsn_cfg {
 	uint16_t	st_udp_ttl;	/* UDP expire */
 	uint16_t	st_icmp_ttl;	/* ICMP expire */
 	uint8_t		set;		/* Named instance set [0..31] */
-	uint8_t		spare;
+	uint8_t		states_chunks;	/* Number of states chunks per PG */
 } ipfw_nat64lsn_cfg;
 typedef struct _ipfw_nat64lsn_state {
@@ -177,5 +181,30 @@ typedef struct _ipfw_nat64lsn_stg {
 	uint32_t	spare2;
 } ipfw_nat64lsn_stg;
-#endif /* _NETINET6_IP_FW_NAT64_H_ */
+typedef struct _ipfw_nat64lsn_state_v1 {
+	struct in6_addr	host6;		/* Bound IPv6 host */
+	struct in_addr	daddr;		/* Remote IPv4 address */
+	uint16_t	dport;		/* Remote destination port */
+	uint16_t	aport;		/* Local alias port */
+	uint16_t	sport;		/* Source port */
+	uint16_t	spare;
+	uint16_t	idle;		/* Last used time */
+	uint8_t		flags;		/* State flags */
+	uint8_t		proto;		/* protocol */
+} ipfw_nat64lsn_state_v1;
+typedef struct _ipfw_nat64lsn_stg_v1 {
+	union nat64lsn_pgidx {
+		uint64_t	index;
+		struct {
+			uint8_t		chunk;	/* states chunk */
+			uint8_t		proto;	/* protocol */
+			uint16_t	port;	/* base port */
+			in_addr_t	addr;	/* alias address */
+		};
+	} next;				/* next state index */
+	struct in_addr	alias4;		/* IPv4 alias address */
+	uint32_t	count;		/* Number of states */
+} ipfw_nat64lsn_stg_v1;
+#endif /* _NETINET6_IP_FW_NAT64_H_ */

Modified: stable/12/sys/netpfil/ipfw/nat64/nat64lsn.c
--- stable/12/sys/netpfil/ipfw/nat64/nat64lsn.c	Sun Apr 14 12:39:09 2019	(r346212)
+++ stable/12/sys/netpfil/ipfw/nat64/nat64lsn.c	Sun Apr 14 13:08:18 2019	(r346213)
@@ -33,16 +33,17 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/counter.h>
+#include <sys/ck.h>
+#include <sys/epoch.h>
 #include <sys/errno.h>
+#include <sys/hash.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/rmlock.h>
-#include <sys/rwlock.h>
 #include <sys/socket.h>
-#include <sys/queue.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
@@ -71,17 +72,22 @@ __FBSDID("$FreeBSD$");
-static void nat64lsn_periodic(void *data);
-static uint8_t nat64lsn_proto_map[256];
-uint8_t nat64lsn_rproto_map[NAT_MAX_PROTO];
+static epoch_t nat64lsn_epoch;
+#define	NAT64LSN_EPOCH_ENTER(et)  epoch_enter_preempt(nat64lsn_epoch, &(et))
+#define	NAT64LSN_EPOCH_EXIT(et)   epoch_exit_preempt(nat64lsn_epoch, &(et))
+#define	NAT64LSN_EPOCH_WAIT()     epoch_wait_preempt(nat64lsn_epoch)
+#define	NAT64LSN_EPOCH_ASSERT()   MPASS(in_epoch(nat64lsn_epoch))
+#define	NAT64LSN_EPOCH_CALL(c, f) epoch_call(nat64lsn_epoch, (c), (f))
-#define	NAT64_FLAG_FIN		0x01	/* FIN was seen */
-#define	NAT64_FLAG_SYN		0x02	/* First syn in->out */
-#define	NAT64_FLAG_ESTAB	0x04	/* Packet with Ack */
+static uma_zone_t nat64lsn_host_zone;
+static uma_zone_t nat64lsn_pgchunk_zone;
+static uma_zone_t nat64lsn_pg_zone;
+static uma_zone_t nat64lsn_aliaslink_zone;
+static uma_zone_t nat64lsn_state_zone;
+static uma_zone_t nat64lsn_job_zone;
-#define	NAT64_FLAG_RDR		0x80	/* Port redirect */
+static void nat64lsn_periodic(void *data);
+#define	PERIODIC_DELAY		4
 #define	NAT64_LOOKUP(chain, cmd)	\
 	(struct nat64lsn_cfg *)SRV_OBJECT((chain), (cmd)->arg1)
@@ -91,25 +97,33 @@ uint8_t nat64lsn_rproto_map[NAT_MAX_PROTO];
 enum nat64lsn_jtype {
 struct nat64lsn_job_item {
-	TAILQ_ENTRY(nat64lsn_job_item)	next;
+	STAILQ_ENTRY(nat64lsn_job_item)	entries;
 	enum nat64lsn_jtype	jtype;
-	struct nat64lsn_host	*nh;
-	struct nat64lsn_portgroup	*pg;
-	void			*spare_idx;
-	struct in6_addr		haddr;
-	uint8_t			nat_proto;
-	uint8_t			done;
-	int			needs_idx;
-	int			delcount;
-	unsigned int		fhash;	/* Flow hash */
-	uint32_t		aaddr;	/* Last used address (net) */
-	struct mbuf		*m;
-	struct ipfw_flow_id	f_id;
-	uint64_t		delmask[NAT64LSN_PGPTRNMASK];
+	union {
+		struct { /* used by JTYPE_NEWHOST, JTYPE_NEWPORTGROUP */
+			struct mbuf		*m;
+			struct nat64lsn_host	*host;
+			struct nat64lsn_state	*state;
+			uint32_t		src6_hval;
+			uint32_t		state_hval;
+			struct ipfw_flow_id	f_id;
+			in_addr_t		faddr;
+			uint16_t		port;
+			uint8_t			proto;
+			uint8_t			done;
+		};
+		struct { /* used by JTYPE_DESTROY */
+			struct nat64lsn_hosts_slist	hosts;
+			struct nat64lsn_pg_slist	portgroups;
+			struct nat64lsn_pgchunk		*pgchunk;
+			struct epoch_context		epoch_ctx;
+		};
+	};
 static struct mtx jmtx;
@@ -118,143 +132,311 @@ static struct mtx jmtx;
 #define	JQUEUE_LOCK()		mtx_lock(&jmtx)
 #define	JQUEUE_UNLOCK()		mtx_unlock(&jmtx)
+static int nat64lsn_alloc_host(struct nat64lsn_cfg *cfg,
+    struct nat64lsn_job_item *ji);
+static int nat64lsn_alloc_pg(struct nat64lsn_cfg *cfg,
+    struct nat64lsn_job_item *ji);
+static struct nat64lsn_job_item *nat64lsn_create_job(
+    struct nat64lsn_cfg *cfg, int jtype);
 static void nat64lsn_enqueue_job(struct nat64lsn_cfg *cfg,
     struct nat64lsn_job_item *ji);
-static void nat64lsn_enqueue_jobs(struct nat64lsn_cfg *cfg,
-    struct nat64lsn_job_head *jhead, int jlen);
+static void nat64lsn_job_destroy(epoch_context_t ctx);
+static void nat64lsn_destroy_host(struct nat64lsn_host *host);
+static void nat64lsn_destroy_pg(struct nat64lsn_pg *pg);
-static struct nat64lsn_job_item *nat64lsn_create_job(struct nat64lsn_cfg *cfg,
-    const struct ipfw_flow_id *f_id, int jtype);
-static int nat64lsn_request_portgroup(struct nat64lsn_cfg *cfg,
-    const struct ipfw_flow_id *f_id, struct mbuf **pm, uint32_t aaddr,
-    int needs_idx);
-static int nat64lsn_request_host(struct nat64lsn_cfg *cfg,
-    const struct ipfw_flow_id *f_id, struct mbuf **pm);
 static int nat64lsn_translate4(struct nat64lsn_cfg *cfg,
-    const struct ipfw_flow_id *f_id, struct mbuf **pm);
+    const struct ipfw_flow_id *f_id, struct mbuf **mp);
 static int nat64lsn_translate6(struct nat64lsn_cfg *cfg,
-    struct ipfw_flow_id *f_id, struct mbuf **pm);
+    struct ipfw_flow_id *f_id, struct mbuf **mp);
+static int nat64lsn_translate6_internal(struct nat64lsn_cfg *cfg,
+    struct mbuf **mp, struct nat64lsn_state *state, uint8_t flags);
-static int alloc_portgroup(struct nat64lsn_job_item *ji);
-static void destroy_portgroup(struct nat64lsn_portgroup *pg);
-static void destroy_host6(struct nat64lsn_host *nh);
-static int alloc_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji);
+#define	NAT64_BIT_TCP_FIN	0	/* FIN was seen */
+#define	NAT64_BIT_TCP_SYN	1	/* First syn in->out */
+#define	NAT64_BIT_TCP_ESTAB	2	/* Packet with Ack */
+#define	NAT64_BIT_READY_IPV4	6	/* state is ready for translate4 */
+#define	NAT64_BIT_STALE		7	/* state is going to be expired */
-static int attach_portgroup(struct nat64lsn_cfg *cfg,
-    struct nat64lsn_job_item *ji);
-static int attach_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji);
+#define	NAT64_FLAG_FIN		(1 << NAT64_BIT_TCP_FIN)
+#define	NAT64_FLAG_SYN		(1 << NAT64_BIT_TCP_SYN)
+#define	NAT64_FLAG_ESTAB	(1 << NAT64_BIT_TCP_ESTAB)
+#define	NAT64_FLAG_READY	(1 << NAT64_BIT_READY_IPV4)
+#define	NAT64_FLAG_STALE	(1 << NAT64_BIT_STALE)
-/* XXX tmp */
-static uma_zone_t nat64lsn_host_zone;
-static uma_zone_t nat64lsn_pg_zone;
-static uma_zone_t nat64lsn_pgidx_zone;
+static inline uint8_t
+convert_tcp_flags(uint8_t flags)
+	uint8_t result;
-static unsigned int nat64lsn_periodic_chkstates(struct nat64lsn_cfg *cfg,
-    struct nat64lsn_host *nh);
+	result = flags & (TH_FIN|TH_SYN);
+	result |= (flags & TH_RST) >> 2; /* Treat RST as FIN */
+	result |= (flags & TH_ACK) >> 2; /* Treat ACK as estab */
-#define	I6_hash(x)		(djb_hash((const unsigned char *)(x), 16))
-#define	I6_first(_ph, h)	(_ph)[h]
-#define	I6_next(x)		(x)->next
-#define	I6_val(x)		(&(x)->addr)
-#define	I6_cmp(a, b)		IN6_ARE_ADDR_EQUAL(a, b)
-#define	I6_lock(a, b)
-#define	I6_unlock(a, b)
+	return (result);
-#define	I6HASH_FIND(_cfg, _res, _a) \
-	CHT_FIND(_cfg->ih, _cfg->ihsize, I6_, _res, _a)
-#define	I6HASH_INSERT(_cfg, _i)	\
-	CHT_INSERT_HEAD(_cfg->ih, _cfg->ihsize, I6_, _i)
-#define	I6HASH_REMOVE(_cfg, _res, _tmp, _a)	\
-	CHT_REMOVE(_cfg->ih, _cfg->ihsize, I6_, _res, _tmp, _a)
+static void
+nat64lsn_log(struct pfloghdr *plog, struct mbuf *m, sa_family_t family,
+    struct nat64lsn_state *state)
-#define	I6HASH_FOREACH_SAFE(_cfg, _x, _tmp, _cb, _arg)	\
-	CHT_FOREACH_SAFE(_cfg->ih, _cfg->ihsize, I6_, _x, _tmp, _cb, _arg)
+	memset(plog, 0, sizeof(*plog));
+	plog->length = PFLOG_REAL_HDRLEN;
+	plog->af = family;
+	plog->action = PF_NAT;
+	plog->dir = PF_IN;
+	plog->rulenr = htonl(state->ip_src);
+	plog->subrulenr = htonl((uint32_t)(state->aport << 16) |
+	    (state->proto << 8) | (state->ip_dst & 0xff));
+	plog->ruleset[0] = '\0';
+	strlcpy(plog->ifname, "NAT64LSN", sizeof(plog->ifname));
+	ipfw_bpf_mtap2(plog, PFLOG_HDRLEN, m);
-#define	HASH_IN4(x)	djb_hash((const unsigned char *)(x), 8)
+#define	HVAL(p, n, s)	jenkins_hash32((const uint32_t *)(p), (n), (s))
+#define	HOST_HVAL(c, a)	HVAL((a),\
+    sizeof(struct in6_addr) / sizeof(uint32_t), (c)->hash_seed)
+#define	HOSTS(c, v)	((c)->hosts_hash[(v) & ((c)->hosts_hashsize - 1)])
-static unsigned
-djb_hash(const unsigned char *h, const int len)
+#define	ALIASLINK_HVAL(c, f)	HVAL(&(f)->dst_ip6,\
+    sizeof(struct in6_addr) * 2 / sizeof(uint32_t), (c)->hash_seed)
+#define	ALIAS_BYHASH(c, v)	\
+    ((c)->aliases[(v) & ((1 << (32 - (c)->plen4)) - 1)])
+static struct nat64lsn_aliaslink*
+nat64lsn_get_aliaslink(struct nat64lsn_cfg *cfg __unused,
+    struct nat64lsn_host *host, const struct ipfw_flow_id *f_id __unused)
-	unsigned int result = 0;
-	int i;
-	for (i = 0; i < len; i++)
-		result = 33 * result ^ h[i];
-	return (result);
+	/*
+	 * We can implement some different algorithms how
+	 * select an alias address.
+	 * XXX: for now we use first available.
+	 */
+	return (CK_SLIST_FIRST(&host->aliases));
-static size_t 
-bitmask_size(size_t num, int *level)
+#define	STATE_HVAL(c, d)	HVAL((d), 2, (c)->hash_seed)
+#define	STATE_HASH(h, v)	\
+    ((h)->states_hash[(v) & ((h)->states_hashsize - 1)])
+#define	STATES_CHUNK(p, v)	\
+    ((p)->chunks_count == 1 ? (p)->states : \
+	((p)->states_chunk[CHUNK_BY_FADDR(p, v)]))
+#ifdef __LP64__
+#define	FREEMASK_FFSLL(pg, faddr)		\
+    ffsll(*FREEMASK_CHUNK((pg), (faddr)))
+#define	FREEMASK_BTR(pg, faddr, bit)	\
+    ck_pr_btr_64(FREEMASK_CHUNK((pg), (faddr)), (bit))
+#define	FREEMASK_BTS(pg, faddr, bit)	\
+    ck_pr_bts_64(FREEMASK_CHUNK((pg), (faddr)), (bit))
+#define	FREEMASK_ISSET(pg, faddr, bit)	\
+    ISSET64(*FREEMASK_CHUNK((pg), (faddr)), (bit))
+#define	FREEMASK_COPY(pg, n, out)	\
+    (out) = ck_pr_load_64(FREEMASK_CHUNK((pg), (n)))
+static inline int
+freemask_ffsll(uint32_t *freemask)
-	size_t x;
-	int c;
+	int i;
-	for (c = 0, x = num; num > 1; num /= 64, c++)
-		;
-	return (x);
+	if ((i = ffsl(freemask[0])) != 0)
+		return (i);
+	if ((i = ffsl(freemask[1])) != 0)
+		return (i + 32);
+	return (0);
+#define	FREEMASK_FFSLL(pg, faddr)		\
+    freemask_ffsll(FREEMASK_CHUNK((pg), (faddr)))
+#define	FREEMASK_BTR(pg, faddr, bit)	\
+    ck_pr_btr_32(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32, (bit) % 32)
+#define	FREEMASK_BTS(pg, faddr, bit)	\
+    ck_pr_bts_32(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32, (bit) % 32)
+#define	FREEMASK_ISSET(pg, faddr, bit)	\
+    ISSET32(*(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32), (bit) % 32)
+#define	FREEMASK_COPY(pg, n, out)	\
+    (out) = ck_pr_load_32(FREEMASK_CHUNK((pg), (n))) | \
+	((uint64_t)ck_pr_load_32(FREEMASK_CHUNK((pg), (n)) + 1) << 32)
+#endif /* !__LP64__ */
-static void
-bitmask_prepare(uint64_t *pmask, size_t bufsize, int level)
+#define	NAT64LSN_TRY_PGCNT	32
+static struct nat64lsn_pg*
+nat64lsn_get_pg(uint32_t *chunkmask, uint32_t *pgmask,
+    struct nat64lsn_pgchunk **chunks, struct nat64lsn_pg **pgptr,
+    uint32_t *pgidx, in_addr_t faddr)
-	size_t x, z;
+	struct nat64lsn_pg *pg, *oldpg;
+	uint32_t idx, oldidx;
+	int cnt;
-	memset(pmask, 0xFF, bufsize);
-	for (x = 0, z = 1; level > 1; x += z, z *= 64, level--)
-		;
-	pmask[x] ~= 0x01;
+	cnt = 0;
+	/* First try last used PG */
+	oldpg = pg = ck_pr_load_ptr(pgptr);
+	idx = oldidx = ck_pr_load_32(pgidx);
+	/* If pgidx is out of range, reset it to the first pgchunk */
+	if (!ISSET32(*chunkmask, idx / 32))
+		idx = 0;
+	do {
+		ck_pr_fence_load();
+		if (pg != NULL && FREEMASK_BITCOUNT(pg, faddr) > 0) {
+			/*
+			 * If last used PG has not free states,
+			 * try to update pointer.
+			 * NOTE: it can be already updated by jobs handler,
+			 *	 thus we use CAS operation.
+			 */
+			if (cnt > 0)
+				ck_pr_cas_ptr(pgptr, oldpg, pg);
+			return (pg);
+		}
+		/* Stop if idx is out of range */
+		if (!ISSET32(*chunkmask, idx / 32))
+			break;
+		if (ISSET32(pgmask[idx / 32], idx % 32))
+			pg = ck_pr_load_ptr(
+			    &chunks[idx / 32]->pgptr[idx % 32]);
+		else
+			pg = NULL;
+		idx++;
+	} while (++cnt < NAT64LSN_TRY_PGCNT);
+	/* If pgidx is out of range, reset it to the first pgchunk */
+	if (!ISSET32(*chunkmask, idx / 32))
+		idx = 0;
+	ck_pr_cas_32(pgidx, oldidx, idx);
+	return (NULL);
-static void
-nat64lsn_log(struct pfloghdr *plog, struct mbuf *m, sa_family_t family,
-    uint32_t n, uint32_t sn)
+static struct nat64lsn_state*
+nat64lsn_get_state6to4(struct nat64lsn_cfg *cfg, struct nat64lsn_host *host,
+    const struct ipfw_flow_id *f_id, uint32_t hval, in_addr_t faddr,
+    uint16_t port, uint8_t proto)
+	struct nat64lsn_aliaslink *link;
+	struct nat64lsn_state *state;
+	struct nat64lsn_pg *pg;
+	int i, offset;
-	memset(plog, 0, sizeof(*plog));
-	plog->length = PFLOG_REAL_HDRLEN;
-	plog->af = family;
-	plog->action = PF_NAT;
-	plog->dir = PF_IN;
-	plog->rulenr = htonl(n);
-	plog->subrulenr = htonl(sn);
-	plog->ruleset[0] = '\0';
-	strlcpy(plog->ifname, "NAT64LSN", sizeof(plog->ifname));
-	ipfw_bpf_mtap2(plog, PFLOG_HDRLEN, m);
+	/* Check that we already have state for given arguments */
+	CK_SLIST_FOREACH(state, &STATE_HASH(host, hval), entries) {
+		if (state->proto == proto && state->ip_dst == faddr &&
+		    state->sport == port && state->dport == f_id->dst_port)
+			return (state);
+	}
+	link = nat64lsn_get_aliaslink(cfg, host, f_id);
+	if (link == NULL)
+		return (NULL);
+	switch (proto) {
+		pg = nat64lsn_get_pg(
+		    &link->alias->tcp_chunkmask, link->alias->tcp_pgmask,
+		    link->alias->tcp, &link->alias->tcp_pg,
+		    &link->alias->tcp_pgidx, faddr);
+		break;
+		pg = nat64lsn_get_pg(
+		    &link->alias->udp_chunkmask, link->alias->udp_pgmask,
+		    link->alias->udp, &link->alias->udp_pg,
+		    &link->alias->udp_pgidx, faddr);
+		break;
+		pg = nat64lsn_get_pg(
+		    &link->alias->icmp_chunkmask, link->alias->icmp_pgmask,
+		    link->alias->icmp, &link->alias->icmp_pg,
+		    &link->alias->icmp_pgidx, faddr);
+		break;
+	default:
+		panic("%s: wrong proto %d", __func__, proto);
+	}
+	if (pg == NULL)
+		return (NULL);
+	/* Check that PG has some free states */
+	state = NULL;
+	i = FREEMASK_BITCOUNT(pg, faddr);
+	while (i-- > 0) {
+		offset = FREEMASK_FFSLL(pg, faddr);
+		if (offset == 0) {
+			/*
+			 * We lost the race.
+			 * No more free states in this PG.
+			 */
+			break;
+		}
+		/* Lets try to atomically grab the state */
+		if (FREEMASK_BTR(pg, faddr, offset - 1)) {
+			state = &STATES_CHUNK(pg, faddr)->state[offset - 1];
+			/* Initialize */
+			state->flags = proto != IPPROTO_TCP ? 0 :
+			    convert_tcp_flags(f_id->_flags);
+			state->proto = proto;
+			state->aport = pg->base_port + offset - 1;
+			state->dport = f_id->dst_port;
+			state->sport = port;
+			state->ip6_dst = f_id->dst_ip6;
+			state->ip_dst = faddr;
+			state->ip_src = link->alias->addr;
+			state->hval = hval;
+			state->host = host;
+			SET_AGE(state->timestamp);
+			/* Insert new state into host's hash table */
+			HOST_LOCK(host);
+			    state, entries);
+			host->states_count++;
+			/*
+			 * XXX: In case if host is going to be expired,
+			 * reset NAT64LSN_DEADHOST flag.
+			 */
+			host->flags &= ~NAT64LSN_DEADHOST;
+			HOST_UNLOCK(host);
+			NAT64STAT_INC(&cfg->base.stats, screated);
+			/* Mark the state as ready for translate4 */
+			ck_pr_fence_store();
+			ck_pr_bts_32(&state->flags, NAT64_BIT_READY_IPV4);
+			break;
+		}
+	}
+	return (state);
  * Inspects icmp packets to see if the message contains different
  * packet header so we need to alter @addr and @port.
 static int
-inspect_icmp_mbuf(struct mbuf **m, uint8_t *nat_proto, uint32_t *addr,
+inspect_icmp_mbuf(struct mbuf **mp, uint8_t *proto, uint32_t *addr,
     uint16_t *port)
+	struct icmp *icmp;
 	struct ip *ip;
-	struct tcphdr *tcp;
-	struct udphdr *udp;
-	struct icmphdr *icmp;
 	int off;
-	uint8_t proto;
+	uint8_t inner_proto;
-	ip = mtod(*m, struct ip *); /* Outer IP header */
+	ip = mtod(*mp, struct ip *); /* Outer IP header */
 	off = (ip->ip_hl << 2) + ICMP_MINLEN;
-	if ((*m)->m_len < off)
-		*m = m_pullup(*m, off);
-	if (*m == NULL)
+	if ((*mp)->m_len < off)
+		*mp = m_pullup(*mp, off);
+	if (*mp == NULL)
 		return (ENOMEM);
-	ip = mtod(*m, struct ip *); /* Outer IP header */
-	icmp = L3HDR(ip, struct icmphdr *);
+	ip = mtod(*mp, struct ip *); /* Outer IP header */
+	icmp = L3HDR(ip, struct icmp *);
 	switch (icmp->icmp_type) {
 	case ICMP_ECHO:
 		/* Use icmp ID as distinguisher */
-		*port = ntohs(*((uint16_t *)(icmp + 1)));
+		*port = ntohs(icmp->icmp_id);
 		return (0);
@@ -266,90 +448,133 @@ inspect_icmp_mbuf(struct mbuf **m, uint8_t *nat_proto,
 	 * ICMP_UNREACH and ICMP_TIMXCEED contains IP header + 64 bits
 	 * of ULP header.
-	if ((*m)->m_pkthdr.len < off + sizeof(struct ip) + ICMP_MINLEN)
+	if ((*mp)->m_pkthdr.len < off + sizeof(struct ip) + ICMP_MINLEN)
 		return (EINVAL);
-	if ((*m)->m_len < off + sizeof(struct ip) + ICMP_MINLEN)
-		*m = m_pullup(*m, off + sizeof(struct ip) + ICMP_MINLEN);
-	if (*m == NULL)
+	if ((*mp)->m_len < off + sizeof(struct ip) + ICMP_MINLEN)
+		*mp = m_pullup(*mp, off + sizeof(struct ip) + ICMP_MINLEN);
+	if (*mp == NULL)
 		return (ENOMEM);
-	ip = mtodo(*m, off); /* Inner IP header */
-	proto = ip->ip_p;
+	ip = mtodo(*mp, off); /* Inner IP header */
+	inner_proto = ip->ip_p;
 	off += ip->ip_hl << 2; /* Skip inner IP header */
 	*addr = ntohl(ip->ip_src.s_addr);
-	if ((*m)->m_len < off + ICMP_MINLEN)
-		*m = m_pullup(*m, off + ICMP_MINLEN);
-	if (*m == NULL)
+	if ((*mp)->m_len < off + ICMP_MINLEN)
+		*mp = m_pullup(*mp, off + ICMP_MINLEN);
+	if (*mp == NULL)
 		return (ENOMEM);
-	switch (proto) {
+	switch (inner_proto) {
-		tcp = mtodo(*m, off);
-		*nat_proto = NAT_PROTO_TCP;
-		*port = ntohs(tcp->th_sport);
-		return (0);
-		udp = mtodo(*m, off);
-		*nat_proto = NAT_PROTO_UDP;
-		*port = ntohs(udp->uh_sport);
+		/* Copy source port from the header */
+		*port = ntohs(*((uint16_t *)mtodo(*mp, off)));
+		*proto = inner_proto;
 		return (0);
 		 * We will translate only ICMP errors for our ICMP
 		 * echo requests.
-		icmp = mtodo(*m, off);
+		icmp = mtodo(*mp, off);
 		if (icmp->icmp_type != ICMP_ECHO)
 			return (EOPNOTSUPP);
-		*port = ntohs(*((uint16_t *)(icmp + 1)));
+		*port = ntohs(icmp->icmp_id);
 		return (0);
 	return (EOPNOTSUPP);
-static inline uint8_t
-convert_tcp_flags(uint8_t flags)
+static struct nat64lsn_state*
+nat64lsn_get_state4to6(struct nat64lsn_cfg *cfg, struct nat64lsn_alias *alias,
+    in_addr_t faddr, uint16_t port, uint8_t proto)
-	uint8_t result;
+	struct nat64lsn_state *state;
+	struct nat64lsn_pg *pg;
+	int chunk_idx, pg_idx, state_idx;
-	result = flags & (TH_FIN|TH_SYN);
-	result |= (flags & TH_RST) >> 2; /* Treat RST as FIN */
-	result |= (flags & TH_ACK) >> 2; /* Treat ACK as estab */


More information about the svn-src-all mailing list