svn commit: r183965 - in user/kmacy/HEAD_ECMP/sys: conf i386/conf
net netinet
Kip Macy
kmacy at FreeBSD.org
Fri Oct 17 03:59:26 UTC 2008
Author: kmacy
Date: Fri Oct 17 03:59:25 2008
New Revision: 183965
URL: http://svn.freebsd.org/changeset/base/183965
Log:
Add experimental flow tracking support to provide stateful ECMP
Added:
user/kmacy/HEAD_ECMP/sys/i386/conf/ECMP_TEST
user/kmacy/HEAD_ECMP/sys/net/flowtable.c (contents, props changed)
Modified:
user/kmacy/HEAD_ECMP/sys/conf/files
user/kmacy/HEAD_ECMP/sys/net/radix_mpath.c
user/kmacy/HEAD_ECMP/sys/net/radix_mpath.h
user/kmacy/HEAD_ECMP/sys/net/route.c
user/kmacy/HEAD_ECMP/sys/net/route.h
user/kmacy/HEAD_ECMP/sys/netinet/ip_input.c
Modified: user/kmacy/HEAD_ECMP/sys/conf/files
==============================================================================
--- user/kmacy/HEAD_ECMP/sys/conf/files Fri Oct 17 03:17:10 2008 (r183964)
+++ user/kmacy/HEAD_ECMP/sys/conf/files Fri Oct 17 03:59:25 2008 (r183965)
@@ -1824,6 +1824,7 @@ net/if_stf.c optional stf
net/if_tun.c optional tun
net/if_tap.c optional tap
net/if_vlan.c optional vlan
+net/flowtable.c optional inet
net/mppcc.c optional netgraph_mppc_compression
net/mppcd.c optional netgraph_mppc_compression
net/netisr.c standard
Added: user/kmacy/HEAD_ECMP/sys/i386/conf/ECMP_TEST
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ user/kmacy/HEAD_ECMP/sys/i386/conf/ECMP_TEST Fri Oct 17 03:59:25 2008 (r183965)
@@ -0,0 +1,241 @@
+#
+# GENERIC -- Generic kernel configuration file for FreeBSD/i386
+#
+# For more information on this file, please read the handbook section on
+# Kernel Configuration Files:
+#
+# http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html
+#
+# The handbook is also available locally in /usr/share/doc/handbook
+# if you've installed the doc distribution, otherwise always see the
+# FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the
+# latest information.
+#
+# An exhaustive list of options and more detailed explanations of the
+# device lines is also present in the ../../conf/NOTES and NOTES files.
+# If you are in doubt as to the purpose or necessity of a line, check first
+# in NOTES.
+#
+# $FreeBSD: user/kmacy/HEAD_ECMP/sys/i386/conf/GENERIC 183735 2008-10-09 21:25:01Z n_hibma $
+
+cpu I486_CPU
+cpu I586_CPU
+cpu I686_CPU
+ident GENERIC
+
+# To statically compile in device wiring instead of /boot/device.hints
+#hints "GENERIC.hints" # Default places to look for devices.
+
+makeoptions DEBUG=-g # Build kernel with gdb(1) debug symbols
+
+options SCHED_ULE # ULE scheduler
+options PREEMPTION # Enable kernel thread preemption
+options INET # InterNETworking
+options INET6 # IPv6 communications protocols
+options SCTP # Stream Control Transmission Protocol
+options FFS # Berkeley Fast Filesystem
+options SOFTUPDATES # Enable FFS soft updates support
+options UFS_ACL # Support for access control lists
+options UFS_DIRHASH # Improve performance on big directories
+options UFS_GJOURNAL # Enable gjournal-based UFS journaling
+options MD_ROOT # MD is a potential root device
+options NFSCLIENT # Network Filesystem Client
+options NFSSERVER # Network Filesystem Server
+options NFSLOCKD # Network Lock Manager
+options NFS_ROOT # NFS usable as /, requires NFSCLIENT
+options MSDOSFS # MSDOS Filesystem
+options CD9660 # ISO 9660 Filesystem
+options PROCFS # Process filesystem (requires PSEUDOFS)
+options PSEUDOFS # Pseudo-filesystem framework
+options GEOM_PART_GPT # GUID Partition Tables.
+options GEOM_LABEL # Provides labelization
+options COMPAT_43TTY # BSD 4.3 TTY compat [KEEP THIS!]
+options COMPAT_FREEBSD4 # Compatible with FreeBSD4
+options COMPAT_FREEBSD5 # Compatible with FreeBSD5
+options COMPAT_FREEBSD6 # Compatible with FreeBSD6
+options COMPAT_FREEBSD7 # Compatible with FreeBSD7
+options SCSI_DELAY=5000 # Delay (in ms) before probing SCSI
+options KTRACE # ktrace(1) support
+options STACK # stack(9) support
+options SYSVSHM # SYSV-style shared memory
+options SYSVMSG # SYSV-style message queues
+options SYSVSEM # SYSV-style semaphores
+options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions
+options KBD_INSTALL_CDEV # install a CDEV entry in /dev
+options STOP_NMI # Stop CPUS using NMI instead of IPI
+options HWPMC_HOOKS # Necessary kernel hooks for hwpmc(4)
+options AUDIT # Security event auditing
+
+# Debugging for use in -current
+options KDB # Enable kernel debugger support.
+options DDB # Support DDB.
+options GDB # Support remote GDB.
+options INVARIANTS # Enable calls of extra sanity checking
+options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS
+options WITNESS # Enable checks to detect deadlocks and cycles
+options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed
+
+# To make an SMP kernel, the next two lines are needed
+options SMP # Symmetric MultiProcessor Kernel
+device apic # I/O APIC
+
+# CPU frequency control
+device cpufreq
+
+# Bus support.
+device acpi
+device eisa
+device pci
+
+# Floppy drives
+device fdc
+
+# ATA and ATAPI devices
+device ata
+device atadisk # ATA disk drives
+device ataraid # ATA RAID drives
+device atapicd # ATAPI CDROM drives
+device atapifd # ATAPI floppy drives
+device atapist # ATAPI tape drives
+options ATA_STATIC_ID # Static device numbering
+
+# SCSI Controllers
+device ahb # EISA AHA1742 family
+device ahc # AHA2940 and onboard AIC7xxx devices
+options AHC_REG_PRETTY_PRINT # Print register bitfields in debug
+ # output. Adds ~128k to driver.
+device ahd # AHA39320/29320 and onboard AIC79xx devices
+options AHD_REG_PRETTY_PRINT # Print register bitfields in debug
+ # output. Adds ~215k to driver.
+device amd # AMD 53C974 (Tekram DC-390(T))
+device hptiop # Highpoint RocketRaid 3xxx series
+device isp # Qlogic family
+#device ispfw # Firmware for QLogic HBAs- normally a module
+device mpt # LSI-Logic MPT-Fusion
+#device ncr # NCR/Symbios Logic
+device sym # NCR/Symbios Logic (newer chipsets + those of `ncr')
+
+# SCSI peripherals
+device scbus # SCSI bus (required for SCSI)
+device ch # SCSI media changers
+device da # Direct Access (disks)
+device sa # Sequential Access (tape etc)
+device cd # CD
+device pass # Passthrough device (direct SCSI access)
+device ses # SCSI Environmental Services (and SAF-TE)
+
+# atkbdc0 controls both the keyboard and the PS/2 mouse
+device atkbdc # AT keyboard controller
+device atkbd # AT keyboard
+device psm # PS/2 mouse
+
+device kbdmux # keyboard multiplexer
+
+device vga # VGA video card driver
+
+device splash # Splash screen and screen saver support
+
+# syscons is the default console driver, resembling an SCO console
+device sc
+
+device agp # support several AGP chipsets
+
+# Power management support (see NOTES for more options)
+#device apm
+# Add suspend/resume support for the i8254.
+device pmtimer
+
+# PCCARD (PCMCIA) support
+# PCMCIA and cardbus bridge support
+device cbb # cardbus (yenta) bridge
+device pccard # PC Card (16-bit) bus
+device cardbus # CardBus (32-bit) bus
+
+# Serial (COM) ports
+device uart # Generic UART driver
+
+# If you've got a "dumb" serial or parallel PCI card that is
+# supported by the puc(4) glue driver, uncomment the following
+# line to enable it (connects to sio, uart and/or ppc drivers):
+#device puc
+
+# PCI Ethernet NICs.
+device em # Intel PRO/1000 Gigabit Ethernet Family
+device igb # Intel PRO/1000 PCIE Server Gigabit Family
+device ixgb # Intel PRO/10GbE Ethernet Card
+device le # AMD Am7900 LANCE and Am79C9xx PCnet
+
+
+# PCI Ethernet NICs that use the common MII bus controller code.
+# NOTE: Be sure to keep the 'device miibus' line in order to use these NICs!
+device miibus # MII bus support
+device ae # Attansic/Atheros L2 FastEthernet
+device age # Attansic/Atheros L1 Gigabit Ethernet
+device bce # Broadcom BCM5706/BCM5708 Gigabit Ethernet
+device bfe # Broadcom BCM440x 10/100 Ethernet
+device bge # Broadcom BCM570xx Gigabit Ethernet
+device et # Agere ET1310 10/100/Gigabit Ethernet
+device fxp # Intel EtherExpress PRO/100B (82557, 82558)
+device jme # JMicron JMC250 Gigabit/JMC260 Fast Ethernet
+device lge # Level 1 LXT1001 gigabit Ethernet
+device msk # Marvell/SysKonnect Yukon II Gigabit Ethernet
+
+# Pseudo devices.
+device loop # Network loopback
+device random # Entropy device
+device ether # Ethernet support
+device tun # Packet tunnel.
+device pty # BSD-style compatibility pseudo ttys
+device md # Memory "disks"
+device gif # IPv6 and IPv4 tunneling
+device faith # IPv6-to-IPv4 relaying (translation)
+device firmware # firmware assist module
+
+# The `bpf' device enables the Berkeley Packet Filter.
+# Be aware of the administrative consequences of enabling this!
+# Note that 'bpf' is required for DHCP.
+device bpf # Berkeley packet filter
+
+# USB support
+device uhci # UHCI PCI->USB interface
+device ohci # OHCI PCI->USB interface
+device ehci # EHCI PCI->USB interface (USB 2.0)
+device usb # USB Bus (required)
+#device udbp # USB Double Bulk Pipe devices
+device ugen # Generic
+device uhid # "Human Interface Devices"
+device ukbd # Keyboard
+device ulpt # Printer
+device umass # Disks/Mass storage - Requires scbus and da
+device ums # Mouse
+device urio # Diamond Rio 500 MP3 player
+device uscanner # Scanners
+# USB Serial devices
+device ucom # Generic com ttys
+device u3g # USB-based 3G modems (Option, Huawei, Sierra)
+device uark # Technologies ARK3116 based serial adapters
+device ubsa # Belkin F5U103 and compatible serial adapters
+device uftdi # For FTDI usb serial adapters
+device uipaq # Some WinCE based devices
+device uplcom # Prolific PL-2303 serial adapters
+device uslcom # SI Labs CP2101/CP2102 serial adapters
+device uvisor # Visor and Palm devices
+device uvscom # USB serial support for DDI pocket's PHS
+# USB Ethernet, requires miibus
+device aue # ADMtek USB Ethernet
+device axe # ASIX Electronics USB Ethernet
+device cdce # Generic USB over Ethernet
+device cue # CATC USB Ethernet
+device kue # Kawasaki LSI USB Ethernet
+device rue # RealTek RTL8150 USB Ethernet
+device udav # Davicom DM9601E USB
+
+# FireWire support
+device firewire # FireWire bus code
+device sbp # SCSI over FireWire (Requires scbus and da)
+device fwe # Ethernet over FireWire (non-standard!)
+device fwip # IP over FireWire (RFC 2734,3146)
+device dcons # Dumb console driver
+device dcons_crom # Configuration ROM for dcons
+
+options RADIX_MPATH
Added: user/kmacy/HEAD_ECMP/sys/net/flowtable.c
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ user/kmacy/HEAD_ECMP/sys/net/flowtable.c Fri Oct 17 03:59:25 2008 (r183965)
@@ -0,0 +1,604 @@
+#include "opt_mpath.h"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/limits.h>
+#include <sys/kernel.h>
+#include <sys/bitstring.h>
+#include <sys/vimage.h>
+
+
+#include <sys/callout.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/smp.h>
+#include <sys/socket.h>
+
+#include <net/route.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <netinet/sctp.h>
+
+#define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
+
+
+/*
+ * Taken from http://burtleburtle.net/bob/c/lookup3.c
+ */
+
+#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
+
+/*
+-------------------------------------------------------------------------------
+mix -- mix 3 32-bit values reversibly.
+
+This is reversible, so any information in (a,b,c) before mix() is
+still in (a,b,c) after mix().
+
+If four pairs of (a,b,c) inputs are run through mix(), or through
+mix() in reverse, there are at least 32 bits of the output that
+are sometimes the same for one pair and different for another pair.
+This was tested for:
+* pairs that differed by one bit, by two bits, in any combination
+ of top bits of (a,b,c), or in any combination of bottom bits of
+ (a,b,c).
+* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
+ the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+ is commonly produced by subtraction) look like a single 1-bit
+ difference.
+* the base values were pseudorandom, all zero but one bit set, or
+ all zero plus a counter that starts at zero.
+
+Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
+satisfy this are
+ 4 6 8 16 19 4
+ 9 15 3 18 27 15
+ 14 9 3 7 17 3
+Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
+for "differ" defined as + with a one-bit base and a two-bit delta. I
+used http://burtleburtle.net/bob/hash/avalanche.html to choose
+the operations, constants, and arrangements of the variables.
+
+This does not achieve avalanche. There are input bits of (a,b,c)
+that fail to affect some output bits of (a,b,c), especially of a. The
+most thoroughly mixed value is c, but it doesn't really even achieve
+avalanche in c.
+
+This allows some parallelism. Read-after-writes are good at doubling
+the number of bits affected, so the goal of mixing pulls in the opposite
+direction as the goal of parallelism. I did what I could. Rotates
+seem to cost as much as shifts on every machine I could lay my hands
+on, and rotates are much kinder to the top and bottom bits, so I used
+rotates.
+-------------------------------------------------------------------------------
+*/
+#define mix(a,b,c) \
+{ \
+ a -= c; a ^= rot(c, 4); c += b; \
+ b -= a; b ^= rot(a, 6); a += c; \
+ c -= b; c ^= rot(b, 8); b += a; \
+ a -= c; a ^= rot(c,16); c += b; \
+ b -= a; b ^= rot(a,19); a += c; \
+ c -= b; c ^= rot(b, 4); b += a; \
+}
+
+/*
+-------------------------------------------------------------------------------
+final -- final mixing of 3 32-bit values (a,b,c) into c
+
+Pairs of (a,b,c) values differing in only a few bits will usually
+produce values of c that look totally different. This was tested for
+* pairs that differed by one bit, by two bits, in any combination
+ of top bits of (a,b,c), or in any combination of bottom bits of
+ (a,b,c).
+* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
+ the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+ is commonly produced by subtraction) look like a single 1-bit
+ difference.
+* the base values were pseudorandom, all zero but one bit set, or
+ all zero plus a counter that starts at zero.
+
+These constants passed:
+ 14 11 25 16 4 14 24
+ 12 14 25 16 4 14 24
+and these came close:
+ 4 8 15 26 3 22 24
+ 10 8 15 26 3 22 24
+ 11 8 15 26 3 22 24
+-------------------------------------------------------------------------------
+*/
+#define final(a,b,c) \
+{ \
+ c ^= b; c -= rot(b,14); \
+ a ^= c; a -= rot(c,11); \
+ b ^= a; b -= rot(a,25); \
+ c ^= b; c -= rot(b,16); \
+ a ^= c; a -= rot(c,4); \
+ b ^= a; b -= rot(a,14); \
+ c ^= b; c -= rot(b,24); \
+}
+
+/*
+--------------------------------------------------------------------
+ This works on all machines. To be useful, it requires
+ -- that the key be an array of uint32_t's, and
+ -- that the length be the number of uint32_t's in the key
+
+ The function hashword() is identical to hashlittle() on little-endian
+ machines, and identical to hashbig() on big-endian machines,
+ except that the length has to be measured in uint32_ts rather than in
+ bytes. hashlittle() is more complicated than hashword() only because
+ hashlittle() has to dance around fitting the key bytes into registers.
+--------------------------------------------------------------------
+*/
+static uint32_t hashword(
+const uint32_t *k, /* the key, an array of uint32_t values */
+size_t length, /* the length of the key, in uint32_ts */
+uint32_t initval) /* the previous hash, or an arbitrary value */
+{
+ uint32_t a,b,c;
+
+ /* Set up the internal state */
+ a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval;
+
+ /*------------------------------------------------- handle most of the key */
+ while (length > 3)
+ {
+ a += k[0];
+ b += k[1];
+ c += k[2];
+ mix(a,b,c);
+ length -= 3;
+ k += 3;
+ }
+
+ /*------------------------------------------- handle the last 3 uint32_t's */
+ switch(length) /* all the case statements fall through */
+ {
+ case 3 : c+=k[2];
+ case 2 : b+=k[1];
+ case 1 : a+=k[0];
+ final(a,b,c);
+ case 0: /* case 0: nothing left to add */
+ break;
+ }
+ /*------------------------------------------------------ report the result */
+ return c;
+}
+
+
+struct ip_tuple {
+ in_addr_t ip_saddr; /* source address */
+ in_addr_t ip_daddr; /* destination address */
+ uint16_t ip_sport; /* source port */
+ uint16_t ip_dport; /* destination port */
+};
+
+union ip_flow {
+ struct ip_tuple ipf_ipt;
+ uint32_t ipf_key[3];
+};
+
+struct flentry_v4 {
+ uint32_t fl_fhash; /* hash flowing forward */
+ uint32_t fl_ticks; /* last time this flow was accessed */
+ uint16_t fl_flags; /* flow flags */
+ uint8_t fl_pad;
+ uint8_t fl_proto; /* protocol */
+ union ip_flow fl_flow;
+ struct rtentry *fl_rt; /* rtentry for flow */
+ uint32_t fl_refcnt;
+ uint32_t fl_hash_next; /* needed for GC */
+ uint32_t fl_hash_prev;
+};
+
+#define TICKS_PER_MINUTE (60*hz)
+#define TICKS_PER_HOUR (60*TICKS_PER_MINUTE)
+#define TICKS_PER_DAY (24*TICKS_PER_HOUR)
+
+
+#define SYN_IDLE (5*TICKS_PER_MINUTE)
+#define UDP_IDLE (5*TICKS_PER_MINUTE)
+#define FIN_WAIT_IDLE (10*TICKS_PER_MINUTE)
+#define TCP_IDLE TICKS_PER_DAY
+
+
+static struct flentry_v4 *ipv4_flow_table;
+static int ipv4_flow_table_size;
+static bitstr_t *ipv4_flow_bitstring;
+static int ipv4_flow_allocated;
+struct mtx *ipv4_flow_locks;
+static int ipv4_flow_lock_count;
+extern uint32_t hashjitter;
+static uint32_t ipv4_flow_route_lookup_fail;
+static uint32_t ipv4_flow_collisions;
+struct callout ipv4_flow_callout;
+static int ipv4_flow_max_count;
+
+
+#define FL_ENTRY_INDEX(hash)((hash) % ipv4_flow_table_size)
+#define FL_ENTRY(hash) (&ipv4_flow_table[FL_ENTRY_INDEX((hash))])
+#define FL_ENTRY_LOCK(hash) mtx_lock(&ipv4_flow_locks[(hash)&(ipv4_flow_lock_count - 1)])
+#define FL_ENTRY_UNLOCK(hash) mtx_lock(&ipv4_flow_locks[(hash)&(ipv4_flow_lock_count - 1)])
+
+#define FL_STALE (1<<8)
+
+static uint32_t
+ipv4_flow_lookup_hash_internal(struct mbuf *m, struct route *ro,
+ uint32_t *key, uint16_t *flags, uint8_t *protop)
+{
+ uint16_t sport = 0, dport = 0;
+ struct ip *ip = mtod(m, struct ip *);
+ uint8_t proto = ip->ip_p;
+ int iphlen = ip->ip_hl << 2;
+ struct sockaddr_in *sin;
+ struct tcphdr *th;
+ struct udphdr *uh;
+ struct sctphdr *sh;
+
+ key[0] = ip->ip_src.s_addr;
+ key[1] = ip->ip_dst.s_addr;
+
+ sin = (struct sockaddr_in *)&ro->ro_dst;
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+ sin->sin_addr = ip->ip_dst;
+
+ switch (proto) {
+ case IPPROTO_TCP:
+ th = (struct tcphdr *)((caddr_t)ip + iphlen);
+ sport = th->th_sport;
+ dport = th->th_dport;
+ *flags = th->th_flags;
+ if (*flags & TH_RST)
+ *flags |= FL_STALE;
+ break;
+ case IPPROTO_UDP:
+ uh = (struct udphdr *)((caddr_t)ip + iphlen);
+ sport = uh->uh_sport;
+ dport = uh->uh_dport;
+ break;
+ case IPPROTO_SCTP:
+ sh = (struct sctphdr *)((caddr_t)ip + iphlen);
+ sport = sh->src_port;
+ dport = sh->dest_port;
+ break;
+ default:
+ /* no port - hence not a protocol we care about */
+ break;;
+
+ }
+ ((uint16_t *)key)[4] = sport;
+ ((uint16_t *)key)[5] = dport;
+
+ *protop = proto;
+ return (hashword(key, 3, hashjitter + proto));
+}
+
+uint32_t
+ipv4_flow_lookup_hash(struct mbuf *m)
+{
+ struct route ro;
+ uint32_t key[3];
+ uint16_t flags;
+ uint8_t proto;
+
+ bzero(&ro, sizeof(ro));
+ return (ipv4_flow_lookup_hash_internal(m, &ro, key, &flags, &proto));
+}
+
+static void
+ipv4_flow_insert(uint32_t hash, uint32_t *key, uint8_t proto,
+ struct rtentry *rt, uint16_t flags)
+{
+ struct flentry_v4 *fle, *fle2;
+ uint32_t *hashkey;
+
+ fle = FL_ENTRY(hash);
+ hashkey = fle->fl_flow.ipf_key;
+
+ hashkey[0] = key[0];
+ hashkey[1] = key[1];
+ hashkey[2] = key[2];
+
+ bit_set(ipv4_flow_bitstring, FL_ENTRY_INDEX(hash));
+ if (rt->rt_flow_head == 0) {
+ rt->rt_flow_head = hash;
+ fle->fl_hash_next = fle->fl_hash_prev = 0;
+ } else {
+ fle->fl_hash_next = rt->rt_flow_head;
+ fle2 = FL_ENTRY(rt->rt_flow_head);
+ rt->rt_flow_head = hash;
+ fle2->fl_hash_prev = hash;
+ }
+ fle->fl_proto = proto;
+ fle->fl_rt = rt;
+ fle->fl_fhash = hash;
+ fle->fl_ticks = ticks;
+ rt->rt_refcnt++;
+ ipv4_flow_allocated++;
+}
+
+uint32_t
+ipv4_flow_alloc(struct mbuf *m, struct route *ro)
+{
+ uint32_t key[3], hash, *hashkey;
+ struct flentry_v4 *fle;
+ uint16_t flags = 0;
+ uint8_t proto;
+
+ /*
+ * Only handle IPv4 for now
+ *
+ */
+ hash = ipv4_flow_lookup_hash_internal(m, ro, key, &flags, &proto);
+
+ /*
+ * Ports are zero - thus not a protocol for which
+ * we need to keep state
+ */
+ if (key[3] == 0)
+ return (hash);
+
+ FL_ENTRY_LOCK(hash);
+ fle = FL_ENTRY(hash);
+
+ hashkey = fle->fl_flow.ipf_key;
+
+ if (fle->fl_fhash == 0) {
+ FL_ENTRY_UNLOCK(hash);
+ rtalloc_mpath_fib(ro, hash, M_GETFIB(m));
+ if (ro->ro_rt) {
+ FL_ENTRY_LOCK(hash);
+ ipv4_flow_insert(hash, key, proto, ro->ro_rt, flags);
+ RT_UNLOCK(ro->ro_rt);
+ } else
+ ipv4_flow_route_lookup_fail++;
+ } else if (fle->fl_fhash == hash
+ && key[0] == hashkey[0]
+ && key[1] == hashkey[1]
+ && key[2] == hashkey[2]
+ && proto == fle->fl_proto) {
+ fle->fl_ticks = ticks;
+ fle->fl_flags |= flags;
+ fle->fl_refcnt++;
+ ro->ro_rt = fle->fl_rt;
+ } else
+ ipv4_flow_collisions++;
+
+ FL_ENTRY_UNLOCK(hash);
+
+ return (hash);
+}
+
+/*
+ * Internal helper routine
+ * hash - the hash of the entry to free
+ * stale - indicates to only free the entry if it is marked stale
+ */
+
+static uint32_t
+ipv4_flow_free_internal(uint32_t hash, int staleonly)
+{
+ struct flentry_v4 *fle, *fleprev, *flenext;
+ uint32_t hash_next;
+
+ fle = FL_ENTRY(hash);
+ hash_next = fle->fl_hash_next;
+
+ if (staleonly && ((fle->fl_flags & FL_STALE) == 0))
+ return (hash_next);
+
+ if (fle->fl_hash_next) {
+ flenext = FL_ENTRY(fle->fl_hash_next);
+ flenext->fl_hash_prev = fle->fl_hash_prev;
+ }
+ if (fle->fl_hash_prev) {
+ fleprev = FL_ENTRY(fle->fl_hash_prev);
+ fleprev->fl_hash_next = fle->fl_hash_next;
+ }
+ fle->fl_hash_next = fle->fl_hash_prev = 0;
+
+ if (fle->fl_refcnt == 0) {
+ fle->fl_rt->rt_refcnt--;
+ ipv4_flow_allocated--;
+ bit_clear(ipv4_flow_bitstring, FL_ENTRY_INDEX(hash));
+ bzero(fle, sizeof(struct flentry_v4));
+ } else if (!staleonly)
+ fle->fl_flags |= FL_STALE;
+
+ return (hash_next);
+}
+
+/*
+ * drops the refcount on the flow after alloc was called and
+ * checks if the flow has become stale since alloc was called
+ *
+ */
+void
+ipv4_flow_free(uint32_t hash)
+{
+ struct flentry_v4 *fle;
+ struct rtentry *rt;
+ int stale;
+
+ fle = FL_ENTRY(hash);
+ KASSERT(fle->fl_refcnt > 0,
+ ("route referenced with flow refcount set to zero"));
+
+ stale = ((fle->fl_flags & FL_STALE) &&
+ (fle->fl_refcnt == 1));
+
+ rt = fle->fl_rt;
+ if (stale)
+ RT_LOCK(rt);
+
+ FL_ENTRY_LOCK(hash);
+ fle->fl_refcnt--;
+
+ if (stale) {
+ ipv4_flow_free_internal(hash, 0);
+ RTFREE_LOCKED(rt);
+ }
+ FL_ENTRY_UNLOCK(hash);
+}
+
+/*
+ *
+ * Frees all flows that are linked to this rtentry
+ *
+ */
+void
+ipv4_flow_free_all(struct rtentry *rt)
+{
+ uint32_t hash_next = rt->rt_flow_head;
+
+ RT_LOCK_ASSERT(rt);
+ while (hash_next)
+ hash_next = ipv4_flow_free_internal(hash_next, 0);
+}
+
+/*
+ * Frees all flows tied to this rt that
+ * have been marked stale
+ *
+ */
+static int
+ipv4_flow_free_stale(struct radix_node *rn, void *unused)
+{
+ struct rtentry *rt = (struct rtentry *)rn;
+ uint32_t hash_next;
+
+ if (rt->rt_flow_head == 0)
+ return (0);
+
+ RT_LOCK(rt);
+ hash_next = rt->rt_flow_head;
+ while (hash_next)
+ hash_next = ipv4_flow_free_internal(hash_next, 1);
+ RT_UNLOCK(rt);
+
+ return (0);
+}
+
+struct radix_node_head *ipv4_flow_rnh_list[100];
+static void
+ipv4_flow_check_stale(struct flentry_v4 *fle,
+ struct radix_node_head **rnh_list, int *rnh_count)
+{
+ int count = *rnh_count;
+ uint32_t idle_ticks;
+ struct radix_node_head *rnh;
+ struct rtentry *rt;
+ int i, stale = 0, found = 0;
+
+ if (ticks > fle->fl_ticks)
+ idle_ticks = ticks - fle->fl_ticks;
+ else
+ idle_ticks = (INT_MAX - fle->fl_ticks) + ticks ;
+
+ if ((fle->fl_flags & FL_STALE) ||
+ ((fle->fl_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
+ && (idle_ticks > UDP_IDLE)) ||
+ ((fle->fl_flags & TH_FIN)
+ && (idle_ticks > FIN_WAIT_IDLE)) ||
+ ((fle->fl_flags & (TH_SYN|TH_ACK)) == TH_SYN
+ && (idle_ticks > SYN_IDLE)) ||
+ ((fle->fl_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
+ && (idle_ticks > TCP_IDLE)))
+ stale = 1;
+
+ if (stale == 0)
+ return;
+
+ fle->fl_flags |= FL_STALE;
+ rt = fle->fl_rt;
+ rnh = V_rt_tables[rt->rt_fibnum][rt_key(rt)->sa_family];
+
+ for (i = 0; i < count; i++)
+ if (rnh_list[i] == rnh) {
+ found = 1;
+ break;
+ }
+ if (found == 0) {
+ rnh_list[count] = rnh;
+ count++;
+ *rnh_count = count;
+ }
+}
+
+
+static __inline int
+bit_fns(bitstr_t *name, int nbits, int lastbit)
+{
+ int lastbit_start = lastbit & ~0x7;
+ bitstr_t *bitstr_start = &name[lastbit_start];
+ int value = 0;
+
+ while (value <= lastbit && value != 1)
+ bit_ffs(bitstr_start, nbits, &value);
+
+ return (value);
+}
+
+
+static int ipv4_flow_last_index;
+static void
+ipv4_flow_timeout(void *arg)
+{
+ int i, idx, rnh_count = 0;
+ struct radix_node_head *rnh;
+
+ /*
+ * scan 1/4th of the table once a second
+ */
+ for (i = 0; i < (ipv4_flow_allocated >> 2); i++) {
+ idx = bit_fns(ipv4_flow_bitstring, ipv4_flow_table_size,
+ ipv4_flow_last_index);
+ if (idx == -1) {
+ ipv4_flow_last_index = 0;
+ break;
+ }
+
+ FL_ENTRY_LOCK(idx);
+ ipv4_flow_check_stale(FL_ENTRY(idx), ipv4_flow_rnh_list, &rnh_count);
+ FL_ENTRY_UNLOCK(idx);
+ }
+ for (i = 0; i < rnh_count; i++) {
+ rnh = ipv4_flow_rnh_list[i];
+ RADIX_NODE_HEAD_LOCK(rnh);
+ rnh->rnh_walktree(rnh, ipv4_flow_free_stale, NULL);
+ RADIX_NODE_HEAD_UNLOCK(rnh);
+ }
+
+ callout_reset(&ipv4_flow_callout, hz, ipv4_flow_timeout, NULL);
+}
+
+static void
+flowtable_init(void *unused)
+{
+ int i, nentry;
+
+ nentry = ipv4_flow_max_count;
+ /*
+ * round mp_ncpus up to the next power of 2 and double
+ * to determine the number of locks
+ */
+ ipv4_flow_lock_count = (1 << fls(mp_ncpus)) << 1;
+
+ ipv4_flow_table_size = nentry;
+ ipv4_flow_table = malloc(nentry*sizeof(struct flentry_v4),
+ M_RTABLE, M_WAITOK | M_ZERO);
+ ipv4_flow_bitstring = bit_alloc(nentry);
+ ipv4_flow_locks = malloc(ipv4_flow_lock_count*sizeof(struct mtx),
+ M_RTABLE, M_WAITOK | M_ZERO);
+ for (i = 0; i < ipv4_flow_lock_count; i++)
+ mtx_init(&ipv4_flow_locks[i], "ipv4_flow", NULL, MTX_DEF);
+
+}
+SYSINIT(flowtable, SI_SUB_INIT_IF, SI_ORDER_ANY, flowtable_init, NULL);
Modified: user/kmacy/HEAD_ECMP/sys/net/radix_mpath.c
==============================================================================
--- user/kmacy/HEAD_ECMP/sys/net/radix_mpath.c Fri Oct 17 03:17:10 2008 (r183964)
+++ user/kmacy/HEAD_ECMP/sys/net/radix_mpath.c Fri Oct 17 03:59:25 2008 (r183965)
@@ -53,7 +53,7 @@ __FBSDID("$FreeBSD$");
/*
* give some jitter to hash, to avoid synchronization between routers
*/
-static u_int32_t hashjitter;
+uint32_t hashjitter;
int
rn_mpath_capable(struct radix_node_head *rnh)
@@ -298,7 +298,7 @@ rtalloc_mpath_fib(struct route *ro, u_in
return;
}
- rtfree(ro->ro_rt);
+ RTFREE(ro->ro_rt);
ro->ro_rt = (struct rtentry *)rn;
RT_LOCK(ro->ro_rt);
RT_ADDREF(ro->ro_rt);
Modified: user/kmacy/HEAD_ECMP/sys/net/radix_mpath.h
==============================================================================
--- user/kmacy/HEAD_ECMP/sys/net/radix_mpath.h Fri Oct 17 03:17:10 2008 (r183964)
+++ user/kmacy/HEAD_ECMP/sys/net/radix_mpath.h Fri Oct 17 03:59:25 2008 (r183965)
@@ -58,6 +58,11 @@ int rt_mpath_deldup(struct rtentry *, st
int rn4_mpath_inithead(void **, int);
int rn6_mpath_inithead(void **, int);
+uint32_t ipv4_flow_alloc(struct mbuf *m, struct route *ro);
+void ipv4_flow_free(uint32_t hash);
+
+uint32_t ipv4_flow_lookup_hash(struct mbuf *m);
+void ipv4_flow_free_all(struct rtentry *rt);
#endif
#endif /* _NET_RADIX_MPATH_H_ */
Modified: user/kmacy/HEAD_ECMP/sys/net/route.c
==============================================================================
--- user/kmacy/HEAD_ECMP/sys/net/route.c Fri Oct 17 03:17:10 2008 (r183964)
+++ user/kmacy/HEAD_ECMP/sys/net/route.c Fri Oct 17 03:59:25 2008 (r183965)
@@ -808,8 +808,10 @@ rtexpunge(struct rtentry *rt)
("unexpected flags 0x%x", rn->rn_flags));
KASSERT(rt == RNTORT(rn),
("lookup mismatch, rt %p rn %p", rt, rn));
-
rt->rt_flags &= ~RTF_UP;
+#ifdef RADIX_MPATH
+ ipv4_flow_free_all(rt);
+#endif
/*
* Now search what's left of the subtree for any cloned
@@ -948,6 +950,9 @@ rtrequest1_fib(int req, struct rt_addrin
RT_LOCK(rt);
RT_ADDREF(rt);
rt->rt_flags &= ~RTF_UP;
+#ifdef RADIX_MPATH
+ ipv4_flow_free_all(rt);
+#endif
goto deldone; /* done with the RTM_DELETE command */
}
@@ -966,7 +971,9 @@ normal_rtdel:
RT_LOCK(rt);
RT_ADDREF(rt);
rt->rt_flags &= ~RTF_UP;
-
+#ifdef RADIX_MPATH
+ ipv4_flow_free_all(rt);
+#endif
/*
* Now search what's left of the subtree for any cloned
* routes which might have been formed from this node.
Modified: user/kmacy/HEAD_ECMP/sys/net/route.h
==============================================================================
--- user/kmacy/HEAD_ECMP/sys/net/route.h Fri Oct 17 03:17:10 2008 (r183964)
+++ user/kmacy/HEAD_ECMP/sys/net/route.h Fri Oct 17 03:59:25 2008 (r183965)
@@ -148,6 +148,9 @@ struct rtentry {
#ifdef _KERNEL
/* XXX ugly, user apps use this definition but don't have a mtx def */
struct mtx rt_mtx; /* mutex for routing entry */
+#ifdef RADIX_MPATH
+ uint32_t rt_flow_head;
+#endif
#endif
};
Modified: user/kmacy/HEAD_ECMP/sys/netinet/ip_input.c
==============================================================================
--- user/kmacy/HEAD_ECMP/sys/netinet/ip_input.c Fri Oct 17 03:17:10 2008 (r183964)
+++ user/kmacy/HEAD_ECMP/sys/netinet/ip_input.c Fri Oct 17 03:59:25 2008 (r183965)
@@ -1286,7 +1286,7 @@ ip_forward(struct mbuf *m, int srcrt)
struct mbuf *mcopy;
struct in_addr dest;
struct route ro;
- int error, type = 0, code = 0, mtu = 0;
+ int error, type = 0, code = 0, mtu = 0, cached = 0;
if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
V_ipstat.ips_cantforward++;
@@ -1305,7 +1305,24 @@ ip_forward(struct mbuf *m, int srcrt)
}
#endif
+ bzero(&ro, sizeof(ro));
+#ifdef RADIX_MPATH
+ hash = ipv4_flow_alloc(m, &ro);
+
+ if (ro.ro_rt == NULL)
+ rtalloc_mpath_fib(&ro, hash, M_GETFIB(m));
+ else
+ cached = 1;
+
+ if (ro->ro_rt != NULL)
+ ia = ifatoia(ro.ro_rt->rt_ifa);
+#else
+ /*
+ * I love how we go to all the trouble to look up the
+ * route and then throw it away KMM
+ */
ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m));
+#endif
if (!srcrt && ia == NULL) {
icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
return;
@@ -1365,7 +1382,6 @@ ip_forward(struct mbuf *m, int srcrt)
struct sockaddr_in *sin;
struct rtentry *rt;
- bzero(&ro, sizeof(ro));
sin = (struct sockaddr_in *)&ro.ro_dst;
sin->sin_family = AF_INET;
sin->sin_len = sizeof(*sin);
@@ -1390,7 +1406,7 @@ ip_forward(struct mbuf *m, int srcrt)
code = ICMP_REDIRECT_HOST;
}
}
- if (rt)
+ if (rt && (cached == 0))
RTFREE(rt);
}
@@ -1398,13 +1414,15 @@ ip_forward(struct mbuf *m, int srcrt)
* Try to cache the route MTU from ip_output so we can consider it for
* the ICMP_UNREACH_NEEDFRAG "Next-Hop MTU" field described in RFC1191.
*/
- bzero(&ro, sizeof(ro));
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-user
mailing list