Routing enhancement - reduce routing table locking
Ingo Flaschberger
if at freebsd.org
Tue Apr 5 01:47:32 UTC 2011
Hi,
I have written a patch to:
*) reduce locking of routing table to achieve the same speed as with
flowtables, which do not scale with many routes:
use of a copy of the route
use rm_lock(9)
(idea of Andre Oppermann)
*) implement some multipath changes to use a direct attached
interface route and a real route, used some OpenBSD code
*) icmp rate-limiting in forwarding (old code from FreeBSD page)
The patch applies at FreeBSD 8.2 stable.
Comments are welcome, especially if I can reuse "dst" (which seems to
work):
rtl.rt_gateway = (struct sockaddr *)dst;
but 'm not shure if I should take same caution with bcopy regarding
sa_len:
rtlookup_fib(struct sockaddr *dst, u_int fibnum, struct rtlookup *rtl,
Kind regards,
Ingo Flaschberger
Geschaeftsleitung
____________________________________
crossip communications gmbh
A-1020 Wien, Sebastian Kneipp Gasse 1/3
Sitz der Gesellschaft: 1020 Wien, Oesterreich
Firmenbuchgericht: Handelsgericht Wien, FN 269698 s,
Umsatzsteueridentifikationsnummer (UID): ATU62080367
Haftungsausschluss / Disclaimer <http://www.xip.at/content/view/278/>
-------------- next part --------------
diff -u -r ../src_org_8.2_20110329/contrib/ipfilter/radix.c ./contrib/ipfilter/radix.c
--- ../src_org_8.2_20110329/contrib/ipfilter/radix.c 2009-08-03 08:13:06.000000000 +0000
+++ ./contrib/ipfilter/radix.c 2011-04-03 16:08:28.000000000 +0000
@@ -759,9 +759,10 @@
}
struct radix_node *
-rn_delete(v_arg, netmask_arg, head)
+rn_delete(v_arg, netmask_arg, head, rn)
void *v_arg, *netmask_arg;
struct radix_node_head *head;
+ struct radix_node *rn;
{
struct radix_node *t, *p, *x, *tt;
struct radix_mask *m, *saved_m, **mp;
@@ -1069,7 +1070,7 @@
struct radix_node_head *rnh = p;
struct radix_node *d;
- d = rnh->rnh_deladdr(n->rn_key, NULL, rnh);
+ d = rnh->rnh_deladdr(n->rn_key, NULL, rnh, NULL);
if (d != NULL) {
FreeS(d, max_keylen + 2 * sizeof (*d));
}
diff -u -r ../src_org_8.2_20110329/contrib/ipfilter/radix_ipf.h ./contrib/ipfilter/radix_ipf.h
--- ../src_org_8.2_20110329/contrib/ipfilter/radix_ipf.h 2009-08-03 08:13:06.000000000 +0000
+++ ./contrib/ipfilter/radix_ipf.h 2011-04-03 16:08:28.000000000 +0000
@@ -130,7 +130,7 @@
__P((void *v, void *mask,
struct radix_node_head *head, struct radix_node nodes[]));
struct radix_node *(*rnh_deladdr) /* remove based on sockaddr */
- __P((void *v, void *mask, struct radix_node_head *head));
+ __P((void *v, void *mask, struct radix_node_head *head, struct radix_node *rn));
struct radix_node *(*rnh_delpkt) /* remove based on packet hdr */
__P((void *v, void *mask, struct radix_node_head *head));
struct radix_node *(*rnh_matchaddr) /* locate based on sockaddr */
@@ -202,7 +202,7 @@
*rn_addmask __P((void *, int, int)),
*rn_addroute __P((void *, void *, struct radix_node_head *,
struct radix_node [2])),
- *rn_delete __P((void *, void *, struct radix_node_head *)),
+ *rn_delete __P((void *, void *, struct radix_node_head *, struct radix_node *)),
*rn_insert __P((void *, struct radix_node_head *, int *,
struct radix_node [2])),
*rn_lookup __P((void *, void *, struct radix_node_head *)),
diff -u -r ../src_org_8.2_20110329/sbin/routed/radix.c ./sbin/routed/radix.c
--- ../src_org_8.2_20110329/sbin/routed/radix.c 2009-08-03 08:13:06.000000000 +0000
+++ ./sbin/routed/radix.c 2011-04-03 16:08:07.000000000 +0000
@@ -662,7 +662,8 @@
static struct radix_node *
rn_delete(void *v_arg,
void *netmask_arg,
- struct radix_node_head *head)
+ struct radix_node_head *head,
+ struct radix_node *rn)
{
struct radix_node *t, *p, *x, *tt;
struct radix_mask *m, *saved_m, **mp;
@@ -670,6 +671,8 @@
caddr_t v, netmask;
int b, head_off, vlen;
+ rn = NULL; /* XXX make compiler happy */
+
v = v_arg;
netmask = netmask_arg;
x = head->rnh_treetop;
diff -u -r ../src_org_8.2_20110329/sbin/routed/radix.h ./sbin/routed/radix.h
--- ../src_org_8.2_20110329/sbin/routed/radix.h 2009-08-03 08:13:06.000000000 +0000
+++ ./sbin/routed/radix.h 2011-04-03 16:08:07.000000000 +0000
@@ -115,7 +115,7 @@
(void *v, void *mask,
struct radix_node_head *head, struct radix_node nodes[]);
struct radix_node *(*rnh_deladdr) /* remove based on sockaddr */
- (void *v, void *mask, struct radix_node_head *head);
+ (void *v, void *mask, struct radix_node_head *head, struct radix_node *rn);
struct radix_node *(*rnh_delpkt) /* remove based on packet hdr */
(void *v, void *mask, struct radix_node_head *head);
struct radix_node *(*rnh_matchaddr) /* locate based on sockaddr */
diff -u -r ../src_org_8.2_20110329/sbin/routed/table.c ./sbin/routed/table.c
--- ../src_org_8.2_20110329/sbin/routed/table.c 2009-08-03 08:13:06.000000000 +0000
+++ ./sbin/routed/table.c 2011-04-03 16:08:07.000000000 +0000
@@ -1865,7 +1865,7 @@
mask_sock.sin_addr.s_addr = htonl(rt->rt_mask);
masktrim(&mask_sock);
if (rt != (struct rt_entry *)rhead->rnh_deladdr(&dst_sock, &mask_sock,
- rhead)) {
+ rhead, NULL)) {
msglog("rnh_deladdr() failed");
} else {
free(rt);
diff -u -r ../src_org_8.2_20110329/sys/contrib/ipfilter/netinet/ip_pool.c ./sys/contrib/ipfilter/netinet/ip_pool.c
--- ../src_org_8.2_20110329/sys/contrib/ipfilter/netinet/ip_pool.c 2007-10-18 21:42:38.000000000 +0000
+++ ./sys/contrib/ipfilter/netinet/ip_pool.c 2011-04-03 16:07:46.000000000 +0000
@@ -67,6 +67,7 @@
#include "netinet/ip_compat.h"
#include "netinet/ip_fil.h"
#include "netinet/ip_pool.h"
+#include <sys/rmlock.h>
#if defined(IPFILTER_LOOKUP) && defined(_KERNEL) && \
((BSD >= 198911) && !defined(__osf__) && \
@@ -620,7 +621,7 @@
RADIX_NODE_HEAD_LOCK(ipo->ipo_head);
ipo->ipo_head->rnh_deladdr(&ipe->ipn_addr, &ipe->ipn_mask,
- ipo->ipo_head);
+ ipo->ipo_head, NULL);
RADIX_NODE_HEAD_UNLOCK(ipo->ipo_head);
ip_pool_node_deref(ipe);
@@ -751,7 +752,7 @@
RADIX_NODE_HEAD_LOCK(ipo->ipo_head);
while ((n = ipo->ipo_list) != NULL) {
ipo->ipo_head->rnh_deladdr(&n->ipn_addr, &n->ipn_mask,
- ipo->ipo_head);
+ ipo->ipo_head, NULL);
*n->ipn_pnext = n->ipn_next;
if (n->ipn_next)
@@ -963,7 +964,7 @@
struct radix_node_head *rnh = p;
struct radix_node *d;
- d = rnh->rnh_deladdr(n->rn_key, NULL, rnh);
+ d = rnh->rnh_deladdr(n->rn_key, NULL, rnh, NULL);
if (d != NULL) {
FreeS(d, max_keylen + 2 * sizeof (*d));
}
diff -u -r ../src_org_8.2_20110329/sys/contrib/pf/net/pf.c ./sys/contrib/pf/net/pf.c
--- ../src_org_8.2_20110329/sys/contrib/pf/net/pf.c 2010-09-20 17:03:10.000000000 +0000
+++ ./sys/contrib/pf/net/pf.c 2011-04-03 16:07:46.000000000 +0000
@@ -99,9 +99,7 @@
#include <net/if_types.h>
#include <net/bpf.h>
#include <net/route.h>
-#ifndef __FreeBSD__
#include <net/radix_mpath.h>
-#endif
#include <netinet/in.h>
#include <netinet/in_var.h>
@@ -6166,9 +6164,9 @@
if (kif->pfik_ifp == ifp)
ret = 1;
#ifdef __FreeBSD__ /* MULTIPATH_ROUTING */
- rn = NULL;
-#else
rn = rn_mpath_next(rn);
+#else
+ rn = rn_mpath_next(rn, 0);
#endif
} while (check_mpath == 1 && rn != NULL && ret == 0);
} else
diff -u -r ../src_org_8.2_20110329/sys/contrib/pf/net/pf_table.c ./sys/contrib/pf/net/pf_table.c
--- ../src_org_8.2_20110329/sys/contrib/pf/net/pf_table.c 2009-08-03 08:13:06.000000000 +0000
+++ ./sys/contrib/pf/net/pf_table.c 2011-04-03 16:07:46.000000000 +0000
@@ -44,7 +44,7 @@
#include <sys/mbuf.h>
#include <sys/kernel.h>
#include <sys/lock.h>
-#include <sys/rwlock.h>
+#include <sys/rmlock.h>
#ifdef __FreeBSD__
#include <sys/malloc.h>
#endif
@@ -1114,17 +1114,9 @@
#endif
if (KENTRY_NETWORK(ke)) {
pfr_prepare_network(&mask, ke->pfrke_af, ke->pfrke_net);
-#ifdef __FreeBSD__
- rn = rn_delete(&ke->pfrke_sa, &mask, head);
-#else
rn = rn_delete(&ke->pfrke_sa, &mask, head, NULL);
-#endif
} else
-#ifdef __FreeBSD__
- rn = rn_delete(&ke->pfrke_sa, NULL, head);
-#else
rn = rn_delete(&ke->pfrke_sa, NULL, head, NULL);
-#endif
splx(s);
if (rn == NULL) {
diff -u -r ../src_org_8.2_20110329/sys/kern/subr_witness.c ./sys/kern/subr_witness.c
--- ../src_org_8.2_20110329/sys/kern/subr_witness.c 2011-03-28 15:26:48.000000000 +0000
+++ ./sys/kern/subr_witness.c 2011-04-03 16:07:54.000000000 +0000
@@ -508,7 +508,7 @@
* Routing
*/
{ "so_rcv", &lock_class_mtx_sleep },
- { "radix node head", &lock_class_rw },
+ { "radix node head", &lock_class_rm },
{ "rtentry", &lock_class_mtx_sleep },
{ "ifaddr", &lock_class_mtx_sleep },
{ NULL, NULL },
diff -u -r ../src_org_8.2_20110329/sys/kern/vfs_export.c ./sys/kern/vfs_export.c
--- ../src_org_8.2_20110329/sys/kern/vfs_export.c 2009-10-01 13:11:45.000000000 +0000
+++ ./sys/kern/vfs_export.c 2011-04-03 16:07:54.000000000 +0000
@@ -43,6 +43,7 @@
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/lock.h>
+#include <sys/rmlock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/mount.h>
@@ -228,7 +229,7 @@
struct radix_node_head *rnh = (struct radix_node_head *) w;
struct ucred *cred;
- (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
+ (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh, NULL);
cred = ((struct netcred *)rn)->netc_anon;
if (cred != NULL)
crfree(cred);
@@ -427,6 +428,7 @@
register struct netcred *np;
register struct radix_node_head *rnh;
struct sockaddr *saddr;
+ struct rm_priotracker tracker;
nep = mp->mnt_export;
if (nep == NULL)
@@ -440,10 +442,10 @@
saddr = nam;
rnh = nep->ne_rtable[saddr->sa_family];
if (rnh != NULL) {
- RADIX_NODE_HEAD_RLOCK(rnh);
+ RADIX_NODE_HEAD_RLOCK(rnh, &tracker);
np = (struct netcred *)
(*rnh->rnh_matchaddr)(saddr, rnh);
- RADIX_NODE_HEAD_RUNLOCK(rnh);
+ RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
np = NULL;
}
diff -u -r ../src_org_8.2_20110329/sys/net/if.c ./sys/net/if.c
--- ../src_org_8.2_20110329/sys/net/if.c 2011-03-28 15:26:51.000000000 +0000
+++ ./sys/net/if.c 2011-04-03 16:07:57.000000000 +0000
@@ -49,6 +49,7 @@
#include <sys/protosw.h>
#include <sys/kernel.h>
#include <sys/lock.h>
+#include <sys/rmlock.h>
#include <sys/refcount.h>
#include <sys/module.h>
#include <sys/rwlock.h>
diff -u -r ../src_org_8.2_20110329/sys/net/pfil.c ./sys/net/pfil.c
--- ../src_org_8.2_20110329/sys/net/pfil.c 2010-02-07 09:00:22.000000000 +0000
+++ ./sys/net/pfil.c 2011-04-03 16:07:57.000000000 +0000
@@ -39,7 +39,6 @@
#include <sys/socketvar.h>
#include <sys/systm.h>
#include <sys/condvar.h>
-#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/queue.h>
diff -u -r ../src_org_8.2_20110329/sys/net/radix.c ./sys/net/radix.c
--- ../src_org_8.2_20110329/sys/net/radix.c 2010-04-02 05:02:50.000000000 +0000
+++ ./sys/net/radix.c 2011-04-03 16:07:57.000000000 +0000
@@ -41,6 +41,7 @@
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/syslog.h>
+#include <sys/rmlock.h>
#include <net/radix.h>
#include "opt_mpath.h"
#ifdef RADIX_MPATH
@@ -614,7 +615,7 @@
struct radix_node treenodes[2];
{
caddr_t v = (caddr_t)v_arg, netmask = (caddr_t)n_arg;
- register struct radix_node *t, *x = 0, *tt;
+ register struct radix_node *t, *x = 0, *xx = 0, *tt;
struct radix_node *saved_tt, *top = head->rnh_treetop;
short b = 0, b_leaf = 0;
int keyduplicated;
@@ -723,12 +724,19 @@
x = t->rn_right;
/* Promote general routes from below */
if (x->rn_bit < 0) {
- for (mp = &t->rn_mklist; x; x = x->rn_dupedkey)
+ for (mp = &t->rn_mklist; x; xx = x, x = x->rn_dupedkey) {
+ if (xx && xx->rn_mklist && xx->rn_mask == x->rn_mask &&
+ x->rn_mklist == 0) {
+ /* multipath route, bump refcount on first mklist */
+ x->rn_mklist = xx->rn_mklist;
+ x->rn_mklist->rm_refs++;
+ }
if (x->rn_mask && (x->rn_bit >= b_leaf) && x->rn_mklist == 0) {
*mp = m = rn_new_radix_mask(x, 0);
if (m)
mp = &m->rm_mklist;
}
+ }
} else if (x->rn_mklist) {
/*
* Skip over masks whose index is > that of new node
@@ -760,11 +768,30 @@
break;
if (m->rm_flags & RNF_NORMAL) {
mmask = m->rm_leaf->rn_mask;
- if (tt->rn_flags & RNF_NORMAL) {
-#if !defined(RADIX_MPATH)
- log(LOG_ERR,
- "Non-unique normal route, mask not entered\n");
+ if (keyduplicated) {
+ if (m->rm_leaf->rn_parent == tt)
+ /* new route is bettter */
+ m->rm_leaf = tt;
+#ifdef DIAGNOSTIC
+ else {
+ for (t = m->rm_leaf; t;
+ t = t->rn_dupedkey) {
+ break;
+ }
+ if (t == NULL) {
+ log(LOG_ERR, "Non-unique "
+ "normal route on dupedkey, "
+ "mask not entered\n");
+ return tt;
+ }
+ }
#endif
+ m->rm_refs++;
+ tt->rn_mklist = m;
+ return tt;
+ } else if (tt->rn_flags & RNF_NORMAL) {
+ log(LOG_ERR, "Non-unique normal route,"
+ " mask not entered\n");
return tt;
}
} else
@@ -783,9 +810,10 @@
}
struct radix_node *
-rn_delete(v_arg, netmask_arg, head)
+rn_delete(v_arg, netmask_arg, head, rn)
void *v_arg, *netmask_arg;
struct radix_node_head *head;
+ struct radix_node *rn;
{
register struct radix_node *t, *p, *x, *tt;
struct radix_mask *m, *saved_m, **mp;
@@ -815,13 +843,37 @@
if ((tt = tt->rn_dupedkey) == 0)
return (0);
}
+#ifdef RADIX_MPATH
+ if (rn) {
+ while (tt != rn)
+ if ((tt = tt->rn_dupedkey) == 0)
+ return (0);
+ }
+#endif
if (tt->rn_mask == 0 || (saved_m = m = tt->rn_mklist) == 0)
goto on1;
if (tt->rn_flags & RNF_NORMAL) {
- if (m->rm_leaf != tt || m->rm_refs > 0) {
+ if (m->rm_leaf != tt && m->rm_refs == 0) {
log(LOG_ERR, "rn_delete: inconsistent annotation\n");
return 0; /* dangling ref could cause disaster */
}
+ if (m->rm_leaf != tt) {
+ if (--m->rm_refs >= 0)
+ goto on1;
+ }
+ /* tt is currently the head of the possible multipath chain */
+ if (m->rm_refs > 0) {
+ if (tt->rn_dupedkey == NULL ||
+ tt->rn_dupedkey->rn_mklist != m) {
+ log(LOG_ERR, "rn_delete: inconsistent "
+ "dupedkey list\n");
+ return (0);
+ }
+ m->rm_leaf = tt->rn_dupedkey;
+ --m->rm_refs;
+ goto on1;
+ }
+ /* else tt is last and only route */
} else {
if (m->rm_mask != tt->rn_mask) {
log(LOG_ERR, "rn_delete: inconsistent annotation\n");
@@ -875,15 +927,10 @@
else
t->rn_right = x;
} else {
- /* find node in front of tt on the chain */
- for (x = p = saved_tt; p && p->rn_dupedkey != tt;)
- p = p->rn_dupedkey;
- if (p) {
- p->rn_dupedkey = tt->rn_dupedkey;
- if (tt->rn_dupedkey) /* parent */
- tt->rn_dupedkey->rn_parent = p;
- /* parent */
- } else log(LOG_ERR, "rn_delete: couldn't find us\n");
+ x = saved_tt;
+ t->rn_dupedkey = tt->rn_dupedkey;
+ if (tt->rn_dupedkey)
+ tt->rn_dupedkey->rn_parent = t;
}
t = tt + 1;
if (t->rn_flags & RNF_ACTIVE) {
@@ -931,8 +978,16 @@
if (m == x->rn_mklist) {
struct radix_mask *mm = m->rm_mklist;
x->rn_mklist = 0;
- if (--(m->rm_refs) < 0)
+ if (--(m->rm_refs) < 0) {
MKFree(m);
+ } else if (m->rm_flags & RNF_NORMAL) {
+ /*
+ * don't progress because this
+ * a multipath route. Next
+ * route will use the same m.
+ */
+ mm = m;
+ }
m = mm;
}
if (m)
diff -u -r ../src_org_8.2_20110329/sys/net/radix.h ./sys/net/radix.h
--- ../src_org_8.2_20110329/sys/net/radix.h 2010-03-23 09:58:59.000000000 +0000
+++ ./sys/net/radix.h 2011-04-03 16:07:57.000000000 +0000
@@ -36,7 +36,7 @@
#ifdef _KERNEL
#include <sys/_lock.h>
#include <sys/_mutex.h>
-#include <sys/_rwlock.h>
+#include <sys/_rmlock.h>
#endif
#ifdef MALLOC_DECLARE
@@ -114,7 +114,7 @@
(void *v, void *mask,
struct radix_node_head *head, struct radix_node nodes[]);
struct radix_node *(*rnh_deladdr) /* remove based on sockaddr */
- (void *v, void *mask, struct radix_node_head *head);
+ (void *v, void *mask, struct radix_node_head *head, struct radix_node *rn);
struct radix_node *(*rnh_delpkt) /* remove based on packet hdr */
(void *v, void *mask, struct radix_node_head *head);
struct radix_node *(*rnh_matchaddr) /* locate based on sockaddr */
@@ -133,7 +133,7 @@
struct radix_node rnh_nodes[3]; /* empty tree for common case */
int rnh_multipath; /* multipath capable ? */
#ifdef _KERNEL
- struct rwlock rnh_lock; /* locks entire radix tree */
+ struct rmlock rnh_lock; /* locks entire radix tree */
#endif
};
@@ -147,17 +147,15 @@
#define Free(p) free((caddr_t)p, M_RTABLE);
#define RADIX_NODE_HEAD_LOCK_INIT(rnh) \
- rw_init_flags(&(rnh)->rnh_lock, "radix node head", 0)
-#define RADIX_NODE_HEAD_LOCK(rnh) rw_wlock(&(rnh)->rnh_lock)
-#define RADIX_NODE_HEAD_UNLOCK(rnh) rw_wunlock(&(rnh)->rnh_lock)
-#define RADIX_NODE_HEAD_RLOCK(rnh) rw_rlock(&(rnh)->rnh_lock)
-#define RADIX_NODE_HEAD_RUNLOCK(rnh) rw_runlock(&(rnh)->rnh_lock)
-#define RADIX_NODE_HEAD_LOCK_TRY_UPGRADE(rnh) rw_try_upgrade(&(rnh)->rnh_lock)
-
-
-#define RADIX_NODE_HEAD_DESTROY(rnh) rw_destroy(&(rnh)->rnh_lock)
-#define RADIX_NODE_HEAD_LOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_LOCKED)
-#define RADIX_NODE_HEAD_WLOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_WLOCKED)
+ rm_init_flags(&(rnh)->rnh_lock, "radix node head", 0)
+#define RADIX_NODE_HEAD_LOCK(rnh) rm_wlock(&(rnh)->rnh_lock)
+#define RADIX_NODE_HEAD_UNLOCK(rnh) rm_wunlock(&(rnh)->rnh_lock)
+#define RADIX_NODE_HEAD_RLOCK(rnh, tracker) rm_rlock(&(rnh)->rnh_lock, (tracker))
+#define RADIX_NODE_HEAD_RUNLOCK(rnh, tracker) rm_runlock(&(rnh)->rnh_lock, (tracker))
+
+#define RADIX_NODE_HEAD_DESTROY(rnh) rm_destroy(&(rnh)->rnh_lock)
+#define RADIX_NODE_HEAD_LOCK_ASSERT(rnh) rm_wowned(&(rnh)->rnh_lock)
+#define RADIX_NODE_HEAD_WLOCK_ASSERT(rnh) rm_wowned(&(rnh)->rnh_lock)
#endif /* _KERNEL */
void rn_init(int);
@@ -168,7 +166,7 @@
*rn_addmask(void *, int, int),
*rn_addroute (void *, void *, struct radix_node_head *,
struct radix_node [2]),
- *rn_delete(void *, void *, struct radix_node_head *),
+ *rn_delete(void *, void *, struct radix_node_head *, struct radix_node *),
*rn_lookup (void *v_arg, void *m_arg,
struct radix_node_head *head),
*rn_match(void *, struct radix_node_head *);
diff -u -r ../src_org_8.2_20110329/sys/net/radix_mpath.c ./sys/net/radix_mpath.c
--- ../src_org_8.2_20110329/sys/net/radix_mpath.c 2010-04-02 05:02:50.000000000 +0000
+++ ./sys/net/radix_mpath.c 2011-04-04 19:33:16.000000000 +0000
@@ -45,6 +45,8 @@
#include <sys/socket.h>
#include <sys/domain.h>
#include <sys/syslog.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
#include <net/radix.h>
#include <net/radix_mpath.h>
#include <net/route.h>
@@ -54,7 +56,7 @@
/*
* give some jitter to hash, to avoid synchronization between routers
*/
-static uint32_t hashjitter;
+uint32_t hashjitter;
int
rn_mpath_capable(struct radix_node_head *rnh)
@@ -77,10 +79,11 @@
return NULL;
}
-uint32_t
+//uint32_t
+int64_t
rn_mpath_count(struct radix_node *rn)
{
- uint32_t i = 0;
+ int64_t i = 0;
struct rtentry *rt;
while (rn != NULL) {
@@ -112,46 +115,14 @@
* we need to compare the interface address because
* rt_gateway is a special sockadd_dl structure
*/
- if (rt->rt_gateway->sa_family == AF_LINK) {
- if (!memcmp(rt->rt_ifa->ifa_addr, gate, gate->sa_len))
+ if (rt->rt_gateway->sa_len == gate->sa_len &&
+ !memcmp(rt->rt_gateway, gate, gate->sa_len))
break;
- } else {
- if (rt->rt_gateway->sa_len == gate->sa_len &&
- !memcmp(rt->rt_gateway, gate, gate->sa_len))
- break;
- }
} while ((rn = rn_mpath_next(rn)) != NULL);
return (struct rtentry *)rn;
}
-/*
- * go through the chain and unlink "rt" from the list
- * the caller will free "rt"
- */
-int
-rt_mpath_deldup(struct rtentry *headrt, struct rtentry *rt)
-{
- struct radix_node *t, *tt;
-
- if (!headrt || !rt)
- return (0);
- t = (struct radix_node *)headrt;
- tt = rn_mpath_next(t);
- while (tt) {
- if (tt == (struct radix_node *)rt) {
- t->rn_dupedkey = tt->rn_dupedkey;
- tt->rn_dupedkey = NULL;
- tt->rn_flags &= ~RNF_ACTIVE;
- tt[1].rn_flags &= ~RNF_ACTIVE;
- return (1);
- }
- t = tt;
- tt = rn_mpath_next((struct radix_node *)t);
- }
- return (0);
-}
-
/*
* check if we have the same key/mask/gateway on the table already.
*/
@@ -262,9 +233,10 @@
rtalloc_mpath_fib(struct route *ro, uint32_t hash, u_int fibnum)
{
struct radix_node *rn0, *rn;
- u_int32_t n;
+ u_int32_t n = 0;
struct rtentry *rt;
int64_t weight;
+ int64_t lowest_weight;
/*
* XXX we don't attempt to lookup cached route again; what should
@@ -285,20 +257,32 @@
/* beyond here, we use rn as the master copy */
rn0 = rn = (struct radix_node *)ro->ro_rt;
- n = rn_mpath_count(rn0);
+
+ /* find lowest weight route */
+ for ( rt = (struct rtentry *)rn, weight = rt->rt_rmx.rmx_weight; rn != NULL; rn = rn_mpath_next( rn)) {
+ rt = (struct rtentry *)rn;
+ if(rt->rt_flags & RTF_UP) {
+ if (weight > rt->rt_rmx.rmx_weight) {
+ weight = rt->rt_rmx.rmx_weight;
+ n = 1;
+ } else if (weight == rt->rt_rmx.rmx_weight)
+ n++;
+ }
+ }
+ lowest_weight = weight;
/* gw selection by Modulo-N Hash (RFC2991) XXX need improvement? */
hash += hashjitter;
hash %= n;
- for (weight = abs((int32_t)hash), rt = ro->ro_rt;
- weight >= rt->rt_rmx.rmx_weight && rn;
- weight -= rt->rt_rmx.rmx_weight) {
-
- /* stay within the multipath routes */
- if (rn->rn_dupedkey && rn->rn_mask != rn->rn_dupedkey->rn_mask)
- break;
- rn = rn->rn_dupedkey;
+ for ( rn = rn0, n = 0; rn != NULL; rn = rn_mpath_next( rn)) {
rt = (struct rtentry *)rn;
+ if(rt->rt_flags & RTF_UP) {
+ if ( rt->rt_rmx.rmx_weight == lowest_weight) {
+ if (n == hash)
+ break;
+ n++;
+ }
+ }
}
/* XXX try filling rt_gwroute and avoid unreachable gw */
diff -u -r ../src_org_8.2_20110329/sys/net/radix_mpath.h ./sys/net/radix_mpath.h
--- ../src_org_8.2_20110329/sys/net/radix_mpath.h 2009-08-03 08:13:06.000000000 +0000
+++ ./sys/net/radix_mpath.h 2011-04-04 19:48:09.000000000 +0000
@@ -44,9 +44,10 @@
struct route;
struct rtentry;
struct sockaddr;
+extern uint32_t hashjitter;
int rn_mpath_capable(struct radix_node_head *);
struct radix_node *rn_mpath_next(struct radix_node *);
-u_int32_t rn_mpath_count(struct radix_node *);
+int64_t rn_mpath_count(struct radix_node *);
struct rtentry *rt_mpath_matchgate(struct rtentry *, struct sockaddr *);
int rt_mpath_conflict(struct radix_node_head *, struct rtentry *,
struct sockaddr *);
diff -u -r ../src_org_8.2_20110329/sys/net/route.c ./sys/net/route.c
--- ../src_org_8.2_20110329/sys/net/route.c 2011-03-28 15:26:51.000000000 +0000
+++ ./sys/net/route.c 2011-04-04 23:01:17.000000000 +0000
@@ -51,6 +51,8 @@
#include <sys/proc.h>
#include <sys/domain.h>
#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
#include <net/if.h>
#include <net/if_dl.h>
@@ -342,6 +344,7 @@
struct radix_node *rn;
struct rtentry *newrt;
struct rt_addrinfo info;
+ struct rm_priotracker tracker;
int err = 0, msgtype = RTM_MISS;
int needlock;
@@ -358,24 +361,26 @@
goto miss;
}
needlock = !(ignflags & RTF_RNH_LOCKED);
- if (needlock)
- RADIX_NODE_HEAD_RLOCK(rnh);
-#ifdef INVARIANTS
+ if (needlock) /* XXX we always need the lock for now! */
+ RADIX_NODE_HEAD_LOCK(rnh);
else
- RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
-#endif
+ RADIX_NODE_HEAD_RLOCK(rnh, &tracker);
rn = rnh->rnh_matchaddr(dst, rnh);
if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
newrt = rt = RNTORT(rn);
RT_LOCK(newrt);
RT_ADDREF(newrt);
- if (needlock)
- RADIX_NODE_HEAD_RUNLOCK(rnh);
+ if (needlock) /* XXX we always need the lock for now! */
+ RADIX_NODE_HEAD_UNLOCK(rnh);
+ else
+ RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
goto done;
+ }
+ if (needlock) /* XXX we always need the lock for now! */
+ RADIX_NODE_HEAD_UNLOCK(rnh);
+ else
+ RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
- } else if (needlock)
- RADIX_NODE_HEAD_RUNLOCK(rnh);
-
/*
* Either we hit the root or couldn't find any match,
* Which basically means
@@ -400,6 +405,157 @@
}
/*
+ * Lookup a destination in the routing table and
+ * report the next hop, interface and interface address
+ * in a new structure.
+ * Only read lock access on the routing table is required,
+ * individual routes are not locked.
+ * Returns 1 for entry found, 0 for not found.
+ */
+int
+rtlookup_fib(struct sockaddr *dst, u_int fibnum, struct rtlookup *rtl,
+ int flags)
+{
+ struct radix_node_head *rnh;
+ struct radix_node *rn;
+ struct rtentry *rt;
+ int ret = 0;
+ struct rm_priotracker tracker;
+
+ KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum"));
+ if (dst->sa_family != AF_INET) /* Only INET supports > 1 fib now */
+ fibnum = 0;
+ rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
+
+ /* Look up the address in the table for that Address Family. */
+ if (rnh == NULL) {
+ V_rtstat.rts_unreach++;
+ return (0);
+ }
+
+ RADIX_NODE_HEAD_RLOCK(rnh, &tracker);
+ rn = rnh->rnh_matchaddr(dst, rnh);
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
+ rt = RNTORT(rn);
+
+ int rt_len = SA_SIZE( rt->rt_gateway);
+ int rtl_len = SA_SIZE( rtl->rt_gateway);
+ if( rt_len > rtl_len) {
+ bcopy( &rt->rt_gateway, &rtl->rt_gateway, rtl_len);
+ } else {
+ bcopy( &rt->rt_gateway, &rtl->rt_gateway, rt_len);
+ }
+ rtl->rt_ifp = rt->rt_ifp;
+ rtl->rt_ifa = rt->rt_ifa;
+ rtl->rt_rmx.rmx_mtu = rt->rt_rmx.rmx_mtu;
+ rtl->rt_rmx.rmx_expire = rt->rt_rmx.rmx_expire;
+ rtl->rt_flags = rt->rt_flags;
+ if (flags & RTL_PKSENT)
+ rt->rt_rmx.rmx_pksent++; /* racy but ok - XXX WHY?*/
+ ret = 1;
+ }
+ RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
+ return (ret);
+}
+
+#ifdef RADIX_MPATH
+/*
+ * Lookup a mpath destination in the routing table and
+ * report the next hop, interface and interface address
+ * in a new structure.
+ * Only read lock access on the routing table is required,
+ * individual routes are not locked.
+ * Returns 1 for entry found, 0 for not found.
+ */
+int
+rtlookup_mpath_fib(struct sockaddr *dst, u_int32_t hash, u_int fibnum,
+ struct rtlookup *rtl, int flags)
+{
+ struct radix_node_head *rnh;
+ struct radix_node *rn, *rn0;
+ struct rtentry *rt;
+ int ret = 0;
+ struct rm_priotracker tracker;
+ int64_t weight;
+ int64_t lowest_weight;
+ u_int32_t n = 0;
+
+ KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum"));
+ if (dst->sa_family != AF_INET) /* Only INET supports > 1 fib now */
+ fibnum = 0;
+ rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
+
+ /* Look up the address in the table for that Address Family. */
+ if (rnh == NULL) {
+ V_rtstat.rts_unreach++;
+ return (0);
+ }
+
+ RADIX_NODE_HEAD_RLOCK(rnh, &tracker);
+ rn = rnh->rnh_matchaddr(dst, rnh);
+ if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
+ /* we have a route - now do the mpath selection */
+ if (rn_mpath_next( rn) != NULL) { /* multipath */
+ rn0 = rn;
+
+ /* find lowest weight route */
+ for ( rt = (struct rtentry *)rn, weight = rt->rt_rmx.rmx_weight;
+ rn != NULL; rn = rn_mpath_next( rn)) {
+ rt = (struct rtentry *)rn;
+ if(rt->rt_flags & RTF_UP) {
+ if (weight > rt->rt_rmx.rmx_weight) {
+ weight = rt->rt_rmx.rmx_weight;
+ n = 1;
+ } else if (weight == rt->rt_rmx.rmx_weight)
+ n++;
+ }
+ }
+ lowest_weight = weight;
+
+ /* select now one of the lowest weight routes */
+ /* gw selection by Modulo-N Hash (RFC2991) XXX need improvement? */
+ hash += hashjitter;
+ hash %= n;
+ for ( rn = rn0, n = 0; rn != NULL; rn = rn_mpath_next( rn)) {
+ rt = (struct rtentry *)rn;
+ if(rt->rt_flags & RTF_UP) {
+ if ( rt->rt_rmx.rmx_weight == lowest_weight) {
+ if (n == hash)
+ break;
+ n++;
+ }
+ }
+ }
+
+ /* gw selection has failed - there must be only zero weight routes */
+ if (!rn)
+ goto end;
+ } else
+ rt = (struct rtentry *)rn;
+
+ int rt_len = SA_SIZE( rt->rt_gateway);
+ int rtl_len = SA_SIZE( rtl->rt_gateway);
+ if( rt_len > rtl_len) {
+ bcopy( &rt->rt_gateway, &rtl->rt_gateway, rtl_len);
+ } else {
+ bcopy( &rt->rt_gateway, &rtl->rt_gateway, rt_len);
+ }
+ rtl->rt_ifp = rt->rt_ifp;
+ rtl->rt_ifa = rt->rt_ifa;
+ rtl->rt_rmx.rmx_mtu = rt->rt_rmx.rmx_mtu;
+ rtl->rt_rmx.rmx_expire = rt->rt_rmx.rmx_expire;
+ rtl->rt_flags = rt->rt_flags;
+ if (flags & RTL_PKSENT)
+ rt->rt_rmx.rmx_pksent++; /* racy but ok - XXX WHY?*/
+ ret = 1;
+ }
+end:
+ RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
+ return (ret);
+}
+#endif
+
+/*
* Remove a reference count from an rtentry.
* If the count gets low enough, take it out of the routing table
*/
@@ -875,7 +1031,7 @@
* Remove the item from the tree; it should be there,
* but when callers invoke us blindly it may not (sigh).
*/
- rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh);
+ rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh, NULL);
if (rn == NULL) {
error = ESRCH;
goto bad;
@@ -913,112 +1069,6 @@
return (error);
}
-#ifdef RADIX_MPATH
-static int
-rn_mpath_update(int req, struct rt_addrinfo *info,
- struct radix_node_head *rnh, struct rtentry **ret_nrt)
-{
- /*
- * if we got multipath routes, we require users to specify
- * a matching RTAX_GATEWAY.
- */
- struct rtentry *rt, *rto = NULL;
- register struct radix_node *rn;
- int error = 0;
-
- rn = rnh->rnh_matchaddr(dst, rnh);
- if (rn == NULL)
- return (ESRCH);
- rto = rt = RNTORT(rn);
- rt = rt_mpath_matchgate(rt, gateway);
- if (rt == NULL)
- return (ESRCH);
- /*
- * this is the first entry in the chain
- */
- if (rto == rt) {
- rn = rn_mpath_next((struct radix_node *)rt);
- /*
- * there is another entry, now it's active
- */
- if (rn) {
- rto = RNTORT(rn);
- RT_LOCK(rto);
- rto->rt_flags |= RTF_UP;
- RT_UNLOCK(rto);
- } else if (rt->rt_flags & RTF_GATEWAY) {
- /*
- * For gateway routes, we need to
- * make sure that we we are deleting
- * the correct gateway.
- * rt_mpath_matchgate() does not
- * check the case when there is only
- * one route in the chain.
- */
- if (gateway &&
- (rt->rt_gateway->sa_len != gateway->sa_len ||
- memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
- error = ESRCH;
- else {
- /*
- * remove from tree before returning it
- * to the caller
- */
- rn = rnh->rnh_deladdr(dst, netmask, rnh);
- KASSERT(rt == RNTORT(rn), ("radix node disappeared"));
- goto gwdelete;
- }
-
- }
- /*
- * use the normal delete code to remove
- * the first entry
- */
- if (req != RTM_DELETE)
- goto nondelete;
-
- error = ENOENT;
- goto done;
- }
-
- /*
- * if the entry is 2nd and on up
- */
- if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt))
- panic ("rtrequest1: rt_mpath_deldup");
-gwdelete:
- RT_LOCK(rt);
- RT_ADDREF(rt);
- if (req == RTM_DELETE) {
- rt->rt_flags &= ~RTF_UP;
- /*
- * One more rtentry floating around that is not
- * linked to the routing table. rttrash will be decremented
- * when RTFREE(rt) is eventually called.
- */
- V_rttrash++;
- }
-
-nondelete:
- if (req != RTM_DELETE)
- panic("unrecognized request %d", req);
-
-
- /*
- * If the caller wants it, then it can have it,
- * but it's up to it to free the rtentry as we won't be
- * doing it.
- */
- if (ret_nrt) {
- *ret_nrt = rt;
- RT_UNLOCK(rt);
- } else
- RTFREE_LOCKED(rt);
-done:
- return (error);
-}
-#endif
-
int
rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
u_int fibnum)
@@ -1032,6 +1082,7 @@
register struct radix_node_head *rnh;
struct ifaddr *ifa;
struct sockaddr *ndst;
+ struct rm_priotracker tracker;
#define senderr(x) { error = x ; goto bad; }
KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum"));
@@ -1048,7 +1099,7 @@
if (needlock)
RADIX_NODE_HEAD_LOCK(rnh);
else
- RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
+ RADIX_NODE_HEAD_RLOCK(rnh, &tracker);
/*
* If we are adding a host route then we don't want to put
* a netmask in the tree, nor do we want to clone it.
@@ -1058,28 +1109,30 @@
switch (req) {
case RTM_DELETE:
+ if ((rn = rnh->rnh_lookup(dst, netmask, rnh)) == NULL)
+ senderr(ESRCH);
+ rt = RNTORT(rn);
#ifdef RADIX_MPATH
+ /*
+ * if we got multipath routes, we require users to specify
+ * a matching RTAX_GATEWAY.
+ */
if (rn_mpath_capable(rnh)) {
- error = rn_mpath_update(req, info, rnh, ret_nrt);
- /*
- * "bad" holds true for the success case
- * as well
- */
- if (error != ENOENT)
- goto bad;
- error = 0;
+ rt = rt_mpath_matchgate( rt, gateway);
+ rn = (struct radix_node *)rt;
+ if (!rt)
+ senderr(ESRCH);
}
#endif
/*
* Remove the item from the tree and return it.
* Complain if it is not there and do no more processing.
*/
- rn = rnh->rnh_deladdr(dst, netmask, rnh);
+ rn = rnh->rnh_deladdr(dst, netmask, rnh, rn);
if (rn == NULL)
senderr(ESRCH);
if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
panic ("rtrequest delete");
- rt = RNTORT(rn);
RT_LOCK(rt);
RT_ADDREF(rt);
rt->rt_flags &= ~RTF_UP;
@@ -1285,6 +1338,8 @@
bad:
if (needlock)
RADIX_NODE_HEAD_UNLOCK(rnh);
+ else
+ RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
return (error);
#undef senderr
}
@@ -1308,7 +1363,9 @@
#endif
RT_LOCK_ASSERT(rt);
+#ifdef INVARIANTS
RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
+#endif
/*
* Prepare to store the gateway in rt->rt_gateway.
diff -u -r ../src_org_8.2_20110329/sys/net/route.h ./sys/net/route.h
--- ../src_org_8.2_20110329/sys/net/route.h 2010-04-02 05:12:46.000000000 +0000
+++ ./sys/net/route.h 2011-04-03 16:07:57.000000000 +0000
@@ -79,6 +79,39 @@
};
/*
+ * Pointers to structures on the stack for pure routing
+ * table lookups / fast mtu access.
+ * Fakes struct rt_metrics_lite
+ */
+struct rtlookup_metrics {
+ u_long rmx_mtu; /* MTU for this path */
+ u_long rmx_expire; /* XXX rearange rt_metrics_lite */
+ u_long rmx_pksent; /* XXX faster than extra if? - remove? */
+};
+
+/*
+ * Pointers to structures on the stack for pure routing
+ * table lookups.
+ * Fakes struct rtentry
+ */
+#ifndef RNF_NORMAL
+#include <net/radix.h>
+#ifdef RADIX_MPATH
+#include <net/radix_mpath.h>
+#endif
+#endif
+struct rtlookup {
+ struct radix_node rt_nodes[2]; /* XXX rearange rtentry and remove */
+ struct sockaddr *rt_gateway;
+ int rt_flags;
+ int rt_refcnt; /* XXX rearange rtentry and remove */
+ struct ifnet *rt_ifp;
+ struct ifaddr *rt_ifa;
+ struct rtlookup_metrics rt_rmx;
+};
+#define RTL_PKSENT 0x0001 /* increment packet sent counter */
+
+/*
* rmx_rtt and rmx_rttvar are stored as microseconds;
* RTTTOPRHZ(rtt) converts to a value suitable for use
* by a protocol slowtimo counter.
@@ -123,12 +156,6 @@
* gateways are marked so that the output routines know to address the
* gateway rather than the ultimate destination.
*/
-#ifndef RNF_NORMAL
-#include <net/radix.h>
-#ifdef RADIX_MPATH
-#include <net/radix_mpath.h>
-#endif
-#endif
struct rtentry {
struct radix_node rt_nodes[2]; /* tree glue, and other values */
/*
@@ -430,6 +457,10 @@
void rtalloc_fib(struct route *ro, u_int fibnum);
struct rtentry *rtalloc1_fib(struct sockaddr *, int, u_long, u_int);
int rtioctl_fib(u_long, caddr_t, u_int);
+int rtlookup_fib(struct sockaddr *, u_int, struct rtlookup *, int);
+#ifdef RADIX_MPATH
+int rtlookup_mpath_fib(struct sockaddr *, u_int32_t, u_int, struct rtlookup *, int);
+#endif
void rtredirect_fib(struct sockaddr *, struct sockaddr *,
struct sockaddr *, int, struct sockaddr *, u_int);
int rtrequest_fib(int, struct sockaddr *,
diff -u -r ../src_org_8.2_20110329/sys/net/rtsock.c ./sys/net/rtsock.c
--- ../src_org_8.2_20110329/sys/net/rtsock.c 2010-10-30 11:54:55.000000000 +0000
+++ ./sys/net/rtsock.c 2011-04-03 16:07:57.000000000 +0000
@@ -51,6 +51,7 @@
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
+#include <sys/rmlock.h>
#include <net/if.h>
#include <net/if_dl.h>
@@ -513,6 +514,7 @@
int len, error = 0;
struct ifnet *ifp = NULL;
union sockaddr_union saun;
+ struct rm_priotracker tracker;
#define senderr(e) { error = e; goto flush;}
if (m == NULL || ((m->m_len < sizeof(long)) &&
@@ -643,11 +645,11 @@
info.rti_info[RTAX_DST]->sa_family);
if (rnh == NULL)
senderr(EAFNOSUPPORT);
- RADIX_NODE_HEAD_RLOCK(rnh);
+ RADIX_NODE_HEAD_RLOCK(rnh, &tracker);
rt = (struct rtentry *) rnh->rnh_lookup(info.rti_info[RTAX_DST],
info.rti_info[RTAX_NETMASK], rnh);
if (rt == NULL) { /* XXX looks bogus */
- RADIX_NODE_HEAD_RUNLOCK(rnh);
+ RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
senderr(ESRCH);
}
#ifdef RADIX_MPATH
@@ -663,7 +665,7 @@
(rtm->rtm_type != RTM_GET || info.rti_info[RTAX_GATEWAY])) {
rt = rt_mpath_matchgate(rt, info.rti_info[RTAX_GATEWAY]);
if (!rt) {
- RADIX_NODE_HEAD_RUNLOCK(rnh);
+ RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
senderr(ESRCH);
}
}
@@ -695,13 +697,13 @@
*/
rt = (struct rtentry *)rnh->rnh_matchaddr(&laddr, rnh);
if (rt == NULL) {
- RADIX_NODE_HEAD_RUNLOCK(rnh);
+ RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
senderr(ESRCH);
}
}
RT_LOCK(rt);
RT_ADDREF(rt);
- RADIX_NODE_HEAD_RUNLOCK(rnh);
+ RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
/*
* Fix for PR: 82974
diff -u -r ../src_org_8.2_20110329/sys/netinet/icmp_var.h ./sys/netinet/icmp_var.h
--- ../src_org_8.2_20110329/sys/netinet/icmp_var.h 2009-08-03 08:13:06.000000000 +0000
+++ ./sys/netinet/icmp_var.h 2011-04-03 16:07:57.000000000 +0000
@@ -102,7 +102,11 @@
#define BANDLIM_RST_CLOSEDPORT 3 /* No connection, and no listeners */
#define BANDLIM_RST_OPENPORT 4 /* No connection, listener */
#define BANDLIM_ICMP6_UNREACH 5
-#define BANDLIM_MAX 5
+#define BANDLIM_ICMP_FWD_UNREACH 6 /* forwarding: limit unreachable */
+#define BANDLIM_ICMP_FWD_TIMXCEED 7 /* forwarding: limit time-exceeded */
+#define BANDLIM_ICMP_FWD_NEEDFRAG 8 /* forwarding: limit need-frag */
+#define BANDLIM_ICMP_FWD_FILTER 9 /* forwarding: limit admin-prohib */
+#define BANDLIM_MAX 9
#endif
#endif
diff -u -r ../src_org_8.2_20110329/sys/netinet/in.c ./sys/netinet/in.c
--- ../src_org_8.2_20110329/sys/netinet/in.c 2011-01-12 20:44:11.000000000 +0000
+++ ./sys/netinet/in.c 2011-04-03 16:07:57.000000000 +0000
@@ -1392,12 +1392,42 @@
in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr)
{
struct rtentry *rt;
+#ifdef RADIX_MPATH
+ int64_t weight;
+ struct rtentry *rt0;
+ int32_t found = 0;
+#endif
KASSERT(l3addr->sa_family == AF_INET,
("sin_family %d", l3addr->sa_family));
/* XXX rtalloc1 should take a const param */
rt = rtalloc1(__DECONST(struct sockaddr *, l3addr), 0, 0);
+#ifdef RADIX_MPATH
+ rt0 = rt;
+ if ((rt != NULL) && ( rn_mpath_next((struct radix_node *)rt) != NULL)) {
+ /* check if there are other, matching routes */
+ /* find lowest weight route */
+ for ( weight = rt->rt_rmx.rmx_weight; rt != NULL; rt = (struct rtentry *)rn_mpath_next( (struct radix_node *)rt)) {
+ if(rt->rt_flags & RTF_UP) {
+ if (weight > rt->rt_rmx.rmx_weight)
+ weight = rt->rt_rmx.rmx_weight;
+ }
+ }
+
+ /* find now one non gateway route with lowest weight */
+ for ( rt = rt0; rt != NULL; rt = (struct rtentry *)rn_mpath_next( (struct radix_node *)rt)) {
+ if(rt->rt_flags & RTF_UP) {
+ if ((weight == rt->rt_rmx.rmx_weight) && !(rt->rt_flags & RTF_GATEWAY)) {
+ found = 1;
+ break;
+ }
+ }
+ }
+ if (found == 0)
+ rt = NULL;
+ }
+#endif
if (rt == NULL || (!(flags & LLE_PUB) &&
((rt->rt_flags & RTF_GATEWAY) ||
(rt->rt_ifp != ifp)))) {
@@ -1405,11 +1435,20 @@
log(LOG_INFO, "IPv4 address: \"%s\" is not on the network\n",
inet_ntoa(((const struct sockaddr_in *)l3addr)->sin_addr));
#endif
+#ifdef RADIX_MPATH
+ if (rt0 != NULL)
+ RTFREE_LOCKED(rt0);
+#else
if (rt != NULL)
RTFREE_LOCKED(rt);
+#endif
return (EINVAL);
}
+#ifdef RADIX_MPATH
+ RTFREE_LOCKED(rt0);
+#else
RTFREE_LOCKED(rt);
+#endif
return 0;
}
diff -u -r ../src_org_8.2_20110329/sys/netinet/in_rmx.c ./sys/netinet/in_rmx.c
--- ../src_org_8.2_20110329/sys/netinet/in_rmx.c 2010-10-11 11:25:37.000000000 +0000
+++ ./sys/netinet/in_rmx.c 2011-04-03 16:07:57.000000000 +0000
@@ -51,6 +51,8 @@
#include <sys/mbuf.h>
#include <sys/syslog.h>
#include <sys/callout.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
#include <net/if.h>
#include <net/route.h>
diff -u -r ../src_org_8.2_20110329/sys/netinet/ip_fastfwd.c ./sys/netinet/ip_fastfwd.c
--- ../src_org_8.2_20110329/sys/netinet/ip_fastfwd.c 2010-12-10 14:06:50.000000000 +0000
+++ ./sys/netinet/ip_fastfwd.c 2011-04-05 01:13:41.000000000 +0000
@@ -94,6 +94,9 @@
#include <net/if_var.h>
#include <net/if_dl.h>
#include <net/route.h>
+#ifdef RADIX_MPATH
+#include <net/radix_mpath.h>
+#endif
#include <net/vnet.h>
#include <netinet/in.h>
@@ -102,6 +105,7 @@
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/ip_icmp.h>
+#include <netinet/icmp_var.h>
#include <netinet/ip_options.h>
#include <machine/in_cksum.h>
@@ -113,7 +117,11 @@
&VNET_NAME(ipfastforward_active), 0, "Enable fast IP forwarding");
static struct sockaddr_in *
-ip_findroute(struct route *ro, struct in_addr dest, struct mbuf *m)
+#ifdef RADIX_MPATH
+ip_findroute(struct route *ro, uint32_t hash, struct in_addr dest, struct mbuf *m, struct rtlookup *rtl)
+#else
+ip_findroute(struct route *ro, struct in_addr dest, struct mbuf *m, struct rtlookup *rtl)
+#endif
{
struct sockaddr_in *dst;
struct rtentry *rt;
@@ -126,7 +134,17 @@
dst->sin_family = AF_INET;
dst->sin_len = sizeof(*dst);
dst->sin_addr.s_addr = dest.s_addr;
- in_rtalloc_ign(ro, 0, M_GETFIB(m));
+
+ rtl->rt_gateway = (struct sockaddr *)dst;
+#ifdef RADIX_MPATH
+ if (!rtlookup_mpath_fib((struct sockaddr *)dst,
+ hash, M_GETFIB(m), rtl, RTL_PKSENT))
+#else
+ if (!rtlookup_fib( (struct sockaddr *)dst, M_GETFIB(m), rtl, RTL_PKSENT))
+#endif
+ ro->ro_rt = NULL;
+ else
+ ro->ro_rt = (struct rtentry *)rtl;
/*
* Route there and interface still up?
@@ -140,9 +158,10 @@
} else {
IPSTAT_INC(ips_noroute);
IPSTAT_INC(ips_cantforward);
- if (rt)
- RTFREE(rt);
- icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
+ if (badport_bandlim(BANDLIM_ICMP_FWD_UNREACH) < 0)
+ m_freem(m);
+ else
+ icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
return NULL;
}
return dst;
@@ -167,6 +186,7 @@
u_short sum, ip_len;
int error = 0;
int hlen, mtu;
+ struct rtlookup rtl;
#ifdef IPFIREWALL_FORWARD
struct m_tag *fwd_tag;
#endif
@@ -299,8 +319,11 @@
if (ip_doopts == 1)
return m;
else if (ip_doopts == 2) {
- icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_FILTER_PROHIB,
- 0, 0);
+ if (badport_bandlim(BANDLIM_ICMP_FWD_FILTER) < 0)
+ m_freem(m);
+ else
+ icmp_error(m, ICMP_UNREACH,
+ ICMP_UNREACH_FILTER_PROHIB, 0, 0);
return NULL; /* mbuf already free'd */
}
/* else ignore IP options and continue */
@@ -399,7 +422,11 @@
if (!V_ipstealth) {
#endif
if (ip->ip_ttl <= IPTTLDEC) {
- icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0);
+ if (badport_bandlim(BANDLIM_ICMP_FWD_TIMXCEED) < 0)
+ m_freem(m);
+ else
+ icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
+ 0, 0);
return NULL; /* mbuf already free'd */
}
@@ -420,7 +447,13 @@
/*
* Find route to destination.
*/
- if ((dst = ip_findroute(&ro, dest, m)) == NULL)
+#ifdef RADIX_MPATH
+ if ((dst = ip_findroute(&ro, ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
+ dest, m, &rtl)) == NULL)
+#else
+ if ((dst = ip_findroute(&ro,
+ dest, m, &rtl)) == NULL)
+#endif
return NULL; /* icmp unreach already sent */
ifp = ro.ro_rt->rt_ifp;
@@ -476,8 +509,6 @@
* "ours"-label.
*/
m->m_flags |= M_FASTFWD_OURS;
- if (ro.ro_rt)
- RTFREE(ro.ro_rt);
return m;
}
/*
@@ -490,8 +521,7 @@
m_tag_delete(m, fwd_tag);
}
#endif /* IPFIREWALL_FORWARD */
- RTFREE(ro.ro_rt);
- if ((dst = ip_findroute(&ro, dest, m)) == NULL)
+ if ((dst = ip_findroute(&ro, dest, m, &rtl)) == NULL)
return NULL; /* icmp unreach already sent */
ifp = ro.ro_rt->rt_ifp;
}
@@ -507,6 +537,8 @@
if ((ro.ro_rt->rt_flags & RTF_REJECT) &&
(ro.ro_rt->rt_rmx.rmx_expire == 0 ||
time_uptime < ro.ro_rt->rt_rmx.rmx_expire)) {
+ if (badport_bandlim(BANDLIM_ICMP_FWD_UNREACH) < 0)
+ goto drop;
icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
goto consumed;
}
@@ -527,6 +559,8 @@
* Check if media link state of interface is not down
*/
if (ifp->if_link_state == LINK_STATE_DOWN) {
+ if (badport_bandlim(BANDLIM_ICMP_FWD_UNREACH) < 0)
+ goto drop;
icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
goto consumed;
}
@@ -557,8 +591,9 @@
*/
if (ip->ip_off & IP_DF) {
IPSTAT_INC(ips_cantfrag);
- icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG,
- 0, mtu);
+ if (badport_bandlim(BANDLIM_ICMP_FWD_NEEDFRAG) < 0)
+ goto drop;
+ icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, mtu);
goto consumed;
} else {
/*
@@ -606,12 +641,9 @@
IPSTAT_INC(ips_fastforward);
}
consumed:
- RTFREE(ro.ro_rt);
return NULL;
drop:
if (m)
m_freem(m);
- if (ro.ro_rt)
- RTFREE(ro.ro_rt);
return NULL;
}
diff -u -r ../src_org_8.2_20110329/sys/netinet/ip_icmp.c ./sys/netinet/ip_icmp.c
--- ../src_org_8.2_20110329/sys/netinet/ip_icmp.c 2010-09-21 22:33:30.000000000 +0000
+++ ./sys/netinet/ip_icmp.c 2011-04-04 23:01:57.000000000 +0000
@@ -958,7 +958,11 @@
{ "icmp tstamp response" },
{ "closed port RST response" },
{ "open port RST response" },
- { "icmp6 unreach response" }
+ { "icmp6 unreach response" },
+ { "forwarding: limit unreachable" },
+ { "forwarding: limit time-exceeded" },
+ { "forwarding: limit need-frag" },
+ { "forwarding: limit admin-prohib" }
};
/*
diff -u -r ../src_org_8.2_20110329/sys/netinet/ip_input.c ./sys/netinet/ip_input.c
--- ../src_org_8.2_20110329/sys/netinet/ip_input.c 2011-03-28 15:26:52.000000000 +0000
+++ ./sys/netinet/ip_input.c 2011-04-05 01:14:00.000000000 +0000
@@ -71,6 +71,7 @@
#include <netinet/ip_var.h>
#include <netinet/ip_fw.h>
#include <netinet/ip_icmp.h>
+#include <netinet/icmp_var.h>
#include <netinet/ip_options.h>
#include <machine/in_cksum.h>
#include <netinet/ip_carp.h>
@@ -1348,20 +1349,22 @@
struct route sro;
struct sockaddr_in *sin;
struct in_ifaddr *ia;
+ struct sockaddr_in *lu_dst;
+ struct rtlookup rtl;
bzero(&sro, sizeof(sro));
sin = (struct sockaddr_in *)&sro.ro_dst;
sin->sin_family = AF_INET;
sin->sin_len = sizeof(*sin);
sin->sin_addr = dst;
- in_rtalloc_ign(&sro, 0, fibnum);
-
- if (sro.ro_rt == NULL)
+ lu_dst = (struct sockaddr_in *)&sro.ro_dst;
+ rtl.rt_gateway = (struct sockaddr *)lu_dst;
+ if (!rtlookup_fib( (struct sockaddr *)lu_dst,
+ fibnum, &rtl, 0))
return (NULL);
- ia = ifatoia(sro.ro_rt->rt_ifa);
+ ia = ifatoia(rtl.rt_ifa);
ifa_ref(&ia->ia_ifa);
- RTFREE(sro.ro_rt);
return (ia);
}
@@ -1397,6 +1400,9 @@
struct in_addr dest;
struct route ro;
int error, type = 0, code = 0, mtu = 0;
+ struct rtlookup rtl;
+ struct sockaddr_in *dst;
+ int icmp_send = 0;
if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
IPSTAT_INC(ips_cantforward);
@@ -1407,8 +1413,11 @@
if (!V_ipstealth) {
#endif
if (ip->ip_ttl <= IPTTLDEC) {
- icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
- 0, 0);
+ if (badport_bandlim(BANDLIM_ICMP_FWD_TIMXCEED) < 0)
+ m_freem(m);
+ else
+ icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
+ 0, 0);
return;
}
#ifdef IPSTEALTH
@@ -1423,7 +1432,10 @@
* ip_output in case of outgoing IPsec policy.
*/
if (!srcrt && ia == NULL) {
- icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
+ if (badport_bandlim(BANDLIM_ICMP_FWD_UNREACH) < 0)
+ m_freem(m);
+ else
+ icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
return;
}
#endif
@@ -1488,7 +1500,13 @@
sin->sin_family = AF_INET;
sin->sin_len = sizeof(*sin);
sin->sin_addr = ip->ip_dst;
- in_rtalloc_ign(&ro, 0, M_GETFIB(m));
+ dst = (struct sockaddr_in *)&ro.ro_dst;
+ rtl.rt_gateway = (struct sockaddr *)dst;
+ if (!rtlookup_fib( (struct sockaddr *)dst,
+ M_GETFIB(m), &rtl, 0))
+ ro.ro_rt = NULL;
+ else
+ ro.ro_rt = (struct rtentry *)&rtl;
rt = ro.ro_rt;
@@ -1508,8 +1526,6 @@
code = ICMP_REDIRECT_HOST;
}
}
- if (rt)
- RTFREE(rt);
}
/*
@@ -1522,8 +1538,6 @@
if (error == EMSGSIZE && ro.ro_rt)
mtu = ro.ro_rt->rt_rmx.rmx_mtu;
- if (ro.ro_rt)
- RTFREE(ro.ro_rt);
if (error)
IPSTAT_INC(ips_cantforward);
@@ -1558,11 +1572,13 @@
default:
type = ICMP_UNREACH;
code = ICMP_UNREACH_HOST;
+ icmp_send = badport_bandlim( BANDLIM_ICMP_FWD_UNREACH);
break;
case EMSGSIZE:
type = ICMP_UNREACH;
code = ICMP_UNREACH_NEEDFRAG;
+ icmp_send = badport_bandlim( BANDLIM_ICMP_FWD_NEEDFRAG);
#ifdef IPSEC
/*
@@ -1618,7 +1634,10 @@
}
if (ia != NULL)
ifa_free(&ia->ia_ifa);
- icmp_error(mcopy, type, code, dest.s_addr, mtu);
+ if (icmp_send < 0)
+ m_freem(m);
+ else
+ icmp_error(mcopy, type, code, dest.s_addr, mtu);
}
void
diff -u -r ../src_org_8.2_20110329/sys/netinet/ip_output.c ./sys/netinet/ip_output.c
--- ../src_org_8.2_20110329/sys/netinet/ip_output.c 2010-10-25 13:16:11.000000000 +0000
+++ ./sys/netinet/ip_output.c 2011-04-05 01:15:32.000000000 +0000
@@ -128,6 +128,7 @@
struct in_ifaddr *ia = NULL;
int isbroadcast, sw_csum;
struct route iproute;
+ struct rtlookup rtl;
struct rtentry *rte; /* cache for ro->ro_rt */
struct in_addr odst;
#ifdef IPFIREWALL_FORWARD
@@ -271,16 +272,24 @@
* operation (as it is for ARP).
*/
if (rte == NULL) {
+ rtl.rt_gateway = (struct sockaddr *)dst;
#ifdef RADIX_MPATH
- rtalloc_mpath_fib(ro,
- ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
- inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
+ if (!rtlookup_mpath_fib((struct sockaddr *)dst,
+ ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
+ inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m),
+ &rtl, RTL_PKSENT))
#else
- in_rtalloc_ign(ro, 0,
- inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
+ if (!rtlookup_fib( (struct sockaddr *)dst,
+ inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m),
+ &rtl, RTL_PKSENT))
#endif
- rte = ro->ro_rt;
+ ro->ro_rt = NULL;
+ else {
+ nortfree = 1;
+ ro->ro_rt = (struct rtentry *)&rtl;
+ }
}
+ rte = ro->ro_rt;
if (rte == NULL ||
rte->rt_ifp == NULL ||
!RT_LINK_IS_UP(rte->rt_ifp)) {
diff -u -r ../src_org_8.2_20110329/sys/netinet/ipfw/ip_fw_table.c ./sys/netinet/ipfw/ip_fw_table.c
--- ../src_org_8.2_20110329/sys/netinet/ipfw/ip_fw_table.c 2010-03-23 09:58:59.000000000 +0000
+++ ./sys/netinet/ipfw/ip_fw_table.c 2011-04-03 16:07:57.000000000 +0000
@@ -137,7 +137,7 @@
mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr;
IPFW_WLOCK(ch);
- ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh);
+ ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh, NULL);
if (ent == NULL) {
IPFW_WUNLOCK(ch);
return (ESRCH);
@@ -154,7 +154,7 @@
struct table_entry *ent;
ent = (struct table_entry *)
- rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
+ rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh, NULL);
if (ent != NULL)
free(ent, M_IPFW_TBL);
return (0);
diff -u -r ../src_org_8.2_20110329/sys/netinet/raw_ip.c ./sys/netinet/raw_ip.c
--- ../src_org_8.2_20110329/sys/netinet/raw_ip.c 2011-04-02 14:45:13.000000000 +0000
+++ ./sys/netinet/raw_ip.c 2011-04-03 16:07:57.000000000 +0000
@@ -755,6 +755,8 @@
if (err == 0)
ia->ia_flags |= IFA_ROUTE;
err = ifa_add_loopback_route((struct ifaddr *)ia, sa);
+ if (err == 0)
+ ia->ia_flags |= IFA_RTSELF;
ifa_free(&ia->ia_ifa);
break;
}
diff -u -r ../src_org_8.2_20110329/sys/netinet6/in6_ifattach.c ./sys/netinet6/in6_ifattach.c
--- ../src_org_8.2_20110329/sys/netinet6/in6_ifattach.c 2010-05-06 06:44:19.000000000 +0000
+++ ./sys/netinet6/in6_ifattach.c 2011-04-03 16:07:57.000000000 +0000
@@ -42,6 +42,8 @@
#include <sys/proc.h>
#include <sys/syslog.h>
#include <sys/md5.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
#include <net/if.h>
#include <net/if_dl.h>
diff -u -r ../src_org_8.2_20110329/sys/netinet6/in6_rmx.c ./sys/netinet6/in6_rmx.c
--- ../src_org_8.2_20110329/sys/netinet6/in6_rmx.c 2010-10-11 11:25:37.000000000 +0000
+++ ./sys/netinet6/in6_rmx.c 2011-04-03 16:07:57.000000000 +0000
@@ -87,6 +87,7 @@
#include <sys/rwlock.h>
#include <sys/syslog.h>
#include <sys/callout.h>
+#include <sys/rmlock.h>
#include <net/if.h>
#include <net/route.h>
diff -u -r ../src_org_8.2_20110329/sys/netinet6/in6_src.c ./sys/netinet6/in6_src.c
--- ../src_org_8.2_20110329/sys/netinet6/in6_src.c 2010-05-06 06:44:19.000000000 +0000
+++ ./sys/netinet6/in6_src.c 2011-04-05 01:14:24.000000000 +0000
@@ -796,15 +796,253 @@
/*
* clone - meaningful only for bsdi and freebsd
+ * XXX remove and do lookup direct in ip6_output
*/
int
in6_selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
- struct ip6_moptions *mopts, struct route_in6 *ro,
+ struct ip6_moptions *mopts, struct route_in6 *ro, struct rtlookup *rtl,
struct ifnet **retifp, struct rtentry **retrt)
{
+ int error = 0;
+ struct ifnet *ifp = NULL;
+ struct rtentry *rt = NULL;
+ struct sockaddr_in6 *sin6_next;
+ struct in6_pktinfo *pi = NULL;
+ struct in6_addr *dst = &dstsock->sin6_addr;
+ struct sockaddr_in6 *lu_dst;
+ int norouteok = 0;
+#if 0
+ char ip6buf[INET6_ADDRSTRLEN];
+
+ if (dstsock->sin6_addr.s6_addr32[0] == 0 &&
+ dstsock->sin6_addr.s6_addr32[1] == 0 &&
+ !IN6_IS_ADDR_LOOPBACK(&dstsock->sin6_addr)) {
+ printf("in6_selectroute: strange destination %s\n",
+ ip6_sprintf(ip6buf, &dstsock->sin6_addr));
+ } else {
+ printf("in6_selectroute: destination = %s%%%d\n",
+ ip6_sprintf(ip6buf, &dstsock->sin6_addr),
+ dstsock->sin6_scope_id); /* for debug */
+ }
+#endif
+
+ /* If the caller specify the outgoing interface explicitly, use it. */
+ if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) {
+ /* XXX boundary check is assumed to be already done. */
+ ifp = ifnet_byindex(pi->ipi6_ifindex);
+ if (ifp != NULL &&
+ (norouteok || retrt == NULL ||
+ IN6_IS_ADDR_MULTICAST(dst))) {
+ /*
+ * we do not have to check or get the route for
+ * multicast.
+ */
+ goto done;
+ } else
+ goto getroute;
+ }
+
+ /*
+ * If the destination address is a multicast address and the outgoing
+ * interface for the address is specified by the caller, use it.
+ */
+ if (IN6_IS_ADDR_MULTICAST(dst) &&
+ mopts != NULL && (ifp = mopts->im6o_multicast_ifp) != NULL) {
+ goto done; /* we do not need a route for multicast. */
+ }
+
+ getroute:
+ /*
+ * If the next hop address for the packet is specified by the caller,
+ * use it as the gateway.
+ */
+ if (opts && opts->ip6po_nexthop) {
+ struct route_in6 *ron;
+ struct llentry *la;
+
+ sin6_next = satosin6(opts->ip6po_nexthop);
+
+ /* at this moment, we only support AF_INET6 next hops */
+ if (sin6_next->sin6_family != AF_INET6) {
+ error = EAFNOSUPPORT; /* or should we proceed? */
+ goto done;
+ }
+
+ /*
+ * If the next hop is an IPv6 address, then the node identified
+ * by that address must be a neighbor of the sending host.
+ */
+ ron = &opts->ip6po_nextroute;
+ /*
+ * XXX what do we do here?
+ * PLZ to be fixing
+ */
+
+ if (ron->ro_rt == NULL) {
+ lu_dst = (struct sockaddr_in6 *)&ron->ro_dst;
+ rtl->rt_gateway = (struct sockaddr *)lu_dst;
+ if (!rtlookup_fib( (struct sockaddr *)lu_dst, 0U,
+ rtl, RTL_PKSENT)) {
+ ron->ro_rt = NULL;
+ error = EHOSTUNREACH;
+ goto done;
+ } else
+ ron->ro_rt = (struct rtentry *) rtl;
+ }
+
+ rt = ron->ro_rt;
+ ifp = rt->rt_ifp;
+ IF_AFDATA_LOCK(ifp);
+ la = lla_lookup(LLTABLE6(ifp), 0, (struct sockaddr *)&sin6_next->sin6_addr);
+ IF_AFDATA_UNLOCK(ifp);
+ if (la != NULL)
+ LLE_RUNLOCK(la);
+ else {
+ error = EHOSTUNREACH;
+ goto done;
+ }
+#if 0
+ if ((ron->ro_rt &&
+ (ron->ro_rt->rt_flags & (RTF_UP | RTF_LLINFO)) !=
+ (RTF_UP | RTF_LLINFO)) ||
+ !IN6_ARE_ADDR_EQUAL(&satosin6(&ron->ro_dst)->sin6_addr,
+ &sin6_next->sin6_addr)) {
+ if (ron->ro_rt)
+ ron->ro_rt = NULL;
+ *satosin6(&ron->ro_dst) = *sin6_next;
+ }
+ if (ron->ro_rt == NULL) {
+ lu_dst = (struct sockaddr_in6 *)&ron->ro_dst;
+ rtl->rt_gateway = (struct sockaddr *)lu_dst;
+ if (!rtlookup_fib( (struct sockaddr *)lu_dst, 0U,
+ rtl, RTL_PKSENT)) {
+ ron->ro_rt = NULL;
+ error = EHOSTUNREACH;
+ goto done;
+ } else {
+ ron->ro_rt = (struct rtentry *) rtl;
+ if (!(ron->ro_rt->rt_flags & RTF_LLINFO)) {
+ ron->ro_rt = NULL;
+ error = EHOSTUNREACH;
+ goto done;
+ }
+ }
+ }
+#endif
+
+ /*
+ * When cloning is required, try to allocate a route to the
+ * destination so that the caller can store path MTU
+ * information.
+ */
+ goto done;
+ }
+
+ /*
+ * Use a cached route if it exists and is valid, else try to allocate
+ * a new one. Note that we should check the address family of the
+ * cached destination, in case of sharing the cache with IPv4.
+ */
+ if (ro) {
+ if (ro->ro_rt &&
+ (!(ro->ro_rt->rt_flags & RTF_UP) ||
+ ((struct sockaddr *)(&ro->ro_dst))->sa_family != AF_INET6 ||
+ !IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr,
+ dst)))
+ ro->ro_rt = (struct rtentry *)NULL;
+ if (ro->ro_rt == (struct rtentry *)NULL) {
+ struct sockaddr_in6 *sa6;
+
+ /* No route yet, so try to acquire one */
+ bzero(&ro->ro_dst, sizeof(struct sockaddr_in6));
+ sa6 = (struct sockaddr_in6 *)&ro->ro_dst;
+ *sa6 = *dstsock;
+ sa6->sin6_scope_id = 0;
+
+ lu_dst = (struct sockaddr_in6 *)&ro->ro_dst;
+ rtl->rt_gateway = (struct sockaddr *)lu_dst;
+#ifdef RADIX_MPATH
+ if (!rtlookup_mpath_fib((struct sockaddr *)lu_dst,
+ ntohl(sa6->sin6_addr.s6_addr32[3]),
+ 0U, rtl, RTL_PKSENT))
+#else
+ if (!rtlookup_fib((struct sockaddr *)lu_dst, 0U,
+ rtl, RTL_PKSENT))
+#endif
+ ro->ro_rt = NULL;
+ else
+ ro->ro_rt = (struct rtentry *) rtl;
+ }
+
+ /*
+ * do not care about the result if we have the nexthop
+ * explicitly specified.
+ */
+ if (opts && opts->ip6po_nexthop)
+ goto done;
+
+ if (ro->ro_rt) {
+ ifp = ro->ro_rt->rt_ifp;
+
+ if (ifp == NULL) { /* can this really happen? */
+ ro->ro_rt = NULL;
+ }
+ }
+ if (ro->ro_rt == NULL)
+ error = EHOSTUNREACH;
+ rt = ro->ro_rt;
+
+ /*
+ * Check if the outgoing interface conflicts with
+ * the interface specified by ipi6_ifindex (if specified).
+ * Note that loopback interface is always okay.
+ * (this may happen when we are sending a packet to one of
+ * our own addresses.)
+ */
+ if (ifp && opts && opts->ip6po_pktinfo &&
+ opts->ip6po_pktinfo->ipi6_ifindex) {
+ if (!(ifp->if_flags & IFF_LOOPBACK) &&
+ ifp->if_index !=
+ opts->ip6po_pktinfo->ipi6_ifindex) {
+ error = EHOSTUNREACH;
+ goto done;
+ }
+ }
+ }
+
+ done:
+ if (ifp == NULL && rt == NULL) {
+ /*
+ * This can happen if the caller did not pass a cached route
+ * nor any other hints. We treat this case an error.
+ */
+ error = EHOSTUNREACH;
+ }
+ if (error == EHOSTUNREACH)
+ V_ip6stat.ip6s_noroute++;
+
+ if (retifp != NULL) {
+ *retifp = ifp;
+
+ /*
+ * Adjust the "outgoing" interface. If we're going to loop
+ * the packet back to ourselves, the ifp would be the loopback
+ * interface. However, we'd rather know the interface associated
+ * to the destination address (which should probably be one of
+ * our own addresses.)
+ */
+ if (rt) {
+ if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) &&
+ (rt->rt_gateway->sa_family == AF_LINK))
+ *retifp =
+ ifnet_byindex(((struct sockaddr_dl *)
+ rt->rt_gateway)->sdl_index);
+ }
+ }
+ if (retrt != NULL)
+ *retrt = rt; /* rt may be NULL */
- return (selectroute(dstsock, opts, mopts, ro, retifp,
- retrt, 0));
+ return (error);
}
/*
diff -u -r ../src_org_8.2_20110329/sys/netinet6/ip6_forward.c ./sys/netinet6/ip6_forward.c
--- ../src_org_8.2_20110329/sys/netinet6/ip6_forward.c 2010-02-07 09:00:22.000000000 +0000
+++ ./sys/netinet6/ip6_forward.c 2011-04-05 01:14:58.000000000 +0000
@@ -99,6 +99,7 @@
struct ifnet *origifp; /* maybe unnecessary */
u_int32_t inzone, outzone;
struct in6_addr src_in6, dst_in6;
+ struct rtlookup rtl;
#ifdef IPSEC
struct secpolicy *sp = NULL;
int ipsecrt = 0;
@@ -352,18 +353,27 @@
dst->sin6_family = AF_INET6;
dst->sin6_addr = ip6->ip6_dst;
- rin6.ro_rt = rtalloc1((struct sockaddr *)dst, 0, 0);
- if (rin6.ro_rt != NULL)
- RT_UNLOCK(rin6.ro_rt);
- else {
+ rtl.rt_gateway = (struct sockaddr *)dst;
+#ifdef RADIX_MPATH
+ src_in6 = ip6->ip6_src;
+ dst_in6 = ip6->ip6_dst;
+ if (!rtlookup_mpath_fib((struct sockaddr *)dst,
+ ntohl(src_in6->sin6_addr.s6_addr32[3] ^ dst_in6->sin6_addr.s6_addr32[3]),
+ 0U, &rtl, RTL_PKSENT)) {
+#else
+ if (!rtlookup_fib( (struct sockaddr *)dst, 0U, &rtl,
+ RTL_PKSENT)) {
+#endif
+ rin6.ro_rt = NULL;
V_ip6stat.ip6s_noroute++;
in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_noroute);
if (mcopy) {
icmp6_error(mcopy, ICMP6_DST_UNREACH,
- ICMP6_DST_UNREACH_NOROUTE, 0);
+ ICMP6_DST_UNREACH_NOROUTE, 0);
}
goto bad;
- }
+ } else
+ rin6.ro_rt = (struct rtentry *) &rtl;
rt = rin6.ro_rt;
#ifdef IPSEC
skip_routing:
@@ -580,12 +590,12 @@
senderr:
if (mcopy == NULL)
- goto out;
+ return;
switch (error) {
case 0:
if (type == ND_REDIRECT) {
icmp6_redirect_output(mcopy, rt);
- goto out;
+ return;
}
goto freecopy;
diff -u -r ../src_org_8.2_20110329/sys/netinet6/ip6_output.c ./sys/netinet6/ip6_output.c
--- ../src_org_8.2_20110329/sys/netinet6/ip6_output.c 2010-10-25 13:16:11.000000000 +0000
+++ ./sys/netinet6/ip6_output.c 2011-04-03 16:07:57.000000000 +0000
@@ -200,7 +200,7 @@
int hlen, tlen, len, off;
struct route_in6 ip6route;
struct rtentry *rt = NULL;
- struct sockaddr_in6 *dst, src_sa, dst_sa;
+ struct sockaddr_in6 *dst, src_sa, dst_sa, dst_lookup;
struct in6_addr odst;
int error = 0;
struct in6_ifaddr *ia = NULL;
@@ -213,6 +213,7 @@
struct route_in6 *ro_pmtu = NULL;
int hdrsplit = 0;
int needipsec = 0;
+ struct rtlookup rtl;
#ifdef SCTP
int sw_csum;
#endif
@@ -572,11 +573,11 @@
/* adjust pointer */
ip6 = mtod(m, struct ip6_hdr *);
- bzero(&dst_sa, sizeof(dst_sa));
- dst_sa.sin6_family = AF_INET6;
- dst_sa.sin6_len = sizeof(dst_sa);
- dst_sa.sin6_addr = ip6->ip6_dst;
- if ((error = in6_selectroute(&dst_sa, opt, im6o, ro,
+ bzero(&dst_lookup, sizeof(dst_lookup));
+ dst_lookup.sin6_family = AF_INET6;
+ dst_lookup.sin6_len = sizeof(dst_lookup);
+ dst_lookup.sin6_addr = ip6->ip6_dst;
+ if ((error = in6_selectroute(&dst_lookup, opt, im6o, ro, &rtl,
&ifp, &rt)) != 0) {
switch (error) {
case EHOSTUNREACH:
@@ -595,7 +596,7 @@
* If in6_selectroute() does not return a route entry,
* dst may not have been updated.
*/
- *dst = dst_sa; /* XXX */
+ *dst = dst_lookup; /* XXX */
}
/*
@@ -1071,11 +1072,6 @@
V_ip6stat.ip6s_fragmented++;
done:
- if (ro == &ip6route && ro->ro_rt) { /* brace necessary for RTFREE */
- RTFREE(ro->ro_rt);
- } else if (ro_pmtu == &ip6route && ro_pmtu->ro_rt) {
- RTFREE(ro_pmtu->ro_rt);
- }
#ifdef IPSEC
if (sp != NULL)
KEY_FREESP(&sp);
diff -u -r ../src_org_8.2_20110329/sys/netinet6/ip6_var.h ./sys/netinet6/ip6_var.h
--- ../src_org_8.2_20110329/sys/netinet6/ip6_var.h 2010-09-09 06:43:18.000000000 +0000
+++ ./sys/netinet6/ip6_var.h 2011-04-03 16:07:57.000000000 +0000
@@ -431,12 +431,13 @@
int dest6_input __P((struct mbuf **, int *, int));
int none_input __P((struct mbuf **, int *, int));
+#include <net/route.h>
int in6_selectsrc(struct sockaddr_in6 *, struct ip6_pktopts *,
struct inpcb *inp, struct route_in6 *, struct ucred *cred,
struct ifnet **, struct in6_addr *);
int in6_selectroute __P((struct sockaddr_in6 *, struct ip6_pktopts *,
- struct ip6_moptions *, struct route_in6 *, struct ifnet **,
- struct rtentry **));
+ struct ip6_moptions *, struct route_in6 *, struct rtlookup *,
+ struct ifnet **, struct rtentry **));
u_int32_t ip6_randomid __P((void));
u_int32_t ip6_randomflowlabel __P((void));
#endif /* _KERNEL */
diff -u -r ../src_org_8.2_20110329/sys/netinet6/nd6_rtr.c ./sys/netinet6/nd6_rtr.c
--- ../src_org_8.2_20110329/sys/netinet6/nd6_rtr.c 2010-05-06 06:44:19.000000000 +0000
+++ ./sys/netinet6/nd6_rtr.c 2011-04-03 16:07:57.000000000 +0000
@@ -48,6 +48,8 @@
#include <sys/rwlock.h>
#include <sys/syslog.h>
#include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
#include <net/if.h>
#include <net/if_types.h>
More information about the freebsd-net
mailing list