svn commit: r287045 - in projects/routing/sys: net netinet
Alexander V. Chernikov
melifaro at FreeBSD.org
Sun Aug 23 18:13:31 UTC 2015
Author: melifaro
Date: Sun Aug 23 18:13:27 2015
New Revision: 287045
URL: https://svnweb.freebsd.org/changeset/base/287045
Log:
Switch IPv4 output path to use new routing api.
The goals of the new API is to provide consumers with minimal
needed information, but as fast as possible. So we provide
full nexthop info copied into alighed on-cache structure
instead of rte/ia pointers, their refcounts and locks.
This does not provide solution for protecting from egress
ifp destruction, but does not make it any worse.
Current changes:
nhops:
Add fib4_lookup_prepend() function which stores either full
L2+L3 prepend info (e.g. MAC header in case of plain IPv4) or
L3 info with NH_FLAGS_L2_INCOMPLETE flag indicating that no valid L2
info exists and we have to take "slow" path.
ip_output:
Currently ip[ 46]_output consumers use 'struct route' for
the following purposes:
1) double lookup avoidance(route caching)
2) plain route caching
3) get path MTU to be able to notify source.
The former pattern is mostly used by various tunnels
(gif, gre, stf). (Actually, gre is the only remaining,
others were already converted. Their locking model did
not scale good enogh to benefit from such caching, so
we have (temporarily) removed it without any performance
loss).
Plain route caching used by SCTP is simply wrong and should be removed.
Temporary break it for now just to be able to compile.
Optimize path mtu reporting by providing it in new 'route_info' stucture.
Minimize games with @ia locking/refcounting for route lookup:
add special nhop[46]_extended structure to store more route attributes.
Pointer to given structure can be passed to fib4_lookup_prepend() to indicate
we want this info (we actually needs it for UDP and raw IP).
ether_output:
Provide light-weight ether_output2() call to deal with
transmitting L2 frame (e.g. properly handle broadcast/simloop/bridge/
other L2 hooks before actually transmitting frame by if_transmit()).
Add a hack based on new RT_NHOP ro_flag to distinguish which version should
we call. Better way is probably to add a new "if_output_frame" driver
callbacks.
Next steps:
* Convert ip_fastfwd part
* Implement auto-growing array for per-radix nexthops
* Implement LLE tracking for nexthop calculations to be able to
immediately provide all necessary info in single route lookup
for gateway routes
* Switch radix locking scheme to runtime/cfg lock
* Implement multipath support for rtsock
* Implement "tracked nexthops" for tunnels (e.g. _proper_
nexthop caching)
* Add IPv6 support for remaining parts (postponed not to
interfere with user/ae/inet6 branch)
* Consider adding "if_output_frame" driver call to
ease logical frame pushing.
Modified:
projects/routing/sys/net/if_ethersubr.c
projects/routing/sys/net/route.h
projects/routing/sys/net/rt_nhops.c
projects/routing/sys/net/rt_nhops.h
projects/routing/sys/netinet/if_ether.c
projects/routing/sys/netinet/if_ether.h
projects/routing/sys/netinet/ip_input.c
projects/routing/sys/netinet/ip_output.c
projects/routing/sys/netinet/ip_var.h
projects/routing/sys/netinet/sctp_os_bsd.h
projects/routing/sys/netinet/tcp_output.c
Modified: projects/routing/sys/net/if_ethersubr.c
==============================================================================
--- projects/routing/sys/net/if_ethersubr.c Sun Aug 23 18:12:11 2015 (r287044)
+++ projects/routing/sys/net/if_ethersubr.c Sun Aug 23 18:13:27 2015 (r287045)
@@ -78,6 +78,7 @@
#ifdef INET6
#include <netinet6/nd6.h>
#endif
+#include <net/rt_nhops.h>
#include <security/mac/mac_framework.h>
#ifdef CTASSERT
@@ -114,6 +115,14 @@ static int ether_resolvemulti(struct ifn
static void ether_reassign(struct ifnet *, struct vnet *, char *);
#endif
+int ether_output_full(struct ifnet *ifp, struct mbuf *m,
+ const struct sockaddr *dst, struct route *ro);
+int ether_output2(struct ifnet *ifp, struct mbuf *m, struct nhop_data *nh,
+ int af);
+
+static int loopback_frame(struct ifnet *ifp, struct mbuf *m, int family,
+ int hlen);
+
#define ETHER_IS_BROADCAST(addr) \
(bcmp(etherbroadcastaddr, (addr), ETHER_ADDR_LEN) == 0)
@@ -135,6 +144,17 @@ update_mbuf_csumflags(struct mbuf *src,
dst->m_pkthdr.csum_data = 0xffff;
}
+int
+ether_output(struct ifnet *ifp, struct mbuf *m,
+ const struct sockaddr *dst, struct route *ro)
+{
+ if (ro != NULL && (ro->ro_flags & RT_NHOP))
+ return (ether_output2(ifp, m, (struct nhop_data *)ro->ro_lle,
+ (ro->ro_flags >> 8) & 0xFF));
+
+ return (ether_output_full(ifp, m, dst, ro));
+}
+
/*
* Ethernet output routine.
* Encapsulate a packet of type family for the local net.
@@ -142,7 +162,7 @@ update_mbuf_csumflags(struct mbuf *src,
* packet leaves a multiple of 512 bytes of data in remainder.
*/
int
-ether_output(struct ifnet *ifp, struct mbuf *m,
+ether_output_full(struct ifnet *ifp, struct mbuf *m,
const struct sockaddr *dst, struct route *ro)
{
short type;
@@ -281,31 +301,11 @@ ether_output(struct ifnet *ifp, struct m
*/
if ((ifp->if_flags & IFF_SIMPLEX) && loop_copy &&
((t = pf_find_mtag(m)) == NULL || !t->routed)) {
- if (m->m_flags & M_BCAST) {
- struct mbuf *n;
-
- /*
- * Because if_simloop() modifies the packet, we need a
- * writable copy through m_dup() instead of a readonly
- * one as m_copy[m] would give us. The alternative would
- * be to modify if_simloop() to handle the readonly mbuf,
- * but performancewise it is mostly equivalent (trading
- * extra data copying vs. extra locking).
- *
- * XXX This is a local workaround. A number of less
- * often used kernel parts suffer from the same bug.
- * See PR kern/105943 for a proposed general solution.
- */
- if ((n = m_dup(m, M_NOWAIT)) != NULL) {
- update_mbuf_csumflags(m, n);
- (void)if_simloop(ifp, n, dst->sa_family, hlen);
- } else
- if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
- } else if (bcmp(eh->ether_dhost, eh->ether_shost,
- ETHER_ADDR_LEN) == 0) {
- update_mbuf_csumflags(m, m);
- (void) if_simloop(ifp, m, dst->sa_family, hlen);
- return (0); /* XXX */
+ if ((m->m_flags & M_BCAST) || (bcmp(eh->ether_dhost,
+ eh->ether_shost, ETHER_ADDR_LEN) == 0)) {
+ /* Either broadcast or to-us L2 header */
+ if (loopback_frame(ifp, m, dst->sa_family, hlen) == 1)
+ return (0);
}
}
@@ -341,6 +341,112 @@ bad: if (m != NULL)
}
/*
+ * We assume this function to be called for
+ * ip[6]_output(), with already pre-compiled L2 header.
+ *
+ * Function assumes all loopback routing is already done on L3,
+ * so the only reason to push packet (copy) to host is M_BCAST flag.
+ */
+int
+ether_output2(struct ifnet *ifp, struct mbuf *m, struct nhop_data *nh, int af)
+{
+ int error;
+
+#ifdef MAC
+ error = mac_ifnet_check_transmit(ifp, m);
+ if (error)
+ senderr(error);
+#endif
+
+ M_PROFILE(m);
+ if (ifp->if_flags & IFF_MONITOR)
+ senderr(ENETDOWN);
+ if (!((ifp->if_flags & IFF_UP) &&
+ (ifp->if_drv_flags & IFF_DRV_RUNNING)))
+ senderr(ENETDOWN);
+
+ if ((ifp->if_flags & IFF_SIMPLEX) && (m->m_flags & M_BCAST)) {
+ /* We have to copy frame to-us */
+ if (loopback_frame(NH_LIFP(nh), m, af, nh->nh_count) != 0)
+ return (0);
+ }
+
+ /*
+ * Bridges require special output handling.
+ */
+ if (ifp->if_bridge) {
+ BRIDGE_OUTPUT(ifp, m, error);
+ return (error);
+ }
+
+#if defined(INET) || defined(INET6)
+ if (ifp->if_carp) {
+ struct sockaddr_in dst;
+ memset(&dst, 0, sizeof(dst));
+ //dst.sin_addr =
+ error = (*carp_output_p)(ifp, m,
+ (const struct sockaddr *)&dst);
+ if (error != 0)
+ goto bad;
+ }
+#endif
+
+ /* Handle ng_ether(4) processing, if any */
+ if (ifp->if_l2com != NULL) {
+ KASSERT(ng_ether_output_p != NULL,
+ ("ng_ether_output_p is NULL"));
+ if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) {
+bad: if (m != NULL)
+ m_freem(m);
+ return (error);
+ }
+ if (m == NULL)
+ return (0);
+ }
+
+ /* Continue with link-layer output */
+ return (ether_output_frame(ifp, m));
+}
+
+static int
+loopback_frame(struct ifnet *ifp, struct mbuf *m, int family, int hlen)
+{
+ struct ether_header *eh;
+
+ if (m->m_flags & M_BCAST) {
+ struct mbuf *n;
+
+ /*
+ * Because if_simloop() modifies the packet, we need a
+ * writable copy through m_dup() instead of a readonly
+ * one as m_copy[m] would give us. The alternative would
+ * be to modify if_simloop() to handle the readonly mbuf,
+ * but performancewise it is mostly equivalent (trading
+ * extra data copying vs. extra locking).
+ *
+ * XXX This is a local workaround. A number of less
+ * often used kernel parts suffer from the same bug.
+ * See PR kern/105943 for a proposed general solution.
+ */
+ if ((n = m_dup(m, M_NOWAIT)) != NULL) {
+ update_mbuf_csumflags(m, n);
+ if_simloop(ifp, n, family, hlen);
+ } else
+ if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
+ } else {
+ eh = mtod(m, struct ether_header *);
+ if (bcmp(eh->ether_dhost, eh->ether_shost,
+ ETHER_ADDR_LEN) == 0) {
+ update_mbuf_csumflags(m, m);
+ if_simloop(ifp, m, family, hlen);
+ return (1);
+ }
+ }
+
+ return (0);
+}
+
+/*
* Ethernet link layer output routine to send a raw frame to the device.
*
* This assumes that the 14 byte Ethernet header is present and contiguous
Modified: projects/routing/sys/net/route.h
==============================================================================
--- projects/routing/sys/net/route.h Sun Aug 23 18:12:11 2015 (r287044)
+++ projects/routing/sys/net/route.h Sun Aug 23 18:13:27 2015 (r287045)
@@ -59,6 +59,7 @@ struct route {
#define RT_CACHING_CONTEXT 0x1 /* XXX: not used anywhere */
#define RT_NORTREF 0x2 /* doesn't hold reference on ro_rt */
+#define RT_NHOP 0x4
struct rt_metrics {
u_long rmx_locks; /* Kernel must leave these values alone */
Modified: projects/routing/sys/net/rt_nhops.c
==============================================================================
--- projects/routing/sys/net/rt_nhops.c Sun Aug 23 18:12:11 2015 (r287044)
+++ projects/routing/sys/net/rt_nhops.c Sun Aug 23 18:13:27 2015 (r287045)
@@ -62,9 +62,13 @@
#endif
#include <netinet/in.h>
+#include <netinet/in_var.h>
#include <netinet/ip_mroute.h>
#include <netinet/ip6.h>
+#include <net/if_types.h>
+#include <netinet/if_ether.h>
+#include <net/ethernet.h>
#include <net/rt_nhops.h>
#include <vm/uma.h>
@@ -104,6 +108,18 @@ static struct rwlock fwd_lock;
int fwd_attach_fib(struct fwd_module *fm, u_int fib);
int fwd_destroy_fib(struct fwd_module *fm, u_int fib);
#endif
+
+#ifdef INET
+static void fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst,
+ struct nhop4_extended *pnh4);
+static void fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst,
+ struct nhop4_basic *pnh4);
+#endif
+#ifdef INET
+static void fib6_rte_to_nh_basic(struct rtentry *rte, struct in6_addr dst,
+ struct nhop6_basic *pnh6);
+#endif
+
MALLOC_DEFINE(M_RTFIB, "rtfib", "routing fwd");
@@ -132,14 +148,243 @@ MALLOC_DEFINE(M_RTFIB, "rtfib", "routing
#define NHOP_FLAGS_MASK (RTF_REJECT|RTF_BLACKHOLE)
//#define NHOP_DIRECT
#define RNTORT(p) ((struct rtentry *)(p))
+
+
+/*
+ * Copies proper nexthop data based on @nh_src nexthop.
+ *
+ * For non-ECMP nexthop function simply copies @nh_src.
+ * For ECMP nexthops flowid is used to select proper
+ * nexthop.
+ *
+ */
+static inline void
+fib_choose_prepend(uint32_t fibnum, struct nhop_data *nh_src,
+ uint32_t flowid, struct nhop_data *nh, int af)
+{
+ struct nhop_multi *nh_multi;
+ int idx;
+
+ if ((nh_src->nh_flags & NH_FLAGS_RECURSE) != 0) {
+
+ /*
+ * Recursive nexthop. Choose direct nexthop
+ * based on flowid.
+ */
+ nh_multi = (struct nhop_multi *)nh_src;
+ idx = nh_multi->nh_nhops[flowid % nh_multi->nh_count];
+#if 0
+ KASSERT((fibnum < rt_numfibs), ("fib4_lookup_prepend§: bad fibnum"));
+ rnh = rt_tables_get_rnh(fibnum, AF_INET);
+ //nh_src = &rnh->nhops[i];
+#endif
+ }
+
+ *nh = *nh_src;
+ /* TODO: Do some light-weight refcounting on egress ifp's */
+}
+
+static inline void
+fib_free_nh(uint32_t fibnum, struct nhop_data *nh, int af)
+{
+
+ /* TODO: Do some light-weight refcounting on egress ifp's */
+}
+
#ifdef INET
+void
+fib4_free_nh(uint32_t fibnum, struct nhop_data *nh)
+{
+
+ fib_free_nh(fibnum, nh, AF_INET);
+}
+
+void
+fib4_choose_prepend(uint32_t fibnum, struct nhop_data *nh_src,
+ uint32_t flowid, struct nhop_data *nh, struct nhop4_extended *nh_ext)
+{
+
+ fib_choose_prepend(fibnum, nh_src, flowid, nh, AF_INET);
+ if (nh_ext == NULL)
+ return;
+
+ nh_ext->nh_ifp = NH_LIFP(nh);
+ nh_ext->nh_mtu = nh->nh_mtu;
+ nh_ext->nh_flags = nh->nh_flags;
+#if 0
+ /* TODO: copy source/gw address from extended nexthop data */
+ nh_ext->nh_addr = ;
+ nh_ext->nh_src= ;
+#endif
+}
+
+/*
+ * Function performs lookup in IPv4 table fib @fibnum.
+ *
+ * In case of successful lookup @nh header is filled with
+ * appropriate interface info and full L2 header to prepend.
+ *
+ * If no valid ARP record is present, NH_FLAGS_L2_INCOMPLETE flag
+ * is set and gateway address is stored into nh->d.gw4
+ *
+ * If @nh_ext is not NULL, additional nexthop data is stored there.
+ *
+ * Returns 0 on success.
+ *
+ */
+int
+fib4_lookup_prepend(uint32_t fibnum, struct in_addr dst, struct mbuf *m,
+ struct nhop_data *nh, struct nhop4_extended *nh_ext)
+{
+ struct radix_node_head *rnh;
+ struct radix_node *rn;
+ struct sockaddr_in *gw_sa, sin;
+ struct ifnet *lifp;
+ struct in_addr gw;
+ struct ether_header *eh;
+ int error, flags;
+ //uint32_t flowid;
+ struct rtentry *rte;
+
+ KASSERT((fibnum < rt_numfibs), ("fib4_lookup_prepend: bad fibnum"));
+ rnh = rt_tables_get_rnh(fibnum, AF_INET);
+ if (rnh == NULL)
+ return (EHOSTUNREACH);
+
+ /* Prepare lookup key */
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_len = sizeof(struct sockaddr_in);
+ sin.sin_addr = dst;
+
+ RADIX_NODE_HEAD_RLOCK(rnh);
+ rn = rnh->rnh_matchaddr((void *)&sin, rnh);
+ rte = RNTORT(rn);
+ if (rn == NULL || ((rn->rn_flags & RNF_ROOT) != 0) ||
+ RT_LINK_IS_UP(rte->rt_ifp) == 0) {
+ RADIX_NODE_HEAD_RUNLOCK(rnh);
+ return (EHOSTUNREACH);
+ }
+
+ /*
+ * Currently we fill in @nh ourselves.
+ * In near future rte will have nhop index to copy from.
+ */
+
+ /* Calculate L3 info */
+ flags = 0;
+ nh->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
+ if (rte->rt_flags & RTF_GATEWAY) {
+ gw_sa = (struct sockaddr_in *)rte->rt_gateway;
+ gw = gw_sa->sin_addr;
+ } else
+ gw = dst;
+ /* Set flags */
+ flags = rte->rt_flags & NHOP_FLAGS_MASK;
+ gw_sa = (struct sockaddr_in *)rt_key(rte);
+ if (gw_sa->sin_addr.s_addr == 0)
+ flags |= NHOP_DEFAULT;
+
+ /*
+ * TODO: nh L2/L3 resolve.
+ * Currently all we have is rte ifp.
+ * Simply use it.
+ */
+ lifp = rte->rt_ifp;
+ /* Save both logical and transmit interface indexes */
+ nh->lifp_idx = lifp->if_index;
+ nh->i.ifp_idx = nh->lifp_idx;
+
+ if (nh_ext != NULL) {
+ /* Fill in extended info */
+ fib4_rte_to_nh_extended(rte, dst, nh_ext);
+ }
+
+ RADIX_NODE_HEAD_RUNLOCK(rnh);
+
+ nh->nh_flags = flags;
+ /*
+ * Try to lookup L2 info.
+ * Do this using separate LLE locks.
+ * TODO: move this under radix lock.
+ */
+ if (lifp->if_type == IFT_ETHER) {
+ eh = (struct ether_header *)nh->d.data;
+
+ /*
+ * Fill in ethernet header.
+ * It should be already presented if we're
+ * sending data via known gateway.
+ */
+ error = arpresolve_fast(lifp, gw, m->m_flags, eh->ether_dhost);
+ if (error == 0) {
+ memcpy(&eh->ether_shost, IF_LLADDR(lifp), ETHER_ADDR_LEN);
+ eh->ether_type = htons(ETHERTYPE_IP);
+ nh->nh_count = ETHER_HDR_LEN;
+ return (0);
+ }
+ }
+
+ /* Notify caller that no L2 info is linked */
+ nh->nh_count = 0;
+ nh->nh_flags |= NH_FLAGS_L2_INCOMPLETE;
+ /* ..And save gateway address */
+ nh->d.gw4 = gw;
+ return (0);
+}
+
+static void
+fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst,
+ struct nhop4_extended *pnh4)
+{
+ struct sockaddr_in *gw;
+ struct in_ifaddr *ia;
+
+ pnh4->nh_ifp = rte->rt_ifa->ifa_ifp;
+ pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
+ if (rte->rt_flags & RTF_GATEWAY) {
+ gw = (struct sockaddr_in *)rte->rt_gateway;
+ pnh4->nh_addr = gw->sin_addr;
+ } else
+ pnh4->nh_addr = dst;
+
+ ia = ifatoia(rte->rt_ifa);
+ pnh4->nh_src = IA_SIN(ia)->sin_addr;
+
+ /* Set flags */
+ pnh4->nh_flags = rte->rt_flags & NHOP_FLAGS_MASK;
+ gw = (struct sockaddr_in *)rt_key(rte);
+ if (gw->sin_addr.s_addr == 0)
+ pnh4->nh_flags |= NHOP_DEFAULT;
+}
+
+
+static void
+fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst,
+ struct nhop4_basic *pnh4)
+{
+ struct sockaddr_in *gw;
+
+ pnh4->nh_ifp = rte->rt_ifa->ifa_ifp;
+ pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
+ if (rte->rt_flags & RTF_GATEWAY) {
+ gw = (struct sockaddr_in *)rte->rt_gateway;
+ pnh4->nh_addr = gw->sin_addr;
+ } else
+ pnh4->nh_addr = dst;
+ /* Set flags */
+ pnh4->nh_flags = rte->rt_flags & NHOP_FLAGS_MASK;
+ gw = (struct sockaddr_in *)rt_key(rte);
+ if (gw->sin_addr.s_addr == 0)
+ pnh4->nh_flags |= NHOP_DEFAULT;
+}
+
int
fib4_lookup_nh_basic(uint32_t fibnum, struct in_addr dst, uint32_t flowid,
struct nhop4_basic *pnh4)
{
struct radix_node_head *rnh;
struct radix_node *rn;
- struct sockaddr_in *gw, sin;
+ struct sockaddr_in sin;
struct rtentry *rte;
KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_basic: bad fibnum"));
@@ -157,18 +402,7 @@ fib4_lookup_nh_basic(uint32_t fibnum, st
rte = RNTORT(rn);
/* Ensure route & ifp is UP */
if (RT_LINK_IS_UP(rte->rt_ifp)) {
- pnh4->nh_ifp = rte->rt_ifa->ifa_ifp;
- pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
- if (rte->rt_flags & RTF_GATEWAY) {
- gw = (struct sockaddr_in *)rte->rt_gateway;
- pnh4->nh_addr = gw->sin_addr;
- } else
- pnh4->nh_addr = dst;
- /* Set flags */
- pnh4->nh_flags = rte->rt_flags & NHOP_FLAGS_MASK;
- gw = (struct sockaddr_in *)rt_key(rte);
- if (gw->sin_addr.s_addr == 0)
- pnh4->nh_flags |= NHOP_DEFAULT;
+ fib4_rte_to_nh_basic(rte, dst, pnh4);
RADIX_NODE_HEAD_RUNLOCK(rnh);
return (0);
@@ -181,13 +415,59 @@ fib4_lookup_nh_basic(uint32_t fibnum, st
#endif
#ifdef INET6
+void
+fib6_free_nh(uint32_t fibnum, struct nhop_data *nh)
+{
+
+ fib_free_nh(fibnum, nh, AF_INET6);
+}
+
+void
+fib6_choose_prepend(uint32_t fibnum, struct nhop_data *nh_src,
+ uint32_t flowid, struct nhop_data *nh, struct nhop6_extended *nh_ext)
+{
+
+ fib_choose_prepend(fibnum, nh_src, flowid, nh, AF_INET6);
+ if (nh_ext == NULL)
+ return;
+
+ nh_ext->nh_ifp = NH_LIFP(nh);
+ nh_ext->nh_mtu = nh->nh_mtu;
+ nh_ext->nh_flags = nh->nh_flags;
+/*
+ nh_ext->nh_addr = ;
+ nh_ext->nh_src= ;
+*/
+}
+
+
+static void
+fib6_rte_to_nh_basic(struct rtentry *rte, struct in6_addr dst,
+ struct nhop6_basic *pnh6)
+{
+ struct sockaddr_in6 *gw;
+
+ pnh6->nh_ifp = rte->rt_ifa->ifa_ifp;
+ pnh6->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
+ if (rte->rt_flags & RTF_GATEWAY) {
+ gw = (struct sockaddr_in6 *)rte->rt_gateway;
+ pnh6->nh_addr = gw->sin6_addr;
+ } else
+ pnh6->nh_addr = dst;
+ /* Set flags */
+ pnh6->nh_flags = rte->rt_flags & NHOP_FLAGS_MASK;
+ gw = (struct sockaddr_in6 *)rt_key(rte);
+ if (IN6_IS_ADDR_UNSPECIFIED(&gw->sin6_addr))
+ pnh6->nh_flags |= NHOP_DEFAULT;
+}
+
int
fib6_lookup_nh_basic(uint32_t fibnum, struct in6_addr dst, uint32_t flowid,
struct nhop6_basic *pnh6)
{
struct radix_node_head *rnh;
struct radix_node *rn;
- struct sockaddr_in6 *gw, sin6;
+ struct sockaddr_in6 sin6;
struct rtentry *rte;
KASSERT((fibnum < rt_numfibs), ("fib6_lookup_nh_basic: bad fibnum"));
@@ -205,18 +485,7 @@ fib6_lookup_nh_basic(uint32_t fibnum, st
rte = RNTORT(rn);
/* Ensure route & ifp is UP */
if (RT_LINK_IS_UP(rte->rt_ifp)) {
- pnh6->nh_ifp = rte->rt_ifa->ifa_ifp;
- pnh6->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu);
- if (rte->rt_flags & RTF_GATEWAY) {
- gw = (struct sockaddr_in6 *)rte->rt_gateway;
- pnh6->nh_addr = gw->sin6_addr;
- } else
- pnh6->nh_addr = dst;
- /* Set flags */
- pnh6->nh_flags = rte->rt_flags & NHOP_FLAGS_MASK;
- gw = (struct sockaddr_in6 *)rt_key(rte);
- if (IN6_IS_ADDR_UNSPECIFIED(&gw->sin6_addr))
- pnh6->nh_flags |= NHOP_DEFAULT;
+ fib6_rte_to_nh_basic(rte, dst, pnh6);
RADIX_NODE_HEAD_RUNLOCK(rnh);
return (0);
}
@@ -228,8 +497,45 @@ fib6_lookup_nh_basic(uint32_t fibnum, st
#endif
+#if 0
+typedef void nhop_change_cb_t(void *state);
+struct nhop_tracker {
+ TAILQ_ENTRY(nhop_tracker) next;
+ nhop_change_cb_t *f;
+ void *state;
+ uint32_t fibnum;
+ struct sockaddr_storage ss;
+};
+
+struct nhop_tracker *
+nhop_alloc_tracked(uint32_t fibnum, struct sockaddr *sa, nhop_change_cb_t *f,
+ void *state)
+{
+ struct nhop_tracker *nt;
+
+ nt = malloc(sizeof(struct nhop_tracker), M_RTFIB, M_WAITOK | M_ZERO);
+
+ nt->f = f;
+ nt-state = state;
+ nt->fibnum = fibnum;
+ memcpy(&nt->ss, sa, sa->sa_len);
+
+ return (nt);
+}
+
+
+int
+nhop_bind(struct nhop_tracker *nt)
+{
+ NHOP_LOCK(nnh);
+
+ NHOP_UNLOCK(nnh);
+
+ return (0);
+}
+#endif
Modified: projects/routing/sys/net/rt_nhops.h
==============================================================================
--- projects/routing/sys/net/rt_nhops.h Sun Aug 23 18:12:11 2015 (r287044)
+++ projects/routing/sys/net/rt_nhops.h Sun Aug 23 18:13:27 2015 (r287045)
@@ -30,7 +30,6 @@
#ifndef _NET_RT_NHOPS_H_
#define _NET_RT_NHOPS_H_
-#define MAX_PREPEND_LEN 64 /* Max data that can be prepended */
#define NH_TYPE_DIRECT 1 /* Directly reachable, no data */
@@ -40,7 +39,7 @@
#define NH_TYPE_MUTATOR 5 /* NH+callback function */
#define NH_TYPE_MULTIPATH 6 /* Multipath route */
-struct nhop_info {
+struct nhop_ctl_info {
uint64_t refcnt; /* Use references */
uint64_t flags; /* Options */
@@ -61,19 +60,49 @@ struct nhop_mutator_info {
char data[];
};
-/* Structure used for forwarding purposes */
+/* Structures used for forwarding purposes */
+#define MAX_PREPEND_LEN 56 /* Max data that can be prepended */
+
+/* Non-recursive nexthop */
struct nhop_data {
- uint8_t flags; /* NH flags */
- uint8_t count; /* Number of nexthops or data length */
- uint16_t mtu;
+ uint8_t nh_flags; /* NH flags */
+ uint8_t nh_count; /* Number of nexthops or data length */
+ uint16_t nh_mtu; /* given nhop MTU */
uint16_t lifp_idx; /* Logical interface index */
- uint16_t ifp_idx; /* Transmit interface index */
union {
- struct nhop_mpath_info mp[32]; /* Multipath info */
- struct nhop_mutator_info mm; /* mutator info */
- char data[MAX_PREPEND_LEN - 8]; /* data to prepend */
+ uint16_t ifp_idx; /* Transmit interface index */
+ uint16_t nhop_idx; /* L2 multipath nhop index */
+ } i;
+ union {
+ char data[MAX_PREPEND_LEN]; /* data to prepend */
+#ifdef INET
+ struct in_addr gw4; /* IPv4 gw address */
+#endif
+#ifdef INET6
+ struct in6_addr gw6; /* IPv4 gw address */
+#endif
} d;
};
+/* Internal flags */
+#define NH_FLAGS_RECURSE 0x01 /* Nexthop structure is recursive */
+#define NH_FLAGS_L2_NHOP 0x02 /* L2 interface has to be selected */
+#define NH_FLAGS_L2_ME 0x04 /* dst L2 address is our address */
+#define NH_FLAGS_L2_INCOMPLETE 0x08 /* L2 header not prepended */
+
+#define NH_LIFP(nh) ifnet_byindex_locked((nh)->lifp_idx)
+#define NH_TIFP(nh) ifnet_byindex_locked((nh)->i.ifp_idx)
+
+/* L2/L3 recursive nexthop */
+struct nhop_multi {
+ uint8_t nh_flags; /* NH flags */
+ uint8_t nh_count; /* Number of nexthops or data length */
+ uint8_t spare[2];
+ uint16_t nh_nhops[30]; /* Nexthop indexes */
+};
+
+/* Control plane nexthop data */
+struct nhop_info {
+};
/* Per-AF per-fib nhop table */
struct nhops_descr {
@@ -105,6 +134,7 @@ struct nhop6_basic {
struct ifnet *nh_ifp; /* Logical egress interface */
uint16_t nh_mtu; /* nexthop mtu */
uint16_t nh_flags; /* nhop flags */
+ uint8_t spare[4];
struct in6_addr nh_addr; /* GW/DST IPv4 address */
};
@@ -115,11 +145,63 @@ struct nhop64_basic {
} u;
};
+/* Extended nexthop info used for control protocols */
+struct nhop4_extended {
+ struct ifnet *nh_ifp; /* Logical egress interface */
+ uint16_t nh_mtu; /* nexthop mtu */
+ uint16_t nh_flags; /* nhop flags */
+ uint8_t spare[4];
+ struct in_addr nh_addr; /* GW/DST IPv4 address */
+ struct in_addr nh_src; /* default source IPv4 address */
+ uint64_t spare2[2];
+};
+
+struct nhop6_extended {
+ struct ifnet *nh_ifp; /* Logical egress interface */
+ uint16_t nh_mtu; /* nexthop mtu */
+ uint16_t nh_flags; /* nhop flags */
+ uint8_t spare[4];
+ struct in6_addr nh_addr; /* GW/DST IPv6 address */
+ struct in6_addr nh_src; /* default source IPv6 address */
+ uint64_t spare2[2];
+};
+
+struct nhop64_extended {
+ union {
+ struct nhop4_extended nh4;
+ struct nhop6_extended nh6;
+ } u;
+};
+
+struct route_info {
+ struct nhop_data *ri_nh; /* Desired nexthop to use */
+ struct nhop64_basic *ri_nh_info; /* Get selected route info */
+ uint16_t ri_mtu;
+ uint16_t spare[3];
+};
+
+struct route_compat {
+ struct nhop_data *ro_nh;
+ void *spare0;
+ void *spare1;
+ int ro_flags;
+};
+
int fib4_lookup_nh_basic(uint32_t fibnum, struct in_addr dst, uint32_t flowid,
struct nhop4_basic *pnh4);
int fib6_lookup_nh_basic(uint32_t fibnum, struct in6_addr dst, uint32_t flowid,
struct nhop6_basic *pnh6);
+void fib4_free_nh(uint32_t fibnum, struct nhop_data *nh);
+void fib4_choose_prepend(uint32_t fibnum, struct nhop_data *nh_src,
+ uint32_t flowid, struct nhop_data *nh, struct nhop4_extended *nh_ext);
+int fib4_lookup_prepend(uint32_t fibnum, struct in_addr dst, struct mbuf *m,
+ struct nhop_data *nh, struct nhop4_extended *nh_ext);
+
+void fib6_free_nh(uint32_t fibnum, struct nhop_data *nh);
+void fib6_choose_prepend(uint32_t fibnum, struct nhop_data *nh_src,
+ uint32_t flowid, struct nhop_data *nh, struct nhop6_extended *nh_ext);
+
#define NHOP_REJECT RTF_REJECT
#define NHOP_BLACKHOLE RTF_BLACKHOLE
#define NHOP_DEFAULT 0x80 /* Default route */
Modified: projects/routing/sys/netinet/if_ether.c
==============================================================================
--- projects/routing/sys/netinet/if_ether.c Sun Aug 23 18:12:11 2015 (r287044)
+++ projects/routing/sys/netinet/if_ether.c Sun Aug 23 18:13:27 2015 (r287045)
@@ -303,6 +303,72 @@ arprequest(struct ifnet *ifp, const stru
}
/*
+ *
+ * Saves lle address for @dst in @dst_addr.
+ * Returns 0 if address was found&valid.
+ */
+int
+arpresolve_fast(struct ifnet *ifp, struct in_addr dst, u_int mflags,
+ u_char *dst_addr)
+{
+ int do_arp, error;
+ struct llentry *la;
+ struct sockaddr_in sin;
+
+ if (mflags & M_BCAST) {
+ memcpy(dst_addr, ifp->if_broadcastaddr, ifp->if_addrlen);
+ return (0);
+ }
+ if (mflags & M_MCAST) {
+ ETHER_MAP_IP_MULTICAST(&dst, dst_addr);
+ return (0);
+ }
+
+ do_arp = 0;
+ error = EAGAIN;
+
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_addr = dst;
+ sin.sin_family = AF_INET;
+ sin.sin_len = sizeof(sin);
+
+ IF_AFDATA_RLOCK(ifp);
+ la = lla_lookup(LLTABLE(ifp), 0, (const struct sockaddr *)&sin);
+
+ /*
+ * XXX: We need to convert all these checks to single one
+ */
+ if (la != NULL && (la->la_flags & LLE_VALID) &&
+ ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) {
+ bcopy(&la->ll_addr, dst_addr, ifp->if_addrlen);
+ /*
+ * If entry has an expiry time and it is approaching,
+ * see if we need to send an ARP request within this
+ * arpt_down interval.
+ */
+ if (!(la->la_flags & LLE_STATIC) &&
+ time_uptime + la->la_preempt > la->la_expire) {
+ do_arp = 1;
+ la->la_preempt--;
+ }
+ error = 0;
+ }
+ if (la != NULL)
+ LLE_RUNLOCK(la);
+ IF_AFDATA_RUNLOCK(ifp);
+
+ /*
+ * XXX: For compat reasons only.
+ * We should delay the job to slowpath queue.
+ */
+ if (do_arp != 0)
+ arprequest(ifp, NULL, &dst, NULL);
+
+ return (error);
+}
+
+
+/*
* Resolve an IP address into an ethernet address.
* On input:
* ifp is the interface we use
Modified: projects/routing/sys/netinet/if_ether.h
==============================================================================
--- projects/routing/sys/netinet/if_ether.h Sun Aug 23 18:12:11 2015 (r287044)
+++ projects/routing/sys/netinet/if_ether.h Sun Aug 23 18:13:27 2015 (r287045)
@@ -116,6 +116,8 @@ struct ifaddr;
int arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m,
const struct sockaddr *dst, u_char *desten, uint32_t *pflags);
+int arpresolve_fast(struct ifnet *ifp, struct in_addr dst, u_int mflags,
+ u_char *dst_addr);
void arprequest(struct ifnet *, const struct in_addr *,
const struct in_addr *, u_char *);
void arp_ifinit(struct ifnet *, struct ifaddr *);
Modified: projects/routing/sys/netinet/ip_input.c
==============================================================================
--- projects/routing/sys/netinet/ip_input.c Sun Aug 23 18:12:11 2015 (r287044)
+++ projects/routing/sys/netinet/ip_input.c Sun Aug 23 18:13:27 2015 (r287045)
@@ -82,6 +82,8 @@ __FBSDID("$FreeBSD$");
#endif /* IPSEC */
#include <netinet/in_rss.h>
+#include <net/rt_nhops.h>
+
#include <sys/socketvar.h>
#include <security/mac/mac_framework.h>
@@ -901,6 +903,7 @@ ip_forward(struct mbuf *m, int srcrt)
struct sockaddr_in *sin;
struct in_addr dest;
struct route ro;
+ struct route_info ri;
int error, type = 0, code = 0, mtu = 0;
if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
@@ -1031,11 +1034,12 @@ ip_forward(struct mbuf *m, int srcrt)
}
}
- error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL);
+ bzero(&ri, sizeof(ri));
+
+ error = ip_output(m, NULL, &ri, IP_FORWARDING, NULL, NULL);
- if (error == EMSGSIZE && ro.ro_rt)
- mtu = ro.ro_rt->rt_mtu;
- RO_RTFREE(&ro);
+ if (error == EMSGSIZE)
+ mtu = ri.ri_mtu;
if (error)
IPSTAT_INC(ips_cantforward);
Modified: projects/routing/sys/netinet/ip_output.c
==============================================================================
--- projects/routing/sys/netinet/ip_output.c Sun Aug 23 18:12:11 2015 (r287044)
+++ projects/routing/sys/netinet/ip_output.c Sun Aug 23 18:13:27 2015 (r287045)
@@ -84,6 +84,8 @@ __FBSDID("$FreeBSD$");
#include <netinet/sctp_crc32.h>
#endif
+#include <net/rt_nhops.h>
+
#ifdef IPSEC
#include <netinet/ip_ipsec.h>
#include <netipsec/ipsec.h>
@@ -99,8 +101,9 @@ SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_
&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
#endif
-static void ip_mloopback
- (struct ifnet *, struct mbuf *, struct sockaddr_in *, int);
+static void ip_mloopback (struct ifnet *, struct mbuf *, int);
+static inline int ip_sendmbuf(struct ifnet *ifp, struct mbuf *m,
+ struct nhop_data *nh, struct in_addr dst);
extern int in_mcast_loop;
@@ -108,11 +111,12 @@ extern struct protosw inetsw[];
static inline int
ip_output_pfil(struct mbuf *m, struct ifnet *ifp, struct inpcb *inp,
- struct sockaddr_in *dst, int *fibnum, int *error)
+ struct in_addr *dst, int *fibnum, int *error)
{
struct m_tag *fwd_tag = NULL;
struct in_addr odst;
struct ip *ip;
+ struct sockaddr_in *dst_sa;
ip = mtod(m, struct ip *);
@@ -147,11 +151,7 @@ ip_output_pfil(struct mbuf *m, struct if
return 1; /* Finished */
}
- bzero(dst, sizeof(*dst));
- dst->sin_family = AF_INET;
- dst->sin_len = sizeof(*dst);
- dst->sin_addr = ip->ip_dst;
-
+ *dst = ip->ip_dst;
return -1; /* Reloop */
}
/* See if fib was changed by packet filter. */
@@ -183,7 +183,11 @@ ip_output_pfil(struct mbuf *m, struct if
/* Or forward to some other address? */
if ((m->m_flags & M_IP_NEXTHOP) &&
((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) {
- bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
+ dst_sa = (struct sockaddr_in *)(fwd_tag + 1);
+ bzero(dst_sa, sizeof(*dst_sa));
+ dst_sa->sin_family = AF_INET;
+ dst_sa->sin_len = sizeof(*dst_sa);
+ dst_sa->sin_addr = *dst;
m->m_flags |= M_SKIP_FIREWALL;
m->m_flags &= ~M_IP_NEXTHOP;
m_tag_delete(m, fwd_tag);
@@ -207,7 +211,7 @@ ip_output_pfil(struct mbuf *m, struct if
* inserted, so must have a NULL opt pointer.
*/
int
-ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
+ip_output(struct mbuf *m, struct mbuf *opt, struct route_info *ri, int flags,
struct ip_moptions *imo, struct inpcb *inp)
{
struct rm_priotracker in_ifa_tracker;
@@ -217,15 +221,14 @@ ip_output(struct mbuf *m, struct mbuf *o
int hlen = sizeof (struct ip);
int mtu;
int error = 0;
- struct sockaddr_in *dst;
- const struct sockaddr_in *gw;
+ struct in_addr dst, local_addr;
+ struct sockaddr_in gw_out;
struct in_ifaddr *ia;
int isbroadcast;
uint16_t ip_len, ip_off;
- struct route iproute;
- struct rtentry *rte; /* cache for ro->ro_rt */
+ struct nhop_data local_nh, *nh;
+ struct nhop4_extended nhe, *pnhe;
uint32_t fibnum;
- int have_ia_ref;
#ifdef IPSEC
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-projects
mailing list