svn commit: r292978 - in head/sys: dev/cxgb/ulp/tom dev/cxgbe/tom net netinet netinet6 ofed/drivers/infiniband/ulp/ipoib

Alexander V. Chernikov melifaro at FreeBSD.org
Thu Dec 31 05:03:30 UTC 2015


Author: melifaro
Date: Thu Dec 31 05:03:27 2015
New Revision: 292978
URL: https://svnweb.freebsd.org/changeset/base/292978

Log:
  Implement interface link header precomputation API.
  
  Add if_requestencap() interface method which is capable of calculating
    various link headers for given interface. Right now there is support
    for INET/INET6/ARP llheader calculation (IFENCAP_LL type request).
    Other types are planned to support more complex calculation
    (L2 multipath lagg nexthops, tunnel encap nexthops, etc..).
  
  Reshape 'struct route' to be able to pass additional data (with is length)
    to prepend to mbuf.
  
  These two changes permits routing code to pass pre-calculated nexthop data
    (like L2 header for route w/gateway) down to the stack eliminating the
    need for other lookups. It also brings us closer to more complex scenarios
    like transparently handling MPLS nexthops and tunnel interfaces.
    Last, but not least, it removes layering violation introduced by flowtable
    code (ro_lle) and simplifies handling of existing if_output consumers.
  
  ARP/ND changes:
  Make arp/ndp stack pre-calculate link header upon installing/updating lle
    record. Interface link address change are handled by re-calculating
    headers for all lles based on if_lladdr event. After these changes,
    arpresolve()/nd6_resolve() returns full pre-calculated header for
    supported interfaces thus simplifying if_output().
  Move these lookups to separate ether_resolve_addr() function which ether
    returs error or fully-prepared link header. Add <arp|nd6_>resolve_addr()
    compat versions to return link addresses instead of pre-calculated data.
  
  BPF changes:
  Raw bpf writes occupied _two_ cases: AF_UNSPEC and pseudo_AF_HDRCMPLT.
  Despite the naming, both of there have ther header "complete". The only
    difference is that interface source mac has to be filled by OS for
    AF_UNSPEC (controlled via BIOCGHDRCMPLT). This logic has to stay inside
    BPF and not pollute if_output() routines. Convert BPF to pass prepend data
    via new 'struct route' mechanism. Note that it does not change
    non-optimized if_output(): ro_prepend handling is purely optional.
  Side note: hackish pseudo_AF_HDRCMPLT is supported for ethernet and FDDI.
    It is not needed for ethernet anymore. The only remaining FDDI user is
    dev/pdq mostly untouched since 2007. FDDI support was eliminated from
    OpenBSD in 2013 (sys/net/if_fddisubr.c rev 1.65).
  
  Flowtable changes:
    Flowtable violates layering by saving (and not correctly managing)
    rtes/lles. Instead of passing lle pointer, pass pointer to pre-calculated
    header data from that lle.
  
  Differential Revision:	https://reviews.freebsd.org/D4102

Modified:
  head/sys/dev/cxgb/ulp/tom/cxgb_l2t.c
  head/sys/dev/cxgbe/tom/t4_tom_l2t.c
  head/sys/net/bpf.c
  head/sys/net/flowtable.c
  head/sys/net/if.c
  head/sys/net/if_ethersubr.c
  head/sys/net/if_llatbl.c
  head/sys/net/if_llatbl.h
  head/sys/net/if_var.h
  head/sys/net/route.h
  head/sys/netinet/if_ether.c
  head/sys/netinet/if_ether.h
  head/sys/netinet/in.c
  head/sys/netinet/ip_output.c
  head/sys/netinet/toecore.c
  head/sys/netinet6/icmp6.c
  head/sys/netinet6/in6.c
  head/sys/netinet6/in6.h
  head/sys/netinet6/nd6.c
  head/sys/netinet6/nd6.h
  head/sys/netinet6/nd6_nbr.c
  head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c

Modified: head/sys/dev/cxgb/ulp/tom/cxgb_l2t.c
==============================================================================
--- head/sys/dev/cxgb/ulp/tom/cxgb_l2t.c	Thu Dec 31 04:14:05 2015	(r292977)
+++ head/sys/dev/cxgb/ulp/tom/cxgb_l2t.c	Thu Dec 31 05:03:27 2015	(r292978)
@@ -215,7 +215,7 @@ resolve_entry(struct adapter *sc, struct
 	struct tom_data *td = sc->tom_softc;
 	struct toedev *tod = &td->tod;
 	struct sockaddr_in sin = {0};
-	uint8_t dmac[ETHER_ADDR_LEN];
+	uint8_t dmac[ETHER_HDR_LEN];
 	uint16_t vtag = EVL_VLID_MASK;
 	int rc;
 

Modified: head/sys/dev/cxgbe/tom/t4_tom_l2t.c
==============================================================================
--- head/sys/dev/cxgbe/tom/t4_tom_l2t.c	Thu Dec 31 04:14:05 2015	(r292977)
+++ head/sys/dev/cxgbe/tom/t4_tom_l2t.c	Thu Dec 31 05:03:27 2015	(r292978)
@@ -233,7 +233,7 @@ resolve_entry(struct adapter *sc, struct
 	struct sockaddr_in sin = {0};
 	struct sockaddr_in6 sin6 = {0};
 	struct sockaddr *sa;
-	uint8_t dmac[ETHER_ADDR_LEN];
+	uint8_t dmac[ETHER_HDR_LEN];
 	uint16_t vtag = VLAN_NONE;
 	int rc;
 

Modified: head/sys/net/bpf.c
==============================================================================
--- head/sys/net/bpf.c	Thu Dec 31 04:14:05 2015	(r292977)
+++ head/sys/net/bpf.c	Thu Dec 31 05:03:27 2015	(r292978)
@@ -69,6 +69,7 @@ __FBSDID("$FreeBSD$");
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_dl.h>
 #include <net/bpf.h>
 #include <net/bpf_buffer.h>
 #ifdef BPF_JITTER
@@ -76,6 +77,7 @@ __FBSDID("$FreeBSD$");
 #endif
 #include <net/bpf_zerocopy.h>
 #include <net/bpfdesc.h>
+#include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
@@ -164,7 +166,7 @@ static void	bpf_detachd(struct bpf_d *);
 static void	bpf_detachd_locked(struct bpf_d *);
 static void	bpf_freed(struct bpf_d *);
 static int	bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **,
-		    struct sockaddr *, int *, struct bpf_insn *);
+		    struct sockaddr *, int *, struct bpf_d *);
 static int	bpf_setif(struct bpf_d *, struct ifreq *);
 static void	bpf_timed_out(void *);
 static __inline void
@@ -454,7 +456,7 @@ bpf_ioctl_setzbuf(struct thread *td, str
  */
 static int
 bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp,
-    struct sockaddr *sockp, int *hdrlen, struct bpf_insn *wfilter)
+    struct sockaddr *sockp, int *hdrlen, struct bpf_d *d)
 {
 	const struct ieee80211_bpf_params *p;
 	struct ether_header *eh;
@@ -549,7 +551,7 @@ bpf_movein(struct uio *uio, int linktype
 	if (error)
 		goto bad;
 
-	slen = bpf_filter(wfilter, mtod(m, u_char *), len, len);
+	slen = bpf_filter(d->bd_wfilter, mtod(m, u_char *), len, len);
 	if (slen == 0) {
 		error = EPERM;
 		goto bad;
@@ -566,6 +568,10 @@ bpf_movein(struct uio *uio, int linktype
 			else
 				m->m_flags |= M_MCAST;
 		}
+		if (d->bd_hdrcmplt == 0) {
+			memcpy(eh->ether_shost, IF_LLADDR(ifp),
+			    sizeof(eh->ether_shost));
+		}
 		break;
 	}
 
@@ -1088,6 +1094,7 @@ bpfwrite(struct cdev *dev, struct uio *u
 	struct ifnet *ifp;
 	struct mbuf *m, *mc;
 	struct sockaddr dst;
+	struct route ro;
 	int error, hlen;
 
 	error = devfs_get_cdevpriv((void **)&d);
@@ -1119,7 +1126,7 @@ bpfwrite(struct cdev *dev, struct uio *u
 	hlen = 0;
 	/* XXX: bpf_movein() can sleep */
 	error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp,
-	    &m, &dst, &hlen, d->bd_wfilter);
+	    &m, &dst, &hlen, d);
 	if (error) {
 		d->bd_wdcount++;
 		return (error);
@@ -1151,7 +1158,14 @@ bpfwrite(struct cdev *dev, struct uio *u
 	BPFD_UNLOCK(d);
 #endif
 
-	error = (*ifp->if_output)(ifp, m, &dst, NULL);
+	bzero(&ro, sizeof(ro));
+	if (hlen != 0) {
+		ro.ro_prepend = (u_char *)&dst.sa_data;
+		ro.ro_plen = hlen;
+		ro.ro_flags = RT_HAS_HEADER;
+	}
+
+	error = (*ifp->if_output)(ifp, m, &dst, &ro);
 	if (error)
 		d->bd_wdcount++;
 

Modified: head/sys/net/flowtable.c
==============================================================================
--- head/sys/net/flowtable.c	Thu Dec 31 04:14:05 2015	(r292977)
+++ head/sys/net/flowtable.c	Thu Dec 31 05:03:27 2015	(r292978)
@@ -665,6 +665,7 @@ int
 flowtable_lookup(sa_family_t sa, struct mbuf *m, struct route *ro)
 {
 	struct flentry *fle;
+	struct llentry *lle;
 
 	if (V_flowtable_enable == 0)
 		return (ENXIO);
@@ -693,8 +694,15 @@ flowtable_lookup(sa_family_t sa, struct 
 	}
 
 	ro->ro_rt = fle->f_rt;
-	ro->ro_lle = fle->f_lle;
 	ro->ro_flags |= RT_NORTREF;
+	lle = fle->f_lle;
+	if (lle != NULL && (lle->la_flags & LLE_VALID)) {
+		ro->ro_prepend = lle->r_linkdata;
+		ro->ro_plen = lle->r_hdrlen;
+		ro->ro_flags |= RT_MAY_LOOP;
+		if (lle->la_flags & LLE_IFADDR)
+			ro->ro_flags |= RT_L2_ME;
+	}
 
 	return (0);
 }

Modified: head/sys/net/if.c
==============================================================================
--- head/sys/net/if.c	Thu Dec 31 04:14:05 2015	(r292977)
+++ head/sys/net/if.c	Thu Dec 31 05:03:27 2015	(r292978)
@@ -161,6 +161,7 @@ static int	ifconf(u_long, caddr_t);
 static void	if_freemulti(struct ifmultiaddr *);
 static void	if_grow(void);
 static void	if_input_default(struct ifnet *, struct mbuf *);
+static int	if_requestencap_default(struct ifnet *, struct if_encap_req *);
 static void	if_route(struct ifnet *, int flag, int fam);
 static int	if_setflag(struct ifnet *, int, int, int *, int);
 static int	if_transmit(struct ifnet *ifp, struct mbuf *m);
@@ -673,6 +674,9 @@ if_attach_internal(struct ifnet *ifp, in
 	if (ifp->if_input == NULL)
 		ifp->if_input = if_input_default;
 
+	if (ifp->if_requestencap == NULL)
+		ifp->if_requestencap = if_requestencap_default;
+
 	if (!vmove) {
 #ifdef MAC
 		mac_ifnet_create(ifp);
@@ -3398,6 +3402,43 @@ if_setlladdr(struct ifnet *ifp, const u_
 }
 
 /*
+ * Compat function for handling basic encapsulation requests.
+ * Not converted stacks (FDDI, IB, ..) supports traditional
+ * output model: ARP (and other similar L2 protocols) are handled
+ * inside output routine, arpresolve/nd6_resolve() returns MAC
+ * address instead of full prepend.
+ *
+ * This function creates calculated header==MAC for IPv4/IPv6 and
+ * returns EAFNOSUPPORT (which is then handled in ARP code) for other
+ * address families.
+ */
+static int
+if_requestencap_default(struct ifnet *ifp, struct if_encap_req *req)
+{
+
+	if (req->rtype != IFENCAP_LL)
+		return (EOPNOTSUPP);
+
+	if (req->bufsize < req->lladdr_len)
+		return (ENOMEM);
+
+	switch (req->family) {
+	case AF_INET:
+	case AF_INET6:
+		break;
+	default:
+		return (EAFNOSUPPORT);
+	}
+
+	/* Copy lladdr to storage as is */
+	memmove(req->buf, req->lladdr, req->lladdr_len);
+	req->bufsize = req->lladdr_len;
+	req->lladdr_off = 0;
+
+	return (0);
+}
+
+/*
  * The name argument must be a pointer to storage which will last as
  * long as the interface does.  For physical devices, the result of
  * device_get_name(dev) is a good choice and for pseudo-devices a

Modified: head/sys/net/if_ethersubr.c
==============================================================================
--- head/sys/net/if_ethersubr.c	Thu Dec 31 04:14:05 2015	(r292977)
+++ head/sys/net/if_ethersubr.c	Thu Dec 31 05:03:27 2015	(r292978)
@@ -113,6 +113,7 @@ static	int ether_resolvemulti(struct ifn
 #ifdef VIMAGE
 static	void ether_reassign(struct ifnet *, struct vnet *, char *);
 #endif
+static	int ether_requestencap(struct ifnet *, struct if_encap_req *);
 
 #define	ETHER_IS_BROADCAST(addr) \
 	(bcmp(etherbroadcastaddr, (addr), ETHER_ADDR_LEN) == 0)
@@ -136,6 +137,138 @@ update_mbuf_csumflags(struct mbuf *src, 
 }
 
 /*
+ * Handle link-layer encapsulation requests.
+ */
+static int
+ether_requestencap(struct ifnet *ifp, struct if_encap_req *req)
+{
+	struct ether_header *eh;
+	struct arphdr *ah;
+	uint16_t etype;
+	const u_char *lladdr;
+
+	if (req->rtype != IFENCAP_LL)
+		return (EOPNOTSUPP);
+
+	if (req->bufsize < ETHER_HDR_LEN)
+		return (ENOMEM);
+
+	eh = (struct ether_header *)req->buf;
+	lladdr = req->lladdr;
+	req->lladdr_off = 0;
+
+	switch (req->family) {
+	case AF_INET:
+		etype = htons(ETHERTYPE_IP);
+		break;
+	case AF_INET6:
+		etype = htons(ETHERTYPE_IPV6);
+		break;
+	case AF_ARP:
+		ah = (struct arphdr *)req->hdata;
+		ah->ar_hrd = htons(ARPHRD_ETHER);
+
+		switch(ntohs(ah->ar_op)) {
+		case ARPOP_REVREQUEST:
+		case ARPOP_REVREPLY:
+			etype = htons(ETHERTYPE_REVARP);
+			break;
+		case ARPOP_REQUEST:
+		case ARPOP_REPLY:
+		default:
+			etype = htons(ETHERTYPE_ARP);
+			break;
+		}
+
+		if (req->flags & IFENCAP_FLAG_BROADCAST)
+			lladdr = ifp->if_broadcastaddr;
+		break;
+	default:
+		return (EAFNOSUPPORT);
+	}
+
+	memcpy(&eh->ether_type, &etype, sizeof(eh->ether_type));
+	memcpy(eh->ether_dhost, lladdr, ETHER_ADDR_LEN);
+	memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
+	req->bufsize = sizeof(struct ether_header);
+
+	return (0);
+}
+
+
+static int
+ether_resolve_addr(struct ifnet *ifp, struct mbuf *m,
+	const struct sockaddr *dst, struct route *ro, u_char *phdr,
+	uint32_t *pflags)
+{
+	struct ether_header *eh;
+	struct rtentry *rt;
+	uint32_t lleflags = 0;
+	int error = 0;
+#if defined(INET) || defined(INET6)
+	uint16_t etype;
+#endif
+
+	eh = (struct ether_header *)phdr;
+
+	switch (dst->sa_family) {
+#ifdef INET
+	case AF_INET:
+		if ((m->m_flags & (M_BCAST | M_MCAST)) == 0)
+			error = arpresolve(ifp, 0, m, dst, phdr, &lleflags);
+		else {
+			if (m->m_flags & M_BCAST)
+				memcpy(eh->ether_dhost, ifp->if_broadcastaddr,
+				    ETHER_ADDR_LEN);
+			else {
+				const struct in_addr *a;
+				a = &(((const struct sockaddr_in *)dst)->sin_addr);
+				ETHER_MAP_IP_MULTICAST(a, eh->ether_dhost);
+			}
+			etype = htons(ETHERTYPE_IP);
+			memcpy(&eh->ether_type, &etype, sizeof(etype));
+			memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
+		}
+		break;
+#endif
+#ifdef INET6
+	case AF_INET6:
+		if ((m->m_flags & M_MCAST) == 0)
+			error = nd6_resolve(ifp, 0, m, dst, phdr, &lleflags);
+		else {
+			const struct in6_addr *a6;
+			a6 = &(((const struct sockaddr_in6 *)dst)->sin6_addr);
+			ETHER_MAP_IPV6_MULTICAST(a6, eh->ether_dhost);
+			etype = htons(ETHERTYPE_IPV6);
+			memcpy(&eh->ether_type, &etype, sizeof(etype));
+			memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN);
+		}
+		break;
+#endif
+	default:
+		if_printf(ifp, "can't handle af%d\n", dst->sa_family);
+		if (m != NULL)
+			m_freem(m);
+		return (EAFNOSUPPORT);
+	}
+
+	if (error == EHOSTDOWN) {
+		rt = (ro != NULL) ? ro->ro_rt : NULL;
+		if (rt != NULL && (rt->rt_flags & RTF_GATEWAY) != 0)
+			error = EHOSTUNREACH;
+	}
+
+	if (error != 0)
+		return (error);
+
+	*pflags = RT_MAY_LOOP;
+	if (lleflags & LLE_IFADDR)
+		*pflags |= RT_L2_ME;
+
+	return (0);
+}
+
+/*
  * Ethernet output routine.
  * Encapsulate a packet of type family for the local net.
  * Use trailer local net encapsulation if enough data in first
@@ -145,27 +278,20 @@ int
 ether_output(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro)
 {
-	short type;
-	int error = 0, hdrcmplt = 0;
-	u_char edst[ETHER_ADDR_LEN];
-	struct llentry *lle = NULL;
-	struct rtentry *rt0 = NULL;
+	int error = 0;
+	char linkhdr[ETHER_HDR_LEN], *phdr;
 	struct ether_header *eh;
 	struct pf_mtag *t;
 	int loop_copy = 1;
 	int hlen;	/* link layer header length */
-	int is_gw = 0;
-	uint32_t pflags = 0;
+	uint32_t pflags;
 
+	phdr = NULL;
+	pflags = 0;
 	if (ro != NULL) {
-		if (!(m->m_flags & (M_BCAST | M_MCAST))) {
-			lle = ro->ro_lle;
-			if (lle != NULL)
-				pflags = lle->la_flags;
-		}
-		rt0 = ro->ro_rt;
-		if (rt0 != NULL && (rt0->rt_flags & RTF_GATEWAY) != 0)
-			is_gw = 1;
+		phdr = ro->ro_prepend;
+		hlen = ro->ro_plen;
+		pflags = ro->ro_flags;
 	}
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
@@ -180,94 +306,31 @@ ether_output(struct ifnet *ifp, struct m
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING)))
 		senderr(ENETDOWN);
 
-	hlen = ETHER_HDR_LEN;
-	switch (dst->sa_family) {
-#ifdef INET
-	case AF_INET:
-		if (lle != NULL && (pflags & LLE_VALID) != 0)
-			memcpy(edst, &lle->ll_addr.mac16, sizeof(edst));
-		else
-			error = arpresolve(ifp, is_gw, m, dst, edst, &pflags);
-		if (error)
+	if (phdr == NULL) {
+		/* No prepend data supplied. Try to calculate ourselves. */
+		phdr = linkhdr;
+		hlen = ETHER_HDR_LEN;
+		error = ether_resolve_addr(ifp, m, dst, ro, phdr, &pflags);
+		if (error != 0)
 			return (error == EWOULDBLOCK ? 0 : error);
-		type = htons(ETHERTYPE_IP);
-		break;
-	case AF_ARP:
-	{
-		struct arphdr *ah;
-		ah = mtod(m, struct arphdr *);
-		ah->ar_hrd = htons(ARPHRD_ETHER);
-
-		loop_copy = 0; /* if this is for us, don't do it */
-
-		switch(ntohs(ah->ar_op)) {
-		case ARPOP_REVREQUEST:
-		case ARPOP_REVREPLY:
-			type = htons(ETHERTYPE_REVARP);
-			break;
-		case ARPOP_REQUEST:
-		case ARPOP_REPLY:
-		default:
-			type = htons(ETHERTYPE_ARP);
-			break;
-		}
-
-		if (m->m_flags & M_BCAST)
-			bcopy(ifp->if_broadcastaddr, edst, ETHER_ADDR_LEN);
-		else
-			bcopy(ar_tha(ah), edst, ETHER_ADDR_LEN);
-
-	}
-	break;
-#endif
-#ifdef INET6
-	case AF_INET6:
-		if (lle != NULL && (pflags & LLE_VALID))
-			memcpy(edst, &lle->ll_addr.mac16, sizeof(edst));
-		else
-			error = nd6_resolve(ifp, is_gw, m, dst, (u_char *)edst,
-			    &pflags);
-		if (error)
-			return (error == EWOULDBLOCK ? 0 : error);
-		type = htons(ETHERTYPE_IPV6);
-		break;
-#endif
-	case pseudo_AF_HDRCMPLT:
-	    {
-		const struct ether_header *eh;
-
-		hdrcmplt = 1;
-		/* FALLTHROUGH */
-
-	case AF_UNSPEC:
-		loop_copy = 0; /* if this is for us, don't do it */
-		eh = (const struct ether_header *)dst->sa_data;
-		(void)memcpy(edst, eh->ether_dhost, sizeof (edst));
-		type = eh->ether_type;
-		break;
-            }
-	default:
-		if_printf(ifp, "can't handle af%d\n", dst->sa_family);
-		senderr(EAFNOSUPPORT);
 	}
 
-	if ((pflags & LLE_IFADDR) != 0) {
+	if ((pflags & RT_L2_ME) != 0) {
 		update_mbuf_csumflags(m, m);
 		return (if_simloop(ifp, m, dst->sa_family, 0));
 	}
+	loop_copy = pflags & RT_MAY_LOOP;
 
 	/*
 	 * Add local net header.  If no space in first mbuf,
 	 * allocate another.
 	 */
-	M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT);
+	M_PREPEND(m, hlen, M_NOWAIT);
 	if (m == NULL)
 		senderr(ENOBUFS);
-	eh = mtod(m, struct ether_header *);
-	if (hdrcmplt == 0) {
-		memcpy(&eh->ether_type, &type, sizeof(eh->ether_type));
-		memcpy(eh->ether_dhost, edst, sizeof (edst));
-		memcpy(eh->ether_shost, IF_LLADDR(ifp),sizeof(eh->ether_shost));
+	if ((pflags & RT_HAS_HEADER) == 0) {
+		eh = mtod(m, struct ether_header *);
+		memcpy(eh, phdr, hlen);
 	}
 
 	/*
@@ -279,34 +342,27 @@ ether_output(struct ifnet *ifp, struct m
 	 * on the wire). However, we don't do that here for security
 	 * reasons and compatibility with the original behavior.
 	 */
-	if ((ifp->if_flags & IFF_SIMPLEX) && loop_copy &&
+	if ((m->m_flags & M_BCAST) && loop_copy && (ifp->if_flags & IFF_SIMPLEX) &&
 	    ((t = pf_find_mtag(m)) == NULL || !t->routed)) {
-		if (m->m_flags & M_BCAST) {
-			struct mbuf *n;
+		struct mbuf *n;
 
-			/*
-			 * Because if_simloop() modifies the packet, we need a
-			 * writable copy through m_dup() instead of a readonly
-			 * one as m_copy[m] would give us. The alternative would
-			 * be to modify if_simloop() to handle the readonly mbuf,
-			 * but performancewise it is mostly equivalent (trading
-			 * extra data copying vs. extra locking).
-			 *
-			 * XXX This is a local workaround.  A number of less
-			 * often used kernel parts suffer from the same bug.
-			 * See PR kern/105943 for a proposed general solution.
-			 */
-			if ((n = m_dup(m, M_NOWAIT)) != NULL) {
-				update_mbuf_csumflags(m, n);
-				(void)if_simloop(ifp, n, dst->sa_family, hlen);
-			} else
-				if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
-		} else if (bcmp(eh->ether_dhost, eh->ether_shost,
-				ETHER_ADDR_LEN) == 0) {
-			update_mbuf_csumflags(m, m);
-			(void) if_simloop(ifp, m, dst->sa_family, hlen);
-			return (0);	/* XXX */
-		}
+		/*
+		 * Because if_simloop() modifies the packet, we need a
+		 * writable copy through m_dup() instead of a readonly
+		 * one as m_copy[m] would give us. The alternative would
+		 * be to modify if_simloop() to handle the readonly mbuf,
+		 * but performancewise it is mostly equivalent (trading
+		 * extra data copying vs. extra locking).
+		 *
+		 * XXX This is a local workaround.  A number of less
+		 * often used kernel parts suffer from the same bug.
+		 * See PR kern/105943 for a proposed general solution.
+		 */
+		if ((n = m_dup(m, M_NOWAIT)) != NULL) {
+			update_mbuf_csumflags(m, n);
+			(void)if_simloop(ifp, n, dst->sa_family, hlen);
+		} else
+			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 	}
 
        /*
@@ -798,6 +854,7 @@ ether_ifattach(struct ifnet *ifp, const 
 	ifp->if_output = ether_output;
 	ifp->if_input = ether_input;
 	ifp->if_resolvemulti = ether_resolvemulti;
+	ifp->if_requestencap = ether_requestencap;
 #ifdef VIMAGE
 	ifp->if_reassign = ether_reassign;
 #endif

Modified: head/sys/net/if_llatbl.c
==============================================================================
--- head/sys/net/if_llatbl.c	Thu Dec 31 04:14:05 2015	(r292977)
+++ head/sys/net/if_llatbl.c	Thu Dec 31 05:03:27 2015	(r292978)
@@ -278,10 +278,12 @@ lltable_drop_entry_queue(struct llentry 
 
 void
 lltable_set_entry_addr(struct ifnet *ifp, struct llentry *lle,
-    const char *lladdr)
+    const char *linkhdr, size_t linkhdrsize, int lladdr_off)
 {
 
-	bcopy(lladdr, &lle->ll_addr, ifp->if_addrlen);
+	memcpy(lle->r_linkdata, linkhdr, linkhdrsize);
+	lle->r_hdrlen = linkhdrsize;
+	lle->ll_addr = &lle->r_linkdata[lladdr_off];
 	lle->la_flags |= LLE_VALID;
 	lle->r_flags |= RLLE_VALID;
 }
@@ -296,7 +298,7 @@ lltable_set_entry_addr(struct ifnet *ifp
  */
 int
 lltable_try_set_entry_addr(struct ifnet *ifp, struct llentry *lle,
-    const char *lladdr)
+    const char *linkhdr, size_t linkhdrsize, int lladdr_off)
 {
 
 	/* Perform real LLE update */
@@ -318,7 +320,7 @@ lltable_try_set_entry_addr(struct ifnet 
 	}
 
 	/* Update data */
-	lltable_set_entry_addr(ifp, lle, lladdr);
+	lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off);
 
 	IF_AFDATA_WUNLOCK(ifp);
 
@@ -327,6 +329,84 @@ lltable_try_set_entry_addr(struct ifnet 
 	return (1);
 }
 
+ /*
+ * Helper function used to pre-compute full/partial link-layer
+ * header data suitable for feeding into if_output().
+ */
+int
+lltable_calc_llheader(struct ifnet *ifp, int family, char *lladdr,
+    char *buf, size_t *bufsize, int *lladdr_off)
+{
+	struct if_encap_req ereq;
+	int error;
+
+	bzero(buf, *bufsize);
+	bzero(&ereq, sizeof(ereq));
+	ereq.buf = buf;
+	ereq.bufsize = *bufsize;
+	ereq.rtype = IFENCAP_LL;
+	ereq.family = family;
+	ereq.lladdr = lladdr;
+	ereq.lladdr_len = ifp->if_addrlen;
+	error = ifp->if_requestencap(ifp, &ereq);
+	if (error == 0) {
+		*bufsize = ereq.bufsize;
+		*lladdr_off = ereq.lladdr_off;
+	}
+
+	return (error);
+}
+
+/*
+ * Update link-layer header for given @lle after
+ * interface lladdr was changed.
+ */
+static int
+llentry_update_ifaddr(struct lltable *llt, struct llentry *lle, void *farg)
+{
+	struct ifnet *ifp;
+	u_char linkhdr[LLE_MAX_LINKHDR];
+	size_t linkhdrsize;
+	u_char *lladdr;
+	int lladdr_off;
+
+	ifp = (struct ifnet *)farg;
+
+	lladdr = lle->ll_addr;
+
+	LLE_WLOCK(lle);
+	if ((lle->la_flags & LLE_VALID) == 0) {
+		LLE_WUNLOCK(lle);
+		return (0);
+	}
+
+	if ((lle->la_flags & LLE_IFADDR) != 0)
+		lladdr = IF_LLADDR(ifp);
+
+	linkhdrsize = sizeof(linkhdr);
+	lltable_calc_llheader(ifp, llt->llt_af, lladdr, linkhdr, &linkhdrsize,
+	    &lladdr_off);
+	memcpy(lle->r_linkdata, linkhdr, linkhdrsize);
+	LLE_WUNLOCK(lle);
+
+	return (0);
+}
+
+/*
+ * Update all calculated headers for given @llt
+ */
+void
+lltable_update_ifaddr(struct lltable *llt)
+{
+
+	if (llt->llt_ifp->if_flags & IFF_LOOPBACK)
+		return;
+
+	IF_AFDATA_WLOCK(llt->llt_ifp);
+	lltable_foreach_lle(llt, llentry_update_ifaddr, llt->llt_ifp);
+	IF_AFDATA_WUNLOCK(llt->llt_ifp);
+}
+
 /*
  *
  * Performes generic cleanup routines and frees lle.
@@ -642,6 +722,9 @@ lla_rt_output(struct rt_msghdr *rtm, str
 	struct ifnet *ifp;
 	struct lltable *llt;
 	struct llentry *lle, *lle_tmp;
+	uint8_t linkhdr[LLE_MAX_LINKHDR];
+	size_t linkhdrsize;
+	int lladdr_off;
 	u_int laflags = 0;
 	int error;
 
@@ -677,11 +760,14 @@ lla_rt_output(struct rt_msghdr *rtm, str
 		if (lle == NULL)
 			return (ENOMEM);
 
-		bcopy(LLADDR(dl), &lle->ll_addr, ifp->if_addrlen);
+		linkhdrsize = sizeof(linkhdr);
+		if (lltable_calc_llheader(ifp, dst->sa_family, LLADDR(dl),
+		    linkhdr, &linkhdrsize, &lladdr_off) != 0)
+			return (EINVAL);
+		lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize,
+		    lladdr_off);
 		if ((rtm->rtm_flags & RTF_ANNOUNCE))
 			lle->la_flags |= LLE_PUB;
-		lle->la_flags |= LLE_VALID;
-		lle->r_flags |= RLLE_VALID;
 		lle->la_expire = rtm->rtm_rmx.rmx_expire;
 
 		laflags = lle->la_flags;
@@ -767,7 +853,7 @@ llatbl_lle_show(struct llentry_sa *la)
 	db_printf(" ln_router=%u\n", lle->ln_router);
 	db_printf(" ln_ntick=%ju\n", (uintmax_t)lle->ln_ntick);
 	db_printf(" lle_refcnt=%d\n", lle->lle_refcnt);
-	bcopy(&lle->ll_addr.mac16, octet, sizeof(octet));
+	bcopy(lle->ll_addr, octet, sizeof(octet));
 	db_printf(" ll_addr=%02x:%02x:%02x:%02x:%02x:%02x\n",
 	    octet[0], octet[1], octet[2], octet[3], octet[4], octet[5]);
 	db_printf(" lle_timer=%p\n", &lle->lle_timer);

Modified: head/sys/net/if_llatbl.h
==============================================================================
--- head/sys/net/if_llatbl.h	Thu Dec 31 04:14:05 2015	(r292977)
+++ head/sys/net/if_llatbl.h	Thu Dec 31 05:03:27 2015	(r292978)
@@ -48,6 +48,7 @@ extern struct rwlock lltable_rwlock;
 #define	LLTABLE_WUNLOCK()	rw_wunlock(&lltable_rwlock)
 #define	LLTABLE_LOCK_ASSERT()	rw_assert(&lltable_rwlock, RA_LOCKED)
 
+#define	LLE_MAX_LINKHDR		24	/* Full IB header */
 /*
  * Code referencing llentry must at least hold
  * a shared lock
@@ -58,14 +59,11 @@ struct llentry {
 		struct in_addr	addr4;
 		struct in6_addr	addr6;
 	} r_l3addr;
-	union {
-		uint64_t	mac_aligned;
-		uint16_t	mac16[3];
-		uint8_t		mac8[20];	/* IB needs 20 bytes. */
-	} ll_addr;
+	char			r_linkdata[LLE_MAX_LINKHDR]; /* L2 data */
+	uint8_t			r_hdrlen;	/* length for LL header */
+	uint8_t			spare0[3];
 	uint16_t		r_flags;	/* LLE runtime flags */
 	uint16_t		r_skip_req;	/* feedback from fast path */
-	uint64_t		spare1;
 
 	struct lltable		 *lle_tbl;
 	struct llentries	 *lle_head;
@@ -82,6 +80,7 @@ struct llentry {
 	time_t			lle_remtime;	/* Real time remaining */
 	time_t			lle_hittime;	/* Time when r_skip_req was unset */
 	int			 lle_refcnt;
+	char			*ll_addr;	/* link-layer address */
 
 	LIST_ENTRY(llentry)	lle_chain;	/* chain of deleted items */
 	struct callout		lle_timer;
@@ -198,6 +197,8 @@ MALLOC_DECLARE(M_LLTABLE);
 /* LLE request flags */
 #define	LLE_EXCLUSIVE	0x2000	/* return lle xlocked  */
 #define	LLE_UNLOCKED	0x4000	/* return lle unlocked */
+#define	LLE_ADDRONLY	0x4000	/* return lladdr instead of full header */
+#define	LLE_CREATE	0x8000	/* hint to avoid lle lookup */
 
 /* LLE flags used by fastpath code */
 #define	RLLE_VALID	0x0001		/* entry is valid */
@@ -223,10 +224,13 @@ struct llentry  *llentry_alloc(struct if
 /* helper functions */
 size_t lltable_drop_entry_queue(struct llentry *);
 void lltable_set_entry_addr(struct ifnet *ifp, struct llentry *lle,
-    const char *lladdr);
+    const char *linkhdr, size_t linkhdrsize, int lladdr_off);
 int lltable_try_set_entry_addr(struct ifnet *ifp, struct llentry *lle,
-    const char *lladdr);
+    const char *linkhdr, size_t linkhdrsize, int lladdr_off);
 
+int lltable_calc_llheader(struct ifnet *ifp, int family, char *lladdr,
+    char *buf, size_t *bufsize, int *lladdr_off);
+void lltable_update_ifaddr(struct lltable *llt);
 struct llentry *lltable_alloc_entry(struct lltable *llt, u_int flags,
     const struct sockaddr *l4addr);
 void lltable_free_entry(struct lltable *llt, struct llentry *lle);

Modified: head/sys/net/if_var.h
==============================================================================
--- head/sys/net/if_var.h	Thu Dec 31 04:14:05 2015	(r292977)
+++ head/sys/net/if_var.h	Thu Dec 31 05:03:27 2015	(r292978)
@@ -134,6 +134,48 @@ struct ifnet_hw_tsomax {
 	u_int	tsomaxsegsize;	/* TSO maximum segment size in bytes */
 };
 
+/* Interface encap request types */
+typedef enum {
+	IFENCAP_LL = 1			/* pre-calculate link-layer header */
+} ife_type;
+
+/*
+ * The structure below allows to request various pre-calculated L2/L3 headers
+ * for different media. Requests varies by type (rtype field).
+ *
+ * IFENCAP_LL type: pre-calculates link header based on address family
+ *   and destination lladdr.
+ *
+ *   Input data fields:
+ *     buf: pointer to destination buffer
+ *     bufsize: buffer size
+ *     flags: IFENCAP_FLAG_BROADCAST if destination is broadcast
+ *     family: address family defined by AF_ constant.
+ *     lladdr: pointer to link-layer address
+ *     lladdr_len: length of link-layer address
+ *     hdata: pointer to L3 header (optional, used for ARP requests).
+ *   Output data fields:
+ *     buf: encap data is stored here
+ *     bufsize: resulting encap length is stored here
+ *     lladdr_off: offset of link-layer address from encap hdr start
+ *     hdata: L3 header may be altered if necessary
+ */
+
+struct if_encap_req {
+	u_char		*buf;		/* Destination buffer (w) */
+	size_t		bufsize;	/* size of provided buffer (r) */
+	ife_type	rtype;		/* request type (r) */
+	uint32_t	flags;		/* Request flags (r) */
+	int		family;		/* Address family AF_* (r) */
+	int		lladdr_off;	/* offset from header start (w) */
+	int		lladdr_len;	/* lladdr length (r) */
+	char		*lladdr;	/* link-level address pointer (r) */
+	char		*hdata;		/* Upper layer header data (rw) */
+};
+
+#define	IFENCAP_FLAG_BROADCAST	0x02	/* Destination is broadcast */
+
+
 /*
  * Structure defining a network interface.
  *
@@ -235,6 +277,8 @@ struct ifnet {
 	void	(*if_reassign)		/* reassign to vnet routine */
 		(struct ifnet *, struct vnet *, char *);
 	if_get_counter_t if_get_counter; /* get counter values */
+	int	(*if_requestencap)	/* make link header from request */
+		(struct ifnet *, struct if_encap_req *);
 
 	/* Statistics. */
 	counter_u64_t	if_counters[IFCOUNTERS];

Modified: head/sys/net/route.h
==============================================================================
--- head/sys/net/route.h	Thu Dec 31 04:14:05 2015	(r292977)
+++ head/sys/net/route.h	Thu Dec 31 05:03:27 2015	(r292978)
@@ -51,14 +51,21 @@
  */
 struct route {
 	struct	rtentry *ro_rt;
-	struct	llentry *ro_lle;
-	struct	in_ifaddr *ro_ia;
-	int		ro_flags;
+	char		*ro_prepend;
+	uint16_t	ro_plen;
+	uint16_t	ro_flags;
 	struct	sockaddr ro_dst;
 };
 
+#define	RT_L2_ME_BIT		2	/* dst L2 addr is our address */
+#define	RT_MAY_LOOP_BIT		3	/* dst may require loop copy */
+#define	RT_HAS_HEADER_BIT	4	/* mbuf already have its header prepended */
+
 #define	RT_CACHING_CONTEXT	0x1	/* XXX: not used anywhere */
 #define	RT_NORTREF		0x2	/* doesn't hold reference on ro_rt */
+#define	RT_L2_ME		(1 << RT_L2_ME_BIT)
+#define	RT_MAY_LOOP		(1 << RT_MAY_LOOP_BIT)
+#define	RT_HAS_HEADER		(1 << RT_HAS_HEADER_BIT)
 
 struct rt_metrics {
 	u_long	rmx_locks;	/* Kernel must leave these values alone */

Modified: head/sys/netinet/if_ether.c
==============================================================================
--- head/sys/netinet/if_ether.c	Thu Dec 31 04:14:05 2015	(r292977)
+++ head/sys/netinet/if_ether.c	Thu Dec 31 05:03:27 2015	(r292978)
@@ -282,6 +282,37 @@ arptimer(void *arg)
 }
 
 /*
+ * Stores link-layer header for @ifp in format suitable for if_output()
+ * into buffer @buf. Resulting header length is stored in @bufsize.
+ *
+ * Returns 0 on success.
+ */
+static int
+arp_fillheader(struct ifnet *ifp, struct arphdr *ah, int bcast, u_char *buf,
+    size_t *bufsize)
+{
+	struct if_encap_req ereq;
+	int error;
+
+	bzero(buf, *bufsize);
+	bzero(&ereq, sizeof(ereq));
+	ereq.buf = buf;
+	ereq.bufsize = *bufsize;
+	ereq.rtype = IFENCAP_LL;
+	ereq.family = AF_ARP;
+	ereq.lladdr = ar_tha(ah);
+	ereq.hdata = (u_char *)ah;
+	if (bcast)
+		ereq.flags = IFENCAP_FLAG_BROADCAST;
+	error = ifp->if_requestencap(ifp, &ereq);
+	if (error == 0)
+		*bufsize = ereq.bufsize;
+
+	return (error);
+}
+
+
+/*
  * Broadcast an ARP request. Caller specifies:
  *	- arp header source ip address
  *	- arp header target ip address
@@ -295,6 +326,10 @@ arprequest(struct ifnet *ifp, const stru
 	struct arphdr *ah;
 	struct sockaddr sa;
 	u_char *carpaddr = NULL;
+	uint8_t linkhdr[LLE_MAX_LINKHDR];
+	size_t linkhdrsize;
+	struct route ro;
+	int error;
 
 	if (sip == NULL) {
 		/*
@@ -350,12 +385,28 @@ arprequest(struct ifnet *ifp, const stru
 	bcopy(tip, ar_tpa(ah), ah->ar_pln);
 	sa.sa_family = AF_ARP;
 	sa.sa_len = 2;
+
+	/* Calculate link header for sending frame */
+	bzero(&ro, sizeof(ro));
+	linkhdrsize = sizeof(linkhdr);
+	error = arp_fillheader(ifp, ah, 1, linkhdr, &linkhdrsize);
+	if (error != 0 && error != EAFNOSUPPORT) {
+		ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n",
+		    if_name(ifp), error);
+		return;
+	}
+
+	ro.ro_prepend = linkhdr;
+	ro.ro_plen = linkhdrsize;
+	ro.ro_flags = 0;
+
 	m->m_flags |= M_BCAST;
 	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
-	(*ifp->if_output)(ifp, m, &sa, NULL);
+	(*ifp->if_output)(ifp, m, &sa, &ro);
 	ARPSTAT_INC(txrequests);
 }
 
+
 /*
  * Resolve an IP address into an ethernet address - heavy version.
  * Used internally by arpresolve().
@@ -368,18 +419,20 @@ arprequest(struct ifnet *ifp, const stru
  * Note that m_freem() handles NULL.
  */
 static int
-arpresolve_full(struct ifnet *ifp, int is_gw, int create, struct mbuf *m,
+arpresolve_full(struct ifnet *ifp, int is_gw, int flags, struct mbuf *m,
 	const struct sockaddr *dst, u_char *desten, uint32_t *pflags)
 {
 	struct llentry *la = NULL, *la_tmp;
 	struct mbuf *curr = NULL;
 	struct mbuf *next = NULL;
 	int error, renew;
+	char *lladdr;
+	int ll_len;
 
 	if (pflags != NULL)
 		*pflags = 0;
 
-	if (create == 0) {
+	if ((flags & LLE_CREATE) == 0) {
 		IF_AFDATA_RLOCK(ifp);
 		la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
 		IF_AFDATA_RUNLOCK(ifp);
@@ -413,7 +466,14 @@ arpresolve_full(struct ifnet *ifp, int i
 
 	if ((la->la_flags & LLE_VALID) &&
 	    ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) {
-		bcopy(&la->ll_addr, desten, ifp->if_addrlen);
+		if (flags & LLE_ADDRONLY) {
+			lladdr = la->ll_addr;
+			ll_len = ifp->if_addrlen;
+		} else {
+			lladdr = la->r_linkdata;
+			ll_len = la->r_hdrlen;
+		}
+		bcopy(lladdr, desten, ll_len);
 
 		/* Check if we have feedback request from arptimer() */
 		if (la->r_skip_req != 0) {
@@ -485,15 +545,31 @@ arpresolve_full(struct ifnet *ifp, int i
 
 /*
  * Resolve an IP address into an ethernet address.
+ */
+int
+arpresolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst,
+    char *desten, uint32_t *pflags)
+{
+	int error;
+
+	flags |= LLE_ADDRONLY;
+	error = arpresolve_full(ifp, 0, flags, NULL, dst, desten, pflags);
+	return (error);
+}
+
+
+/*
+ * Lookups link header based on an IP address.

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-head mailing list