svn commit: r312379 - in head: lib/libc/sys sbin/ifconfig sys/conf sys/kern sys/modules/if_lagg sys/modules/if_vlan sys/net sys/netinet sys/netinet6 sys/sys
Hans Petter Selasky
hselasky at FreeBSD.org
Wed Jan 18 13:31:20 UTC 2017
Author: hselasky
Date: Wed Jan 18 13:31:17 2017
New Revision: 312379
URL: https://svnweb.freebsd.org/changeset/base/312379
Log:
Implement kernel support for hardware rate limited sockets.
- Add RATELIMIT kernel configuration keyword which must be set to
enable the new functionality.
- Add support for hardware driven, Receive Side Scaling, RSS aware, rate
limited sendqueues and expose the functionality through the already
established SO_MAX_PACING_RATE setsockopt(). The API support rates in
the range from 1 to 4Gbytes/s which are suitable for regular TCP and
UDP streams. The setsockopt(2) manual page has been updated.
- Add rate limit function callback API to "struct ifnet" which supports
the following operations: if_snd_tag_alloc(), if_snd_tag_modify(),
if_snd_tag_query() and if_snd_tag_free().
- Add support to ifconfig to view, set and clear the IFCAP_TXRTLMT
flag, which tells if a network driver supports rate limiting or not.
- This patch also adds support for rate limiting through VLAN and LAGG
intermediate network devices.
- How rate limiting works:
1) The userspace application calls setsockopt() after accepting or
making a new connection to set the rate which is then stored in the
socket structure in the kernel. Later on when packets are transmitted
a check is made in the transmit path for rate changes. A rate change
implies a non-blocking ifp->if_snd_tag_alloc() call will be made to the
destination network interface, which then sets up a custom sendqueue
with the given rate limitation parameter. A "struct m_snd_tag" pointer is
returned which serves as a "snd_tag" hint in the m_pkthdr for the
subsequently transmitted mbufs.
2) When the network driver sees the "m->m_pkthdr.snd_tag" different
from NULL, it will move the packets into a designated rate limited sendqueue
given by the snd_tag pointer. It is up to the individual drivers how the rate
limited traffic will be rate limited.
3) Route changes are detected by the NIC drivers in the ifp->if_transmit()
routine when the ifnet pointer in the incoming snd_tag mismatches the
one of the network interface. The network adapter frees the mbuf and
returns EAGAIN which causes the ip_output() to release and clear the send
tag. Upon next ip_output() a new "snd_tag" will be tried allocated.
4) When the PCB is detached the custom sendqueue will be released by a
non-blocking ifp->if_snd_tag_free() call to the currently bound network
interface.
Reviewed by: wblock (manpages), adrian, gallatin, scottl (network)
Differential Revision: https://reviews.freebsd.org/D3687
Sponsored by: Mellanox Technologies
MFC after: 3 months
Modified:
head/lib/libc/sys/getsockopt.2
head/sbin/ifconfig/ifconfig.8
head/sbin/ifconfig/ifconfig.c
head/sys/conf/NOTES
head/sys/conf/config.mk
head/sys/conf/kern.opts.mk
head/sys/conf/options
head/sys/kern/uipc_socket.c
head/sys/modules/if_lagg/Makefile
head/sys/modules/if_vlan/Makefile
head/sys/net/ieee8023ad_lacp.c
head/sys/net/ieee8023ad_lacp.h
head/sys/net/if.h
head/sys/net/if_dead.c
head/sys/net/if_lagg.c
head/sys/net/if_var.h
head/sys/net/if_vlan.c
head/sys/netinet/in_pcb.c
head/sys/netinet/in_pcb.h
head/sys/netinet/ip_output.c
head/sys/netinet6/ip6_output.c
head/sys/sys/mbuf.h
head/sys/sys/socket.h
head/sys/sys/socketvar.h
Modified: head/lib/libc/sys/getsockopt.2
==============================================================================
--- head/lib/libc/sys/getsockopt.2 Wed Jan 18 13:27:24 2017 (r312378)
+++ head/lib/libc/sys/getsockopt.2 Wed Jan 18 13:31:17 2017 (r312379)
@@ -28,7 +28,7 @@
.\" @(#)getsockopt.2 8.4 (Berkeley) 5/2/95
.\" $FreeBSD$
.\"
-.Dd April 5, 2013
+.Dd January 18, 2017
.Dt GETSOCKOPT 2
.Os
.Sh NAME
@@ -188,6 +188,7 @@ The following options are recognized in
.It Dv SO_LISTENINCQLEN Ta "get incomplete queue length of the socket (get only)"
.It Dv SO_USER_COOKIE Ta "set the 'so_user_cookie' value for the socket (uint32_t, set only)"
.It Dv SO_TS_CLOCK Ta "set specific format of timestamp returned by SO_TIMESTAMP"
+.It Dv SO_MAX_PACING_RATE "set the maximum transmit rate in bytes per second for the socket"
.El
.Pp
.Dv SO_DEBUG
@@ -515,6 +516,10 @@ returns the maximal number of queued con
returns the number of unaccepted complete connections.
.Dv SO_LISTENINCQLEN
returns the number of unaccepted incomplete connections.
+.Pp
+.Dv SO_MAX_PACING_RATE
+instruct the socket and underlying network adapter layers to limit the
+transfer rate to the given unsigned 32-bit value in bytes per second.
.Sh RETURN VALUES
.Rv -std
.Sh ERRORS
Modified: head/sbin/ifconfig/ifconfig.8
==============================================================================
--- head/sbin/ifconfig/ifconfig.8 Wed Jan 18 13:27:24 2017 (r312378)
+++ head/sbin/ifconfig/ifconfig.8 Wed Jan 18 13:31:17 2017 (r312379)
@@ -28,7 +28,7 @@
.\" From: @(#)ifconfig.8 8.3 (Berkeley) 1/5/94
.\" $FreeBSD$
.\"
-.Dd September 17, 2016
+.Dd January 18, 2017
.Dt IFCONFIG 8
.Os
.Sh NAME
@@ -460,6 +460,8 @@ this directive is used to select between
and 802.11g
.Pq Cm 11g
operating modes.
+.It Cm txrtlmt
+Set if the driver supports TX rate limiting.
.It Cm inst Ar minst , Cm instance Ar minst
Set the media instance to
.Ar minst .
Modified: head/sbin/ifconfig/ifconfig.c
==============================================================================
--- head/sbin/ifconfig/ifconfig.c Wed Jan 18 13:27:24 2017 (r312378)
+++ head/sbin/ifconfig/ifconfig.c Wed Jan 18 13:31:17 2017 (r312379)
@@ -1145,7 +1145,7 @@ unsetifdescr(const char *val, int value,
"\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \
"\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \
"\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \
-"\26RXCSUM_IPV6\27TXCSUM_IPV6"
+"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT"
/*
* Print the status of the interface. If an address family was
@@ -1453,6 +1453,8 @@ static struct cmd basic_cmds[] = {
DEF_CMD("-wol_mcast", -IFCAP_WOL_MCAST, setifcap),
DEF_CMD("wol_magic", IFCAP_WOL_MAGIC, setifcap),
DEF_CMD("-wol_magic", -IFCAP_WOL_MAGIC, setifcap),
+ DEF_CMD("txrtlmt", IFCAP_TXRTLMT, setifcap),
+ DEF_CMD("-txrtlmt", -IFCAP_TXRTLMT, setifcap),
DEF_CMD("normal", -IFF_LINK0, setifflags),
DEF_CMD("compress", IFF_LINK0, setifflags),
DEF_CMD("noicmp", IFF_LINK1, setifflags),
Modified: head/sys/conf/NOTES
==============================================================================
--- head/sys/conf/NOTES Wed Jan 18 13:27:24 2017 (r312378)
+++ head/sys/conf/NOTES Wed Jan 18 13:31:17 2017 (r312379)
@@ -619,6 +619,8 @@ options HWPMC_HOOKS # Other necessary
options INET #Internet communications protocols
options INET6 #IPv6 communications protocols
+options RATELIMIT # TX rate limiting support
+
options ROUTETABLES=2 # allocated fibs up to 65536. default is 1.
# but that would be a bad idea as they are large.
Modified: head/sys/conf/config.mk
==============================================================================
--- head/sys/conf/config.mk Wed Jan 18 13:27:24 2017 (r312378)
+++ head/sys/conf/config.mk Wed Jan 18 13:31:17 2017 (r312379)
@@ -19,6 +19,10 @@ opt_inet.h:
opt_inet6.h:
@echo "#define INET6 1" > ${.TARGET}
.endif
+.if ${MK_RATELIMIT} != "no"
+opt_ratelimit.h:
+ @echo "#define RATELIMIT 1" > ${.TARGET}
+.endif
.if ${MK_EISA} != "no"
opt_eisa.h:
@echo "#define DEV_EISA 1" > ${.TARGET}
Modified: head/sys/conf/kern.opts.mk
==============================================================================
--- head/sys/conf/kern.opts.mk Wed Jan 18 13:27:24 2017 (r312378)
+++ head/sys/conf/kern.opts.mk Wed Jan 18 13:31:17 2017 (r312379)
@@ -48,6 +48,7 @@ __DEFAULT_NO_OPTIONS = \
EXTRA_TCP_STACKS \
NAND \
OFED \
+ RATELIMIT \
REPRODUCIBLE_BUILD
# Some options are totally broken on some architectures. We disable
Modified: head/sys/conf/options
==============================================================================
--- head/sys/conf/options Wed Jan 18 13:27:24 2017 (r312378)
+++ head/sys/conf/options Wed Jan 18 13:31:17 2017 (r312379)
@@ -412,6 +412,7 @@ BOOTP_NFSV3 opt_bootp.h
BOOTP_WIRED_TO opt_bootp.h
DEVICE_POLLING
DUMMYNET opt_ipdn.h
+RATELIMIT opt_ratelimit.h
INET opt_inet.h
INET6 opt_inet6.h
IPDIVERT
Modified: head/sys/kern/uipc_socket.c
==============================================================================
--- head/sys/kern/uipc_socket.c Wed Jan 18 13:27:24 2017 (r312378)
+++ head/sys/kern/uipc_socket.c Wed Jan 18 13:31:17 2017 (r312379)
@@ -2699,6 +2699,14 @@ sosetopt(struct socket *so, struct socko
so->so_ts_clock = optval;
break;
+ case SO_MAX_PACING_RATE:
+ error = sooptcopyin(sopt, &val32, sizeof(val32),
+ sizeof(val32));
+ if (error)
+ goto bad;
+ so->so_max_pacing_rate = val32;
+ break;
+
default:
if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
error = hhook_run_socket(so, sopt,
@@ -2890,6 +2898,10 @@ integer:
optval = so->so_ts_clock;
goto integer;
+ case SO_MAX_PACING_RATE:
+ optval = so->so_max_pacing_rate;
+ goto integer;
+
default:
if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
error = hhook_run_socket(so, sopt,
Modified: head/sys/modules/if_lagg/Makefile
==============================================================================
--- head/sys/modules/if_lagg/Makefile Wed Jan 18 13:27:24 2017 (r312378)
+++ head/sys/modules/if_lagg/Makefile Wed Jan 18 13:31:17 2017 (r312379)
@@ -2,6 +2,6 @@
.PATH: ${.CURDIR}/../../net
KMOD= if_lagg
-SRCS= if_lagg.c ieee8023ad_lacp.c opt_inet.h opt_inet6.h
+SRCS= if_lagg.c ieee8023ad_lacp.c opt_inet.h opt_inet6.h opt_ratelimit.h
.include <bsd.kmod.mk>
Modified: head/sys/modules/if_vlan/Makefile
==============================================================================
--- head/sys/modules/if_vlan/Makefile Wed Jan 18 13:27:24 2017 (r312378)
+++ head/sys/modules/if_vlan/Makefile Wed Jan 18 13:31:17 2017 (r312379)
@@ -4,6 +4,6 @@
KMOD= if_vlan
SRCS= if_vlan.c
-SRCS+= opt_inet.h opt_vlan.h
+SRCS+= opt_inet.h opt_vlan.h opt_ratelimit.h
.include <bsd.kmod.mk>
Modified: head/sys/net/ieee8023ad_lacp.c
==============================================================================
--- head/sys/net/ieee8023ad_lacp.c Wed Jan 18 13:27:24 2017 (r312378)
+++ head/sys/net/ieee8023ad_lacp.c Wed Jan 18 13:31:17 2017 (r312379)
@@ -30,6 +30,8 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include "opt_ratelimit.h"
+
#include <sys/param.h>
#include <sys/callout.h>
#include <sys/eventhandler.h>
@@ -853,6 +855,35 @@ lacp_select_tx_port(struct lagg_softc *s
return (lp->lp_lagg);
}
+
+#ifdef RATELIMIT
+struct lagg_port *
+lacp_select_tx_port_by_hash(struct lagg_softc *sc, uint32_t flowid)
+{
+ struct lacp_softc *lsc = LACP_SOFTC(sc);
+ struct lacp_portmap *pm;
+ struct lacp_port *lp;
+ uint32_t hash;
+
+ if (__predict_false(lsc->lsc_suppress_distributing)) {
+ LACP_DPRINTF((NULL, "%s: waiting transit\n", __func__));
+ return (NULL);
+ }
+
+ pm = &lsc->lsc_pmap[lsc->lsc_activemap];
+ if (pm->pm_count == 0) {
+ LACP_DPRINTF((NULL, "%s: no active aggregator\n", __func__));
+ return (NULL);
+ }
+
+ hash = flowid >> sc->flowid_shift;
+ hash %= pm->pm_count;
+ lp = pm->pm_map[hash];
+
+ return (lp->lp_lagg);
+}
+#endif
+
/*
* lacp_suppress_distributing: drop transmit packets for a while
* to preserve packet ordering.
Modified: head/sys/net/ieee8023ad_lacp.h
==============================================================================
--- head/sys/net/ieee8023ad_lacp.h Wed Jan 18 13:27:24 2017 (r312378)
+++ head/sys/net/ieee8023ad_lacp.h Wed Jan 18 13:31:17 2017 (r312379)
@@ -284,6 +284,9 @@ struct lacp_softc {
struct mbuf *lacp_input(struct lagg_port *, struct mbuf *);
struct lagg_port *lacp_select_tx_port(struct lagg_softc *, struct mbuf *);
+#ifdef RATELIMIT
+struct lagg_port *lacp_select_tx_port_by_hash(struct lagg_softc *, uint32_t);
+#endif
void lacp_attach(struct lagg_softc *);
void lacp_detach(void *);
void lacp_init(struct lagg_softc *);
Modified: head/sys/net/if.h
==============================================================================
--- head/sys/net/if.h Wed Jan 18 13:27:24 2017 (r312378)
+++ head/sys/net/if.h Wed Jan 18 13:31:17 2017 (r312379)
@@ -239,6 +239,7 @@ struct if_data {
#define IFCAP_RXCSUM_IPV6 0x200000 /* can offload checksum on IPv6 RX */
#define IFCAP_TXCSUM_IPV6 0x400000 /* can offload checksum on IPv6 TX */
#define IFCAP_HWSTATS 0x800000 /* manages counters internally */
+#define IFCAP_TXRTLMT 0x1000000 /* hardware supports TX rate limiting */
#define IFCAP_HWCSUM_IPV6 (IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6)
Modified: head/sys/net/if_dead.c
==============================================================================
--- head/sys/net/if_dead.c Wed Jan 18 13:27:24 2017 (r312378)
+++ head/sys/net/if_dead.c Wed Jan 18 13:31:17 2017 (r312379)
@@ -100,6 +100,30 @@ ifdead_get_counter(struct ifnet *ifp, if
return (0);
}
+static int
+ifdead_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
+ struct m_snd_tag **ppmt)
+{
+ return (EOPNOTSUPP);
+}
+
+static int
+ifdead_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
+{
+ return (EOPNOTSUPP);
+}
+
+static int
+ifdead_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
+{
+ return (EOPNOTSUPP);
+}
+
+static void
+ifdead_snd_tag_free(struct m_snd_tag *pmt)
+{
+}
+
void
if_dead(struct ifnet *ifp)
{
@@ -112,4 +136,8 @@ if_dead(struct ifnet *ifp)
ifp->if_qflush = ifdead_qflush;
ifp->if_transmit = ifdead_transmit;
ifp->if_get_counter = ifdead_get_counter;
+ ifp->if_snd_tag_alloc = ifdead_snd_tag_alloc;
+ ifp->if_snd_tag_modify = ifdead_snd_tag_modify;
+ ifp->if_snd_tag_query = ifdead_snd_tag_query;
+ ifp->if_snd_tag_free = ifdead_snd_tag_free;
}
Modified: head/sys/net/if_lagg.c
==============================================================================
--- head/sys/net/if_lagg.c Wed Jan 18 13:27:24 2017 (r312378)
+++ head/sys/net/if_lagg.c Wed Jan 18 13:31:17 2017 (r312379)
@@ -23,6 +23,7 @@ __FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
+#include "opt_ratelimit.h"
#include <sys/param.h>
#include <sys/kernel.h>
@@ -118,6 +119,11 @@ static void lagg_port2req(struct lagg_po
static void lagg_init(void *);
static void lagg_stop(struct lagg_softc *);
static int lagg_ioctl(struct ifnet *, u_long, caddr_t);
+#ifdef RATELIMIT
+static int lagg_snd_tag_alloc(struct ifnet *,
+ union if_snd_tag_alloc_params *,
+ struct m_snd_tag **);
+#endif
static int lagg_ether_setmulti(struct lagg_softc *);
static int lagg_ether_cmdmulti(struct lagg_port *, int);
static int lagg_setflag(struct lagg_port *, int, int,
@@ -503,7 +509,12 @@ lagg_clone_create(struct if_clone *ifc,
ifp->if_ioctl = lagg_ioctl;
ifp->if_get_counter = lagg_get_counter;
ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST;
+#ifdef RATELIMIT
+ ifp->if_snd_tag_alloc = lagg_snd_tag_alloc;
+ ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS | IFCAP_TXRTLMT;
+#else
ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS;
+#endif
/*
* Attach as an ordinary ethernet device, children will be attached
@@ -1549,6 +1560,52 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd
return (error);
}
+#ifdef RATELIMIT
+static int
+lagg_snd_tag_alloc(struct ifnet *ifp,
+ union if_snd_tag_alloc_params *params,
+ struct m_snd_tag **ppmt)
+{
+ struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
+ struct lagg_port *lp;
+ struct lagg_lb *lb;
+ uint32_t p;
+
+ switch (sc->sc_proto) {
+ case LAGG_PROTO_FAILOVER:
+ lp = lagg_link_active(sc, sc->sc_primary);
+ break;
+ case LAGG_PROTO_LOADBALANCE:
+ if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
+ params->hdr.flowtype == M_HASHTYPE_NONE)
+ return (EOPNOTSUPP);
+ p = params->hdr.flowid >> sc->flowid_shift;
+ p %= sc->sc_count;
+ lb = (struct lagg_lb *)sc->sc_psc;
+ lp = lb->lb_ports[p];
+ lp = lagg_link_active(sc, lp);
+ break;
+ case LAGG_PROTO_LACP:
+ if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
+ params->hdr.flowtype == M_HASHTYPE_NONE)
+ return (EOPNOTSUPP);
+ lp = lacp_select_tx_port_by_hash(sc, params->hdr.flowid);
+ break;
+ default:
+ return (EOPNOTSUPP);
+ }
+ if (lp == NULL)
+ return (EOPNOTSUPP);
+ ifp = lp->lp_ifp;
+ if (ifp == NULL || ifp->if_snd_tag_alloc == NULL ||
+ (ifp->if_capenable & IFCAP_TXRTLMT) == 0)
+ return (EOPNOTSUPP);
+
+ /* forward allocation request */
+ return (ifp->if_snd_tag_alloc(ifp, params, ppmt));
+}
+#endif
+
static int
lagg_ether_setmulti(struct lagg_softc *sc)
{
Modified: head/sys/net/if_var.h
==============================================================================
--- head/sys/net/if_var.h Wed Jan 18 13:27:24 2017 (r312378)
+++ head/sys/net/if_var.h Wed Jan 18 13:31:17 2017 (r312379)
@@ -175,6 +175,49 @@ struct if_encap_req {
#define IFENCAP_FLAG_BROADCAST 0x02 /* Destination is broadcast */
+/*
+ * Network interface send tag support. The storage of "struct
+ * m_snd_tag" comes from the network driver and it is free to allocate
+ * as much additional space as it wants for its own use.
+ */
+struct m_snd_tag;
+
+#define IF_SND_TAG_TYPE_RATE_LIMIT 0
+#define IF_SND_TAG_TYPE_MAX 1
+
+struct if_snd_tag_alloc_header {
+ uint32_t type; /* send tag type, see IF_SND_TAG_XXX */
+ uint32_t flowid; /* mbuf hash value */
+ uint32_t flowtype; /* mbuf hash type */
+};
+
+struct if_snd_tag_alloc_rate_limit {
+ struct if_snd_tag_alloc_header hdr;
+ uint64_t max_rate; /* in bytes/s */
+};
+
+struct if_snd_tag_rate_limit_params {
+ uint64_t max_rate; /* in bytes/s */
+};
+
+union if_snd_tag_alloc_params {
+ struct if_snd_tag_alloc_header hdr;
+ struct if_snd_tag_alloc_rate_limit rate_limit;
+};
+
+union if_snd_tag_modify_params {
+ struct if_snd_tag_rate_limit_params rate_limit;
+};
+
+union if_snd_tag_query_params {
+ struct if_snd_tag_rate_limit_params rate_limit;
+};
+
+typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *,
+ struct m_snd_tag **);
+typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *);
+typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *);
+typedef void (if_snd_tag_free_t)(struct m_snd_tag *);
/*
* Structure defining a network interface.
@@ -304,12 +347,19 @@ struct ifnet {
u_int if_hw_tsomaxsegsize; /* TSO maximum segment size in bytes */
/*
+ * Network adapter send tag support:
+ */
+ if_snd_tag_alloc_t *if_snd_tag_alloc;
+ if_snd_tag_modify_t *if_snd_tag_modify;
+ if_snd_tag_query_t *if_snd_tag_query;
+ if_snd_tag_free_t *if_snd_tag_free;
+
+ /*
* Spare fields to be added before branching a stable branch, so
* that structure can be enhanced without changing the kernel
* binary interface.
*/
- void *if_pspare[4]; /* packet pacing / general use */
- int if_ispare[4]; /* packet pacing / general use */
+ int if_ispare[4]; /* general use */
};
/* for compatibility with other BSDs */
Modified: head/sys/net/if_vlan.c
==============================================================================
--- head/sys/net/if_vlan.c Wed Jan 18 13:27:24 2017 (r312378)
+++ head/sys/net/if_vlan.c Wed Jan 18 13:31:17 2017 (r312379)
@@ -46,6 +46,7 @@ __FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_vlan.h"
+#include "opt_ratelimit.h"
#include <sys/param.h>
#include <sys/eventhandler.h>
@@ -212,6 +213,10 @@ static void trunk_destroy(struct ifvlant
static void vlan_init(void *foo);
static void vlan_input(struct ifnet *ifp, struct mbuf *m);
static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr);
+#ifdef RATELIMIT
+static int vlan_snd_tag_alloc(struct ifnet *,
+ union if_snd_tag_alloc_params *, struct m_snd_tag **);
+#endif
static void vlan_qflush(struct ifnet *ifp);
static int vlan_setflag(struct ifnet *ifp, int flag, int status,
int (*func)(struct ifnet *, int));
@@ -971,6 +976,9 @@ vlan_clone_create(struct if_clone *ifc,
ifp->if_transmit = vlan_transmit;
ifp->if_qflush = vlan_qflush;
ifp->if_ioctl = vlan_ioctl;
+#ifdef RATELIMIT
+ ifp->if_snd_tag_alloc = vlan_snd_tag_alloc;
+#endif
ifp->if_flags = VLAN_IFFLAGS;
ether_ifattach(ifp, eaddr);
/* Now undo some of the damage... */
@@ -1591,6 +1599,15 @@ vlan_capabilities(struct ifvlan *ifv)
TOEDEV(ifp) = TOEDEV(p);
ifp->if_capenable |= p->if_capenable & IFCAP_TOE;
}
+
+#ifdef RATELIMIT
+ /*
+ * If the parent interface supports ratelimiting, so does the
+ * VLAN interface.
+ */
+ ifp->if_capabilities |= (p->if_capabilities & IFCAP_TXRTLMT);
+ ifp->if_capenable |= (p->if_capenable & IFCAP_TXRTLMT);
+#endif
}
static void
@@ -1801,3 +1818,19 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd
return (error);
}
+
+#ifdef RATELIMIT
+static int
+vlan_snd_tag_alloc(struct ifnet *ifp,
+ union if_snd_tag_alloc_params *params,
+ struct m_snd_tag **ppmt)
+{
+
+ /* get trunk device */
+ ifp = vlan_trunkdev(ifp);
+ if (ifp == NULL || (ifp->if_capenable & IFCAP_TXRTLMT) == 0)
+ return (EOPNOTSUPP);
+ /* forward allocation request */
+ return (ifp->if_snd_tag_alloc(ifp, params, ppmt));
+}
+#endif
Modified: head/sys/netinet/in_pcb.c
==============================================================================
--- head/sys/netinet/in_pcb.c Wed Jan 18 13:27:24 2017 (r312378)
+++ head/sys/netinet/in_pcb.c Wed Jan 18 13:31:17 2017 (r312379)
@@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
#include "opt_ipsec.h"
#include "opt_inet.h"
#include "opt_inet6.h"
+#include "opt_ratelimit.h"
#include "opt_pcbgroup.h"
#include "opt_rss.h"
@@ -57,6 +58,7 @@ __FBSDID("$FreeBSD$");
#include <sys/rmlock.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
+#include <sys/sockio.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/refcount.h>
@@ -1140,6 +1142,10 @@ in_pcbdetach(struct inpcb *inp)
KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
+#ifdef RATELIMIT
+ if (inp->inp_snd_tag != NULL)
+ in_pcbdetach_txrtlmt(inp);
+#endif
inp->inp_socket->so_pcb = NULL;
inp->inp_socket = NULL;
}
@@ -2677,3 +2683,253 @@ DB_SHOW_COMMAND(inpcb, db_show_inpcb)
db_print_inpcb(inp, "inpcb", 0);
}
#endif /* DDB */
+
+#ifdef RATELIMIT
+/*
+ * Modify TX rate limit based on the existing "inp->inp_snd_tag",
+ * if any.
+ */
+int
+in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
+{
+ union if_snd_tag_modify_params params = {
+ .rate_limit.max_rate = max_pacing_rate,
+ };
+ struct m_snd_tag *mst;
+ struct ifnet *ifp;
+ int error;
+
+ mst = inp->inp_snd_tag;
+ if (mst == NULL)
+ return (EINVAL);
+
+ ifp = mst->ifp;
+ if (ifp == NULL)
+ return (EINVAL);
+
+ if (ifp->if_snd_tag_modify == NULL) {
+ error = EOPNOTSUPP;
+ } else {
+ error = ifp->if_snd_tag_modify(mst, ¶ms);
+ }
+ return (error);
+}
+
+/*
+ * Query existing TX rate limit based on the existing
+ * "inp->inp_snd_tag", if any.
+ */
+int
+in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
+{
+ union if_snd_tag_query_params params = { };
+ struct m_snd_tag *mst;
+ struct ifnet *ifp;
+ int error;
+
+ mst = inp->inp_snd_tag;
+ if (mst == NULL)
+ return (EINVAL);
+
+ ifp = mst->ifp;
+ if (ifp == NULL)
+ return (EINVAL);
+
+ if (ifp->if_snd_tag_query == NULL) {
+ error = EOPNOTSUPP;
+ } else {
+ error = ifp->if_snd_tag_query(mst, ¶ms);
+ if (error == 0 && p_max_pacing_rate != NULL)
+ *p_max_pacing_rate = params.rate_limit.max_rate;
+ }
+ return (error);
+}
+
+/*
+ * Allocate a new TX rate limit send tag from the network interface
+ * given by the "ifp" argument and save it in "inp->inp_snd_tag":
+ */
+int
+in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
+ uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate)
+{
+ union if_snd_tag_alloc_params params = {
+ .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
+ .rate_limit.hdr.flowid = flowid,
+ .rate_limit.hdr.flowtype = flowtype,
+ .rate_limit.max_rate = max_pacing_rate,
+ };
+ int error;
+
+ INP_WLOCK_ASSERT(inp);
+
+ if (inp->inp_snd_tag != NULL)
+ return (EINVAL);
+
+ if (ifp->if_snd_tag_alloc == NULL) {
+ error = EOPNOTSUPP;
+ } else {
+ error = ifp->if_snd_tag_alloc(ifp, ¶ms, &inp->inp_snd_tag);
+
+ /*
+ * At success increment the refcount on
+ * the send tag's network interface:
+ */
+ if (error == 0)
+ if_ref(inp->inp_snd_tag->ifp);
+ }
+ return (error);
+}
+
+/*
+ * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
+ * if any:
+ */
+void
+in_pcbdetach_txrtlmt(struct inpcb *inp)
+{
+ struct m_snd_tag *mst;
+ struct ifnet *ifp;
+
+ INP_WLOCK_ASSERT(inp);
+
+ mst = inp->inp_snd_tag;
+ inp->inp_snd_tag = NULL;
+
+ if (mst == NULL)
+ return;
+
+ ifp = mst->ifp;
+ if (ifp == NULL)
+ return;
+
+ /*
+ * If the device was detached while we still had reference(s)
+ * on the ifp, we assume if_snd_tag_free() was replaced with
+ * stubs.
+ */
+ ifp->if_snd_tag_free(mst);
+
+ /* release reference count on network interface */
+ if_rele(ifp);
+}
+
+/*
+ * This function should be called when the INP_RATE_LIMIT_CHANGED flag
+ * is set in the fast path and will attach/detach/modify the TX rate
+ * limit send tag based on the socket's so_max_pacing_rate value.
+ */
+void
+in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
+{
+ struct socket *socket;
+ uint32_t max_pacing_rate;
+ bool did_upgrade;
+ int error;
+
+ if (inp == NULL)
+ return;
+
+ socket = inp->inp_socket;
+ if (socket == NULL)
+ return;
+
+ if (!INP_WLOCKED(inp)) {
+ /*
+ * NOTE: If the write locking fails, we need to bail
+ * out and use the non-ratelimited ring for the
+ * transmit until there is a new chance to get the
+ * write lock.
+ */
+ if (!INP_TRY_UPGRADE(inp))
+ return;
+ did_upgrade = 1;
+ } else {
+ did_upgrade = 0;
+ }
+
+ /*
+ * NOTE: The so_max_pacing_rate value is read unlocked,
+ * because atomic updates are not required since the variable
+ * is checked at every mbuf we send. It is assumed that the
+ * variable read itself will be atomic.
+ */
+ max_pacing_rate = socket->so_max_pacing_rate;
+
+ /*
+ * NOTE: When attaching to a network interface a reference is
+ * made to ensure the network interface doesn't go away until
+ * all ratelimit connections are gone. The network interface
+ * pointers compared below represent valid network interfaces,
+ * except when comparing towards NULL.
+ */
+ if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
+ error = 0;
+ } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
+ if (inp->inp_snd_tag != NULL)
+ in_pcbdetach_txrtlmt(inp);
+ error = 0;
+ } else if (inp->inp_snd_tag == NULL) {
+ /*
+ * In order to utilize packet pacing with RSS, we need
+ * to wait until there is a valid RSS hash before we
+ * can proceed:
+ */
+ if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
+ error = EAGAIN;
+ } else {
+ error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
+ mb->m_pkthdr.flowid, max_pacing_rate);
+ }
+ } else {
+ error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
+ }
+ if (error == 0 || error == EOPNOTSUPP)
+ inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
+ if (did_upgrade)
+ INP_DOWNGRADE(inp);
+}
+
+/*
+ * Track route changes for TX rate limiting.
+ */
+void
+in_pcboutput_eagain(struct inpcb *inp)
+{
+ struct socket *socket;
+ bool did_upgrade;
+
+ if (inp == NULL)
+ return;
+
+ socket = inp->inp_socket;
+ if (socket == NULL)
+ return;
+
+ if (inp->inp_snd_tag == NULL)
+ return;
+
+ if (!INP_WLOCKED(inp)) {
+ /*
+ * NOTE: If the write locking fails, we need to bail
+ * out and use the non-ratelimited ring for the
+ * transmit until there is a new chance to get the
+ * write lock.
+ */
+ if (!INP_TRY_UPGRADE(inp))
+ return;
+ did_upgrade = 1;
+ } else {
+ did_upgrade = 0;
+ }
+
+ /* detach rate limiting */
+ in_pcbdetach_txrtlmt(inp);
+
+ /* make sure new mbuf send tag allocation is made */
+ inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
+
+ if (did_upgrade)
+ INP_DOWNGRADE(inp);
+}
+#endif /* RATELIMIT */
Modified: head/sys/netinet/in_pcb.h
==============================================================================
--- head/sys/netinet/in_pcb.h Wed Jan 18 13:27:24 2017 (r312378)
+++ head/sys/netinet/in_pcb.h Wed Jan 18 13:31:17 2017 (r312379)
@@ -181,6 +181,7 @@ struct icmp6_filter;
* read-lock usage during modification, this model can be applied to other
* protocols (especially SCTP).
*/
+struct m_snd_tag;
struct inpcb {
LIST_ENTRY(inpcb) inp_hash; /* (h/i) hash list */
LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */
@@ -202,11 +203,11 @@ struct inpcb {
u_char inp_ip_minttl; /* (i) minimum TTL or drop */
uint32_t inp_flowid; /* (x) flow id / queue id */
u_int inp_refcount; /* (i) refcount */
- void *inp_pspare[5]; /* (x) packet pacing / general use */
+ struct m_snd_tag *inp_snd_tag; /* (i) send tag for outgoing mbufs */
+ void *inp_pspare[4]; /* (x) general use */
uint32_t inp_flowtype; /* (x) M_HASHTYPE value */
uint32_t inp_rss_listen_bucket; /* (x) overridden RSS listen bucket */
- u_int inp_ispare[4]; /* (x) packet pacing / user cookie /
- * general use */
+ u_int inp_ispare[4]; /* (x) user cookie / general use */
/* Local and foreign ports, local and foreign addr. */
struct in_conninfo inp_inc; /* (i) list for PCB's local port */
@@ -616,6 +617,7 @@ short inp_so_options(const struct inpcb
#define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */
#define INP_RECVFLOWID 0x00000100 /* populate recv datagram with flow info */
#define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */
+#define INP_RATE_LIMIT_CHANGED 0x00000400 /* rate limit needs attention */
/*
* Flags passed to in_pcblookup*() functions.
@@ -736,6 +738,14 @@ int in_getsockaddr(struct socket *so, st
struct sockaddr *
in_sockaddr(in_port_t port, struct in_addr *addr);
void in_pcbsosetlabel(struct socket *so);
+#ifdef RATELIMIT
+int in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t, uint32_t);
+void in_pcbdetach_txrtlmt(struct inpcb *);
+int in_pcbmodify_txrtlmt(struct inpcb *, uint32_t);
+int in_pcbquery_txrtlmt(struct inpcb *, uint32_t *);
+void in_pcboutput_txrtlmt(struct inpcb *, struct ifnet *, struct mbuf *);
+void in_pcboutput_eagain(struct inpcb *);
+#endif
#endif /* _KERNEL */
#endif /* !_NETINET_IN_PCB_H_ */
Modified: head/sys/netinet/ip_output.c
==============================================================================
--- head/sys/netinet/ip_output.c Wed Jan 18 13:27:24 2017 (r312378)
+++ head/sys/netinet/ip_output.c Wed Jan 18 13:31:17 2017 (r312379)
@@ -33,6 +33,7 @@
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
+#include "opt_ratelimit.h"
#include "opt_ipsec.h"
#include "opt_mbuf_stress_test.h"
#include "opt_mpath.h"
@@ -661,8 +662,23 @@ sendit:
*/
m_clrprotoflags(m);
IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
+#ifdef RATELIMIT
+ if (inp != NULL) {
+ if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
+ in_pcboutput_txrtlmt(inp, ifp, m);
+ /* stamp send tag on mbuf */
+ m->m_pkthdr.snd_tag = inp->inp_snd_tag;
+ } else {
+ m->m_pkthdr.snd_tag = NULL;
+ }
+#endif
error = (*ifp->if_output)(ifp, m,
(const struct sockaddr *)gw, ro);
+#ifdef RATELIMIT
+ /* check for route change */
+ if (error == EAGAIN)
+ in_pcboutput_eagain(inp);
+#endif
goto done;
}
@@ -698,8 +714,23 @@ sendit:
IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp,
mtod(m, struct ip *), NULL);
+#ifdef RATELIMIT
+ if (inp != NULL) {
+ if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
+ in_pcboutput_txrtlmt(inp, ifp, m);
+ /* stamp send tag on mbuf */
+ m->m_pkthdr.snd_tag = inp->inp_snd_tag;
+ } else {
+ m->m_pkthdr.snd_tag = NULL;
+ }
+#endif
error = (*ifp->if_output)(ifp, m,
(const struct sockaddr *)gw, ro);
+#ifdef RATELIMIT
+ /* check for route change */
+ if (error == EAGAIN)
+ in_pcboutput_eagain(inp);
+#endif
} else
m_freem(m);
}
@@ -974,6 +1005,16 @@ ip_ctloutput(struct socket *so, struct s
INP_WUNLOCK(inp);
error = 0;
break;
+ case SO_MAX_PACING_RATE:
+#ifdef RATELIMIT
+ INP_WLOCK(inp);
+ inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
+ INP_WUNLOCK(inp);
+ error = 0;
+#else
+ error = EOPNOTSUPP;
+#endif
+ break;
default:
break;
}
Modified: head/sys/netinet6/ip6_output.c
==============================================================================
--- head/sys/netinet6/ip6_output.c Wed Jan 18 13:27:24 2017 (r312378)
+++ head/sys/netinet6/ip6_output.c Wed Jan 18 13:31:17 2017 (r312379)
@@ -65,6 +65,7 @@ __FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
+#include "opt_ratelimit.h"
#include "opt_ipsec.h"
#include "opt_sctp.h"
#include "opt_route.h"
@@ -954,8 +955,23 @@ passout:
m->m_pkthdr.len);
ifa_free(&ia6->ia_ifa);
}
+#ifdef RATELIMIT
+ if (inp != NULL) {
+ if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
+ in_pcboutput_txrtlmt(inp, ifp, m);
+ /* stamp send tag on mbuf */
+ m->m_pkthdr.snd_tag = inp->inp_snd_tag;
+ } else {
+ m->m_pkthdr.snd_tag = NULL;
+ }
+#endif
error = nd6_output_ifp(ifp, origifp, m, dst,
(struct route *)ro);
+#ifdef RATELIMIT
+ /* check for route change */
+ if (error == EAGAIN)
+ in_pcboutput_eagain(inp);
+#endif
goto done;
}
@@ -1054,8 +1070,23 @@ sendorfree:
counter_u64_add(ia->ia_ifa.ifa_obytes,
m->m_pkthdr.len);
}
+#ifdef RATELIMIT
+ if (inp != NULL) {
+ if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
+ in_pcboutput_txrtlmt(inp, ifp, m);
+ /* stamp send tag on mbuf */
+ m->m_pkthdr.snd_tag = inp->inp_snd_tag;
+ } else {
+ m->m_pkthdr.snd_tag = NULL;
+ }
+#endif
error = nd6_output_ifp(ifp, origifp, m, dst,
(struct route *)ro);
+#ifdef RATELIMIT
+ /* check for route change */
+ if (error == EAGAIN)
+ in_pcboutput_eagain(inp);
+#endif
} else
m_freem(m);
}
@@ -1441,6 +1472,16 @@ ip6_ctloutput(struct socket *so, struct
INP_WUNLOCK(in6p);
error = 0;
break;
+ case SO_MAX_PACING_RATE:
+#ifdef RATELIMIT
+ INP_WLOCK(in6p);
+ in6p->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
+ INP_WUNLOCK(in6p);
+ error = 0;
+#else
+ error = EOPNOTSUPP;
+#endif
+ break;
default:
break;
}
Modified: head/sys/sys/mbuf.h
==============================================================================
--- head/sys/sys/mbuf.h Wed Jan 18 13:27:24 2017 (r312378)
+++ head/sys/sys/mbuf.h Wed Jan 18 13:31:17 2017 (r312379)
@@ -130,6 +130,14 @@ struct m_tag {
};
/*
+ * Static network interface owned tag.
+ * Allocated through ifp->if_snd_tag_alloc().
+ */
+struct m_snd_tag {
+ struct ifnet *ifp; /* network interface tag belongs to */
+};
+
+/*
* Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set.
* Size ILP32: 48
* LP64: 56
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-all
mailing list