git: 051e7d78b039 - main - Kernel-side infrastructure to implement nvlist-based set/get ifcaps

From: Konstantin Belousov <kib_at_FreeBSD.org>
Date: Tue, 24 May 2022 20:59:43 UTC
The branch main has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=051e7d78b03944d5910d4f7ad2f1fd6f2cfac382

commit 051e7d78b03944d5910d4f7ad2f1fd6f2cfac382
Author:     Konstantin Belousov <kib@FreeBSD.org>
AuthorDate: 2021-10-17 15:00:34 +0000
Commit:     Konstantin Belousov <kib@FreeBSD.org>
CommitDate: 2022-05-24 20:59:32 +0000

    Kernel-side infrastructure to implement nvlist-based set/get ifcaps
    
    Reviewed by:    hselasky, jhb, kp (previous version)
    Sponsored by:   NVIDIA Networking
    MFC after:      3 weeks
    Differential revision:  https://reviews.freebsd.org/D32551
---
 sys/net/if.c     | 178 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 sys/net/if.h     |  59 +++++++++++++++++-
 sys/sys/sockio.h |   3 +
 3 files changed, 236 insertions(+), 4 deletions(-)

diff --git a/sys/net/if.c b/sys/net/if.c
index bc0240035ea3..c50cc2d291e2 100644
--- a/sys/net/if.c
+++ b/sys/net/if.c
@@ -58,6 +58,7 @@
 #include <sys/lock.h>
 #include <sys/refcount.h>
 #include <sys/module.h>
+#include <sys/nv.h>
 #include <sys/rwlock.h>
 #include <sys/sockio.h>
 #include <sys/syslog.h>
@@ -2391,6 +2392,88 @@ ifr_data_get_ptr(void *ifrp)
 		return (ifrup->ifr.ifr_ifru.ifru_data);
 }
 
+struct ifcap_nv_bit_name {
+	int cap_bit;
+	const char *cap_name;
+};
+#define CAPNV(x) {.cap_bit = IFCAP_##x, \
+    .cap_name = __CONCAT(IFCAP_, __CONCAT(x, _NAME)) }
+const struct ifcap_nv_bit_name ifcap_nv_bit_names[] = {
+	CAPNV(RXCSUM),
+	CAPNV(TXCSUM),
+	CAPNV(NETCONS),
+	CAPNV(VLAN_MTU),
+	CAPNV(VLAN_HWTAGGING),
+	CAPNV(JUMBO_MTU),
+	CAPNV(POLLING),
+	CAPNV(VLAN_HWCSUM),
+	CAPNV(TSO4),
+	CAPNV(TSO6),
+	CAPNV(LRO),
+	CAPNV(WOL_UCAST),
+	CAPNV(WOL_MCAST),
+	CAPNV(WOL_MAGIC),
+	CAPNV(TOE4),
+	CAPNV(TOE6),
+	CAPNV(VLAN_HWFILTER),
+	CAPNV(VLAN_HWTSO),
+	CAPNV(LINKSTATE),
+	CAPNV(NETMAP),
+	CAPNV(RXCSUM_IPV6),
+	CAPNV(TXCSUM_IPV6),
+	CAPNV(HWSTATS),
+	CAPNV(TXRTLMT),
+	CAPNV(HWRXTSTMP),
+	CAPNV(MEXTPG),
+	CAPNV(TXTLS4),
+	CAPNV(TXTLS6),
+	CAPNV(VXLAN_HWCSUM),
+	CAPNV(VXLAN_HWTSO),
+	CAPNV(TXTLS_RTLMT),
+	{0, NULL}
+};
+#define CAP2NV(x) {.cap_bit = IFCAP2_##x, \
+    .cap_name = __CONCAT(IFCAP2_, __CONCAT(x, _NAME)) }
+const struct ifcap_nv_bit_name ifcap2_nv_bit_names[] = {
+	CAP2NV(RXTLS4),
+	CAP2NV(RXTLS6),
+	{0, NULL}
+};
+#undef CAPNV
+#undef CAP2NV
+
+int
+if_capnv_to_capint(const nvlist_t *nv, int *old_cap,
+    const struct ifcap_nv_bit_name *nn, bool all)
+{
+	int i, res;
+
+	res = 0;
+	for (i = 0; nn[i].cap_name != NULL; i++) {
+		if (nvlist_exists_bool(nv, nn[i].cap_name)) {
+			if (all || nvlist_get_bool(nv, nn[i].cap_name))
+				res |= nn[i].cap_bit;
+		} else {
+			res |= *old_cap & nn[i].cap_bit;
+		}
+	}
+	return (res);
+}
+
+void
+if_capint_to_capnv(nvlist_t *nv, const struct ifcap_nv_bit_name *nn,
+    int ifr_cap, int ifr_req)
+{
+	int i;
+
+	for (i = 0; nn[i].cap_name != NULL; i++) {
+		if ((nn[i].cap_bit & ifr_cap) != 0) {
+			nvlist_add_bool(nv, nn[i].cap_name,
+			    (nn[i].cap_bit & ifr_req) != 0);
+		}
+	}
+}
+
 /*
  * Hardware specific interface ioctls.
  */
@@ -2401,12 +2484,15 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td)
 	int error = 0, do_ifup = 0;
 	int new_flags, temp_flags;
 	size_t namelen, onamelen;
-	size_t descrlen;
+	size_t descrlen, nvbuflen;
 	char *descrbuf, *odescrbuf;
 	char new_name[IFNAMSIZ];
 	char old_name[IFNAMSIZ], strbuf[IFNAMSIZ + 8];
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
+	void *buf;
+	nvlist_t *nvcap;
+	struct siocsifcapnv_driver_data drv_ioctl_data;
 
 	ifr = (struct ifreq *)data;
 	switch (cmd) {
@@ -2425,6 +2511,47 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td)
 		ifr->ifr_curcap = ifp->if_capenable;
 		break;
 
+	case SIOCGIFCAPNV:
+		if ((ifp->if_capabilities & IFCAP_NV) == 0) {
+			error = EINVAL;
+			break;
+		}
+		buf = NULL;
+		nvcap = nvlist_create(0);
+		for (;;) {
+			if_capint_to_capnv(nvcap, ifcap_nv_bit_names,
+			    ifp->if_capabilities, ifp->if_capenable);
+			if_capint_to_capnv(nvcap, ifcap2_nv_bit_names,
+			    ifp->if_capabilities2, ifp->if_capenable2);
+			error = (*ifp->if_ioctl)(ifp, SIOCGIFCAPNV,
+			    __DECONST(caddr_t, nvcap));
+			if (error != 0) {
+				if_printf(ifp,
+			    "SIOCGIFCAPNV driver mistake: nvlist error %d\n",
+				    error);
+				break;
+			}
+			buf = nvlist_pack(nvcap, &nvbuflen);
+			if (buf == NULL) {
+				error = nvlist_error(nvcap);
+				if (error == 0)
+					error = EDOOFUS;
+				break;
+			}
+			if (nvbuflen > ifr->ifr_cap_nv.buf_length) {
+				ifr->ifr_cap_nv.length = nvbuflen;
+				ifr->ifr_cap_nv.buffer = NULL;
+				error = EFBIG;
+				break;
+			}
+			ifr->ifr_cap_nv.length = nvbuflen;
+			error = copyout(buf, ifr->ifr_cap_nv.buffer, nvbuflen);
+			break;
+		}
+		free(buf, M_NVLIST);
+		nvlist_destroy(nvcap);
+		break;
+
 	case SIOCGIFDATA:
 	{
 		struct if_data ifd;
@@ -2563,7 +2690,7 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td)
 
 	case SIOCSIFCAP:
 		error = priv_check(td, PRIV_NET_SETIFCAP);
-		if (error)
+		if (error != 0)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
@@ -2574,6 +2701,53 @@ ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
+	case SIOCSIFCAPNV:
+		error = priv_check(td, PRIV_NET_SETIFCAP);
+		if (error != 0)
+			return (error);
+		if (ifp->if_ioctl == NULL)
+			return (EOPNOTSUPP);
+		if ((ifp->if_capabilities & IFCAP_NV) == 0)
+			return (EINVAL);
+		if (ifr->ifr_cap_nv.length > IFR_CAP_NV_MAXBUFSIZE)
+			return (EINVAL);
+		nvcap = NULL;
+		buf = malloc(ifr->ifr_cap_nv.length, M_TEMP, M_WAITOK);
+		for (;;) {
+			error = copyin(ifr->ifr_cap_nv.buffer, buf,
+			    ifr->ifr_cap_nv.length);
+			if (error != 0)
+				break;
+			nvcap = nvlist_unpack(buf, ifr->ifr_cap_nv.length, 0);
+			if (nvcap == NULL) {
+				error = EINVAL;
+				break;
+			}
+			drv_ioctl_data.reqcap = if_capnv_to_capint(nvcap,
+			    &ifp->if_capenable, ifcap_nv_bit_names, false);
+			if ((drv_ioctl_data.reqcap &
+			    ~ifp->if_capabilities) != 0) {
+				error = EINVAL;
+				break;
+			}
+			drv_ioctl_data.reqcap2 = if_capnv_to_capint(nvcap,
+			    &ifp->if_capenable2, ifcap2_nv_bit_names, false);
+			if ((drv_ioctl_data.reqcap2 &
+			    ~ifp->if_capabilities2) != 0) {
+				error = EINVAL;
+				break;
+			}
+			drv_ioctl_data.nvcap = nvcap;
+			error = (*ifp->if_ioctl)(ifp, SIOCSIFCAPNV,
+			    (caddr_t)&drv_ioctl_data);
+			break;
+		}
+		nvlist_destroy(nvcap);
+		free(buf, M_TEMP);
+		if (error == 0)
+			getmicrotime(&ifp->if_lastchange);
+		break;
+
 #ifdef MAC
 	case SIOCSIFMAC:
 		error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp);
diff --git a/sys/net/if.h b/sys/net/if.h
index 782e792cf87c..4bf29193e7ce 100644
--- a/sys/net/if.h
+++ b/sys/net/if.h
@@ -236,7 +236,7 @@ struct if_data {
 #define	IFCAP_TOE4		0x04000	/* interface can offload TCP */
 #define	IFCAP_TOE6		0x08000	/* interface can offload TCP6 */
 #define	IFCAP_VLAN_HWFILTER	0x10000 /* interface hw can filter vlan tag */
-/* 	available		0x20000 */
+#define	IFCAP_NV		0x20000 /* can do SIOCGIFCAPNV/SIOCSIFCAPNV */
 #define	IFCAP_VLAN_HWTSO	0x40000 /* can do IFCAP_TSO on VLANs */
 #define	IFCAP_LINKSTATE		0x80000 /* the runtime link state is dynamic */
 #define	IFCAP_NETMAP		0x100000 /* netmap mode supported/enabled */
@@ -260,7 +260,40 @@ struct if_data {
 #define	IFCAP_TOE	(IFCAP_TOE4 | IFCAP_TOE6)
 #define	IFCAP_TXTLS	(IFCAP_TXTLS4 | IFCAP_TXTLS6)
 
-#define	IFCAP_CANTCHANGE	(IFCAP_NETMAP)
+#define	IFCAP_CANTCHANGE	(IFCAP_NETMAP | IFCAP_NV)
+#define	IFCAP_ALLCAPS		0xffffffff
+
+#define	IFCAP_RXCSUM_NAME	"RXCSUM"
+#define	IFCAP_TXCSUM_NAME	"TXCSUM"
+#define	IFCAP_NETCONS_NAME	"NETCONS"
+#define	IFCAP_VLAN_MTU_NAME	"VLAN_MTU"
+#define	IFCAP_VLAN_HWTAGGING_NAME "VLAN_HWTAGGING"
+#define	IFCAP_JUMBO_MTU_NAME	"JUMBO_MTU"
+#define	IFCAP_POLLING_NAME	"POLLING"
+#define	IFCAP_VLAN_HWCSUM_NAME	"VLAN_HWCSUM"
+#define	IFCAP_TSO4_NAME		"TSO4"
+#define	IFCAP_TSO6_NAME		"TSO6"
+#define	IFCAP_LRO_NAME		"LRO"
+#define	IFCAP_WOL_UCAST_NAME	"WOL_UCAST"
+#define	IFCAP_WOL_MCAST_NAME	"WOL_MCAST"
+#define	IFCAP_WOL_MAGIC_NAME	"WOL_MAGIC"
+#define	IFCAP_TOE4_NAME		"TOE4"
+#define	IFCAP_TOE6_NAME		"TOE6"
+#define	IFCAP_VLAN_HWFILTER_NAME "VLAN_HWFILTER"
+#define	IFCAP_VLAN_HWTSO_NAME	"VLAN_HWTSO"
+#define	IFCAP_LINKSTATE_NAME	"LINKSTATE"
+#define	IFCAP_NETMAP_NAME	"NETMAP"
+#define	IFCAP_RXCSUM_IPV6_NAME	"RXCSUM_IPV6"
+#define	IFCAP_TXCSUM_IPV6_NAME	"TXCSUM_IPV6"
+#define	IFCAP_HWSTATS_NAME	"HWSTATS"
+#define	IFCAP_TXRTLMT_NAME	"TXRTLMT"
+#define	IFCAP_HWRXTSTMP_NAME	"HWRXTSTMP"
+#define	IFCAP_MEXTPG_NAME	"MEXTPG"
+#define	IFCAP_TXTLS4_NAME	"TXTLS4"
+#define	IFCAP_TXTLS6_NAME	"TXTLS6"
+#define	IFCAP_VXLAN_HWCSUM_NAME	"VXLAN_HWCSUM"
+#define	IFCAP_VXLAN_HWTSO_NAME	"VXLAN_HWTSO"
+#define	IFCAP_TXTLS_RTLMT_NAME	"TXTLS_RTLMT"
 
 #define	IFQ_MAXLEN	50
 #define	IFNET_SLOWHZ	1		/* granularity is 1 second */
@@ -387,6 +420,15 @@ struct ifreq_buffer {
 	void	*buffer;
 };
 
+struct ifreq_nv_req {
+	u_int	buf_length;	/* Total size of buffer,
+				   u_int for ABI struct ifreq */
+	u_int	length;		/* Length of the filled part */
+	void	*buffer;	/* Buffer itself, containing packed nv */
+};
+
+#define	IFR_CAP_NV_MAXBUFSIZE	(2 * 1024 * 1024)
+
 /*
  * Interface request structure used for socket
  * ioctl's.  All interface ioctl's must have parameter
@@ -411,6 +453,7 @@ struct ifreq {
 		int	ifru_cap[2];
 		u_int	ifru_fib;
 		u_char	ifru_vlan_pcp;
+		struct	ifreq_nv_req ifru_nv;
 	} ifr_ifru;
 #define	ifr_addr	ifr_ifru.ifru_addr	/* address */
 #define	ifr_dstaddr	ifr_ifru.ifru_dstaddr	/* other end of p-to-p link */
@@ -434,6 +477,7 @@ struct ifreq {
 #define	ifr_fib		ifr_ifru.ifru_fib	/* interface fib */
 #define	ifr_vlan_pcp	ifr_ifru.ifru_vlan_pcp	/* VLAN priority */
 #define	ifr_lan_pcp	ifr_ifru.ifru_vlan_pcp	/* VLAN priority */
+#define	ifr_cap_nv	ifr_ifru.ifru_nv	/* nv-based cap interface */
 };
 
 #define	_SIZEOF_ADDR_IFREQ(ifr) \
@@ -605,6 +649,17 @@ MALLOC_DECLARE(M_IFMADDR);
 
 extern struct sx ifnet_detach_sxlock;
 
+struct nvlist;
+struct ifcap_nv_bit_name;
+int if_capnv_to_capint(const struct nvlist *nv, int *old_cap,
+    const struct ifcap_nv_bit_name *nn, bool all);
+void if_capint_to_capnv(struct nvlist *nv,
+    const struct ifcap_nv_bit_name *nn, int ifr_cap, int ifr_req);
+struct siocsifcapnv_driver_data {
+	int reqcap;
+	int reqcap2;
+	struct nvlist *nvcap;
+};
 #endif
 
 #ifndef _KERNEL
diff --git a/sys/sys/sockio.h b/sys/sys/sockio.h
index 93b8af28e171..b9ed4a439995 100644
--- a/sys/sys/sockio.h
+++ b/sys/sys/sockio.h
@@ -147,4 +147,7 @@
 
 #define	SIOCGIFDOWNREASON	_IOWR('i', 154, struct ifdownreason)
 
+#define	SIOCSIFCAPNV	_IOW('i', 155, struct ifreq)	/* set IF features */
+#define	SIOCGIFCAPNV	_IOWR('i', 156, struct ifreq)	/* get IF features */
+
 #endif /* !_SYS_SOCKIO_H_ */