git: 04f75b980293 - main - netlink: allow netlink sockets in non-vnet jails.

From: Alexander V. Chernikov <melifaro_at_FreeBSD.org>
Date: Sun, 26 Mar 2023 08:53:30 UTC
The branch main has been updated by melifaro:

URL: https://cgit.FreeBSD.org/src/commit/?id=04f75b980293d517558990a7fda6900445edcac6

commit 04f75b980293d517558990a7fda6900445edcac6
Author:     Alexander V. Chernikov <melifaro@FreeBSD.org>
AuthorDate: 2023-03-26 08:42:51 +0000
Commit:     Alexander V. Chernikov <melifaro@FreeBSD.org>
CommitDate: 2023-03-26 08:44:09 +0000

    netlink: allow netlink sockets in non-vnet jails.
    
    This change allow to open Netlink sockets in the non-vnet jails, even for
     unpriviledged processes.
    The security model largely follows the existing one. To be more specific:
    * by default, every `NETLINK_ROUTE` command is **NOT** allowed in non-VNET
     jail UNLESS `RTNL_F_ALLOW_NONVNET_JAIL` flag is specified in the command
     handler.
    * All notifications are **disabled** for non-vnet jails (requests to
     subscribe for the notifications are ignored). This will change to be more
     fine-grained model once the first netlink provider requiring this gets
     committed.
    * Listing interfaces (RTM_GETLINK) is **allowed** w/o limits (**including**
     interfaces w/o any addresses attached to the jail). The value of this is
     questionable, but it follows the existing approach.
    * Listing ARP/NDP neighbours is **forbidden**. This is a **change** from the
     current approach - currently we list static ARP/ND entries belonging to the
     addresses attached to the jail.
    * Listing interface addresses is **allowed**, but the addresses are filtered
     to match only ones attached to the jail.
    * Listing routes is **allowed**, but the routes are filtered to provide only
     host routes matching the addresses attached to the jail.
    * By default, every `NETLINK_GENERIC` command is **allowed** in non-VNET jail
     (as sub-families may be unrelated to network at all).
     It is the goal of the family author to implement the restriction if
     necessary.
    
    Differential Revision: https://reviews.freebsd.org/D39206
    MFC after:      1 month
---
 sys/kern/kern_jail.c          |  1 +
 sys/netlink/netlink_ctl.h     |  1 +
 sys/netlink/netlink_domain.c  | 12 ++++++++++++
 sys/netlink/netlink_generic.c |  1 +
 sys/netlink/netlink_route.c   |  5 +++++
 sys/netlink/netlink_var.h     |  1 +
 sys/netlink/route/iface.c     |  8 +++++++-
 sys/netlink/route/route_var.h |  3 ++-
 sys/netlink/route/rt.c        |  6 ++++++
 9 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
index 7b57e5bb9d61..0558c7d9b7fe 100644
--- a/sys/kern/kern_jail.c
+++ b/sys/kern/kern_jail.c
@@ -3440,6 +3440,7 @@ prison_check_af(struct ucred *cred, int af)
 #endif
 	case AF_LOCAL:
 	case AF_ROUTE:
+	case AF_NETLINK:
 		break;
 	default:
 		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
diff --git a/sys/netlink/netlink_ctl.h b/sys/netlink/netlink_ctl.h
index 8cd29cf56d10..9369194151af 100644
--- a/sys/netlink/netlink_ctl.h
+++ b/sys/netlink/netlink_ctl.h
@@ -81,6 +81,7 @@ bool netlink_unregister_proto(int proto);
 bool nl_has_listeners(int netlink_family, uint32_t groups_mask);
 bool nlp_has_priv(struct nlpcb *nlp, int priv);
 struct ucred *nlp_get_cred(struct nlpcb *nlp);
+bool nlp_unconstrained_vnet(const struct nlpcb *nlp);
 
 /* netlink_generic.c */
 struct genl_cmd {
diff --git a/sys/netlink/netlink_domain.c b/sys/netlink/netlink_domain.c
index 2704974173b4..24ca9de877f0 100644
--- a/sys/netlink/netlink_domain.c
+++ b/sys/netlink/netlink_domain.c
@@ -36,6 +36,7 @@
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/domain.h>
+#include <sys/jail.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/proc.h>
@@ -111,6 +112,10 @@ nl_add_group_locked(struct nlpcb *nlp, unsigned int group_id)
 	MPASS(group_id <= NLP_MAX_GROUPS);
 	--group_id;
 
+	/* TODO: add family handler callback */
+	if (!nlp_unconstrained_vnet(nlp))
+		return;
+
 	nlp->nl_groups[group_id / 64] |= (uint64_t)1 << (group_id % 64);
 }
 
@@ -212,6 +217,12 @@ nlp_has_priv(struct nlpcb *nlp, int priv)
 	return (priv_check_cred(nlp->nl_cred, priv) == 0);
 }
 
+bool
+nlp_unconstrained_vnet(const struct nlpcb *nlp)
+{
+	return (nlp->nl_unconstrained_vnet);
+}
+
 struct ucred *
 nlp_get_cred(struct nlpcb *nlp)
 {
@@ -308,6 +319,7 @@ nl_pru_attach(struct socket *so, int proto, struct thread *td)
 	nlp->nl_process_id = curproc->p_pid;
 	nlp->nl_linux = is_linux;
 	nlp->nl_active = true;
+	nlp->nl_unconstrained_vnet = !jailed_without_vnet(so->so_cred);
 	NLP_LOCK_INIT(nlp);
 	refcount_init(&nlp->nl_refcount, 1);
 	nl_init_io(nlp);
diff --git a/sys/netlink/netlink_generic.c b/sys/netlink/netlink_generic.c
index d4022c2c2a75..a2bd624f99d9 100644
--- a/sys/netlink/netlink_generic.c
+++ b/sys/netlink/netlink_generic.c
@@ -31,6 +31,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/ck.h>
 #include <sys/epoch.h>
 #include <sys/kernel.h>
+#include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
diff --git a/sys/netlink/netlink_route.c b/sys/netlink/netlink_route.c
index 037fd2170c66..ce0c0eb36dbc 100644
--- a/sys/netlink/netlink_route.c
+++ b/sys/netlink/netlink_route.c
@@ -93,6 +93,11 @@ rtnl_handle_message(struct nlmsghdr *hdr, struct nl_pstate *npt)
 	} else if (cmd->priv != 0)
 		NLP_LOG(LOG_DEBUG3, nlp, "priv %d check passed for msg %s", cmd->priv, cmd->name);
 
+	if (!nlp_unconstrained_vnet(nlp) && (cmd->flags & RTNL_F_ALLOW_NONVNET_JAIL) == 0) {
+		NLP_LOG(LOG_DEBUG2, nlp, "jail check failed for msg %s", cmd->name);
+		return (EPERM);
+	}
+
 	bool need_epoch = !(cmd->flags & RTNL_F_NOEPOCH);
 
 	if (need_epoch)
diff --git a/sys/netlink/netlink_var.h b/sys/netlink/netlink_var.h
index ed19008248e9..0114306885cf 100644
--- a/sys/netlink/netlink_var.h
+++ b/sys/netlink/netlink_var.h
@@ -61,6 +61,7 @@ struct nlpcb {
         bool			nl_task_pending;
 	bool			nl_tx_blocked; /* No new requests accepted */
 	bool			nl_linux; /* true if running under compat */
+	bool			nl_unconstrained_vnet; /* true if running under VNET jail (or without jail) */
 	struct nl_io_queue	rx_queue;
 	struct nl_io_queue	tx_queue;
 	struct taskqueue	*nl_taskqueue;
diff --git a/sys/netlink/route/iface.c b/sys/netlink/route/iface.c
index 6704acd1624f..18eab05576b7 100644
--- a/sys/netlink/route/iface.c
+++ b/sys/netlink/route/iface.c
@@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/types.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
+#include <sys/jail.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
@@ -61,6 +62,7 @@ struct netlink_walkargs {
 	struct nl_writer *nw;
 	struct nlmsghdr hdr;
 	struct nlpcb *so;
+	struct ucred *cred;
 	uint32_t fibnum;
 	int family;
 	int error;
@@ -833,6 +835,8 @@ dump_iface_addrs(struct netlink_walkargs *wa, struct ifnet *ifp)
 			continue;
 		if (ifa->ifa_addr->sa_family == AF_LINK)
 			continue;
+		if (prison_if(wa->cred, ifa->ifa_addr) != 0)
+			continue;
 		wa->count++;
 		if (!dump_iface_addr(wa->nw, ifp, ifa, &wa->hdr))
 			return (ENOMEM);
@@ -856,6 +860,7 @@ rtnl_handle_getaddr(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *n
 	struct netlink_walkargs wa = {
 		.so = nlp,
 		.nw = npt->nw,
+		.cred = nlp_get_cred(nlp),
 		.family = attrs.ifa_family,
 		.hdr.nlmsg_pid = hdr->nlmsg_pid,
 		.hdr.nlmsg_seq = hdr->nlmsg_seq,
@@ -977,7 +982,7 @@ static const struct rtnl_cmd_handler cmd_handlers[] = {
 		.cmd = NL_RTM_GETLINK,
 		.name = "RTM_GETLINK",
 		.cb = &rtnl_handle_getlink,
-		.flags = RTNL_F_NOEPOCH,
+		.flags = RTNL_F_NOEPOCH | RTNL_F_ALLOW_NONVNET_JAIL,
 	},
 	{
 		.cmd = NL_RTM_DELLINK,
@@ -997,6 +1002,7 @@ static const struct rtnl_cmd_handler cmd_handlers[] = {
 		.cmd = NL_RTM_GETADDR,
 		.name = "RTM_GETADDR",
 		.cb = &rtnl_handle_getaddr,
+		.flags = RTNL_F_ALLOW_NONVNET_JAIL,
 	},
 	{
 		.cmd = NL_RTM_NEWADDR,
diff --git a/sys/netlink/route/route_var.h b/sys/netlink/route/route_var.h
index f3b1d7d929a5..a11857b14a1f 100644
--- a/sys/netlink/route/route_var.h
+++ b/sys/netlink/route/route_var.h
@@ -48,7 +48,8 @@ struct rtnl_cmd_handler {
 	int		flags;
 };
 
-#define	RTNL_F_NOEPOCH	0x01
+#define	RTNL_F_NOEPOCH			0x01	/* Do not enter epoch when handling command */
+#define	RTNL_F_ALLOW_NONVNET_JAIL	0x02	/* Allow command execution inside non-VNET jail */
 
 bool rtnl_register_messages(const struct rtnl_cmd_handler *handlers, int count);
 
diff --git a/sys/netlink/route/rt.c b/sys/netlink/route/rt.c
index badd8d937be2..ef52dbf4edd6 100644
--- a/sys/netlink/route/rt.c
+++ b/sys/netlink/route/rt.c
@@ -513,6 +513,8 @@ dump_rtentry(struct rtentry *rt, void *_arg)
 	wa->count++;
 	if (wa->error != 0)
 		return (0);
+	if (!rt_is_exportable(rt, nlp_get_cred(wa->nlp)))
+		return (0);
 	wa->dumped++;
 
 	rt_get_rnd(rt, &wa->rnd);
@@ -606,6 +608,9 @@ handle_rtm_getroute(struct nlpcb *nlp, struct nl_parsed_route *attrs,
 
 	RIB_RUNLOCK(rnh);
 
+	if (!rt_is_exportable(rt, nlp_get_cred(nlp)))
+		return (ESRCH);
+
 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
 		char rtbuf[NHOP_PRINT_BUFSIZE] __unused, nhbuf[NHOP_PRINT_BUFSIZE] __unused;
 		FIB_LOG(LOG_DEBUG2, fibnum, family, "getroute completed: got %s for %s",
@@ -1026,6 +1031,7 @@ static const struct rtnl_cmd_handler cmd_handlers[] = {
 		.cmd = NL_RTM_GETROUTE,
 		.name = "RTM_GETROUTE",
 		.cb = &rtnl_handle_getroute,
+		.flags = RTNL_F_ALLOW_NONVNET_JAIL,
 	},
 	{
 		.cmd = NL_RTM_DELROUTE,