Re: git: e1882428dcbb - main - ifnet/mbuf: provide KPI to serialize/restore m->m_pkthdr.rcvif

From: Mateusz Guzik <mjguzik_at_gmail.com>
Date: Thu, 27 Jan 2022 16:59:03 UTC
On 1/27/22, Gleb Smirnoff <glebius@freebsd.org> wrote:
> The branch main has been updated by glebius:
>
> URL:
> https://cgit.FreeBSD.org/src/commit/?id=e1882428dcbbafd2814d7e17b977a8f686784b39
>
> commit e1882428dcbbafd2814d7e17b977a8f686784b39
> Author:     Gleb Smirnoff <glebius@FreeBSD.org>
> AuthorDate: 2022-01-27 05:58:50 +0000
> Commit:     Gleb Smirnoff <glebius@FreeBSD.org>
> CommitDate: 2022-01-27 05:58:50 +0000
>
>     ifnet/mbuf: provide KPI to serialize/restore m->m_pkthdr.rcvif
>
>     Supplement ifindex table with generation count and use it to
>     serialize & restore an ifnet pointer.
>

This commit or something from the series reliably causes my box to
crash on boot:

Fatal trap 12: page fault while in kernel mode
cpuid = 7; apic id = 07
fault virtual address	= 0x54
fault code		= supervisor read data, page not present
instruction pointer	= 0x20:0xffffffff80616d98
stack pointer	        = 0x28:0xfffffe06462889e0
frame pointer	        = 0x28:0xfffffe06462889e0
code segment		= base 0x0, limit 0xfffff, type 0x1b
			= DPL 0, pres 1, long 1, def32 0, gran 1
processor eflags	= interrupt enabled, resume, IOPL = 0
current process		= 1855 (route)
trap number		= 12
panic: page fault
cpuid = 7
time = 1643302734
KDB: stack backtrace:
db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xfffffe06462887a0
vpanic() at vpanic+0x17f/frame 0xfffffe06462887f0
panic() at panic+0x43/frame 0xfffffe0646288850
trap_fatal() at trap_fatal+0x385/frame 0xfffffe06462888b0
trap_pfault() at trap_pfault+0x4f/frame 0xfffffe0646288910
calltrap() at calltrap+0x8/frame 0xfffffe0646288910
--- trap 0xc, rip = 0xffffffff80616d98, rsp = 0xfffffe06462889e0, rbp
= 0xfffffe06462889e0 ---
m_rcvif_serialize() at m_rcvif_serialize+0x8/frame 0xfffffe06462889e0
netisr_queue_src() at netisr_queue_src+0x15c/frame 0xfffffe0646288a30
route_output() at route_output+0x890/frame 0xfffffe0646288c70
sosend_generic() at sosend_generic+0x456/frame 0xfffffe0646288d10
soo_write() at soo_write+0x43/frame 0xfffffe0646288d40
dofilewrite() at dofilewrite+0x81/frame 0xfffffe0646288d90
sys_write() at sys_write+0xbc/frame 0xfffffe0646288e00
amd64_syscall() at amd64_syscall+0x107/frame 0xfffffe0646288f30
fast_syscall_common() at fast_syscall_common+0xf8/frame 0xfffffe0646288f30
--- syscall (4, FreeBSD ELF64, sys_write), rip = 0x3b000ba6703a, rsp =
0x3b0009378e48, rbp = 0x3b00093796f0 ---
KDB: enter: panic
[ thread pid 1855 tid 102265 ]
Stopped at      kdb_enter+0x37: movq    $0,0x9cc17e(%rip)


>     Reviewed by:            kp
>     Differential revision:  https://reviews.freebsd.org/D33266
>     Fun note:               git show e6abef09187a
> ---
>  sys/kern/kern_mbuf.c | 22 ++++++++++++++++++++++
>  sys/net/if.c         | 49
> ++++++++++++++++++++++++++++++++++++-------------
>  sys/net/if_var.h     |  9 ++++++++-
>  sys/sys/mbuf.h       |  6 ++++++
>  4 files changed, 72 insertions(+), 14 deletions(-)
>
> diff --git a/sys/kern/kern_mbuf.c b/sys/kern/kern_mbuf.c
> index f1e76ef00c65..5c69f663c0e2 100644
> --- a/sys/kern/kern_mbuf.c
> +++ b/sys/kern/kern_mbuf.c
> @@ -1635,6 +1635,28 @@ m_snd_tag_destroy(struct m_snd_tag *mst)
>  	counter_u64_add(snd_tag_count, -1);
>  }
>
> +void
> +m_rcvif_serialize(struct mbuf *m)
> +{
> +	u_short idx, gen;
> +
> +	M_ASSERTPKTHDR(m);
> +	idx = m->m_pkthdr.rcvif->if_index;
> +	gen = m->m_pkthdr.rcvif->if_idxgen;
> +	m->m_pkthdr.rcvidx = idx;
> +	m->m_pkthdr.rcvgen = gen;
> +}
> +
> +struct ifnet *
> +m_rcvif_restore(struct mbuf *m)
> +{
> +
> +	M_ASSERTPKTHDR(m);
> +
> +	return ((m->m_pkthdr.rcvif = ifnet_byindexgen(m->m_pkthdr.rcvidx,
> +	    m->m_pkthdr.rcvgen)));
> +}
> +
>  /*
>   * Allocate an mbuf with anonymous external pages.
>   */
> diff --git a/sys/net/if.c b/sys/net/if.c
> index f148ae8c9c6d..e8d65e64518a 100644
> --- a/sys/net/if.c
> +++ b/sys/net/if.c
> @@ -313,7 +313,10 @@ VNET_DEFINE(struct ifgrouphead, ifg_head);
>  /* Table of ifnet by index. */
>  static int if_index;
>  static int if_indexlim = 8;
> -static struct ifnet **ifindex_table;
> +static struct ifindex_entry {
> +	struct ifnet	*ife_ifnet;
> +	uint16_t	ife_gencnt;
> +} *ifindex_table;
>
>  SYSCTL_NODE(_net_link_generic, IFMIB_SYSTEM, system,
>      CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
> @@ -325,8 +328,8 @@ sysctl_ifcount(SYSCTL_HANDLER_ARGS)
>
>  	IFNET_RLOCK();
>  	for (int i = 1; i <= if_index; i++)
> -		if (ifindex_table[i] != NULL &&
> -		    ifindex_table[i]->if_vnet == curvnet)
> +		if (ifindex_table[i].ife_ifnet != NULL &&
> +		    ifindex_table[i].ife_ifnet->if_vnet == curvnet)
>  			rv = i;
>  	IFNET_RUNLOCK();
>
> @@ -370,7 +373,7 @@ ifnet_byindex(u_int idx)
>  	if (__predict_false(idx > if_index))
>  		return (NULL);
>
> -	ifp = ck_pr_load_ptr(&ifindex_table[idx]);
> +	ifp = ck_pr_load_ptr(&ifindex_table[idx].ife_ifnet);
>
>  	if (curvnet != NULL && ifp != NULL && ifp->if_vnet != curvnet)
>  		ifp = NULL;
> @@ -391,6 +394,24 @@ ifnet_byindex_ref(u_int idx)
>  	return (ifp);
>  }
>
> +struct ifnet *
> +ifnet_byindexgen(uint16_t idx, uint16_t gen)
> +{
> +	struct ifnet *ifp;
> +
> +	NET_EPOCH_ASSERT();
> +
> +	if (__predict_false(idx > if_index))
> +		return (NULL);
> +
> +	ifp = ck_pr_load_ptr(&ifindex_table[idx].ife_ifnet);
> +
> +	if (ifindex_table[idx].ife_gencnt == gen)
> +		return (ifp);
> +	else
> +		return (NULL);
> +}
> +
>  struct ifaddr *
>  ifaddr_byindex(u_short idx)
>  {
> @@ -571,13 +592,13 @@ if_alloc_domain(u_char type, int numa_domain)
>  	 * next slot.
>  	 */
>  	for (idx = 1; idx <= if_index; idx++) {
> -		if (ifindex_table[idx] == NULL)
> +		if (ifindex_table[idx].ife_ifnet == NULL)
>  			break;
>  	}
>
>  	/* Catch if_index overflow. */
>  	if (idx >= if_indexlim) {
> -		struct ifnet **new, **old;
> +		struct ifindex_entry *new, *old;
>  		int newlim;
>
>  		newlim = if_indexlim * 2;
> @@ -593,7 +614,8 @@ if_alloc_domain(u_char type, int numa_domain)
>  		if_index = idx;
>
>  	ifp->if_index = idx;
> -	ck_pr_store_ptr(&ifindex_table[idx], ifp);
> +	ifp->if_idxgen = ifindex_table[idx].ife_gencnt;
> +	ck_pr_store_ptr(&ifindex_table[idx].ife_ifnet, ifp);
>  	IFNET_WUNLOCK();
>
>  	return (ifp);
> @@ -668,9 +690,10 @@ if_free(struct ifnet *ifp)
>  	 * virtualized and interface would outlive the vnet.
>  	 */
>  	IFNET_WLOCK();
> -	MPASS(ifindex_table[ifp->if_index] == ifp);
> -	ck_pr_store_ptr(&ifindex_table[ifp->if_index], NULL);
> -	while (if_index > 0 && ifindex_table[if_index] == NULL)
> +	MPASS(ifindex_table[ifp->if_index].ife_ifnet == ifp);
> +	ck_pr_store_ptr(&ifindex_table[ifp->if_index].ife_ifnet, NULL);
> +	ifindex_table[ifp->if_index].ife_gencnt++;
> +	while (if_index > 0 && ifindex_table[if_index].ife_ifnet == NULL)
>  		if_index--;
>  	IFNET_WUNLOCK();
>
> @@ -819,7 +842,7 @@ if_attach_internal(struct ifnet *ifp, bool vmove)
>  	struct sockaddr_dl *sdl;
>  	struct ifaddr *ifa;
>
> -	MPASS(ifindex_table[ifp->if_index] == ifp);
> +	MPASS(ifindex_table[ifp->if_index].ife_ifnet == ifp);
>
>  #ifdef VIMAGE
>  	ifp->if_vnet = curvnet;
> @@ -4508,8 +4531,8 @@ if_show_ifnet(struct ifnet *ifp)
>  	IF_DB_PRINTF("%d", if_dunit);
>  	IF_DB_PRINTF("%s", if_description);
>  	IF_DB_PRINTF("%u", if_index);
> +	IF_DB_PRINTF("%d", if_idxgen);
>  	IF_DB_PRINTF("%u", if_refcount);
> -	IF_DB_PRINTF("%d", if_index_reserved);
>  	IF_DB_PRINTF("%p", if_softc);
>  	IF_DB_PRINTF("%p", if_l2com);
>  	IF_DB_PRINTF("%p", if_llsoftc);
> @@ -4564,7 +4587,7 @@ DB_SHOW_ALL_COMMAND(ifnets, db_show_all_ifnets)
>  	u_short idx;
>
>  	for (idx = 1; idx <= if_index; idx++) {
> -		ifp = ifindex_table[idx];
> +		ifp = ifindex_table[idx].ife_ifnet;
>  		if (ifp == NULL)
>  			continue;
>  		db_printf( "%20s ifp=%p\n", ifp->if_xname, ifp);
> diff --git a/sys/net/if_var.h b/sys/net/if_var.h
> index dedc73718125..21b3687f62c1 100644
> --- a/sys/net/if_var.h
> +++ b/sys/net/if_var.h
> @@ -334,7 +334,7 @@ struct ifnet {
>  	const char *if_dname;		/* driver name */
>  	int	if_dunit;		/* unit or IF_DUNIT_NONE */
>  	u_short	if_index;		/* numeric abbreviation for this if  */
> -	short	if_index_reserved;	/* spare space to grow if_index */
> +	u_short	if_idxgen;		/* ... and its generation count */
>  	char	if_xname[IFNAMSIZ];	/* external name (name + unit) */
>  	char	*if_description;	/* interface description */
>
> @@ -644,6 +644,13 @@ extern	struct sx ifnet_sxlock;
>  struct ifnet	*ifnet_byindex(u_int);
>  struct ifnet	*ifnet_byindex_ref(u_int);
>
> +/*
> + * ifnet_byindexgen() looks up ifnet by index and generation count,
> + * attempting to restore a weak pointer that had been stored across
> + * the epoch.
> + */
> +struct ifnet   *ifnet_byindexgen(uint16_t idx, uint16_t gen);
> +
>  /*
>   * Given the index, ifaddr_byindex() returns the one and only
>   * link-level ifaddr for the interface. You are not supposed to use
> diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h
> index 77364f428b12..ebe8ef205055 100644
> --- a/sys/sys/mbuf.h
> +++ b/sys/sys/mbuf.h
> @@ -159,6 +159,10 @@ struct pkthdr {
>  	union {
>  		struct m_snd_tag *snd_tag;	/* send tag, if any */
>  		struct ifnet	*rcvif;		/* rcv interface */
> +		struct {
> +			uint16_t rcvidx;	/* rcv interface index ... */
> +			uint16_t rcvgen;	/* ... and generation count */
> +		};
>  	};
>  	SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */
>  	int32_t		 len;		/* total packet length */
> @@ -862,6 +866,8 @@ int		 m_snd_tag_alloc(struct ifnet *,
>  void		 m_snd_tag_init(struct m_snd_tag *, struct ifnet *,
>  		    const struct if_snd_tag_sw *);
>  void		 m_snd_tag_destroy(struct m_snd_tag *);
> +void		 m_rcvif_serialize(struct mbuf *);
> +struct ifnet	*m_rcvif_restore(struct mbuf *);
>
>  static __inline int
>  m_gettype(int size)
>


-- 
Mateusz Guzik <mjguzik gmail.com>