git: 1093f16487a9 - main - unix/dgram: reduce mbuf chain traversals in send(2) and recv(2)

From: Gleb Smirnoff <glebius_at_FreeBSD.org>
Date: Fri, 24 Jun 2022 16:10:38 UTC
The branch main has been updated by glebius:

URL: https://cgit.FreeBSD.org/src/commit/?id=1093f16487a93c3ecdea910ca7249375873969da

commit 1093f16487a93c3ecdea910ca7249375873969da
Author:     Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2022-06-24 16:09:11 +0000
Commit:     Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2022-06-24 16:09:11 +0000

    unix/dgram: reduce mbuf chain traversals in send(2) and recv(2)
    
    o Use m_pkthdr.memlen from m_uiotombuf()
    o Modify unp_internalize() to keep track of allocated space and memory
      as well as pointer to the last buffer.
    o Modify unp_addsockcred() to keep track of allocated space and memory
      as well as pointer to the last buffer.
    o Record the datagram len/memlen/ctllen in the first (from) mbuf of the
      chain in uipc_sosend_dgram() and reuse it in uipc_soreceive_dgram().
    
    Reviewed by:            markj
    Differential revision:  https://reviews.freebsd.org/D35302
---
 sys/kern/uipc_usrreq.c | 151 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 113 insertions(+), 38 deletions(-)

diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index d1c87865c632..05fde7675b9f 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -302,11 +302,13 @@ static void	unp_gc(__unused void *, int);
 static void	unp_scan(struct mbuf *, void (*)(struct filedescent **, int));
 static void	unp_discard(struct file *);
 static void	unp_freerights(struct filedescent **, int);
-static int	unp_internalize(struct mbuf **, struct thread *);
+static int	unp_internalize(struct mbuf **, struct thread *,
+		    struct mbuf **, u_int *, u_int *);
 static void	unp_internalize_fp(struct file *);
 static int	unp_externalize(struct mbuf *, struct mbuf **, int);
 static int	unp_externalize_fp(struct file *);
-static struct mbuf	*unp_addsockcred(struct thread *, struct mbuf *, int);
+static struct mbuf	*unp_addsockcred(struct thread *, struct mbuf *,
+		    int, struct mbuf **, u_int *, u_int *);
 static void	unp_process_defers(void * __unused, int);
 
 static void
@@ -1014,7 +1016,8 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
 		error = EOPNOTSUPP;
 		goto release;
 	}
-	if (control != NULL && (error = unp_internalize(&control, td)))
+	if (control != NULL &&
+	    (error = unp_internalize(&control, td, NULL, NULL, NULL)))
 		goto release;
 
 	unp2 = NULL;
@@ -1051,7 +1054,8 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
 		 * SOCK_SEQPACKET (LOCAL_CREDS => WANTCRED_ONESHOT), or
 		 * forever (LOCAL_CREDS_PERSISTENT => WANTCRED_ALWAYS).
 		 */
-		control = unp_addsockcred(td, control, unp2->unp_flags);
+		control = unp_addsockcred(td, control, unp2->unp_flags, NULL,
+		    NULL, NULL);
 		unp2->unp_flags &= ~UNP_WANTCRED_ONESHOT;
 	}
 
@@ -1131,7 +1135,13 @@ release:
  *
  * Allocate a record consisting of 3 mbufs in the sequence of
  * from -> control -> data and append it to the socket buffer.
+ *
+ * The first mbuf carries sender's name and is a pkthdr that stores
+ * overall length of datagram, its memory consumption and control length.
  */
+#define	ctllen	PH_loc.thirtytwo[1]
+_Static_assert(offsetof(struct pkthdr, memlen) + sizeof(u_int) <=
+    offsetof(struct pkthdr, ctllen), "unix/dgram can not store ctllen");
 static int
 uipc_sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
     struct mbuf *m, struct mbuf *c, int flags, struct thread *td)
@@ -1140,14 +1150,16 @@ uipc_sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
 	const struct sockaddr *from;
 	struct socket *so2;
 	struct sockbuf *sb;
-	struct mbuf *f;
-	u_int cc;
+	struct mbuf *f, *clast;
+	u_int cc, ctl, mbcnt;
+	u_int dcc __diagused, dctl __diagused, dmbcnt __diagused;
 	int error;
 
 	MPASS((uio != NULL && m == NULL) || (m != NULL && uio == NULL));
 
 	error = 0;
 	f = NULL;
+	ctl = 0;
 
 	if (__predict_false(flags & MSG_OOB)) {
 		error = EOPNOTSUPP;
@@ -1163,8 +1175,11 @@ uipc_sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
 			error = EFAULT;
 			goto out;
 		}
-		f = m_get(M_WAITOK, MT_SONAME);
-		if (c != NULL && (error = unp_internalize(&c, td)))
+		f = m_gethdr(M_WAITOK, MT_SONAME);
+		cc = m->m_pkthdr.len;
+		mbcnt = MSIZE + m->m_pkthdr.memlen;
+		if (c != NULL &&
+		    (error = unp_internalize(&c, td, &clast, &ctl, &mbcnt)))
 			goto out;
 	} else {
 		/* pru_sosend() with mbuf usually is a kernel thread. */
@@ -1177,7 +1192,7 @@ uipc_sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
 			error = EMSGSIZE;
 			goto out;
 		}
-		if ((f = m_get(M_NOWAIT, MT_SONAME)) == NULL) {
+		if ((f = m_gethdr(M_NOWAIT, MT_SONAME)) == NULL) {
 			error = ENOBUFS;
 			goto out;
 		}
@@ -1189,6 +1204,14 @@ uipc_sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
 		m->m_pkthdr.csum_flags = 0;
 		m->m_pkthdr.fibnum = 0;
 		m->m_pkthdr.rsstype = 0;
+
+		cc = m->m_pkthdr.len;
+		mbcnt = MSIZE;
+		for (struct mbuf *mb = m; mb != NULL; mb = mb->m_next) {
+			mbcnt += MSIZE;
+			if (mb->m_flags & M_EXT)
+				mbcnt += mb->m_ext.ext_size;
+		}
 	}
 
 	unp = sotounpcb(so);
@@ -1240,7 +1263,8 @@ uipc_sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
 	}
 
 	if (unp2->unp_flags & UNP_WANTCRED_MASK)
-		c = unp_addsockcred(td, c, unp2->unp_flags);
+		c = unp_addsockcred(td, c, unp2->unp_flags, &clast, &ctl,
+		    &mbcnt);
 	if (unp->unp_addr != NULL)
 		from = (struct sockaddr *)unp->unp_addr;
 	else
@@ -1248,36 +1272,55 @@ uipc_sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
 	f->m_len = from->sa_len;
 	MPASS(from->sa_len <= MLEN);
 	bcopy(from, mtod(f, void *), from->sa_len);
-	cc = f->m_len + m->m_pkthdr.len;
+	ctl += f->m_len;
 
-	/* Concatenate: from -> control -> data. */
+	/*
+	 * Concatenate mbufs: from -> control -> data.
+	 * Save overall cc and mbcnt in "from" mbuf.
+	 */
 	if (c != NULL) {
-		struct mbuf *clast;
+#ifdef INVARIANTS
+		struct mbuf *mc;
 
-		cc += m_length(c, &clast);
+		for (mc = c; mc->m_next != NULL; mc = mc->m_next);
+		MPASS(mc == clast);
+#endif
 		f->m_next = c;
 		clast->m_next = m;
 		c = NULL;
 	} else
 		f->m_next = m;
 	m = NULL;
+#ifdef INVARIANTS
+	dcc = dctl = dmbcnt = 0;
+	for (struct mbuf *mb = f; mb != NULL; mb = mb->m_next) {
+		if (mb->m_type == MT_DATA)
+			dcc += mb->m_len;
+		else
+			dctl += mb->m_len;
+		dmbcnt += MSIZE;
+		if (mb->m_flags & M_EXT)
+			dmbcnt += mb->m_ext.ext_size;
+	}
+	MPASS(dcc == cc);
+	MPASS(dctl == ctl);
+	MPASS(dmbcnt == mbcnt);
+#endif
+	f->m_pkthdr.len = cc + ctl;
+	f->m_pkthdr.memlen = mbcnt;
+	f->m_pkthdr.ctllen = ctl;
 
 	so2 = unp2->unp_socket;
 	sb = &so2->so_rcv;
 	SOCK_RECVBUF_LOCK(so2);
 	if (cc <= sbspace(sb)) {
 		STAILQ_INSERT_TAIL(&sb->uxdg_mb, f, m_stailqpkt);
-		/* XXX: would be nice if m_uiotombuf() returns count. */
-		for (; f != NULL; f = f->m_next) {
-			if (f->m_type != MT_DATA)
-				sb->sb_ctl += f->m_len;
-			sb->sb_mbcnt += MSIZE;
-			if (f->m_flags & M_EXT)
-				sb->sb_mbcnt += f->m_ext.ext_size;
-		}
-		sb->sb_acc += cc;
-		sb->sb_ccc += cc;
+		sb->sb_acc += cc + ctl;
+		sb->sb_ccc += cc + ctl;
+		sb->sb_ctl += ctl;
+		sb->sb_mbcnt += mbcnt;
 		sorwakeup_locked(so2);
+		f = NULL;
 	} else {
 		soroverflow_locked(so2);
 		error = (so->so_state & SS_NBIO) ? EAGAIN : ENOBUFS;
@@ -1366,7 +1409,7 @@ static int
 uipc_soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 {
-	struct mbuf *m, *m2;
+	struct mbuf *m;
 	int flags, error;
 	ssize_t len;
 	bool nonblock;
@@ -1419,7 +1462,9 @@ uipc_soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
 			return (error);
 		}
 	}
-	SOCK_RECVBUF_LOCK_ASSERT(so);
+
+	M_ASSERTPKTHDR(m);
+	KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type));
 
 	if (uio->uio_td)
 		uio->uio_td->td_ru.ru_msgrcv++;
@@ -1428,18 +1473,12 @@ uipc_soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
 		return (uipc_peek_dgram(so, psa, uio, controlp, flagsp));
 
 	STAILQ_REMOVE_HEAD(&so->so_rcv.uxdg_mb, m_stailqpkt);
-	for (m2 = m; m2 != NULL; m2 = m2->m_next) {
-		if (m2->m_type != MT_DATA)
-			so->so_rcv.sb_ctl -= m2->m_len;
-		so->so_rcv.sb_acc -= m2->m_len;
-		so->so_rcv.sb_ccc -= m2->m_len;
-		so->so_rcv.sb_mbcnt -= MSIZE;
-		if (m2->m_flags & M_EXT)
-			so->so_rcv.sb_mbcnt -= m2->m_ext.ext_size;
-	}
+	so->so_rcv.sb_acc -= m->m_pkthdr.len;
+	so->so_rcv.sb_ccc -= m->m_pkthdr.len;
+	so->so_rcv.sb_ctl -= m->m_pkthdr.ctllen;
+	so->so_rcv.sb_mbcnt -= m->m_pkthdr.memlen;
 	SOCK_RECVBUF_UNLOCK(so);
 
-	KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type));
 	if (psa != NULL)
 		*psa = sodupsockaddr(mtod(m, struct sockaddr *), M_WAITOK);
 	m = m_free(m);
@@ -2510,7 +2549,8 @@ unp_internalize_cleanup_rights(struct mbuf *control)
 }
 
 static int
-unp_internalize(struct mbuf **controlp, struct thread *td)
+unp_internalize(struct mbuf **controlp, struct thread *td,
+    struct mbuf **clast, u_int *space, u_int *mbcnt)
 {
 	struct mbuf *control, **initial_controlp;
 	struct proc *p;
@@ -2527,6 +2567,7 @@ unp_internalize(struct mbuf **controlp, struct thread *td)
 	int i, j, error, *fdp, oldfds;
 	u_int newlen;
 
+	MPASS((*controlp)->m_next == NULL); /* COMPAT_OLDSOCK may violate */
 	UNP_LINK_UNLOCK_ASSERT();
 
 	p = td->td_proc;
@@ -2672,6 +2713,13 @@ unp_internalize(struct mbuf **controlp, struct thread *td)
 			goto out;
 		}
 
+		if (space != NULL) {
+			*space += (*controlp)->m_len;
+			*mbcnt += MSIZE;
+			if ((*controlp)->m_flags & M_EXT)
+				*mbcnt += (*controlp)->m_ext.ext_size;
+			*clast = *controlp;
+		}
 		controlp = &(*controlp)->m_next;
 	}
 	if (clen > 0)
@@ -2685,7 +2733,8 @@ out:
 }
 
 static struct mbuf *
-unp_addsockcred(struct thread *td, struct mbuf *control, int mode)
+unp_addsockcred(struct thread *td, struct mbuf *control, int mode,
+    struct mbuf **clast, u_int *space, u_int *mbcnt)
 {
 	struct mbuf *m, *n, *n_prev;
 	const struct cmsghdr *cm;
@@ -2704,6 +2753,7 @@ unp_addsockcred(struct thread *td, struct mbuf *control, int mode)
 	m = sbcreatecontrol(NULL, ctrlsz, cmsgtype, SOL_SOCKET, M_NOWAIT);
 	if (m == NULL)
 		return (control);
+	MPASS((m->m_flags & M_EXT) == 0 && m->m_next == NULL);
 
 	if (mode & UNP_WANTCRED_ALWAYS) {
 		struct sockcred2 *sc;
@@ -2745,6 +2795,25 @@ unp_addsockcred(struct thread *td, struct mbuf *control, int mode)
 					control = n->m_next;
 				else
 					n_prev->m_next = n->m_next;
+				if (space != NULL) {
+					MPASS(*space >= n->m_len);
+					*space -= n->m_len;
+					MPASS(*mbcnt >= MSIZE);
+					*mbcnt -= MSIZE;
+					if (n->m_flags & M_EXT) {
+						MPASS(*mbcnt >=
+						    n->m_ext.ext_size);
+						*mbcnt -= n->m_ext.ext_size;
+					}
+					MPASS(clast);
+					if (*clast == n) {
+						MPASS(n->m_next == NULL);
+						if (n_prev == NULL)
+							*clast = m;
+						else
+							*clast = n_prev;
+					}
+				}
 				n = m_free(n);
 			} else {
 				n_prev = n;
@@ -2754,6 +2823,12 @@ unp_addsockcred(struct thread *td, struct mbuf *control, int mode)
 
 	/* Prepend it to the head. */
 	m->m_next = control;
+	if (space != NULL) {
+		*space += m->m_len;
+		*mbcnt += MSIZE;
+		if (control == NULL)
+			*clast = m;
+	}
 	return (m);
 }