git: 17083b94a915 - main - netlink: use protocol specific receive buffer

From: Gleb Smirnoff <glebius_at_FreeBSD.org>
Date: Tue, 02 Jan 2024 21:06:57 UTC
The branch main has been updated by glebius:

URL: https://cgit.FreeBSD.org/src/commit/?id=17083b94a91563aba15ba03d1c74796a35bb1c26

commit 17083b94a91563aba15ba03d1c74796a35bb1c26
Author:     Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2024-01-02 21:04:01 +0000
Commit:     Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2024-01-02 21:04:01 +0000

    netlink: use protocol specific receive buffer
    
    Implement Netlink socket receive buffer as a simple TAILQ of nl_buf's,
    same part of struct sockbuf that is used for send buffer already.
    This shaves a lot of code and a lot of extra processing.  The pcb rids
    of the I/O queues as the socket buffer is exactly the queue.  The
    message writer is simplified a lot, as we now always deal with linear
    buf.  Notion of different buffer types goes away as way as different
    kinds of writers.  The only things remaining are: a socket writer and
    a group writer.
    The impact on the network stack is that we no longer use mbufs, so
    a workaround from d18715475071 disappears.
    
    Note on message throttling.  Now the taskqueue throttling mechanism
    needs to look at both socket buffers protected by their respective
    locks and on flags in the pcb that are protected by the pcb lock.
    There is definitely some room for optimization, but this changes tries
    to preserve as much as possible.
    
    Note on new nl_soreceive().  It emulates soreceive_generic().  It
    must undergo further optimization, see large comment put in there.
    
    Note on tests/sys/netlink/test_netlink_message_writer.py. This test
    boiled down almost to nothing with mbufs removed.  However, I left
    it with minimal functionality (it basically checks that allocating N
    bytes we get N bytes) as it is one of not so many examples of ktest
    framework that allows to test KPIs with python.
    
    Note on Linux support. It got much simplier: Netlink message writer
    loses notion of Linux support lifetime, it is same regardless of
    process ABI.  On socket write from Linux process we perform
    conversion immediately in nl_receive_message() and on an output
    conversion to Linux happens in in nl_send_one(). XXX: both
    conversions use M_NOWAIT allocation, which used to be the case
    before this change, too.
    
    Reviewed by:            melifaro
    Differential Revision:  https://reviews.freebsd.org/D42524
---
 sys/compat/linux/linux_netlink.c                 |  83 ++-
 sys/netlink/ktest_netlink_message_writer.c       |  94 +---
 sys/netlink/ktest_netlink_message_writer.h       |  20 +-
 sys/netlink/netlink_domain.c                     | 199 +++++--
 sys/netlink/netlink_glue.c                       |   1 -
 sys/netlink/netlink_io.c                         | 281 +++-------
 sys/netlink/netlink_linux.h                      |   9 +-
 sys/netlink/netlink_message_writer.c             | 638 ++++-------------------
 sys/netlink/netlink_message_writer.h             |  53 +-
 sys/netlink/netlink_module.c                     |   3 -
 sys/netlink/netlink_var.h                        |  22 +-
 sys/netlink/route/rt.c                           |   6 +-
 tests/sys/netlink/test_netlink_message_writer.py |  54 +-
 13 files changed, 406 insertions(+), 1057 deletions(-)

diff --git a/sys/compat/linux/linux_netlink.c b/sys/compat/linux/linux_netlink.c
index 807cdc7a14bc..af172fb27ba7 100644
--- a/sys/compat/linux/linux_netlink.c
+++ b/sys/compat/linux/linux_netlink.c
@@ -32,7 +32,6 @@
 #include <sys/ck.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
-#include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/vnode.h>
 
@@ -44,6 +43,7 @@
 #include <netlink/netlink.h>
 #include <netlink/netlink_ctl.h>
 #include <netlink/netlink_linux.h>
+#include <netlink/netlink_var.h>
 #include <netlink/netlink_route.h>
 
 #include <compat/linux/linux.h>
@@ -187,6 +187,7 @@ handle_default_out(struct nlmsghdr *hdr, struct nl_writer *nw)
 
 	if (out_hdr != NULL) {
 		memcpy(out_hdr, hdr, hdr->nlmsg_len);
+		nw->num_messages++;
 		return (true);
 	}
 	return (false);
@@ -518,8 +519,7 @@ nlmsg_error_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *
 }
 
 static bool
-nlmsg_to_linux(int netlink_family, struct nlmsghdr *hdr, struct nlpcb *nlp,
-    struct nl_writer *nw)
+nlmsg_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw)
 {
 	if (hdr->nlmsg_type < NLMSG_MIN_TYPE) {
 		switch (hdr->nlmsg_type) {
@@ -536,7 +536,7 @@ nlmsg_to_linux(int netlink_family, struct nlmsghdr *hdr, struct nlpcb *nlp,
 		}
 	}
 
-	switch (netlink_family) {
+	switch (nlp->nl_proto) {
 	case NETLINK_ROUTE:
 		return (rtnl_to_linux(hdr, nlp, nw));
 	default:
@@ -544,64 +544,49 @@ nlmsg_to_linux(int netlink_family, struct nlmsghdr *hdr, struct nlpcb *nlp,
 	}
 }
 
-static struct mbuf *
-nlmsgs_to_linux(int netlink_family, char *buf, int data_length, struct nlpcb *nlp)
+static bool
+nlmsgs_to_linux(struct nl_writer *nw, struct nlpcb *nlp)
 {
-	RT_LOG(LOG_DEBUG3, "LINUX: get %p size %d", buf, data_length);
-	struct nl_writer nw = {};
-
-	struct mbuf *m = NULL;
-	if (!nlmsg_get_chain_writer(&nw, data_length, &m)) {
-		RT_LOG(LOG_DEBUG, "unable to setup chain writer for size %d",
-		    data_length);
-		return (NULL);
-	}
+	struct nl_buf *nb, *orig;
+	u_int offset, msglen, orig_messages __diagused;
+
+	RT_LOG(LOG_DEBUG3, "%p: in %u bytes %u messages", __func__,
+	    nw->buf->datalen, nw->num_messages);
+
+	orig = nw->buf;
+	nb = nl_buf_alloc(orig->datalen + SCRATCH_BUFFER_SIZE, M_NOWAIT);
+	if (__predict_false(nb == NULL))
+		return (false);
+	nw->buf = nb;
+#ifdef INVARIANTS
+	orig_messages = nw->num_messages;
+#endif
+	nw->num_messages = 0;
 
 	/* Assume correct headers. Buffer IS mutable */
-	int count = 0;
-	for (int offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) {
-		struct nlmsghdr *hdr = (struct nlmsghdr *)&buf[offset];
-		int msglen = NLMSG_ALIGN(hdr->nlmsg_len);
-		count++;
+	for (offset = 0;
+	    offset + sizeof(struct nlmsghdr) <= orig->datalen;
+	    offset += msglen) {
+		struct nlmsghdr *hdr = (struct nlmsghdr *)&orig->data[offset];
 
-		if (!nlmsg_to_linux(netlink_family, hdr, nlp, &nw)) {
+		msglen = NLMSG_ALIGN(hdr->nlmsg_len);
+		if (!nlmsg_to_linux(hdr, nlp, nw)) {
 			RT_LOG(LOG_DEBUG, "failed to process msg type %d",
 			    hdr->nlmsg_type);
-			m_freem(m);
-			return (NULL);
+			nl_buf_free(nb);
+			return (false);
 		}
-		offset += msglen;
 	}
-	nlmsg_flush(&nw);
-	RT_LOG(LOG_DEBUG3, "Processed %d messages, chain size %d", count,
-	    m ? m_length(m, NULL) : 0);
 
-	return (m);
-}
+	MPASS(nw->num_messages == orig_messages);
+	MPASS(nw->buf == nb);
+	nl_buf_free(orig);
+	RT_LOG(LOG_DEBUG3, "%p: out %u bytes", __func__, offset);
 
-static struct mbuf *
-mbufs_to_linux(int netlink_family, struct mbuf *m, struct nlpcb *nlp)
-{
-	/* XXX: easiest solution, not optimized for performance */
-	int data_length = m_length(m, NULL);
-	char *buf = malloc(data_length, M_LINUX, M_NOWAIT);
-	if (buf == NULL) {
-		RT_LOG(LOG_DEBUG, "unable to allocate %d bytes, dropping message",
-		    data_length);
-		m_freem(m);
-		return (NULL);
-	}
-	m_copydata(m, 0, data_length, buf);
-	m_freem(m);
-
-	m = nlmsgs_to_linux(netlink_family, buf, data_length, nlp);
-	free(buf, M_LINUX);
-
-	return (m);
+	return (true);
 }
 
 static struct linux_netlink_provider linux_netlink_v1 = {
-	.mbufs_to_linux = mbufs_to_linux,
 	.msgs_to_linux = nlmsgs_to_linux,
 	.msg_from_linux = nlmsg_from_linux,
 };
diff --git a/sys/netlink/ktest_netlink_message_writer.c b/sys/netlink/ktest_netlink_message_writer.c
index e46065dd4bd2..805f52197f69 100644
--- a/sys/netlink/ktest_netlink_message_writer.c
+++ b/sys/netlink/ktest_netlink_message_writer.c
@@ -29,9 +29,9 @@
 #include <sys/cdefs.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
-#include <sys/mbuf.h>
 #include <netlink/netlink.h>
 #include <netlink/netlink_ctl.h>
+#include <netlink/netlink_var.h>
 #include <netlink/netlink_message_writer.h>
 
 #define KTEST_CALLER
@@ -39,54 +39,47 @@
 
 #ifdef INVARIANTS
 
-struct test_mbuf_attrs {
+struct test_nlbuf_attrs {
 	uint32_t	size;
 	uint32_t	expected_avail;
-	uint32_t	expected_count;
-	uint32_t	wtype;
 	int		waitok;
 };
 
-#define	_OUT(_field)	offsetof(struct test_mbuf_attrs, _field)
-static const struct nlattr_parser nla_p_mbuf_w[] = {
+#define	_OUT(_field)	offsetof(struct test_nlbuf_attrs, _field)
+static const struct nlattr_parser nla_p_nlbuf_w[] = {
 	{ .type = 1, .off = _OUT(size), .cb = nlattr_get_uint32 },
 	{ .type = 2, .off = _OUT(expected_avail), .cb = nlattr_get_uint32 },
-	{ .type = 3, .off = _OUT(expected_count), .cb = nlattr_get_uint32 },
-	{ .type = 4, .off = _OUT(wtype), .cb = nlattr_get_uint32 },
-	{ .type = 5, .off = _OUT(waitok), .cb = nlattr_get_uint32 },
+	{ .type = 3, .off = _OUT(waitok), .cb = nlattr_get_uint32 },
 };
 #undef _OUT
-NL_DECLARE_ATTR_PARSER(mbuf_w_parser, nla_p_mbuf_w);
+NL_DECLARE_ATTR_PARSER(nlbuf_w_parser, nla_p_nlbuf_w);
 
 static int
-test_mbuf_parser(struct ktest_test_context *ctx, struct nlattr *nla)
+test_nlbuf_parser(struct ktest_test_context *ctx, struct nlattr *nla)
 {
-	struct test_mbuf_attrs *attrs = npt_alloc(ctx->npt, sizeof(*attrs));
+	struct test_nlbuf_attrs *attrs = npt_alloc(ctx->npt, sizeof(*attrs));
 
 	ctx->arg = attrs;
 	if (attrs != NULL)
-		return (nl_parse_nested(nla, &mbuf_w_parser, ctx->npt, attrs));
+		return (nl_parse_nested(nla, &nlbuf_w_parser, ctx->npt, attrs));
 	return (ENOMEM);
 }
 
 static int
-test_mbuf_writer_allocation(struct ktest_test_context *ctx)
+test_nlbuf_writer_allocation(struct ktest_test_context *ctx)
 {
-	struct test_mbuf_attrs *attrs = ctx->arg;
-	bool ret;
+	struct test_nlbuf_attrs *attrs = ctx->arg;
 	struct nl_writer nw = {};
+	u_int alloc_len;
+	bool ret;
 
-	ret = nlmsg_get_buf_type_wrapper(&nw, attrs->size, attrs->wtype, attrs->waitok);
+	ret = nlmsg_get_buf_wrapper(&nw, attrs->size, attrs->waitok);
 	if (!ret)
 		return (EINVAL);
 
-	int alloc_len = nw.alloc_len;
+	alloc_len = nw.buf->buflen;
 	KTEST_LOG(ctx, "requested %u, allocated %d", attrs->size, alloc_len);
 
-	/* Set cleanup callback */
-	nw.writer_target = NS_WRITER_TARGET_SOCKET;
-	nlmsg_set_callback_wrapper(&nw);
-
 	/* Mark enomem to avoid reallocation */
 	nw.enomem = true;
 
@@ -95,9 +88,7 @@ test_mbuf_writer_allocation(struct ktest_test_context *ctx)
 		return (EINVAL);
 	}
 
-	/* Mark as empty to free the storage */
-	nw.offset = 0;
-	nlmsg_flush(&nw);
+	nl_buf_free(nw.buf);
 
 	if (alloc_len < attrs->expected_avail) {
 		KTEST_LOG(ctx, "alloc_len %d, expected %u",
@@ -107,60 +98,15 @@ test_mbuf_writer_allocation(struct ktest_test_context *ctx)
 
 	return (0);
 }
-
-static int
-test_mbuf_chain_allocation(struct ktest_test_context *ctx)
-{
-	struct test_mbuf_attrs *attrs = ctx->arg;
-	int mflags = attrs->waitok ? M_WAITOK : M_NOWAIT;
-	struct mbuf *chain = nl_get_mbuf_chain_wrapper(attrs->size, mflags);
-
-	if (chain == NULL) {
-		KTEST_LOG(ctx, "nl_get_mbuf_chain(%u) returned NULL", attrs->size);
-		return (EINVAL);
-	}
-
-	/* Iterate and check number of mbufs and space */
-	uint32_t allocated_count = 0, allocated_size = 0;
-	for (struct mbuf *m = chain; m != NULL; m = m->m_next) {
-		allocated_count++;
-		allocated_size += M_SIZE(m);
-	}
-	m_freem(chain);
-
-	if (attrs->expected_avail > allocated_size) {
-		KTEST_LOG(ctx, "expected/allocated avail(bytes) %u/%u"
-				" expected/allocated count %u/%u",
-		    attrs->expected_avail, allocated_size,
-		    attrs->expected_count, allocated_count);
-		return (EINVAL);
-	}
-
-	if (attrs->expected_count > 0 && (attrs->expected_count != allocated_count)) {
-		KTEST_LOG(ctx, "expected/allocated avail(bytes) %u/%u"
-				" expected/allocated count %u/%u",
-		    attrs->expected_avail, allocated_size,
-		    attrs->expected_count, allocated_count);
-		return (EINVAL);
-	}
-
-	return (0);
-}
 #endif
 
 static const struct ktest_test_info tests[] = {
 #ifdef INVARIANTS
 	{
-		.name = "test_mbuf_writer_allocation",
-		.desc = "test different mbuf sizes in the mbuf writer",
-		.func = &test_mbuf_writer_allocation,
-		.parse = &test_mbuf_parser,
-	},
-	{
-		.name = "test_mbuf_chain_allocation",
-		.desc = "verify allocation different chain sizes",
-		.func = &test_mbuf_chain_allocation,
-		.parse = &test_mbuf_parser,
+		.name = "test_nlbuf_writer_allocation",
+		.desc = "test different buffer sizes in the netlink writer",
+		.func = &test_nlbuf_writer_allocation,
+		.parse = &test_nlbuf_parser,
 	},
 #endif
 };
diff --git a/sys/netlink/ktest_netlink_message_writer.h b/sys/netlink/ktest_netlink_message_writer.h
index b7864bea59c9..39d2c5e597d6 100644
--- a/sys/netlink/ktest_netlink_message_writer.h
+++ b/sys/netlink/ktest_netlink_message_writer.h
@@ -30,28 +30,14 @@
 
 #if defined(_KERNEL) && defined(INVARIANTS)
 
-bool nlmsg_get_buf_type_wrapper(struct nl_writer *nw, int size, int type, bool waitok);
-void nlmsg_set_callback_wrapper(struct nl_writer *nw);
-struct mbuf *nl_get_mbuf_chain_wrapper(int len, int malloc_flags);
+bool nlmsg_get_buf_wrapper(struct nl_writer *nw, u_int size, bool waitok);
 
 #ifndef KTEST_CALLER
 
 bool
-nlmsg_get_buf_type_wrapper(struct nl_writer *nw, int size, int type, bool waitok)
+nlmsg_get_buf_wrapper(struct nl_writer *nw, u_int size, bool waitok)
 {
-	return (nlmsg_get_buf_type(nw, size, type, waitok));
-}
-
-void
-nlmsg_set_callback_wrapper(struct nl_writer *nw)
-{
-	nlmsg_set_callback(nw);
-}
-
-struct mbuf *
-nl_get_mbuf_chain_wrapper(int len, int malloc_flags)
-{
-	return (nl_get_mbuf_chain(len, malloc_flags));
+	return (nlmsg_get_buf(nw, size, waitok));
 }
 #endif
 
diff --git a/sys/netlink/netlink_domain.c b/sys/netlink/netlink_domain.c
index ecd110d62c1f..3914d402fc04 100644
--- a/sys/netlink/netlink_domain.c
+++ b/sys/netlink/netlink_domain.c
@@ -179,53 +179,76 @@ nl_get_groups_compat(struct nlpcb *nlp)
 }
 
 static void
-nl_send_one_group(struct mbuf *m, struct nlpcb *nlp, int num_messages,
-    int io_flags)
+nl_send_one_group(struct nl_writer *nw, struct nl_buf *nb, struct nlpcb *nlp)
 {
 	if (__predict_false(nlp->nl_flags & NLF_MSG_INFO))
-		nl_add_msg_info(m);
-	nl_send_one(m, nlp, num_messages, io_flags);
+		nl_add_msg_info(nb);
+	nw->buf = nb;
+	(void)nl_send_one(nw);
+}
+
+static struct nl_buf *
+nl_buf_copy(struct nl_buf *nb)
+{
+	struct nl_buf *copy;
+
+	copy = nl_buf_alloc(nb->buflen, M_NOWAIT);
+	if (__predict_false(copy == NULL))
+		return (NULL);
+	memcpy(copy, nb, sizeof(*nb) + nb->buflen);
+	if (nb->control != NULL) {
+		copy->control = m_copym(nb->control, 0, M_COPYALL, M_NOWAIT);
+		if (__predict_false(copy->control == NULL)) {
+			nl_buf_free(copy);
+			return (NULL);
+		}
+	}
+
+	return (copy);
 }
 
 /*
- * Broadcasts message @m to the protocol @proto group specified by @group_id
+ * Broadcasts in the writer's buffer.
  */
-void
-nl_send_group(struct mbuf *m, int num_messages, int proto, int group_id)
+bool
+nl_send_group(struct nl_writer *nw)
 {
+	struct nl_buf *nb = nw->buf;
 	struct nlpcb *nlp_last = NULL;
 	struct nlpcb *nlp;
 	NLCTL_TRACKER;
 
 	IF_DEBUG_LEVEL(LOG_DEBUG2) {
-		struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *);
-		NL_LOG(LOG_DEBUG2, "MCAST mbuf len %u msg type %d len %u to group %d/%d",
-		    m->m_len, hdr->nlmsg_type, hdr->nlmsg_len, proto, group_id);
+		struct nlmsghdr *hdr = (struct nlmsghdr *)nb->data;
+		NL_LOG(LOG_DEBUG2, "MCAST len %u msg type %d len %u to group %d/%d",
+		    nb->datalen, hdr->nlmsg_type, hdr->nlmsg_len,
+		    nw->group.proto, nw->group.id);
 	}
 
+	nw->buf = NULL;
+
 	struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
 	if (__predict_false(ctl == NULL)) {
 		/*
 		 * Can be the case when notification is sent within VNET
 		 * which doesn't have any netlink sockets.
 		 */
-		m_freem(m);
-		return;
+		nl_buf_free(nb);
+		return (false);
 	}
 
 	NLCTL_RLOCK(ctl);
 
-	int io_flags = NL_IOF_UNTRANSLATED;
-
 	CK_LIST_FOREACH(nlp, &ctl->ctl_pcb_head, nl_next) {
-		if (nl_isset_group_locked(nlp, group_id) && nlp->nl_proto == proto) {
+		if (nl_isset_group_locked(nlp, nw->group.id) &&
+		    nlp->nl_proto == nw->group.proto) {
 			if (nlp_last != NULL) {
-				struct mbuf *m_copy;
-				m_copy = m_copym(m, 0, M_COPYALL, M_NOWAIT);
-				if (m_copy != NULL)
-					nl_send_one_group(m_copy, nlp_last,
-					    num_messages, io_flags);
-				else {
+				struct nl_buf *copy;
+
+				copy = nl_buf_copy(nb);
+				if (copy != NULL) {
+					nl_send_one_group(nw, copy, nlp_last);
+				} else {
 					NLP_LOCK(nlp_last);
 					if (nlp_last->nl_socket != NULL)
 						sorwakeup(nlp_last->nl_socket);
@@ -236,11 +259,13 @@ nl_send_group(struct mbuf *m, int num_messages, int proto, int group_id)
 		}
 	}
 	if (nlp_last != NULL)
-		nl_send_one_group(m, nlp_last, num_messages, io_flags);
+		nl_send_one_group(nw, nb, nlp_last);
 	else
-		m_freem(m);
+		nl_buf_free(nb);
 
 	NLCTL_RUNLOCK(ctl);
+
+	return (true);
 }
 
 bool
@@ -331,7 +356,7 @@ nl_pru_attach(struct socket *so, int proto, struct thread *td)
 		free(nlp, M_PCB);
 		return (error);
 	}
-	so->so_rcv.sb_mtx = &so->so_rcv_mtx;
+	TAILQ_INIT(&so->so_rcv.nl_queue);
 	TAILQ_INIT(&so->so_snd.nl_queue);
 	so->so_pcb = nlp;
 	nlp->nl_socket = so;
@@ -344,7 +369,6 @@ nl_pru_attach(struct socket *so, int proto, struct thread *td)
 	nlp->nl_need_thread_setup = true;
 	NLP_LOCK_INIT(nlp);
 	refcount_init(&nlp->nl_refcount, 1);
-	nl_init_io(nlp);
 
 	nlp->nl_taskqueue = taskqueue_create("netlink_socket", M_WAITOK,
 	    taskqueue_thread_enqueue, &nlp->nl_taskqueue);
@@ -467,15 +491,6 @@ nl_pru_connect(struct socket *so, struct sockaddr *sa, struct thread *td)
 	return (0);
 }
 
-static void
-destroy_nlpcb(struct nlpcb *nlp)
-{
-	NLP_LOCK(nlp);
-	nl_free_io(nlp);
-	NLP_LOCK_DESTROY(nlp);
-	free(nlp, M_PCB);
-}
-
 static void
 destroy_nlpcb_epoch(epoch_context_t ctx)
 {
@@ -483,10 +498,10 @@ destroy_nlpcb_epoch(epoch_context_t ctx)
 
 	nlp = __containerof(ctx, struct nlpcb, nl_epoch_ctx);
 
-	destroy_nlpcb(nlp);
+	NLP_LOCK_DESTROY(nlp);
+	free(nlp, M_PCB);
 }
 
-
 static void
 nl_close(struct socket *so)
 {
@@ -522,9 +537,12 @@ nl_close(struct socket *so)
 
 	while ((nb = TAILQ_FIRST(&so->so_snd.nl_queue)) != NULL) {
 		TAILQ_REMOVE(&so->so_snd.nl_queue, nb, tailq);
-		free(nb, M_NETLINK);
+		nl_buf_free(nb);
+	}
+	while ((nb = TAILQ_FIRST(&so->so_rcv.nl_queue)) != NULL) {
+		TAILQ_REMOVE(&so->so_rcv.nl_queue, nb, tailq);
+		nl_buf_free(nb);
 	}
-	sbdestroy(so, SO_RCV);
 
 	NL_LOG(LOG_DEBUG3, "socket %p, detached", so);
 
@@ -597,10 +615,8 @@ nl_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
 	len = roundup2(uio->uio_resid, 8) + SCRATCH_BUFFER_SIZE;
 	if (nlp->nl_linux)
 		len += roundup2(uio->uio_resid, 8);
-	nb = malloc(sizeof(*nb) + len, M_NETLINK, M_WAITOK);
+	nb = nl_buf_alloc(len, M_WAITOK);
 	nb->datalen = uio->uio_resid;
-	nb->buflen = len;
-	nb->offset = 0;
 	error = uiomove(&nb->data[0], uio->uio_resid, uio);
 	if (__predict_false(error))
 		goto out;
@@ -635,19 +651,107 @@ restart:
 
 out:
 	SOCK_IO_SEND_UNLOCK(so);
-	free(nb, M_NETLINK);
+	if (nb != NULL)
+		nl_buf_free(nb);
 	return (error);
 }
 
 static int
-nl_pru_rcvd(struct socket *so, int flags)
+nl_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
+    struct mbuf **mp, struct mbuf **controlp, int *flagsp)
 {
+	static const struct sockaddr_nl nl_empty_src = {
+		.nl_len = sizeof(struct sockaddr_nl),
+		.nl_family = PF_NETLINK,
+		.nl_pid = 0 /* comes from the kernel */
+	};
+	struct sockbuf *sb = &so->so_rcv;
+	struct nl_buf *nb;
+	int flags, error;
+	u_int overflow;
+	bool nonblock, trunc, peek;
+
+	MPASS(mp == NULL && uio != NULL);
+
 	NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
-	MPASS(sotonlpcb(so) != NULL);
+
+	if (psa != NULL)
+		*psa = sodupsockaddr((const struct sockaddr *)&nl_empty_src,
+		    M_WAITOK);
+
+	flags = flagsp != NULL ? *flagsp & ~MSG_TRUNC : 0;
+	trunc = flagsp != NULL ? *flagsp & MSG_TRUNC : false;
+	nonblock = (so->so_state & SS_NBIO) ||
+	    (flags & (MSG_DONTWAIT | MSG_NBIO));
+	peek = flags & MSG_PEEK;
+
+	error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
+	if (__predict_false(error))
+		return (error);
+
+	SOCK_RECVBUF_LOCK(so);
+	while ((nb = TAILQ_FIRST(&sb->nl_queue)) == NULL) {
+		if (nonblock) {
+			SOCK_RECVBUF_UNLOCK(so);
+			SOCK_IO_RECV_UNLOCK(so);
+			return (EWOULDBLOCK);
+		}
+		error = sbwait(so, SO_RCV);
+		if (error) {
+			SOCK_RECVBUF_UNLOCK(so);
+			SOCK_IO_RECV_UNLOCK(so);
+			return (error);
+		}
+	}
+
+	/*
+	 * XXXGL
+	 * Here we emulate a PR_ATOMIC behavior of soreceive_generic() where
+	 * we take only the first "record" in the socket buffer and send it
+	 * to uio whole or truncated ignoring how many netlink messages are
+	 * in the record and how much space is left in the uio.
+	 * This needs to be fixed at next refactoring. First, we should perform
+	 * truncation only if the very first message doesn't fit into uio.
+	 * That will help an application with small buffer not to lose data.
+	 * Second, we should continue working on the sb->nl_queue as long as
+	 * there is more space in the uio.  That will boost applications with
+	 * large buffers.
+	 */
+	if (__predict_true(!peek)) {
+		TAILQ_REMOVE(&sb->nl_queue, nb, tailq);
+		sb->sb_acc -= nb->datalen;
+		sb->sb_ccc -= nb->datalen;
+	}
+	SOCK_RECVBUF_UNLOCK(so);
+
+	overflow = __predict_false(nb->datalen > uio->uio_resid) ?
+	    nb->datalen - uio->uio_resid : 0;
+	error = uiomove(nb->data, (int)nb->datalen, uio);
+	if (__predict_false(overflow > 0)) {
+		flags |= MSG_TRUNC;
+		if (trunc)
+			uio->uio_resid -= overflow;
+	}
+
+	if (controlp != NULL) {
+		*controlp = nb->control;
+		nb->control = NULL;
+	}
+
+	if (__predict_true(!peek))
+		nl_buf_free(nb);
+
+	if (uio->uio_td)
+		uio->uio_td->td_ru.ru_msgrcv++;
+
+	if (flagsp != NULL)
+		*flagsp |= flags;
+
+	SOCK_IO_RECV_UNLOCK(so);
 
 	nl_on_transmit(sotonlpcb(so));
 
-	return (0);
+	return (error);
 }
 
 static int
@@ -798,8 +902,7 @@ nl_setsbopt(struct socket *so, struct sockopt *sopt)
 }
 
 #define	NETLINK_PROTOSW						\
-	.pr_flags = PR_ATOMIC | PR_ADDR | PR_WANTRCVD |		\
-	    PR_SOCKBUF,						\
+	.pr_flags = PR_ATOMIC | PR_ADDR | PR_SOCKBUF,		\
 	.pr_ctloutput = nl_ctloutput,				\
 	.pr_setsbopt = nl_setsbopt,				\
 	.pr_attach = nl_pru_attach,				\
@@ -807,7 +910,7 @@ nl_setsbopt(struct socket *so, struct sockopt *sopt)
 	.pr_connect = nl_pru_connect,				\
 	.pr_disconnect = nl_pru_disconnect,			\
 	.pr_sosend = nl_sosend,					\
-	.pr_rcvd = nl_pru_rcvd,					\
+	.pr_soreceive = nl_soreceive,				\
 	.pr_shutdown = nl_pru_shutdown,				\
 	.pr_sockaddr = nl_sockaddr,				\
 	.pr_close = nl_close
diff --git a/sys/netlink/netlink_glue.c b/sys/netlink/netlink_glue.c
index e7649c6b13dc..e4b52ffb191b 100644
--- a/sys/netlink/netlink_glue.c
+++ b/sys/netlink/netlink_glue.c
@@ -111,7 +111,6 @@ static bool
 get_stub_writer(struct nl_writer *nw)
 {
 	bzero(nw, sizeof(*nw));
-	nw->writer_type = NS_WRITER_TYPE_STUB;
 	nw->enomem = true;
 
 	return (false);
diff --git a/sys/netlink/netlink_io.c b/sys/netlink/netlink_io.c
index 7e2e098e4a9a..56e430cdcfa8 100644
--- a/sys/netlink/netlink_io.c
+++ b/sys/netlink/netlink_io.c
@@ -51,69 +51,36 @@ _DECLARE_DEBUG(LOG_INFO);
  * sending netlink data between the kernel and userland.
  */
 
-static const struct sockaddr_nl _nl_empty_src = {
-	.nl_len = sizeof(struct sockaddr_nl),
-	.nl_family = PF_NETLINK,
-	.nl_pid = 0 /* comes from the kernel */
-};
-static const struct sockaddr *nl_empty_src = (const struct sockaddr *)&_nl_empty_src;
-
 static bool nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp);
 
-static void
-queue_push(struct nl_io_queue *q, struct mbuf *mq)
-{
-	while (mq != NULL) {
-		struct mbuf *m = mq;
-		mq = mq->m_nextpkt;
-		m->m_nextpkt = NULL;
-
-		q->length += m_length(m, NULL);
-		STAILQ_INSERT_TAIL(&q->head, m, m_stailqpkt);
-	}
-}
-
-static struct mbuf *
-queue_pop(struct nl_io_queue *q)
+struct nl_buf *
+nl_buf_alloc(size_t len, int mflag)
 {
-	if (!STAILQ_EMPTY(&q->head)) {
-		struct mbuf *m = STAILQ_FIRST(&q->head);
-		STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt);
-		m->m_nextpkt = NULL;
-		q->length -= m_length(m, NULL);
+	struct nl_buf *nb;
 
-		return (m);
+	nb = malloc(sizeof(struct nl_buf) + len, M_NETLINK, mflag);
+	if (__predict_true(nb != NULL)) {
+		nb->buflen = len;
+		nb->datalen = nb->offset = 0;
+		nb->control = NULL;
 	}
-	return (NULL);
-}
 
-static struct mbuf *
-queue_head(const struct nl_io_queue *q)
-{
-	return (STAILQ_FIRST(&q->head));
+	return (nb);
 }
 
-static inline bool
-queue_empty(const struct nl_io_queue *q)
+void
+nl_buf_free(struct nl_buf *nb)
 {
-	return (q->length == 0);
-}
 
-static void
-queue_free(struct nl_io_queue *q)
-{
-	while (!STAILQ_EMPTY(&q->head)) {
-		struct mbuf *m = STAILQ_FIRST(&q->head);
-		STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt);
-		m->m_nextpkt = NULL;
-		m_freem(m);
-	}
-	q->length = 0;
+	if (nb->control)
+		m_freem(nb->control);
+	free(nb, M_NETLINK);
 }
 
 void
-nl_add_msg_info(struct mbuf *m)
+nl_add_msg_info(struct nl_buf *nb)
 {
+	/* XXXGL pass nlp as arg? */
 	struct nlpcb *nlp = nl_get_thread_nlp(curthread);
 	NL_LOG(LOG_DEBUG2, "Trying to recover nlp from thread %p: %p",
 	    curthread, nlp);
@@ -139,27 +106,15 @@ nl_add_msg_info(struct mbuf *m)
 	};
 
 
-	while (m->m_next != NULL)
-		m = m->m_next;
-	m->m_next = sbcreatecontrol(data, sizeof(data),
+	nb->control = sbcreatecontrol(data, sizeof(data),
 	    NETLINK_MSG_INFO, SOL_NETLINK, M_NOWAIT);
 
-	NL_LOG(LOG_DEBUG2, "Storing %u bytes of data, ctl: %p",
-	    (unsigned)sizeof(data), m->m_next);
-}
-
-static __noinline struct mbuf *
-extract_msg_info(struct mbuf *m)
-{
-	while (m->m_next != NULL) {
-		if (m->m_next->m_type == MT_CONTROL) {
-			struct mbuf *ctl = m->m_next;
-			m->m_next = NULL;
-			return (ctl);
-		}
-		m = m->m_next;
-	}
-	return (NULL);
+	if (__predict_true(nb->control != NULL))
+		NL_LOG(LOG_DEBUG2, "Storing %u bytes of control data, ctl: %p",
+		    (unsigned)sizeof(data), nb->control);
+	else
+		NL_LOG(LOG_DEBUG2, "Failed to allocate %u bytes of control",
+		    (unsigned)sizeof(data));
 }
 
 void
@@ -174,65 +129,31 @@ nl_schedule_taskqueue(struct nlpcb *nlp)
 	}
 }
 
-static bool
-tx_check_locked(struct nlpcb *nlp)
-{
-	if (queue_empty(&nlp->tx_queue))
-		return (true);
-
-	/*
-	 * Check if something can be moved from the internal TX queue
-	 * to the socket queue.
-	 */
-
-	bool appended = false;
-	struct sockbuf *sb = &nlp->nl_socket->so_rcv;
-	SOCKBUF_LOCK(sb);
-
-	while (true) {
-		struct mbuf *m = queue_head(&nlp->tx_queue);
-		if (m != NULL) {
-			struct mbuf *ctl = NULL;
-			if (__predict_false(m->m_next != NULL))
-				ctl = extract_msg_info(m);
-			if (sbappendaddr_locked(sb, nl_empty_src, m, ctl) != 0) {
-				/* appended successfully */
-				queue_pop(&nlp->tx_queue);
-				appended = true;
-			} else
-				break;
-		} else
-			break;
-	}
-
-	SOCKBUF_UNLOCK(sb);
-
-	if (appended)
-		sorwakeup(nlp->nl_socket);
-
-	return (queue_empty(&nlp->tx_queue));
-}
-
 static bool
 nl_process_received_one(struct nlpcb *nlp)
 {
 	struct socket *so = nlp->nl_socket;
-	struct sockbuf *sb = &so->so_snd;
+	struct sockbuf *sb;
 	struct nl_buf *nb;
 	bool reschedule = false;
 
 	NLP_LOCK(nlp);
 	nlp->nl_task_pending = false;
+	NLP_UNLOCK(nlp);
 
-	if (!tx_check_locked(nlp)) {
-		/* TX overflow queue still not empty, ignore RX */
-		NLP_UNLOCK(nlp);
+	/*
+	 * Do not process queued up requests if there is no space to queue
+	 * replies.
+	 */
+	sb = &so->so_rcv;
+	SOCK_RECVBUF_LOCK(so);
+	if (sb->sb_hiwat <= sb->sb_ccc) {
+		SOCK_RECVBUF_UNLOCK(so);
 		return (false);
 	}
+	SOCK_RECVBUF_UNLOCK(so);
 
-	int prev_hiwat = nlp->tx_queue.hiwat;
-	NLP_UNLOCK(nlp);
-
+	sb = &so->so_snd;
 	SOCK_SENDBUF_LOCK(so);
 	while ((nb = TAILQ_FIRST(&sb->nl_queue)) != NULL) {
 		TAILQ_REMOVE(&sb->nl_queue, nb, tailq);
@@ -244,7 +165,7 @@ nl_process_received_one(struct nlpcb *nlp)
 			sb->sb_ccc -= nb->datalen;
 			/* XXXGL: potentially can reduce lock&unlock count. */
 			sowwakeup_locked(so);
-			free(nb, M_NETLINK);
+			nl_buf_free(nb);
 			SOCK_SENDBUF_LOCK(so);
 		} else {
 			TAILQ_INSERT_HEAD(&sb->nl_queue, nb, tailq);
@@ -252,10 +173,6 @@ nl_process_received_one(struct nlpcb *nlp)
 		}
 	}
 	SOCK_SENDBUF_UNLOCK(so);
-	if (nlp->tx_queue.hiwat > prev_hiwat) {
-		NLP_LOG(LOG_DEBUG, nlp, "TX override peaked to %d", nlp->tx_queue.hiwat);
-
-	}
 
 	return (reschedule);
 }
@@ -276,18 +193,6 @@ nl_process_received(struct nlpcb *nlp)
 		;
 }
 
-void
-nl_init_io(struct nlpcb *nlp)
-{
-	STAILQ_INIT(&nlp->tx_queue.head);
-}
-
-void
-nl_free_io(struct nlpcb *nlp)
-{
-	queue_free(&nlp->tx_queue);
-}
-
 /*
  * Called after some data have been read from the socket.
  */
@@ -306,8 +211,8 @@ nl_on_transmit(struct nlpcb *nlp)
 		struct sockbuf *sb = &so->so_rcv;
 		NLP_LOG(LOG_DEBUG, nlp,
 		    "socket RX overflowed, %lu messages (%lu bytes) dropped. "
-		    "bytes: [%u/%u] mbufs: [%u/%u]", dropped_messages, dropped_bytes,
-		    sb->sb_ccc, sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax);
+		    "bytes: [%u/%u]", dropped_messages, dropped_bytes,
+		    sb->sb_ccc, sb->sb_hiwat);
*** 1285 LINES SKIPPED ***