Does FreeBSD have sendmmsg or recvmmsg system calls?
Boris Astardzhiev
boris.astardzhiev at gmail.com
Tue Jan 12 14:53:28 UTC 2016
Hello again,
In my spare time I did the following simple libc-only implementation of the
syscalls.
I did some tests in a VM adapting these experiments:
https://blog.cloudflare.com/how-to-receive-a-million-packets/
Any comments about the diff are greatly appreciated.
Best regards,
Boris Astardzhiev
On Fri, Jan 8, 2016 at 7:02 PM, Adrian Chadd <adrian.chadd at gmail.com> wrote:
> On 8 January 2016 at 03:02, Bruce Evans <brde at optusnet.com.au> wrote:
> > On Fri, 8 Jan 2016, Adrian Chadd wrote:
> >
> >> On 7 January 2016 at 23:58, Mark Delany <c2h at romeo.emu.st> wrote:
> >>>
> >>> On 08Jan16, Bruce Evans allegedly wrote:
> >>>>
> >>>> If the NIC can't reach line rate
> >>>
> >>>
> >>>> Network stack overheads are also enormous.
> >>>
> >>>
> >>> Bruce makes some excellent points.
> >>>
> >>> I challenge anyone to get line rate UDP out of FBSD (or Linux) for a
> >>> 1G NIC yet alone a 10G NIC listening to a single port. It was exactly
> >>> my frustration with UDP performance that led me down the path of
> >>> *mmsg() and netmap.
> >>>
> >>> Frankly this is an opportunity for FBSD as UDP performance appears to
> >>> be a neglected area.
> >>
> >>
> >> I'm there, on 16 threads.
> >>
> >> I'd rather we do it on two or three, as a lot of time is wasted in
> >> producer/consumer locking. but yeah, 500k tx/rx should be doable per
> >> CPU with only locking changes.
>
> .. and I did mean "kernel producer/consumer locking changes."
>
> >
> > Line rate for 1 Gbps is about 1500 kpps (small packets).
> >
> > With I218V2 (em), I see enormous lock contention above 3 or 4 (user)
> > threads, and 8 are slightly slower than 1. 1 doesn't saturate the NIC,
> > and 2 is optimal.
> >
>
> The RSS support in -HEAD lets you get away with parallelising UDP
> streams very nicely.
>
> The framework is pretty simple (!):
>
> * drivers ask the RSS code for the RSS config and RSS hash to use, and
> configure the hardware appropriately;
> * the netisr input paths check the existence of the RSS hash and will
> calculte it in software if reqiured;
> * v4/v6 reassembly is done (at the IP level, /not/ at the protocol
> level) and if it needs a new RSS hash / netisr reinjection, that'll
> happen;
> * the PCB lookup code for listen sockets now allows one listen socket
> per RSS bucket - as the RSS / PCBGROUPS code already extended the PCB
> to have one PCB table per RSS bucket (as well as a global one);
>
> So:
>
> * userland code queries RSS for the CPU and RSS bucket setup;
> * you then create one listen socket per RSS bucket, bind it to the
> local thread (if you want) and tell it "you're in RSS bucket X";
> * .. and then in the UDP case for local-bound sockets, the
> transmit/receive path does not require modifying the global PCB state,
> so the locking is kept per-RSS bucket, and scales linearly with the
> number of CPUs you have (until you hit the NIC queue limits.)
>
> https://github.com/erikarn/freebsd-rss/
>
> and:
>
>
> http://adrianchadd.blogspot.com/2014/06/hacking-on-receive-side-scaling-rss-on.html
>
> http://adrianchadd.blogspot.com/2014/07/application-awareness-of-receive-side.html
>
> http://adrianchadd.blogspot.com/2014/08/receive-side-scaling-figuring-out-how.html
>
> http://adrianchadd.blogspot.com/2014/09/receive-side-scaling-testing-udp.html
>
> http://adrianchadd.blogspot.com/2014/10/more-rss-udp-tests-this-time-on-dell.html
>
>
>
> -adrian
> _______________________________________________
> freebsd-net at freebsd.org mailing list
> https://lists.freebsd.org/mailman/listinfo/freebsd-net
> To unsubscribe, send any mail to "freebsd-net-unsubscribe at freebsd.org"
>
-------------- next part --------------
diff --git a/lib/libc/include/libc_private.h b/lib/libc/include/libc_private.h
index 5caf9a3..9a0d6cf 100644
--- a/lib/libc/include/libc_private.h
+++ b/lib/libc/include/libc_private.h
@@ -224,6 +224,8 @@ enum {
INTERPOS_kevent,
INTERPOS_wait6,
INTERPOS_ppoll,
+ INTERPOS_sendmmsg,
+ INTERPOS_recvmmsg,
INTERPOS_MAX
};
diff --git a/lib/libc/include/namespace.h b/lib/libc/include/namespace.h
index 739d7b1..c95829e 100644
--- a/lib/libc/include/namespace.h
+++ b/lib/libc/include/namespace.h
@@ -208,6 +208,7 @@
#define readv _readv
#define recvfrom _recvfrom
#define recvmsg _recvmsg
+#define recvmmsg _recvmmsg
#define select _select
#define sem_close _sem_close
#define sem_destroy _sem_destroy
@@ -220,6 +221,7 @@
#define sem_unlink _sem_unlink
#define sem_wait _sem_wait
#define sendmsg _sendmsg
+#define sendmmsg _sendmmsg
#define sendto _sendto
#define setsockopt _setsockopt
/*#define sigaction _sigaction*/
diff --git a/lib/libc/include/un-namespace.h b/lib/libc/include/un-namespace.h
index f31fa7a..0233348 100644
--- a/lib/libc/include/un-namespace.h
+++ b/lib/libc/include/un-namespace.h
@@ -189,6 +189,7 @@
#undef readv
#undef recvfrom
#undef recvmsg
+#undef recvmmsg
#undef select
#undef sem_close
#undef sem_destroy
@@ -201,6 +202,7 @@
#undef sem_unlink
#undef sem_wait
#undef sendmsg
+#undef sendmmsg
#undef sendto
#undef setsockopt
#undef sigaction
diff --git a/lib/libc/sys/Makefile.inc b/lib/libc/sys/Makefile.inc
index e4fe1b2..ecb366a 100644
--- a/lib/libc/sys/Makefile.inc
+++ b/lib/libc/sys/Makefile.inc
@@ -28,6 +28,10 @@ SRCS+= futimens.c utimensat.c
NOASM+= futimens.o utimensat.o
PSEUDO+= _futimens.o _utimensat.o
+SRCS+= recvmmsg.c sendmmsg.c
+NOASM+= recvmmsg.o sendmmsg.o
+PSEUDO+= _recvmmsg.o _sendmmsg.o
+
INTERPOSED = \
accept \
accept4 \
diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map
index 7b3257c..724e1b4 100644
--- a/lib/libc/sys/Symbol.map
+++ b/lib/libc/sys/Symbol.map
@@ -399,6 +399,8 @@ FBSD_1.4 {
utimensat;
numa_setaffinity;
numa_getaffinity;
+ sendmmsg;
+ recvmmsg;
};
FBSDprivate_1.0 {
@@ -1051,4 +1053,6 @@ FBSDprivate_1.0 {
gssd_syscall;
__libc_interposing_slot;
__libc_sigwait;
+ _sendmmsg;
+ _recvmmsg;
};
diff --git a/lib/libc/sys/recvmmsg.c b/lib/libc/sys/recvmmsg.c
new file mode 100644
index 0000000..03ab379
--- /dev/null
+++ b/lib/libc/sys/recvmmsg.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016 Boris Astardzhiev, Smartcom-Bulgaria AD
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice(s), this list of conditions and the following disclaimer as
+ * the first lines of this file unmodified other than the possible
+ * addition of one or more copyright notices.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice(s), this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/socket.h>
+#include "libc_private.h"
+
+#define VLEN_MAX 1024
+
+int
+recvmmsg(int s, struct mmsghdr *msgvec, unsigned int vlen, int flags)
+{
+ int i, ret, rcvd;
+
+ if (vlen > VLEN_MAX)
+ vlen = VLEN_MAX;
+
+ rcvd = 0;
+ for (i = 0; i < vlen; i++) {
+ errno = 0;
+ ret = (((int (*)(int, const struct msghdr *, int))
+ __libc_interposing[INTERPOS_recvmsg])(s,
+ &msgvec[i].msg_hdr, flags));
+ if (ret < 0 || errno != 0) {
+ if (rcvd) {
+ /* We've received messages. Let caller know. */
+ errno = 0;
+ return (rcvd);
+ }
+ return (-1);
+ }
+
+ /* Save received bytes */
+ msgvec[i].msg_len = ret;
+
+ rcvd++;
+ }
+
+ return (rcvd);
+}
+
+#undef VLEN_MAX
diff --git a/lib/libc/sys/sendmmsg.c b/lib/libc/sys/sendmmsg.c
new file mode 100644
index 0000000..3387fdc
--- /dev/null
+++ b/lib/libc/sys/sendmmsg.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016 Boris Astardzhiev, Smartcom-Bulgaria AD
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice(s), this list of conditions and the following disclaimer as
+ * the first lines of this file unmodified other than the possible
+ * addition of one or more copyright notices.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice(s), this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/socket.h>
+#include "libc_private.h"
+
+#define VLEN_MAX 1024
+
+int
+sendmmsg(int s, struct mmsghdr *msgvec, unsigned int vlen, int flags)
+{
+ int i, ret, sent;
+
+ if (vlen > VLEN_MAX)
+ vlen = VLEN_MAX;
+
+ sent = 0;
+ for (i = 0; i < vlen; i++) {
+ errno = 0;
+ ret = (((int (*)(int, const struct msghdr *, int))
+ __libc_interposing[INTERPOS_sendmsg])(s,
+ &msgvec[i].msg_hdr, flags));
+ if (ret < 0 || errno != 0) {
+ if (sent) {
+ /* We have sent messages. Let caller know. */
+ errno = 0;
+ return (sent);
+ }
+ return (-1);
+ }
+
+ /* Save sent bytes */
+ msgvec[i].msg_len = ret;
+
+ sent++;
+ }
+
+ return (sent);
+}
+
+#undef VLEN_MAX
diff --git a/lib/libthr/thread/thr_syscalls.c b/lib/libthr/thread/thr_syscalls.c
index 7c05697..7b5458d 100644
--- a/lib/libthr/thread/thr_syscalls.c
+++ b/lib/libthr/thread/thr_syscalls.c
@@ -606,6 +606,84 @@ __thr_writev(int fd, const struct iovec *iov, int iovcnt)
return (ret);
}
+#define VLEN_MAX 1024
+
+static int
+__thr_sendmmsg(int s, struct mmsghdr *msgvec, unsigned int vlen, int flags)
+{
+ struct pthread *curthread;
+ int i, ret, sent;
+
+ curthread = _get_curthread();
+ _thr_cancel_enter(curthread);
+
+ if (vlen > VLEN_MAX)
+ vlen = VLEN_MAX;
+
+ sent = 0;
+ for (i = 0; i < (int)vlen; i++) {
+ errno = 0;
+ ret = __sys_sendmsg(s, &msgvec[i].msg_hdr, flags);
+ if (ret < 0 || errno != 0) {
+ if (sent) {
+ /* We have sent messages. Let caller know. */
+ errno = 0;
+ _thr_cancel_leave(curthread, ret <= 0);
+ return (sent);
+ }
+ return (-1);
+ }
+
+ /* Save sent bytes */
+ msgvec[i].msg_len = ret;
+
+ sent++;
+ }
+
+ _thr_cancel_leave(curthread, ret <= 0);
+
+ return (sent);
+}
+
+static int
+__thr_recvmmsg(int s, struct mmsghdr *msgvec, unsigned int vlen, int flags)
+{
+ struct pthread *curthread;
+ int i, ret, rcvd;
+
+ curthread = _get_curthread();
+ _thr_cancel_enter(curthread);
+
+ if (vlen > VLEN_MAX)
+ vlen = VLEN_MAX;
+
+ rcvd = 0;
+ for (i = 0; i < (int)vlen; i++) {
+ errno = 0;
+ ret = __sys_recvmsg(s, &msgvec[i].msg_hdr, flags);
+ if (ret < 0 || errno != 0) {
+ if (rcvd) {
+ /* We've received messages. Let caller know. */
+ errno = 0;
+ _thr_cancel_leave(curthread, ret == -1);
+ return (rcvd);
+ }
+ return (-1);
+ }
+
+ /* Save received bytes */
+ msgvec[i].msg_len = ret;
+
+ rcvd++;
+ }
+
+ _thr_cancel_leave(curthread, ret == -1);
+
+ return (rcvd);
+}
+
+#undef VLEN_MAX
+
void
__thr_interpose_libc(void)
{
@@ -652,6 +730,8 @@ __thr_interpose_libc(void)
SLOT(kevent);
SLOT(wait6);
SLOT(ppoll);
+ SLOT(sendmmsg);
+ SLOT(recvmmsg);
#undef SLOT
*(__libc_interposing_slot(
INTERPOS__pthread_mutex_init_calloc_cb)) =
diff --git a/sys/sys/socket.h b/sys/sys/socket.h
index 18e2de1..504313e 100644
--- a/sys/sys/socket.h
+++ b/sys/sys/socket.h
@@ -595,6 +595,18 @@ struct sf_hdtr {
#endif /* _KERNEL */
#endif /* __BSD_VISIBLE */
+#ifndef _KERNEL
+#ifdef __BSD_VISIBLE
+/*
+ * Send/recvmmsg specific structure(s)
+ */
+struct mmsghdr {
+ struct msghdr msg_hdr; /* message header */
+ unsigned int msg_len; /* message length */
+};
+#endif /* __BSD_VISIBLE */
+#endif /* !_KERNEL */
+
#ifndef _KERNEL
#include <sys/cdefs.h>
@@ -615,11 +627,17 @@ int listen(int, int);
ssize_t recv(int, void *, size_t, int);
ssize_t recvfrom(int, void *, size_t, int, struct sockaddr * __restrict, socklen_t * __restrict);
ssize_t recvmsg(int, struct msghdr *, int);
+#if __BSD_VISIBLE
+int recvmmsg(int, struct mmsghdr *, unsigned int, int);
+#endif
ssize_t send(int, const void *, size_t, int);
ssize_t sendto(int, const void *,
size_t, int, const struct sockaddr *, socklen_t);
ssize_t sendmsg(int, const struct msghdr *, int);
#if __BSD_VISIBLE
+int sendmmsg(int, struct mmsghdr *, unsigned int, int);
+#endif
+#if __BSD_VISIBLE
int sendfile(int, int, off_t, size_t, struct sf_hdtr *, off_t *, int);
int setfib(int);
#endif
More information about the freebsd-net
mailing list