git: a0069f16fe23 - main - tools: Add a couple of utilities which were useful for testing SO_SPLICE

From: Mark Johnston <markj_at_FreeBSD.org>
Date: Fri, 01 Nov 2024 14:07:32 UTC
The branch main has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=a0069f16fe235068fac8fac8d11aee4c5dac6e8c

commit a0069f16fe235068fac8fac8d11aee4c5dac6e8c
Author:     Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2024-11-01 14:02:40 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2024-11-01 14:02:40 +0000

    tools: Add a couple of utilities which were useful for testing SO_SPLICE
    
    Sponsored by:   Klara, Inc.
    Sponsored by:   Stormshield
    
    Differential Revision:  https://reviews.freebsd.org/D47308
---
 tools/tools/README               |   1 +
 tools/tools/so_splice/Makefile   |  12 ++
 tools/tools/so_splice/pingpong.c | 197 +++++++++++++++++
 tools/tools/so_splice/proxy.c    | 451 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 661 insertions(+)

diff --git a/tools/tools/README b/tools/tools/README
index 45f1d09f2d21..c5b42d08ff0d 100644
--- a/tools/tools/README
+++ b/tools/tools/README
@@ -52,6 +52,7 @@ pciid		Generate src/share/misc/pci_vendors.
 pciroms		A tool for dumping PCI ROM images. WARNING: alpha quality.
 pirtool		A tool for dumping the $PIR table on i386 machines at runtime.
 scsi-defects	Get at the primary or grown defect list of a SCSI disk.
+so_splice	Utilities relating to the SO_SPLICE socket option.
 sysdoc		Build a manual page with available sysctls for a specific
 		kernel configuration.
 track		Track the progress of a world / kernel build
diff --git a/tools/tools/so_splice/Makefile b/tools/tools/so_splice/Makefile
new file mode 100644
index 000000000000..57ced2fc7b47
--- /dev/null
+++ b/tools/tools/so_splice/Makefile
@@ -0,0 +1,12 @@
+PROGS=	pingpong proxy
+
+MAN=
+
+WARNS?=	6
+
+.if defined(SRCDIR)
+CFLAGS+=	-I${SRCDIR}/include	\
+		-I${SRCDIR}/sys
+.endif
+
+.include <bsd.progs.mk>
diff --git a/tools/tools/so_splice/pingpong.c b/tools/tools/so_splice/pingpong.c
new file mode 100644
index 000000000000..bfcd7f4e0736
--- /dev/null
+++ b/tools/tools/so_splice/pingpong.c
@@ -0,0 +1,197 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Klara, Inc.
+ */
+
+/*
+ * A utility which implements a simple ping-pong over TCP, and prints the amount
+ * of time elapsed for each round trip.
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+
+#include <err.h>
+#include <errno.h>
+#include <netdb.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+static void
+usage(void)
+{
+	fprintf(stderr,
+"usage: pingpong [-c <target addr> [-l <msgsz>] [-n <count]]|[-s <listen addr>]\n");
+	exit(1);
+}
+
+static void
+addrinfo(struct sockaddr_storage *ss, const char *addr)
+{
+	struct addrinfo hints, *res, *res1;
+	char *host, *port;
+	int error;
+
+	host = strdup(addr);
+	if (host == NULL)
+		err(1, "strdup");
+	port = strchr(host, ':');
+	if (port == NULL)
+		errx(1, "invalid address '%s', should be <addr>:<port>", host);
+	*port++ = '\0';
+
+	memset(&hints, 0, sizeof(hints));
+	hints.ai_socktype = SOCK_STREAM;
+	error = getaddrinfo(host, port, &hints, &res);
+	if (error != 0)
+		errx(1, "%s", gai_strerror(error));
+	for (res1 = res; res != NULL; res = res->ai_next) {
+		if (res->ai_protocol == IPPROTO_TCP) {
+			memcpy(ss, res->ai_addr, res->ai_addrlen);
+			break;
+		}
+	}
+	if (res == NULL)
+		errx(1, "no TCP address found for '%s'", host);
+	free(host);
+	freeaddrinfo(res1);
+}
+
+int
+main(int argc, char **argv)
+{
+	char buf[1024];
+	struct sockaddr_storage ss;
+	char *addr;
+	size_t len;
+	int ch, count, s;
+	enum { NONE, CLIENT, SERVER } mode;
+
+	count = 0;
+	len = 0;
+	mode = NONE;
+	while ((ch = getopt(argc, argv, "c:l:n:s:")) != -1) {
+		switch (ch) {
+		case 'c':
+			addr = optarg;
+			mode = CLIENT;
+			break;
+		case 'l':
+			len = strtol(optarg, NULL, 10);
+			if (len > sizeof(buf))
+				errx(1, "message size too large");
+			if (len == 0)
+				errx(1, "message size must be non-zero");
+			break;
+		case 'n':
+			count = strtol(optarg, NULL, 10);
+			if (count <= 0)
+				errx(1, "count must be positive");
+			break;
+		case 's':
+			addr = optarg;
+			mode = SERVER;
+			break;
+		default:
+			usage();
+			break;
+		}
+	}
+
+	if (mode == NONE)
+		usage();
+	if (mode == SERVER && len != 0)
+		usage();
+
+	if (mode == CLIENT) {
+		if (len == 0)
+			len = 1;
+		if (count == 0)
+			count = 1;
+	}
+
+	addrinfo(&ss, addr);
+
+	s = socket(ss.ss_family, SOCK_STREAM, 0);
+	if (s == -1)
+		err(1, "socket");
+	if (setsockopt(s, IPPROTO_TCP, TCP_NODELAY, &(int){1}, sizeof(int)) ==
+	    -1)
+		err(1, "setsockopt");
+
+	if (mode == CLIENT) {
+		struct timespec ts1, ts2;
+		ssize_t n;
+
+		memset(buf, 'a', len);
+		if (connect(s, (struct sockaddr *)&ss, ss.ss_len) == -1)
+			err(1, "connect");
+
+		for (int i = 0; i < count; i++) {
+			int64_t ns;
+
+			clock_gettime(CLOCK_MONOTONIC, &ts1);
+			n = write(s, buf, len);
+			if (n < 0)
+				err(1, "write");
+			n = read(s, buf, len);
+			if (n < 0)
+				err(1, "read");
+			if ((size_t)n < len)
+				errx(1, "unexpected short read");
+			clock_gettime(CLOCK_MONOTONIC, &ts2);
+
+			ns = (ts2.tv_sec - ts1.tv_sec) * 1000000000 +
+			    (ts2.tv_nsec - ts1.tv_nsec);
+			printf("round trip time: %ld.%02ld us\n", ns / 1000,
+			    (ns % 1000) / 10);
+			for (size_t j = 0; j < len; j++) {
+				if (buf[j] != 'b')
+					errx(1, "unexpected data");
+			}
+		}
+	} else /* if (mode == SERVER) */ {
+		int cs;
+
+		if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &(int){1},
+		    sizeof(int)) == -1)
+			err(1, "setsockopt");
+		if (bind(s, (struct sockaddr *)&ss, ss.ss_len) == -1)
+			err(1, "bind");
+		if (listen(s, 1) == -1)
+			err(1, "listen");
+
+		for (;;) {
+			ssize_t n;
+
+			cs = accept(s, NULL, NULL);
+			if (cs == -1)
+				err(1, "accept");
+
+			for (;;) {
+				n = read(cs, buf, sizeof(buf));
+				if (n < 0) {
+					if (errno == ECONNRESET)
+						break;
+					err(1, "read");
+				}
+				if (n == 0)
+					break;
+				memset(buf, 'b', n);
+				n = write(cs, buf, n);
+				if (n < 0)
+					err(1, "write");
+			}
+			(void)close(cs);
+		}
+	}
+
+	return (0);
+}
diff --git a/tools/tools/so_splice/proxy.c b/tools/tools/so_splice/proxy.c
new file mode 100644
index 000000000000..409a47225522
--- /dev/null
+++ b/tools/tools/so_splice/proxy.c
@@ -0,0 +1,451 @@
+/*
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Klara, Inc.
+ */
+
+/*
+ * A simple TCP proxy.  Listens on a local address until a connection appears,
+ * then opens a TCP connection to the target address and shuttles data between
+ * the two until one side closes its connection.
+ *
+ * For example:
+ *
+ *   $ proxy -l 127.0.0.1:8080 www.example.com:80
+ *
+ * The -m flag selects the mode of the transfer.  Specify "-m copy" to enable
+ * copying through userspace, and "-m splice" to use SO_SPLICE.
+ *
+ * The -L flag enables a loopback mode, wherein all data is additionally proxied
+ * through a loopback TCP connection.  This exists mostly to help test a
+ * specific use-case in custom proxy software where we would like to use
+ * SO_SPLICE.
+ */
+
+#include <sys/types.h>
+#include <sys/event.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
+
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+
+#include <assert.h>
+#include <err.h>
+#include <errno.h>
+#include <netdb.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+struct proxy_softc {
+	struct sockaddr_storage	lss;
+	struct sockaddr_storage	tss;
+	size_t bufsz;
+	enum proxy_mode { PROXY_MODE_COPY, PROXY_MODE_SPLICE } mode;
+	bool loopback;
+};
+
+static void
+usage(void)
+{
+	fprintf(stderr,
+"usage: proxy [-m copy|splice] [-s <buf-size>] [-L] -l <listen addr> <target addr>\n");
+	exit(1);
+}
+
+static void
+proxy_copy(struct proxy_softc *sc, int cs, int ts)
+{
+	struct kevent kev[2];
+	uint8_t *buf;
+	int kq;
+
+	kq = kqueue();
+	if (kq == -1)
+		err(1, "kqueue");
+
+	EV_SET(&kev[0], cs, EVFILT_READ, EV_ADD, 0, 0, (void *)(uintptr_t)ts);
+	EV_SET(&kev[1], ts, EVFILT_READ, EV_ADD, 0, 0, (void *)(uintptr_t)cs);
+	if (kevent(kq, kev, 2, NULL, 0, NULL) == -1)
+		err(1, "kevent");
+
+	buf = malloc(sc->bufsz);
+	if (buf == NULL)
+		err(1, "malloc");
+
+	for (;;) {
+		uint8_t *data;
+		ssize_t n, resid;
+		int rs, ws;
+
+		if (kevent(kq, NULL, 0, kev, 2, NULL) == -1) {
+			if (errno == EINTR)
+				continue;
+			err(1, "kevent");
+		}
+
+		rs = (int)kev[0].ident;
+		ws = (int)(uintptr_t)kev[0].udata;
+
+		n = read(rs, buf, sc->bufsz);
+		if (n == -1) {
+			if (errno == ECONNRESET)
+				break;
+			err(1, "read");
+		}
+		if (n == 0)
+			break;
+
+		data = buf;
+		resid = n;
+		do {
+			n = write(ws, data, resid);
+			if (n == -1) {
+				if (errno == EINTR)
+					continue;
+				if (errno == ECONNRESET || errno == EPIPE)
+					break;
+				err(1, "write");
+			}
+			assert(n > 0);
+			data += n;
+			resid -= n;
+		} while (resid > 0);
+	}
+
+	free(buf);
+	close(kq);
+}
+
+static void
+splice(int s1, int s2)
+{
+	struct splice sp;
+
+	memset(&sp, 0, sizeof(sp));
+	sp.sp_fd = s2;
+	if (setsockopt(s1, SOL_SOCKET, SO_SPLICE, &sp, sizeof(sp)) == -1)
+		err(1, "setsockopt");
+}
+
+static void
+proxy_splice(struct proxy_softc *sc __unused, int cs, int ts)
+{
+	struct kevent kev[2];
+	int error, kq;
+
+	/* Set up our splices. */
+	splice(cs, ts);
+	splice(ts, cs);
+
+	/* Block until the connection is terminated. */
+	kq = kqueue();
+	if (kq == -1)
+		err(1, "kqueue");
+	EV_SET(&kev[0], cs, EVFILT_READ, EV_ADD, 0, 0, NULL);
+	EV_SET(&kev[1], ts, EVFILT_READ, EV_ADD, 0, 0, NULL);
+	do {
+		error = kevent(kq, kev, 2, kev, 2, NULL);
+		if (error == -1 && errno != EINTR)
+			err(1, "kevent");
+	} while (error <= 0);
+
+	close(kq);
+}
+
+static void
+nodelay(int s)
+{
+	if (setsockopt(s, IPPROTO_TCP, TCP_NODELAY, &(int){1}, sizeof(int)) ==
+	    -1)
+		err(1, "setsockopt");
+}
+
+/*
+ * Like socketpair(2), but for TCP sockets on the  loopback address.
+ */
+static void
+tcp_socketpair(int out[2], int af)
+{
+	struct sockaddr_in sin;
+	struct sockaddr_in6 sin6;
+	struct sockaddr *sa;
+	int sd[2];
+
+	sd[0] = socket(af, SOCK_STREAM, 0);
+	if (sd[0] == -1)
+		err(1, "socket");
+	sd[1] = socket(af, SOCK_STREAM, 0);
+	if (sd[1] == -1)
+		err(1, "socket");
+
+	nodelay(sd[0]);
+	nodelay(sd[1]);
+
+	if (af == AF_INET) {
+		memset(&sin, 0, sizeof(sin));
+		sin.sin_family = AF_INET;
+		sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+		sin.sin_port = 0;
+		sin.sin_len = sizeof(sin);
+		sa = (struct sockaddr *)&sin;
+	} else if (af == AF_INET6) {
+		memset(&sin6, 0, sizeof(sin6));
+		sin6.sin6_family = AF_INET6;
+		sin6.sin6_addr = in6addr_loopback;
+		sin6.sin6_port = 0;
+		sin6.sin6_len = sizeof(sin6);
+		sa = (struct sockaddr *)&sin6;
+	} else {
+		errx(1, "unsupported address family %d", af);
+	}
+
+	if (bind(sd[0], sa, sa->sa_len) == -1)
+		err(1, "bind");
+	if (listen(sd[0], 1) == -1)
+		err(1, "listen");
+
+	if (getsockname(sd[0], sa, &(socklen_t){sa->sa_len}) == -1)
+		err(1, "getsockname");
+	if (connect(sd[1], sa, sa->sa_len) == -1)
+		err(1, "connect");
+
+	out[0] = sd[1];
+	out[1] = accept(sd[0], NULL, NULL);
+	if (out[1] == -1)
+		err(1, "accept");
+	close(sd[0]);
+}
+
+/*
+ * Proxy data between two connected TCP sockets.  Returns the PID of the process
+ * forked off to handle the data transfer.
+ */
+static pid_t
+proxy(struct proxy_softc *sc, int s1, int s2)
+{
+	pid_t child;
+
+	child = fork();
+	if (child == -1)
+		err(1, "fork");
+	if (child != 0) {
+		close(s1);
+		close(s2);
+		return (child);
+	}
+
+	if (sc->mode == PROXY_MODE_COPY)
+		proxy_copy(sc, s1, s2);
+	else
+		proxy_splice(sc, s1, s2);
+	_exit(0);
+}
+
+/*
+ * The proxy event loop accepts connections and forks off child processes to
+ * handle them.  We also handle events generated when child processes exit
+ * (triggered by one side closing its connection).
+ */
+static void
+eventloop(struct proxy_softc *sc)
+{
+	struct kevent kev;
+	int kq, lsd;
+	pid_t child;
+
+	lsd = socket(sc->lss.ss_family, SOCK_STREAM, 0);
+	if (lsd == -1)
+		err(1, "socket");
+	if (setsockopt(lsd, SOL_SOCKET, SO_REUSEADDR, &(int){1}, sizeof(int)) ==
+	    -1)
+		err(1, "setsockopt");
+	if (bind(lsd, (struct sockaddr *)&sc->lss, sc->lss.ss_len) == -1)
+		err(1, "bind");
+	if (listen(lsd, 5) == -1)
+		err(1, "listen");
+
+	kq = kqueue();
+	if (kq == -1)
+		err(1, "kqueue");
+	EV_SET(&kev, lsd, EVFILT_READ, EV_ADD, 0, 0, NULL);
+	if (kevent(kq, &kev, 1, NULL, 0, NULL) == -1)
+		err(1, "kevent");
+
+	for (;;) {
+		if (kevent(kq, NULL, 0, &kev, 1, NULL) == -1) {
+			if (errno == EINTR)
+				continue;
+			err(1, "kevent");
+		}
+
+		switch (kev.filter) {
+		case EVFILT_READ: {
+			int s, ts;
+
+			if ((int)kev.ident != lsd)
+				errx(1, "unexpected event ident %d",
+				    (int)kev.ident);
+
+			s = accept(lsd, NULL, NULL);
+			if (s == -1)
+				err(1, "accept");
+			nodelay(s);
+
+			ts = socket(sc->tss.ss_family, SOCK_STREAM, 0);
+			if (ts == -1)
+				err(1, "socket");
+			nodelay(ts);
+			if (connect(ts, (struct sockaddr *)&sc->tss,
+			    sc->tss.ss_len) == -1)
+				err(1, "connect");
+
+			if (sc->loopback) {
+				int ls[2];
+
+				tcp_socketpair(ls, sc->tss.ss_family);
+				child = proxy(sc, ls[0], ts);
+				EV_SET(&kev, child, EVFILT_PROC, EV_ADD,
+				    NOTE_EXIT, 0, NULL);
+				if (kevent(kq, &kev, 1, NULL, 0, NULL) == -1)
+					err(1, "kevent");
+				child = proxy(sc, s, ls[1]);
+				EV_SET(&kev, child, EVFILT_PROC, EV_ADD,
+				    NOTE_EXIT, 0, NULL);
+				if (kevent(kq, &kev, 1, NULL, 0, NULL) == -1)
+					err(1, "kevent");
+			} else {
+				child = proxy(sc, s, ts);
+				EV_SET(&kev, child, EVFILT_PROC, EV_ADD,
+				    NOTE_EXIT, 0, NULL);
+				if (kevent(kq, &kev, 1, NULL, 0, NULL) == -1)
+					err(1, "kevent");
+			}
+
+			break;
+			}
+		case EVFILT_PROC: {
+			int status;
+
+			child = kev.ident;
+			status = (int)kev.data;
+			if (WIFEXITED(status)) {
+				if (WEXITSTATUS(status) != 0) {
+					errx(1, "child exited with status %d",
+					    WEXITSTATUS(status));
+				}
+			} else if (WIFSIGNALED(status)) {
+				warnx("child %d terminated by signal %d",
+				    (pid_t)kev.ident, WTERMSIG(status));
+			}
+			if (waitpid(child, NULL, 0) == -1)
+				err(1, "waitpid");
+			break;
+			}
+		}
+	}
+}
+
+static void
+addrinfo(struct sockaddr_storage *ss, const char *addr)
+{
+	struct addrinfo hints, *res, *res1;
+	char *host, *port;
+	int error;
+
+	host = strdup(addr);
+	if (host == NULL)
+		err(1, "strdup");
+	port = strchr(host, ':');
+	if (port == NULL)
+		errx(1, "invalid address '%s', should be <addr>:<port>", host);
+	*port++ = '\0';
+
+	memset(&hints, 0, sizeof(hints));
+	hints.ai_socktype = SOCK_STREAM;
+	error = getaddrinfo(host, port, &hints, &res);
+	if (error != 0)
+		errx(1, "%s", gai_strerror(error));
+	for (res1 = res; res != NULL; res = res->ai_next) {
+		if (res->ai_protocol == IPPROTO_TCP) {
+			memcpy(ss, res->ai_addr, res->ai_addrlen);
+			break;
+		}
+	}
+	if (res == NULL)
+		errx(1, "no TCP address found for '%s'", host);
+	free(host);
+	freeaddrinfo(res1);
+}
+
+static void
+proxy_init(struct proxy_softc *sc, const char *laddr, const char *taddr,
+    size_t bufsz, enum proxy_mode mode, bool loopback)
+{
+	addrinfo(&sc->lss, laddr);
+	addrinfo(&sc->tss, taddr);
+
+	sc->bufsz = bufsz;
+	sc->mode = mode;
+	sc->loopback = loopback;
+}
+
+int
+main(int argc, char **argv)
+{
+	struct proxy_softc sc;
+	char *laddr, *taddr;
+	size_t bufsz;
+	enum proxy_mode mode;
+	int ch;
+	bool loopback;
+
+	(void)signal(SIGPIPE, SIG_IGN);
+
+	loopback = false;
+	mode = PROXY_MODE_COPY;
+	bufsz = 2 * 1024 * 1024ul;
+	laddr = taddr = NULL;
+	while ((ch = getopt(argc, argv, "Ll:m:s:")) != -1) {
+		switch (ch) {
+		case 'l':
+			laddr = optarg;
+			break;
+		case 'L':
+			loopback = true;
+			break;
+		case 'm':
+			if (strcmp(optarg, "copy") == 0)
+				mode = PROXY_MODE_COPY;
+			else if (strcmp(optarg, "splice") == 0)
+				mode = PROXY_MODE_SPLICE;
+			else
+				usage();
+			break;
+		case 's':
+			bufsz = atoi(optarg);
+			break;
+		default:
+			usage();
+		}
+	}
+	argc -= optind;
+	argv += optind;
+
+	if (laddr == NULL || argc != 1)
+		usage();
+	taddr = argv[0];
+
+	/* Marshal command-line parameters into a neat structure. */
+	proxy_init(&sc, laddr, taddr, bufsz, mode, loopback);
+
+	/* Start handling connections. */
+	eventloop(&sc);
+
+	return (0);
+}