git: c9114f9f86f9 - main - Add new vnode dumper to support live minidumps

From: Mitchell Horne <mhorne_at_FreeBSD.org>
Date: Tue, 05 Apr 2022 18:35:13 UTC
The branch main has been updated by mhorne:

URL: https://cgit.FreeBSD.org/src/commit/?id=c9114f9f86f92742eacd1d802c34009a57e81055

commit c9114f9f86f92742eacd1d802c34009a57e81055
Author:     Mitchell Horne <mhorne@FreeBSD.org>
AuthorDate: 2021-03-23 20:47:14 +0000
Commit:     Mitchell Horne <mhorne@FreeBSD.org>
CommitDate: 2022-04-05 18:35:05 +0000

    Add new vnode dumper to support live minidumps
    
    This dumper can instantiate and write the dump's contents to a
    file-backed vnode.
    
    Unlike existing disk or network dumpers, the vnode dumper should not be
    invoked during a system panic, and therefore is not added to the global
    dumper_configs list. Instead, the vnode dumper is constructed ad-hoc
    when a live dump is requested using the new ioctl on /dev/mem. This is
    similar in spirit to a kgdb session against the live system via
    /dev/mem.
    
    As described briefly in the mem(4) man page, live dumps are not
    guaranteed to result in a usuable output file, but offer some debugging
    value where forcefully panicing a system to dump its memory is not
    desirable/feasible.
    
    A future change to savecore(8) will add an option to save a live dump.
    
    Reviewed by:    markj, Pau Amma <pauamma@gundo.com> (manpages)
    Discussed with: kib
    MFC after:      3 weeks
    Sponsored by:   Juniper Networks, Inc.
    Sponsored by:   Klara, Inc.
    Differential Revision:  https://reviews.freebsd.org/D33813
---
 share/man/man4/mem.4        |  62 ++++++++++++++
 sys/conf/files              |   1 +
 sys/dev/mem/memdev.c        |   6 ++
 sys/kern/kern_shutdown.c    |  14 ++-
 sys/kern/kern_vnodedumper.c | 202 ++++++++++++++++++++++++++++++++++++++++++++
 sys/sys/conf.h              |   1 +
 sys/sys/kerneldump.h        |   2 +
 sys/sys/memrange.h          |  10 +++
 8 files changed, 296 insertions(+), 2 deletions(-)

diff --git a/share/man/man4/mem.4 b/share/man/man4/mem.4
index f860df036428..6370d2a95525 100644
--- a/share/man/man4/mem.4
+++ b/share/man/man4/mem.4
@@ -202,6 +202,50 @@ to update an existing or establish a new range, or to
 .Dv MEMRANGE_SET_REMOVE
 to remove a range.
 .El
+.Ss Live Kernel Dumps
+.Pp
+The
+.Dv MEM_KERNELDUMP
+ioctl will initiate a kernel dump against the running system, the contents of
+which will be written to a process-owned file descriptor.
+The resulting dump output will be in minidump format.
+The request is described by
+.Bd -literal
+struct mem_livedump_arg {
+	int	fd;		/* input */
+	int	flags		/* input */
+	uint8_t	compression	/* input */
+};
+.Ed
+.Pp
+The
+.Va fd
+field is used to pass the file descriptor.
+.Pp
+The
+.Va flags
+field is currently unused and must be set to zero.
+.Pp
+The
+.Va compression
+field can be used to specify the desired compression to
+be applied to the dump output.
+The supported values are defined in
+.In sys/kerneldump.h ;
+that is,
+.Dv KERNELDUMP_COMP_NONE ,
+.Dv KERNELDUMP_COMP_GZIP ,
+or
+.Dv KERNELDUMP_COMP_ZSTD .
+.Pp
+Kernel dumps taken against the running system may have inconsistent kernel data
+structures due to allocation, deallocation, or modification of memory
+concurrent to the dump procedure.
+Thus, the resulting core dump is not guaranteed to be usable.
+A system under load is more likely to produce an inconsistent result.
+Despite this, live kernel dumps can be useful for offline debugging of certain
+types of kernel bugs, such as deadlocks, or in inspecting a particular part of
+the system's state.
 .Sh RETURN VALUES
 .Ss MEM_EXTRACT_PADDR
 The
@@ -229,6 +273,24 @@ base/length supplied.
 An attempt to remove a range failed because the range is permanently
 enabled.
 .El
+.Ss MEM_KERNELDUMP
+.Bl -tag -width Er
+.It Bq Er EOPNOTSUPP
+Kernel minidumps are not supported on this architecture.
+.It Bq Er EPERM
+An attempt to begin the kernel dump failed because the calling thread lacks the
+.It Bq Er EBADF
+The supplied file descriptor was invalid, or does not have write permission.
+.It Bq Er EBUSY
+An attempt to begin the kernel dump failed because one is already in progress.
+.It Bq Er EINVAL
+An invalid or unsupported value was specified in
+.Va flags .
+.It Bq Er EINVAL
+An invalid or unsupported compression type was specified.
+.Dv PRIV_KMEM_READ
+privilege.
+.El
 .Sh FILES
 .Bl -tag -width /dev/kmem -compact
 .It Pa /dev/mem
diff --git a/sys/conf/files b/sys/conf/files
index 57bd2693f532..9b907da0dd4b 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3839,6 +3839,7 @@ kern/kern_tslog.c		optional tslog
 kern/kern_ubsan.c		optional kubsan
 kern/kern_umtx.c		standard
 kern/kern_uuid.c		standard
+kern/kern_vnodedumper.c		standard
 kern/kern_xxx.c			standard
 kern/link_elf.c			standard
 kern/linker_if.m		standard
diff --git a/sys/dev/mem/memdev.c b/sys/dev/mem/memdev.c
index f03550aaa495..7d33066f5678 100644
--- a/sys/dev/mem/memdev.c
+++ b/sys/dev/mem/memdev.c
@@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/fcntl.h>
 #include <sys/ioccom.h>
 #include <sys/kernel.h>
+#include <sys/kerneldump.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
@@ -96,6 +97,7 @@ memioctl(struct cdev *dev, u_long cmd, caddr_t data, int flags,
 {
 	vm_map_t map;
 	vm_map_entry_t entry;
+	const struct mem_livedump_arg *marg;
 	struct mem_extract *me;
 	int error;
 
@@ -120,6 +122,10 @@ memioctl(struct cdev *dev, u_long cmd, caddr_t data, int flags,
 		}
 		vm_map_unlock_read(map);
 		break;
+	case MEM_KERNELDUMP:
+		marg = (const struct mem_livedump_arg *)data;
+		error = livedump_start(marg->fd, marg->flags, marg->compression);
+		break;
 	default:
 		error = memioctl_md(dev, cmd, data, flags, td);
 		break;
diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c
index 7d0f913961cb..f7e72d53a566 100644
--- a/sys/kern/kern_shutdown.c
+++ b/sys/kern/kern_shutdown.c
@@ -390,6 +390,17 @@ print_uptime(void)
 	printf("%lds\n", (long)ts.tv_sec);
 }
 
+/*
+ * Set up a context that can be extracted from the dump.
+ */
+void
+dump_savectx(void)
+{
+
+	savectx(&dumppcb);
+	dumptid = curthread->td_tid;
+}
+
 int
 doadump(boolean_t textdump)
 {
@@ -402,8 +413,7 @@ doadump(boolean_t textdump)
 	if (TAILQ_EMPTY(&dumper_configs))
 		return (ENXIO);
 
-	savectx(&dumppcb);
-	dumptid = curthread->td_tid;
+	dump_savectx();
 	dumping++;
 
 	coredump = TRUE;
diff --git a/sys/kern/kern_vnodedumper.c b/sys/kern/kern_vnodedumper.c
new file mode 100644
index 000000000000..c8fdce5e550a
--- /dev/null
+++ b/sys/kern/kern_vnodedumper.c
@@ -0,0 +1,202 @@
+/*-
+ * Copyright (c) 2021-2022 Juniper Networks
+ *
+ * This software was developed by Mitchell Horne <mhorne@FreeBSD.org>
+ * under sponsorship from Juniper Networks and Klara Systems.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/caprights.h>
+#include <sys/disk.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/kerneldump.h>
+#include <sys/limits.h>
+#include <sys/malloc.h>
+#include <sys/namei.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+
+#include <machine/vmparam.h>
+
+static dumper_start_t vnode_dumper_start;
+static dumper_t vnode_dump;
+static dumper_hdr_t vnode_write_headers;
+
+static struct sx livedump_sx;
+SX_SYSINIT(livedump, &livedump_sx, "Livedump sx");
+
+/*
+ * Invoke a live minidump on the system.
+ */
+int
+livedump_start(int fd, int flags, uint8_t compression)
+{
+#if MINIDUMP_PAGE_TRACKING == 1
+	struct dumperinfo di, *livedi;
+	struct diocskerneldump_arg kda;
+	struct vnode *vp;
+	struct file *fp;
+	void *rl_cookie;
+	int error;
+
+	error = priv_check(curthread, PRIV_KMEM_READ);
+	if (error != 0)
+		return (error);
+
+	if (flags != 0)
+		return (EINVAL);
+
+	error = getvnode(curthread, fd, &cap_write_rights, &fp);
+	if (error != 0)
+		return (error);
+	vp = fp->f_vnode;
+
+	if ((fp->f_flag & FWRITE) == 0) {
+		error = EBADF;
+		goto drop;
+	}
+
+	/* Set up a new dumper. */
+	bzero(&di, sizeof(di));
+	di.dumper_start = vnode_dumper_start;
+	di.dumper = vnode_dump;
+	di.dumper_hdr = vnode_write_headers;
+	di.blocksize = PAGE_SIZE; /* Arbitrary. */
+	di.maxiosize = MAXDUMPPGS * PAGE_SIZE;
+
+	bzero(&kda, sizeof(kda));
+	kda.kda_compression = compression;
+	error = dumper_create(&di, "livedump", &kda, &livedi);
+	if (error != 0)
+		goto drop;
+
+	/* Only allow one livedump to proceed at a time. */
+	if (sx_try_xlock(&livedump_sx) == 0) {
+		dumper_destroy(livedi);
+		error = EBUSY;
+		goto drop;
+	}
+
+	/* To be used by the callback functions. */
+	livedi->priv = vp;
+
+	/* Lock the entire file range and vnode. */
+	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+
+	dump_savectx();
+	error = minidumpsys(livedi, true);
+
+	VOP_UNLOCK(vp);
+	vn_rangelock_unlock(vp, rl_cookie);
+	sx_xunlock(&livedump_sx);
+	dumper_destroy(livedi);
+drop:
+	fdrop(fp, curthread);
+	return (error);
+#else
+	return (EOPNOTSUPP);
+#endif /* MINIDUMP_PAGE_TRACKING == 1 */
+}
+
+int
+vnode_dumper_start(struct dumperinfo *di, void *key, uint32_t keysize)
+{
+
+	/* Always begin with an offset of zero. */
+	di->dumpoff = 0;
+
+	KASSERT(keysize == 0, ("encryption not supported for livedumps"));
+	return (0);
+}
+
+/*
+ * Callback from dumpsys() to dump a chunk of memory.
+ *
+ * Parameters:
+ *	arg	 Opaque private pointer to vnode
+ *	virtual  Virtual address (where to read the data from)
+ *	physical Physical memory address (unused)
+ *	offset	 Offset from start of core file
+ *	length	 Data length
+ *
+ * Return value:
+ *	0 on success
+ *	errno on error
+ */
+int
+vnode_dump(void *arg, void *virtual, vm_offset_t physical __unused,
+    off_t offset, size_t length)
+{
+	struct vnode *vp;
+	int error = 0;
+
+	vp = arg;
+	MPASS(vp != NULL);
+	ASSERT_VOP_LOCKED(vp, __func__);
+
+	/* Done? */
+	if (virtual == NULL)
+		return (0);
+
+	error = vn_rdwr(UIO_WRITE, vp, virtual, length, offset, UIO_SYSSPACE,
+	    IO_NODELOCKED, curthread->td_ucred, NOCRED, NULL, curthread);
+	if (error != 0)
+		uprintf("%s: error writing livedump block at offset %jx: %d\n",
+		    __func__, (uintmax_t)offset, error);
+	return (error);
+}
+
+/*
+ * Callback from dumpsys() to write out the dump header, placed at the end.
+ */
+int
+vnode_write_headers(struct dumperinfo *di, struct kerneldumpheader *kdh)
+{
+	struct vnode *vp;
+	int error;
+	off_t offset;
+
+	vp = di->priv;
+	MPASS(vp != NULL);
+	ASSERT_VOP_LOCKED(vp, __func__);
+
+	/* Compensate for compression/encryption adjustment of dumpoff. */
+	offset = roundup2(di->dumpoff, di->blocksize);
+
+	/* Write the kernel dump header to the end of the file. */
+	error = vn_rdwr(UIO_WRITE, vp, kdh, sizeof(*kdh), offset,
+	    UIO_SYSSPACE, IO_NODELOCKED, curthread->td_ucred, NOCRED, NULL,
+	    curthread);
+	if (error != 0)
+		uprintf("%s: error writing livedump header: %d\n", __func__,
+		    error);
+	return (error);
+}
diff --git a/sys/sys/conf.h b/sys/sys/conf.h
index 6f84a3f03dbc..4808de511d6b 100644
--- a/sys/sys/conf.h
+++ b/sys/sys/conf.h
@@ -362,6 +362,7 @@ struct dumperinfo {
 
 extern int dumping;		/* system is dumping */
 
+void dump_savectx(void);
 int doadump(boolean_t);
 struct diocskerneldump_arg;
 int dumper_create(const struct dumperinfo *di_template, const char *devname,
diff --git a/sys/sys/kerneldump.h b/sys/sys/kerneldump.h
index c293491eadc9..2c73790bc81d 100644
--- a/sys/sys/kerneldump.h
+++ b/sys/sys/kerneldump.h
@@ -162,6 +162,8 @@ void dumpsys_pb_progress(size_t);
 
 extern int do_minidump;
 
+int livedump_start(int, int, uint8_t);
+
 #endif
 
 #endif /* _SYS_KERNELDUMP_H */
diff --git a/sys/sys/memrange.h b/sys/sys/memrange.h
index 454b033775f4..d3eeeb79b664 100644
--- a/sys/sys/memrange.h
+++ b/sys/sys/memrange.h
@@ -59,6 +59,16 @@ struct mem_extract {
 
 #define	MEM_EXTRACT_PADDR	_IOWR('m', 52, struct mem_extract)
 
+struct mem_livedump_arg {
+	int		fd;
+	int		flags;
+	uint8_t		compression;
+	uint8_t		pad1[7];
+	uint64_t	pad2[2];
+};
+
+#define	MEM_KERNELDUMP	_IOW('m', 53, struct mem_livedump_arg)
+
 #ifdef _KERNEL
 
 MALLOC_DECLARE(M_MEMDESC);