git: 69d94f4c7608 - main - Add tarfs, a filesystem backed by tarballs.

From: Dag-Erling Smørgrav <des_at_FreeBSD.org>
Date: Thu, 02 Feb 2023 17:20:13 UTC
The branch main has been updated by des:

URL: https://cgit.FreeBSD.org/src/commit/?id=69d94f4c7608e41505996559367450706e91fbb8

commit 69d94f4c7608e41505996559367450706e91fbb8
Author:     Dag-Erling Smørgrav <des@FreeBSD.org>
AuthorDate: 2023-02-02 17:18:41 +0000
Commit:     Dag-Erling Smørgrav <des@FreeBSD.org>
CommitDate: 2023-02-02 17:19:29 +0000

    Add tarfs, a filesystem backed by tarballs.
    
    Sponsored by:   Juniper Networks, Inc.
    Sponsored by:   Klara, Inc.
    Reviewed by:    pauamma, imp
    Differential Revision:  https://reviews.freebsd.org/D37753
---
 etc/mtree/BSD.tests.dist         |    2 +
 share/man/man5/Makefile          |    1 +
 share/man/man5/tarfs.5           |  103 ++++
 sys/conf/files                   |    4 +
 sys/conf/options                 |    4 +
 sys/fs/tarfs/tarfs.h             |  254 +++++++++
 sys/fs/tarfs/tarfs_dbg.h         |   65 +++
 sys/fs/tarfs/tarfs_io.c          |  727 +++++++++++++++++++++++
 sys/fs/tarfs/tarfs_subr.c        |  603 ++++++++++++++++++++
 sys/fs/tarfs/tarfs_vfsops.c      | 1173 ++++++++++++++++++++++++++++++++++++++
 sys/fs/tarfs/tarfs_vnops.c       |  642 +++++++++++++++++++++
 sys/kern/subr_witness.c          |    6 +
 sys/modules/Makefile             |    1 +
 sys/modules/tarfs/Makefile       |   23 +
 tests/sys/fs/Makefile            |    1 +
 tests/sys/fs/tarfs/Makefile      |   10 +
 tests/sys/fs/tarfs/mktar.c       |  238 ++++++++
 tests/sys/fs/tarfs/tarfs_test.sh |   54 ++
 18 files changed, 3911 insertions(+)

diff --git a/etc/mtree/BSD.tests.dist b/etc/mtree/BSD.tests.dist
index 0d05ecaf06fc..b4b18997b7f9 100644
--- a/etc/mtree/BSD.tests.dist
+++ b/etc/mtree/BSD.tests.dist
@@ -757,6 +757,8 @@
         fs
             fusefs
             ..
+            tarfs
+            ..
             tmpfs
             ..
         ..
diff --git a/share/man/man5/Makefile b/share/man/man5/Makefile
index 2d49d981c2f9..f6e91e4ed00b 100644
--- a/share/man/man5/Makefile
+++ b/share/man/man5/Makefile
@@ -70,6 +70,7 @@ MAN=	acct.5 \
 	style.Makefile.5 \
 	style.mdoc.5 \
 	sysctl.conf.5 \
+	tarfs.5 \
 	tmpfs.5 \
 	unionfs.5
 
diff --git a/share/man/man5/tarfs.5 b/share/man/man5/tarfs.5
new file mode 100644
index 000000000000..b25131c323c1
--- /dev/null
+++ b/share/man/man5/tarfs.5
@@ -0,0 +1,103 @@
+.\"-
+.\" SPDX-License-Identifier: BSD-2-Clause
+.\"
+.\" Copyright (c) 2022 Klara, Inc.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.Dd February 2, 2023
+.Dt TARFS 5
+.Os
+.Sh NAME
+.Nm tarfs
+.Nd tarball filesystem
+.Sh SYNOPSIS
+To compile this driver into the kernel, place the following line in
+your kernel configuration file:
+.Bd -ragged -offset indent
+.Cd "options TARFS"
+.Ed
+.Pp
+Alternatively, to load the driver as a module at boot time, place the
+following line in
+.Xr loader.conf 5 :
+.Bd -literal -offset indent
+tarfs_load="YES"
+.Ed
+.Sh DESCRIPTION
+The
+.Nm
+driver implementes a read-only filesystem backed by a
+.Xr tar 5
+file.
+Currently, only POSIX archives, optionally compressed with
+.Xr zstd 1 ,
+are supported.
+.Pp
+The preferred I/O size for
+.Nm
+filesystems can be adjusted using the
+.Va vfs.tarfs.ioshift
+sysctl setting and tunable.
+Setting it to 0 will reset it to its default value.
+Note that changes to this setting only apply to filesystems mounted
+after the change.
+.Sh DIAGNOSTICS
+If enabled by the
+.Dv TARFS_DEBUG
+kernel option, the
+.Va vfs.tarfs.debug
+sysctl setting can be used to control debugging output from the
+.Nm
+driver.
+Debugging output for individual sections of the driver can be enabled
+by adding together the relevant values from the table below.
+.Bl -column Value Description
+.It 0x01 Ta Memory allocations
+.It 0x02 Ta Checksum calculations
+.It 0x04 Ta Filesystem operations (vfsops)
+.It 0x08 Ta Path lookups
+.It 0x10 Ta File operations (vnops)
+.It 0x20 Ta General I/O
+.It 0x40 Ta Decompression
+.It 0x80 Ta Decompression index
+.It 0x100 Ta Sparse file mapping
+.El
+.Sh SEE ALSO
+.Xr tar 1 ,
+.Xr zstd 1 ,
+.Xr fstab 5 ,
+.Xr tar 5 ,
+.Xr mount 8 ,
+.Xr sysctl 8
+.Sh HISTORY
+.An -nosplit
+The
+.Nm
+driver was developed by
+.An Stephen J. Kiernan Aq Mt stevek@FreeBSD.org
+and
+.An Dag-Erling Smørgrav Aq Mt des@FreeBSD.org
+for Juniper Networks and Klara Systems.
+This manual page was written by
+.An Dag-Erling Smørgrav Aq Mt des@FreeBSD.org
+for Juniper Networks and Klara Systems.
diff --git a/sys/conf/files b/sys/conf/files
index 6cb4abcd9223..08966a9b46e4 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3615,6 +3615,10 @@ fs/smbfs/smbfs_smb.c		optional smbfs
 fs/smbfs/smbfs_subr.c		optional smbfs
 fs/smbfs/smbfs_vfsops.c		optional smbfs
 fs/smbfs/smbfs_vnops.c		optional smbfs
+fs/tarfs/tarfs_io.c		optional tarfs compile-with "${NORMAL_C} -I$S/contrib/zstd/lib/freebsd"
+fs/tarfs/tarfs_subr.c		optional tarfs
+fs/tarfs/tarfs_vfsops.c		optional tarfs
+fs/tarfs/tarfs_vnops.c		optional tarfs
 fs/udf/osta.c			optional udf
 fs/udf/udf_iconv.c		optional udf_iconv
 fs/udf/udf_vfsops.c		optional udf
diff --git a/sys/conf/options b/sys/conf/options
index 1f5003507539..3b2be66ba602 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -265,6 +265,7 @@ NULLFS		opt_dontuse.h
 PROCFS		opt_dontuse.h
 PSEUDOFS	opt_dontuse.h
 SMBFS		opt_dontuse.h
+TARFS		opt_dontuse.h
 TMPFS		opt_dontuse.h
 UDF		opt_dontuse.h
 UNIONFS		opt_dontuse.h
@@ -273,6 +274,9 @@ ZFS		opt_dontuse.h
 # Pseudofs debugging
 PSEUDOFS_TRACE	opt_pseudofs.h
 
+# Tarfs debugging
+TARFS_DEBUG	opt_tarfs.h
+
 # In-kernel GSS-API
 KGSSAPI		opt_kgssapi.h
 KGSSAPI_DEBUG	opt_kgssapi.h
diff --git a/sys/fs/tarfs/tarfs.h b/sys/fs/tarfs/tarfs.h
new file mode 100644
index 000000000000..dffd60ee6d8a
--- /dev/null
+++ b/sys/fs/tarfs/tarfs.h
@@ -0,0 +1,254 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Juniper Networks, Inc.
+ * Copyright (c) 2022-2023 Klara, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef	_FS_TARFS_TARFS_H_
+#define	_FS_TARFS_TARFS_H_
+
+#ifndef _KERNEL
+#error Should only be included by kernel
+#endif
+
+MALLOC_DECLARE(M_TARFSMNT);
+MALLOC_DECLARE(M_TARFSNODE);
+MALLOC_DECLARE(M_TARFSNAME);
+
+#ifdef SYSCTL_DECL
+SYSCTL_DECL(_vfs_tarfs);
+#endif
+
+struct componentname;
+struct mount;
+struct vnode;
+
+/*
+ * Internal representation of a tarfs file system node.
+ */
+struct tarfs_node {
+	TAILQ_ENTRY(tarfs_node)	entries;
+	TAILQ_ENTRY(tarfs_node)	dirents;
+
+	struct mtx		 lock;
+
+	struct vnode		*vnode;
+	struct tarfs_mount	*tmp;
+	enum vtype		 type;
+	ino_t			 ino;
+	off_t			 offset;
+	size_t			 size;
+	size_t			 physize;
+	char			*name;
+	size_t			 namelen;
+
+	/* Node attributes */
+	uid_t			 uid;
+	gid_t			 gid;
+	mode_t			 mode;
+	unsigned int		 flags;
+	nlink_t			 nlink;
+	struct timespec		 atime;
+	struct timespec		 mtime;
+	struct timespec		 ctime;
+	struct timespec		 birthtime;
+	unsigned long		 gen;
+
+	/* Block map */
+	size_t			 nblk;
+	struct tarfs_blk	*blk;
+
+	struct tarfs_node	*parent;
+	union {
+		/* VDIR */
+		struct {
+			TAILQ_HEAD(, tarfs_node) dirhead;
+			off_t			 lastcookie;
+			struct tarfs_node	*lastnode;
+		} dir;
+
+		/* VLNK */
+		struct {
+			char			*name;
+			size_t			 namelen;
+		} link;
+
+		/* VBLK or VCHR */
+		dev_t			 rdev;
+
+		/* VREG */
+		struct tarfs_node	*other;
+	};
+};
+
+/*
+ * Entry in sparse file block map.
+ */
+struct tarfs_blk {
+	off_t	 i;		/* input (physical) offset */
+	off_t	 o;		/* output (logical) offset */
+	size_t	 l;		/* length */
+};
+
+/*
+ * Decompression buffer.
+ */
+#define TARFS_ZBUF_SIZE 1048576
+struct tarfs_zbuf {
+	u_char		 buf[TARFS_ZBUF_SIZE];
+	size_t		 off; /* offset of contents */
+	size_t		 len; /* length of contents */
+};
+
+/*
+ * Internal representation of a tarfs mount point.
+ */
+struct tarfs_mount {
+	TAILQ_HEAD(, tarfs_node) allnodes;
+	struct mtx		 allnode_lock;
+
+	struct tarfs_node	*root;
+	struct vnode		*vp;
+	struct mount		*vfs;
+	ino_t			 ino;
+	struct unrhdr		*ino_unr;
+	size_t			 iosize;
+	size_t			 nblocks;
+	size_t			 nfiles;
+	time_t			 mtime; /* default mtime for directories */
+
+	struct tarfs_zio	*zio;
+	struct vnode		*znode;
+};
+
+struct tarfs_zio {
+	struct tarfs_mount	*tmp;
+
+	/* decompression state */
+#ifdef ZSTDIO
+	struct tarfs_zstd	*zstd; /* decompression state (zstd) */
+#endif
+	off_t			 ipos; /* current input position */
+	off_t			 opos; /* current output position */
+
+	/* index of compression frames */
+	unsigned int		 curidx; /* current index position*/
+	unsigned int		 nidx; /* number of index entries */
+	unsigned int		 szidx; /* index capacity */
+	struct tarfs_idx { off_t i, o; } *idx;
+};
+
+struct tarfs_fid {
+	u_short			 len;	/* length of data in bytes */
+	u_short			 data0;	/* force alignment */
+	ino_t			 ino;
+	unsigned long		 gen;
+};
+
+#define	TARFS_NODE_LOCK(tnp) \
+	mtx_lock(&(tnp)->lock)
+#define	TARFS_NODE_UNLOCK(tnp) \
+	mtx_unlock(&(tnp)->lock)
+#define	TARFS_ALLNODES_LOCK(tnp) \
+	mtx_lock(&(tmp)->allnode_lock)
+#define	TARFS_ALLNODES_UNLOCK(tnp) \
+	mtx_unlock(&(tmp)->allnode_lock)
+
+/*
+ * Data and metadata within tar files are aligned on 512-byte boundaries,
+ * to match the block size of the magnetic tapes they were originally
+ * intended for.
+ */
+#define	TARFS_BSHIFT		9
+#define	TARFS_BLOCKSIZE		(size_t)(1U << TARFS_BSHIFT)
+#define	TARFS_BLKOFF(l)		((l) % TARFS_BLOCKSIZE)
+#define	TARFS_BLKNUM(l)		((l) >> TARFS_BSHIFT)
+#define	TARFS_SZ2BLKS(sz)	(((sz) + TARFS_BLOCKSIZE - 1) / TARFS_BLOCKSIZE)
+
+/*
+ * Our preferred I/O size.
+ */
+extern unsigned int tarfs_ioshift;
+#define	TARFS_IOSHIFT_MIN	TARFS_BSHIFT
+#define	TARFS_IOSHIFT_DEFAULT	PAGE_SHIFT
+#define	TARFS_IOSHIFT_MAX	PAGE_SHIFT
+
+#define	TARFS_ROOTINO		((ino_t)3)
+#define	TARFS_ZIOINO		((ino_t)4)
+#define	TARFS_MININO		((ino_t)65535)
+
+#define	TARFS_COOKIE_DOT	0
+#define	TARFS_COOKIE_DOTDOT	1
+#define	TARFS_COOKIE_EOF	OFF_MAX
+
+#define	TARFS_ZIO_NAME		".tar"
+#define	TARFS_ZIO_NAMELEN	(sizeof(TARFS_ZIO_NAME) - 1)
+
+extern struct vop_vector tarfs_vnodeops;
+
+static inline
+struct tarfs_mount *
+MP_TO_TARFS_MOUNT(struct mount *mp)
+{
+
+	MPASS(mp != NULL && mp->mnt_data != NULL);
+	return (mp->mnt_data);
+}
+
+static inline
+struct tarfs_node *
+VP_TO_TARFS_NODE(struct vnode *vp)
+{
+
+	MPASS(vp != NULL && vp->v_data != NULL);
+	return (vp->v_data);
+}
+
+int	tarfs_alloc_node(struct tarfs_mount *tmp, const char *name,
+	    size_t namelen, enum vtype type, off_t off, size_t sz,
+	    time_t mtime, uid_t uid, gid_t gid, mode_t mode,
+	    unsigned int flags, const char *linkname, dev_t rdev,
+	    struct tarfs_node *parent, struct tarfs_node **node);
+int	tarfs_load_blockmap(struct tarfs_node *tnp, size_t realsize);
+void	tarfs_dump_tree(struct tarfs_node *tnp);
+void	tarfs_free_node(struct tarfs_node *tnp);
+struct tarfs_node *
+	tarfs_lookup_dir(struct tarfs_node *tnp, off_t cookie);
+struct tarfs_node *
+	tarfs_lookup_node(struct tarfs_node *tnp, struct tarfs_node *f,
+	    struct componentname *cnp);
+void	tarfs_print_node(struct tarfs_node *tnp);
+int	tarfs_read_file(struct tarfs_node *tnp, size_t len, struct uio *uiop);
+
+int	tarfs_io_init(struct tarfs_mount *tmp);
+int	tarfs_io_fini(struct tarfs_mount *tmp);
+int	tarfs_io_read(struct tarfs_mount *tmp, bool raw,
+    struct uio *uiop);
+ssize_t	tarfs_io_read_buf(struct tarfs_mount *tmp, bool raw,
+    void *buf, off_t off, size_t len);
+unsigned int
+	tarfs_strtofflags(const char *str, char **end);
+
+#endif	/* _FS_TARFS_TARFS_H_ */
diff --git a/sys/fs/tarfs/tarfs_dbg.h b/sys/fs/tarfs/tarfs_dbg.h
new file mode 100644
index 000000000000..45d11d679719
--- /dev/null
+++ b/sys/fs/tarfs/tarfs_dbg.h
@@ -0,0 +1,65 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Juniper Networks, Inc.
+ * Copyright (c) 2022 Klara, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef	_FS_TARFS_TARFS_DBG_H_
+#define	_FS_TARFS_TARFS_DBG_H_
+
+#ifndef _KERNEL
+#error Should only be included by kernel
+#endif
+
+#ifdef	TARFS_DEBUG
+extern int tarfs_debug;
+
+#define	TARFS_DEBUG_ALLOC	0x01
+#define	TARFS_DEBUG_CHECKSUM	0x02
+#define	TARFS_DEBUG_FS		0x04
+#define	TARFS_DEBUG_LOOKUP	0x08
+#define	TARFS_DEBUG_VNODE	0x10
+#define	TARFS_DEBUG_IO		0x20
+#define	TARFS_DEBUG_ZIO		0x40
+#define	TARFS_DEBUG_ZIDX	0x80
+#define	TARFS_DEBUG_MAP		0x100
+
+#define	TARFS_DPF(category, fmt, ...)					\
+	do {								\
+		if ((tarfs_debug & TARFS_DEBUG_##category) != 0)	\
+			printf(fmt, ## __VA_ARGS__);			\
+	} while (0)
+#define	TARFS_DPF_IFF(category, cond, fmt, ...)				\
+	do {								\
+		if ((cond)						\
+		    && (tarfs_debug & TARFS_DEBUG_##category) != 0)	\
+			printf(fmt, ## __VA_ARGS__);			\
+	} while (0)
+#else
+#define	TARFS_DPF(category, fmt, ...)
+#define	TARFS_DPF_IFF(category, cond, fmt, ...)
+#endif
+
+#endif	/* _FS_TARFS_TARFS_DBG_H_ */
diff --git a/sys/fs/tarfs/tarfs_io.c b/sys/fs/tarfs/tarfs_io.c
new file mode 100644
index 000000000000..b957ac11ff51
--- /dev/null
+++ b/sys/fs/tarfs/tarfs_io.c
@@ -0,0 +1,727 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Juniper Networks, Inc.
+ * Copyright (c) 2022-2023 Klara, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "opt_tarfs.h"
+#include "opt_zstdio.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/sysctl.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+
+#ifdef ZSTDIO
+#define ZSTD_STATIC_LINKING_ONLY
+#include <contrib/zstd/lib/zstd.h>
+#endif
+
+#include <fs/tarfs/tarfs.h>
+#include <fs/tarfs/tarfs_dbg.h>
+
+#ifdef TARFS_DEBUG
+SYSCTL_NODE(_vfs_tarfs, OID_AUTO, zio, CTLFLAG_RD, 0,
+    "Tar filesystem decompression layer");
+COUNTER_U64_DEFINE_EARLY(tarfs_zio_inflated);
+SYSCTL_COUNTER_U64(_vfs_tarfs_zio, OID_AUTO, inflated, CTLFLAG_RD,
+    &tarfs_zio_inflated, "Amount of compressed data inflated.");
+COUNTER_U64_DEFINE_EARLY(tarfs_zio_consumed);
+SYSCTL_COUNTER_U64(_vfs_tarfs_zio, OID_AUTO, consumed, CTLFLAG_RD,
+    &tarfs_zio_consumed, "Amount of compressed data consumed.");
+COUNTER_U64_DEFINE_EARLY(tarfs_zio_bounced);
+SYSCTL_COUNTER_U64(_vfs_tarfs_zio, OID_AUTO, bounced, CTLFLAG_RD,
+    &tarfs_zio_bounced, "Amount of decompressed data bounced.");
+
+static int
+tarfs_sysctl_handle_zio_reset(SYSCTL_HANDLER_ARGS)
+{
+	unsigned int tmp;
+	int error;
+
+	tmp = 0;
+	if ((error = SYSCTL_OUT(req, &tmp, sizeof(tmp))) != 0)
+		return (error);
+	if (req->newptr != NULL) {
+		if ((error = SYSCTL_IN(req, &tmp, sizeof(tmp))) != 0)
+			return (error);
+		counter_u64_zero(tarfs_zio_inflated);
+		counter_u64_zero(tarfs_zio_consumed);
+		counter_u64_zero(tarfs_zio_bounced);
+	}
+	return (0);
+}
+
+SYSCTL_PROC(_vfs_tarfs_zio, OID_AUTO, reset,
+    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW,
+    NULL, 0, tarfs_sysctl_handle_zio_reset, "IU",
+    "Reset compression counters.");
+#endif
+
+MALLOC_DEFINE(M_TARFSZSTATE, "tarfs zstate", "tarfs decompression state");
+MALLOC_DEFINE(M_TARFSZBUF, "tarfs zbuf", "tarfs decompression buffers");
+
+#define XZ_MAGIC		(uint8_t[]){ 0xfd, 0x37, 0x7a, 0x58, 0x5a }
+#define ZLIB_MAGIC		(uint8_t[]){ 0x1f, 0x8b, 0x08 }
+#define ZSTD_MAGIC		(uint8_t[]){ 0x28, 0xb5, 0x2f, 0xfd }
+
+#ifdef ZSTDIO
+struct tarfs_zstd {
+	ZSTD_DStream *zds;
+};
+#endif
+
+/* XXX review use of curthread / uio_td / td_cred */
+
+/*
+ * Reads from the tar file according to the provided uio.  If the archive
+ * is compressed and raw is false, reads the decompressed stream;
+ * otherwise, reads directly from the original file.  Returns 0 on success
+ * and a positive errno value on failure.
+ */
+int
+tarfs_io_read(struct tarfs_mount *tmp, bool raw, struct uio *uiop)
+{
+	void *rl = NULL;
+	off_t off = uiop->uio_offset;
+	size_t len = uiop->uio_resid;
+	int error;
+
+	if (raw || tmp->znode == NULL) {
+		rl = vn_rangelock_rlock(tmp->vp, off, off + len);
+		error = vn_lock(tmp->vp, LK_SHARED);
+		if (error == 0) {
+			error = VOP_READ(tmp->vp, uiop,
+			    IO_DIRECT|IO_NODELOCKED,
+			    uiop->uio_td->td_ucred);
+			VOP_UNLOCK(tmp->vp);
+		}
+		vn_rangelock_unlock(tmp->vp, rl);
+	} else {
+		error = vn_lock(tmp->znode, LK_EXCLUSIVE);
+		if (error == 0) {
+			error = VOP_READ(tmp->znode, uiop,
+			    IO_DIRECT | IO_NODELOCKED,
+			    uiop->uio_td->td_ucred);
+			VOP_UNLOCK(tmp->znode);
+		}
+	}
+	TARFS_DPF(IO, "%s(%zu, %zu) = %d (resid %zd)\n", __func__,
+	    (size_t)off, len, error, uiop->uio_resid);
+	return (error);
+}
+
+/*
+ * Reads from the tar file into the provided buffer.  If the archive is
+ * compressed and raw is false, reads the decompressed stream; otherwise,
+ * reads directly from the original file.  Returns the number of bytes
+ * read on success, 0 on EOF, and a negative errno value on failure.
+ */
+ssize_t
+tarfs_io_read_buf(struct tarfs_mount *tmp, bool raw,
+    void *buf, off_t off, size_t len)
+{
+	struct uio auio;
+	struct iovec aiov;
+	ssize_t res;
+	int error;
+
+	if (len == 0) {
+		TARFS_DPF(IO, "%s(%zu, %zu) null\n", __func__,
+		    (size_t)off, len);
+		return (0);
+	}
+	aiov.iov_base = buf;
+	aiov.iov_len = len;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_offset = off;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_rw = UIO_READ;
+	auio.uio_resid = len;
+	auio.uio_td = curthread;
+	error = tarfs_io_read(tmp, raw, &auio);
+	if (error != 0) {
+		TARFS_DPF(IO, "%s(%zu, %zu) error %d\n", __func__,
+		    (size_t)off, len, error);
+		return (-error);
+	}
+	res = len - auio.uio_resid;
+	if (res == 0 && len != 0) {
+		TARFS_DPF(IO, "%s(%zu, %zu) eof\n", __func__,
+		    (size_t)off, len);
+	} else {
+		TARFS_DPF(IO, "%s(%zu, %zu) read %zd | %*D\n", __func__,
+		    (size_t)off, len, res,
+		    (int)(res > 8 ? 8 : res), (uint8_t *)buf, " ");
+	}
+	return (res);
+}
+
+#ifdef ZSTDIO
+static void *
+tarfs_zstate_alloc(void *opaque, size_t size)
+{
+
+	(void)opaque;
+	return (malloc(size, M_TARFSZSTATE, M_WAITOK));
+}
+#endif
+
+#ifdef ZSTDIO
+static void
+tarfs_zstate_free(void *opaque, void *address)
+{
+
+	(void)opaque;
+	free(address, M_TARFSZSTATE);
+}
+#endif
+
+#ifdef ZSTDIO
+static ZSTD_customMem tarfs_zstd_mem = {
+	tarfs_zstate_alloc,
+	tarfs_zstate_free,
+	NULL,
+};
+#endif
+
+/*
+ * Updates the decompression frame index, recording the current input and
+ * output offsets in a new index entry, and growing the index if
+ * necessary.
+ */
+static void
+tarfs_zio_update_index(struct tarfs_zio *zio, off_t i, off_t o)
+{
+
+	if (++zio->curidx >= zio->nidx) {
+		if (++zio->nidx > zio->szidx) {
+			zio->szidx *= 2;
+			zio->idx = realloc(zio->idx,
+			    zio->szidx * sizeof(*zio->idx),
+			    M_TARFSZSTATE, M_ZERO | M_WAITOK);
+			TARFS_DPF(ALLOC, "%s: resized zio index\n", __func__);
+		}
+		zio->idx[zio->curidx].i = i;
+		zio->idx[zio->curidx].o = o;
+		TARFS_DPF(ZIDX, "%s: index %u = i %zu o %zu\n", __func__,
+		    zio->curidx, (size_t)zio->idx[zio->curidx].i,
+		    (size_t)zio->idx[zio->curidx].o);
+	}
+	MPASS(zio->idx[zio->curidx].i == i);
+	MPASS(zio->idx[zio->curidx].o == o);
+}
+
+/*
+ * VOP_ACCESS for zio node.
+ */
+static int
+tarfs_zaccess(struct vop_access_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct tarfs_zio *zio = vp->v_data;
+	struct tarfs_mount *tmp = zio->tmp;
+	accmode_t accmode = ap->a_accmode;
+	int error = EPERM;
+
+	if (accmode == VREAD) {
+		error = vn_lock(tmp->vp, LK_SHARED);
+		if (error == 0) {
+			error = VOP_ACCESS(tmp->vp, accmode, ap->a_cred, ap->a_td);
+			VOP_UNLOCK(tmp->vp);
+		}
+	}
+	TARFS_DPF(ZIO, "%s(%d) = %d\n", __func__, accmode, error);
+	return (error);
+}
+
+/*
+ * VOP_GETATTR for zio node.
+ */
+static int
+tarfs_zgetattr(struct vop_getattr_args *ap)
+{
+	struct vattr va;
+	struct vnode *vp = ap->a_vp;
+	struct tarfs_zio *zio = vp->v_data;
+	struct tarfs_mount *tmp = zio->tmp;
+	struct vattr *vap = ap->a_vap;
+	int error = 0;
+
+	VATTR_NULL(vap);
+	error = vn_lock(tmp->vp, LK_SHARED);
+	if (error == 0) {
+		error = VOP_GETATTR(tmp->vp, &va, ap->a_cred);
+		VOP_UNLOCK(tmp->vp);
+		if (error == 0) {
+			vap->va_type = VREG;
+			vap->va_mode = va.va_mode;
+			vap->va_nlink = 1;
+			vap->va_gid = va.va_gid;
+			vap->va_uid = va.va_uid;
+			vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
+			vap->va_fileid = TARFS_ZIOINO;
+			vap->va_size = zio->idx[zio->nidx - 1].o;
+			vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
+			vap->va_atime = va.va_atime;
+			vap->va_ctime = va.va_ctime;
+			vap->va_mtime = va.va_mtime;
+			vap->va_birthtime = tmp->root->birthtime;
+			vap->va_bytes = va.va_bytes;
+		}
+	}
+	TARFS_DPF(ZIO, "%s() = %d\n", __func__, error);
+	return (error);
+}
+
+#ifdef ZSTDIO
+/*
+ * VOP_READ for zio node, zstd edition.
+ */
+static int
+tarfs_zread_zstd(struct tarfs_zio *zio, struct uio *uiop)
+{
+	void *ibuf = NULL, *obuf = NULL, *rl = NULL;
+	struct uio auio;
+	struct iovec aiov;
+	struct tarfs_mount *tmp = zio->tmp;
+	struct tarfs_zstd *zstd = zio->zstd;
+	struct thread *td = curthread;
+	ZSTD_inBuffer zib;
+	ZSTD_outBuffer zob;
+	off_t zsize;
+	off_t ipos, opos;
+	size_t ilen, olen;
+	size_t zerror;
+	off_t off = uiop->uio_offset;
+	size_t len = uiop->uio_resid;
+	size_t resid = uiop->uio_resid;
+	size_t bsize;
+	int error;
+	bool reset = false;
+
+	/* do we have to rewind? */
+	if (off < zio->opos) {
+		while (zio->curidx > 0 && off < zio->idx[zio->curidx].o)
+			zio->curidx--;
+		reset = true;
+	}
+	/* advance to the nearest index entry */
+	if (off > zio->opos) {
+		// XXX maybe do a binary search instead
+		while (zio->curidx < zio->nidx - 1 &&
+		    off >= zio->idx[zio->curidx + 1].o) {
+			zio->curidx++;
+			reset = true;
+		}
+	}
+	/* reset the decompression stream if needed */
+	if (reset) {
+		zio->ipos = zio->idx[zio->curidx].i;
+		zio->opos = zio->idx[zio->curidx].o;
+		ZSTD_resetDStream(zstd->zds);
+		TARFS_DPF(ZIDX, "%s: skipping to index %u = i %zu o %zu\n", __func__,
+		    zio->curidx, (size_t)zio->ipos, (size_t)zio->opos);
+	} else {
+		TARFS_DPF(ZIDX, "%s: continuing at i %zu o %zu\n", __func__,
+		    (size_t)zio->ipos, (size_t)zio->opos);
+	}
+
+	/*
+	 * Set up a temporary buffer for compressed data.  Use the size
+	 * recommended by the zstd library; this is usually 128 kB, but
+	 * just in case, make sure it's a multiple of the page size and no
+	 * larger than MAXBSIZE.
+	 */
+	bsize = roundup(ZSTD_CStreamOutSize(), PAGE_SIZE);
+	if (bsize > MAXBSIZE)
+		bsize = MAXBSIZE;
+	ibuf = malloc(bsize, M_TEMP, M_WAITOK);
+	zib.src = NULL;
+	zib.size = 0;
+	zib.pos = 0;
+
+	/*
+	 * Set up the decompression buffer.  If the target is not in
+	 * kernel space, we will have to set up a bounce buffer.
+	 *
+	 * TODO: to avoid using a bounce buffer, map destination pages
+	 * using vm_fault_quick_hold_pages().
+	 */
+	MPASS(zio->opos <= off);
+	MPASS(uiop->uio_iovcnt == 1);
+	MPASS(uiop->uio_iov->iov_len >= len);
+	if (uiop->uio_segflg == UIO_SYSSPACE) {
+		zob.dst = uiop->uio_iov->iov_base;
+	} else {
+		TARFS_DPF(ALLOC, "%s: allocating %zu-byte bounce buffer\n",
+		    __func__, len);
+		zob.dst = obuf = malloc(len, M_TEMP, M_WAITOK);
+	}
+	zob.size = len;
+	zob.pos = 0;
+
+	/* lock tarball */
+	rl = vn_rangelock_rlock(tmp->vp, zio->ipos, OFF_MAX);
+	error = vn_lock(tmp->vp, LK_SHARED);
+	if (error != 0) {
+		goto fail_unlocked;
+	}
+	/* check size */
+	error = vn_getsize_locked(tmp->vp, &zsize, td->td_ucred);
+	if (error != 0) {
+		goto fail;
+	}
+	if (zio->ipos >= zsize) {
+		/* beyond EOF */
+		goto fail;
+	}
+
+	while (resid > 0) {
+		if (zib.pos == zib.size) {
+			/* request data from the underlying file */
+			aiov.iov_base = ibuf;
+			aiov.iov_len = bsize;
+			auio.uio_iov = &aiov;
+			auio.uio_iovcnt = 1;
+			auio.uio_offset = zio->ipos;
+			auio.uio_segflg = UIO_SYSSPACE;
+			auio.uio_rw = UIO_READ;
+			auio.uio_resid = aiov.iov_len;
+			auio.uio_td = td;
+			error = VOP_READ(tmp->vp, &auio,
+			    IO_DIRECT | IO_NODELOCKED,
+			    td->td_ucred);
+			if (error != 0)
+				goto fail;
+			TARFS_DPF(ZIO, "%s: req %zu+%zu got %zu+%zu\n", __func__,
+			    (size_t)zio->ipos, bsize,
+			    (size_t)zio->ipos, bsize - auio.uio_resid);
+			zib.src = ibuf;
+			zib.size = bsize - auio.uio_resid;
+			zib.pos = 0;
+		}
+		MPASS(zib.pos <= zib.size);
+		if (zib.pos == zib.size) {
+			TARFS_DPF(ZIO, "%s: end of file after i %zu o %zu\n", __func__,
+			    (size_t)zio->ipos, (size_t)zio->opos);
+			goto fail;
+		}
+		if (zio->opos < off) {
+			/* to be discarded */
+			zob.size = min(off - zio->opos, len);
+			zob.pos = 0;
*** 3111 LINES SKIPPED ***