git: f9693bef8dc8 - main - zfs: merge OpenZFS master-891568c99
Martin Matuska
mm at FreeBSD.org
Sun Mar 21 01:25:37 UTC 2021
The branch main has been updated by mm:
URL: https://cgit.FreeBSD.org/src/commit/?id=f9693bef8dc83284e7ac905adc346f7d866b5245
commit f9693bef8dc83284e7ac905adc346f7d866b5245
Merge: 815209920f1d 48a1c304e82e
Author: Martin Matuska <mm at FreeBSD.org>
AuthorDate: 2021-03-21 00:46:08 +0000
Commit: Martin Matuska <mm at FreeBSD.org>
CommitDate: 2021-03-21 01:17:59 +0000
zfs: merge OpenZFS master-891568c99
Notable upstream pull request merges:
#11652 Split dmu_zfetch() speculation and execution parts
#11682 Fix zfs_get_data access to files with wrong generation
#11735 Clean up RAIDZ/DRAID ereport code
#11737 Initialize metaslab range trees in metaslab_init
#11739 FreeBSD: make seqc asserts conditional on replay
#11763 Allow setting bootfs property on pools with indirect vdevs
#11767 FreeBSD: Fix memory leaks in kstats
Obtained from: OpenZFS
MFC after: 2 weeks
sys/contrib/openzfs/README.md | 2 +-
sys/contrib/openzfs/cmd/raidz_test/raidz_test.c | 2 -
sys/contrib/openzfs/cmd/ztest/ztest.c | 4 +-
sys/contrib/openzfs/config/kernel-bio_max_segs.m4 | 23 ++
.../openzfs/config/kernel-generic_fillattr.m4 | 28 +++
sys/contrib/openzfs/config/kernel-inode-create.m4 | 43 +++-
sys/contrib/openzfs/config/kernel-inode-getattr.m4 | 63 ++++-
.../openzfs/config/kernel-is_owner_or_cap.m4 | 23 +-
sys/contrib/openzfs/config/kernel-mkdir-umode-t.m4 | 32 ---
sys/contrib/openzfs/config/kernel-mkdir.m4 | 65 +++++
sys/contrib/openzfs/config/kernel-mknod.m4 | 30 +++
sys/contrib/openzfs/config/kernel-rename.m4 | 50 +++-
.../openzfs/config/kernel-setattr-prepare.m4 | 45 +++-
sys/contrib/openzfs/config/kernel-symlink.m4 | 30 +++
sys/contrib/openzfs/config/kernel-xattr-handler.m4 | 78 ++++--
sys/contrib/openzfs/config/kernel.m4 | 20 +-
sys/contrib/openzfs/configure.ac | 1 +
.../include/os/linux/kernel/linux/kmap_compat.h | 4 +-
.../include/os/linux/kernel/linux/vfs_compat.h | 24 +-
.../include/os/linux/kernel/linux/xattr_compat.h | 17 +-
.../include/os/linux/zfs/sys/zfs_vnops_os.h | 3 +-
.../include/os/linux/zfs/sys/zfs_znode_impl.h | 8 +-
sys/contrib/openzfs/include/os/linux/zfs/sys/zpl.h | 18 ++
sys/contrib/openzfs/include/sys/dmu_zfetch.h | 23 +-
sys/contrib/openzfs/include/sys/vdev_raidz.h | 2 +
sys/contrib/openzfs/include/sys/vdev_raidz_impl.h | 7 +-
sys/contrib/openzfs/include/sys/zil.h | 3 +-
sys/contrib/openzfs/include/sys/zio.h | 10 +-
sys/contrib/openzfs/include/sys/zvol_impl.h | 4 +-
.../openzfs/man/man5/zfs-module-parameters.5 | 25 +-
sys/contrib/openzfs/man/man8/zfs-allow.8 | 3 +
sys/contrib/openzfs/man/man8/zgenhostid.8 | 4 +-
sys/contrib/openzfs/man/man8/zpoolconcepts.8 | 17 ++
.../openzfs/module/os/freebsd/spl/spl_kstat.c | 11 +-
.../openzfs/module/os/freebsd/zfs/sysctl_os.c | 6 -
sys/contrib/openzfs/module/os/linux/zfs/abd_os.c | 10 +-
sys/contrib/openzfs/module/os/linux/zfs/policy.c | 2 +-
.../openzfs/module/os/linux/zfs/vdev_disk.c | 5 +
.../openzfs/module/os/linux/zfs/zfs_ctldir.c | 3 +-
sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c | 4 +-
.../openzfs/module/os/linux/zfs/zfs_vfsops.c | 6 +-
.../openzfs/module/os/linux/zfs/zfs_vnops_os.c | 5 +-
.../openzfs/module/os/linux/zfs/zpl_ctldir.c | 51 +++-
sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c | 2 +-
.../openzfs/module/os/linux/zfs/zpl_inode.c | 52 +++-
.../openzfs/module/os/linux/zfs/zpl_xattr.c | 4 +-
sys/contrib/openzfs/module/zfs/dbuf.c | 5 +-
sys/contrib/openzfs/module/zfs/dmu.c | 35 ++-
sys/contrib/openzfs/module/zfs/dmu_zfetch.c | 250 +++++++++++--------
sys/contrib/openzfs/module/zfs/metaslab.c | 149 +++++-------
sys/contrib/openzfs/module/zfs/refcount.c | 10 +-
sys/contrib/openzfs/module/zfs/vdev.c | 4 +-
sys/contrib/openzfs/module/zfs/vdev_draid.c | 240 +------------------
sys/contrib/openzfs/module/zfs/vdev_indirect.c | 1 -
sys/contrib/openzfs/module/zfs/vdev_mirror.c | 5 +-
sys/contrib/openzfs/module/zfs/vdev_raidz.c | 266 +++------------------
sys/contrib/openzfs/module/zfs/zfs_fm.c | 8 +-
sys/contrib/openzfs/module/zfs/zfs_fuid.c | 4 -
sys/contrib/openzfs/module/zfs/zfs_log.c | 5 +
sys/contrib/openzfs/module/zfs/zfs_vnops.c | 14 +-
sys/contrib/openzfs/module/zfs/zil.c | 3 +-
sys/contrib/openzfs/module/zfs/zio.c | 4 +-
sys/contrib/openzfs/module/zfs/zvol.c | 3 +-
sys/contrib/openzfs/tests/runfiles/common.run | 8 +-
sys/contrib/openzfs/tests/runfiles/freebsd.run | 4 +
sys/contrib/openzfs/tests/runfiles/sanity.run | 4 +
.../zfs-tests/tests/functional/acl/Makefile.am | 2 +-
.../zfs-tests/tests/functional/acl/off/.gitignore | 1 +
.../zfs-tests/tests/functional/acl/off/Makefile.am | 16 ++
.../zfs-tests/tests/functional/acl/off/cleanup.ksh | 33 +++
.../zfs-tests/tests/functional/acl/off/dosmode.ksh | 199 +++++++++++++++
.../functional/acl/off/dosmode_readonly_write.c | 61 +++++
.../tests/functional/acl/off/posixmode.ksh | 145 +++++++++++
.../zfs-tests/tests/functional/acl/off/setup.ksh | 44 ++++
.../tests/functional/redacted_send/Makefile.am | 1 +
.../functional/redacted_send/redacted_panic.ksh | 44 ++++
sys/modules/zfs/zfs_config.h | 4 +-
77 files changed, 1561 insertions(+), 883 deletions(-)
diff --cc sys/contrib/openzfs/README.md
index 31d99386e90e,000000000000..d666df7af309
mode 100644,000000..100644
--- a/sys/contrib/openzfs/README.md
+++ b/sys/contrib/openzfs/README.md
@@@ -1,35 -1,0 +1,35 @@@
+
+
+OpenZFS is an advanced file system and volume manager which was originally
+developed for Solaris and is now maintained by the OpenZFS community.
+This repository contains the code for running OpenZFS on Linux and FreeBSD.
+
+[](https://codecov.io/gh/openzfs/zfs)
+[](https://scan.coverity.com/projects/openzfs-zfs)
+
+# Official Resources
+
+ * [Documentation](https://openzfs.github.io/openzfs-docs/) - for using and developing this repo
+ * [ZoL Site](https://zfsonlinux.org) - Linux release info & links
+ * [Mailing lists](https://openzfs.github.io/openzfs-docs/Project%20and%20Community/Mailing%20Lists.html)
+ * [OpenZFS site](http://open-zfs.org/) - for conference videos and info on other platforms (illumos, OSX, Windows, etc)
+
+# Installation
+
+Full documentation for installing OpenZFS on your favorite operating system can
+be found at the [Getting Started Page](https://openzfs.github.io/openzfs-docs/Getting%20Started/index.html).
+
+# Contribute & Develop
+
+We have a separate document with [contribution guidelines](./.github/CONTRIBUTING.md).
+
+We have a [Code of Conduct](./CODE_OF_CONDUCT.md).
+
+# Release
+
+OpenZFS is released under a CDDL license.
+For more details see the NOTICE, LICENSE and COPYRIGHT files; `UCRL-CODE-235197`
+
+# Supported Kernels
+ * The `META` file contains the officially recognized supported Linux kernel versions.
- * Supported FreeBSD versions are 12-STABLE and 13-CURRENT.
++ * Supported FreeBSD versions are any supported branches and releases starting from 12.2-RELEASE.
diff --cc sys/contrib/openzfs/config/kernel-bio_max_segs.m4
index 000000000000,a90d75455c13..a90d75455c13
mode 000000,100644..100644
--- a/sys/contrib/openzfs/config/kernel-bio_max_segs.m4
+++ b/sys/contrib/openzfs/config/kernel-bio_max_segs.m4
diff --cc sys/contrib/openzfs/config/kernel-generic_fillattr.m4
index 000000000000,50c8031305b3..50c8031305b3
mode 000000,100644..100644
--- a/sys/contrib/openzfs/config/kernel-generic_fillattr.m4
+++ b/sys/contrib/openzfs/config/kernel-generic_fillattr.m4
diff --cc sys/contrib/openzfs/config/kernel-mkdir.m4
index 000000000000,a162bcd880ff..a162bcd880ff
mode 000000,100644..100644
--- a/sys/contrib/openzfs/config/kernel-mkdir.m4
+++ b/sys/contrib/openzfs/config/kernel-mkdir.m4
diff --cc sys/contrib/openzfs/config/kernel-mknod.m4
index 000000000000,ffe45106003a..ffe45106003a
mode 000000,100644..100644
--- a/sys/contrib/openzfs/config/kernel-mknod.m4
+++ b/sys/contrib/openzfs/config/kernel-mknod.m4
diff --cc sys/contrib/openzfs/config/kernel-symlink.m4
index 000000000000,d90366d04b72..d90366d04b72
mode 000000,100644..100644
--- a/sys/contrib/openzfs/config/kernel-symlink.m4
+++ b/sys/contrib/openzfs/config/kernel-symlink.m4
diff --cc sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
index 3b0f824115f8,000000000000..3e3fda20c72c
mode 100644,000000..100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
@@@ -1,333 -1,0 +1,333 @@@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+/*
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ */
+
+#ifdef _KERNEL
+
+#include <sys/types.h>
+#include <sys/uio_impl.h>
+#include <sys/sysmacros.h>
+#include <sys/strings.h>
+#include <linux/kmap_compat.h>
+#include <linux/uaccess.h>
+
+/*
+ * Move "n" bytes at byte address "p"; "rw" indicates the direction
+ * of the move, and the I/O parameters are provided in "uio", which is
+ * update to reflect the data which was moved. Returns 0 on success or
+ * a non-zero errno on failure.
+ */
+static int
+zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
+{
+ const struct iovec *iov = uio->uio_iov;
+ size_t skip = uio->uio_skip;
+ ulong_t cnt;
+
+ while (n && uio->uio_resid) {
+ cnt = MIN(iov->iov_len - skip, n);
+ switch (uio->uio_segflg) {
+ case UIO_USERSPACE:
+ /*
+ * p = kernel data pointer
+ * iov->iov_base = user data pointer
+ */
+ if (rw == UIO_READ) {
+ if (copy_to_user(iov->iov_base+skip, p, cnt))
+ return (EFAULT);
+ } else {
+ unsigned long b_left = 0;
+ if (uio->uio_fault_disable) {
+ if (!zfs_access_ok(VERIFY_READ,
+ (iov->iov_base + skip), cnt)) {
+ return (EFAULT);
+ }
+ pagefault_disable();
+ b_left =
+ __copy_from_user_inatomic(p,
+ (iov->iov_base + skip), cnt);
+ pagefault_enable();
+ } else {
+ b_left =
+ copy_from_user(p,
+ (iov->iov_base + skip), cnt);
+ }
+ if (b_left > 0) {
+ unsigned long c_bytes =
+ cnt - b_left;
+ uio->uio_skip += c_bytes;
+ ASSERT3U(uio->uio_skip, <,
+ iov->iov_len);
+ uio->uio_resid -= c_bytes;
+ uio->uio_loffset += c_bytes;
+ return (EFAULT);
+ }
+ }
+ break;
+ case UIO_SYSSPACE:
+ if (rw == UIO_READ)
+ bcopy(p, iov->iov_base + skip, cnt);
+ else
+ bcopy(iov->iov_base + skip, p, cnt);
+ break;
+ default:
+ ASSERT(0);
+ }
+ skip += cnt;
+ if (skip == iov->iov_len) {
+ skip = 0;
+ uio->uio_iov = (++iov);
+ uio->uio_iovcnt--;
+ }
+ uio->uio_skip = skip;
+ uio->uio_resid -= cnt;
+ uio->uio_loffset += cnt;
+ p = (caddr_t)p + cnt;
+ n -= cnt;
+ }
+ return (0);
+}
+
+static int
+zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
+{
+ const struct bio_vec *bv = uio->uio_bvec;
+ size_t skip = uio->uio_skip;
+ ulong_t cnt;
+
+ while (n && uio->uio_resid) {
+ void *paddr;
+ cnt = MIN(bv->bv_len - skip, n);
+
- paddr = zfs_kmap_atomic(bv->bv_page, KM_USER1);
++ paddr = zfs_kmap_atomic(bv->bv_page);
+ if (rw == UIO_READ)
+ bcopy(p, paddr + bv->bv_offset + skip, cnt);
+ else
+ bcopy(paddr + bv->bv_offset + skip, p, cnt);
- zfs_kunmap_atomic(paddr, KM_USER1);
++ zfs_kunmap_atomic(paddr);
+
+ skip += cnt;
+ if (skip == bv->bv_len) {
+ skip = 0;
+ uio->uio_bvec = (++bv);
+ uio->uio_iovcnt--;
+ }
+ uio->uio_skip = skip;
+ uio->uio_resid -= cnt;
+ uio->uio_loffset += cnt;
+ p = (caddr_t)p + cnt;
+ n -= cnt;
+ }
+ return (0);
+}
+
+#if defined(HAVE_VFS_IOV_ITER)
+static int
+zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio,
+ boolean_t revert)
+{
+ size_t cnt = MIN(n, uio->uio_resid);
+
+ if (uio->uio_skip)
+ iov_iter_advance(uio->uio_iter, uio->uio_skip);
+
+ if (rw == UIO_READ)
+ cnt = copy_to_iter(p, cnt, uio->uio_iter);
+ else
+ cnt = copy_from_iter(p, cnt, uio->uio_iter);
+
+ /*
+ * When operating on a full pipe no bytes are processed.
+ * In which case return EFAULT which is converted to EAGAIN
+ * by the kernel's generic_file_splice_read() function.
+ */
+ if (cnt == 0)
+ return (EFAULT);
+
+ /*
+ * Revert advancing the uio_iter. This is set by zfs_uiocopy()
+ * to avoid consuming the uio and its iov_iter structure.
+ */
+ if (revert)
+ iov_iter_revert(uio->uio_iter, cnt);
+
+ uio->uio_resid -= cnt;
+ uio->uio_loffset += cnt;
+
+ return (0);
+}
+#endif
+
+int
+zfs_uiomove(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
+{
+ if (uio->uio_segflg == UIO_BVEC)
+ return (zfs_uiomove_bvec(p, n, rw, uio));
+#if defined(HAVE_VFS_IOV_ITER)
+ else if (uio->uio_segflg == UIO_ITER)
+ return (zfs_uiomove_iter(p, n, rw, uio, B_FALSE));
+#endif
+ else
+ return (zfs_uiomove_iov(p, n, rw, uio));
+}
+EXPORT_SYMBOL(zfs_uiomove);
+
+/*
+ * Fault in the pages of the first n bytes specified by the uio structure.
+ * 1 byte in each page is touched and the uio struct is unmodified. Any
+ * error will terminate the process as this is only a best attempt to get
+ * the pages resident.
+ */
+int
+zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio)
+{
+ if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC) {
+ /* There's never a need to fault in kernel pages */
+ return (0);
+#if defined(HAVE_VFS_IOV_ITER)
+ } else if (uio->uio_segflg == UIO_ITER) {
+ /*
+ * At least a Linux 4.9 kernel, iov_iter_fault_in_readable()
+ * can be relied on to fault in user pages when referenced.
+ */
+ if (iov_iter_fault_in_readable(uio->uio_iter, n))
+ return (EFAULT);
+#endif
+ } else {
+ /* Fault in all user pages */
+ ASSERT3S(uio->uio_segflg, ==, UIO_USERSPACE);
+ const struct iovec *iov = uio->uio_iov;
+ int iovcnt = uio->uio_iovcnt;
+ size_t skip = uio->uio_skip;
+ uint8_t tmp;
+ caddr_t p;
+
+ for (; n > 0 && iovcnt > 0; iov++, iovcnt--, skip = 0) {
+ ulong_t cnt = MIN(iov->iov_len - skip, n);
+ /* empty iov */
+ if (cnt == 0)
+ continue;
+ n -= cnt;
+ /* touch each page in this segment. */
+ p = iov->iov_base + skip;
+ while (cnt) {
+ if (get_user(tmp, (uint8_t *)p))
+ return (EFAULT);
+ ulong_t incr = MIN(cnt, PAGESIZE);
+ p += incr;
+ cnt -= incr;
+ }
+ /* touch the last byte in case it straddles a page. */
+ p--;
+ if (get_user(tmp, (uint8_t *)p))
+ return (EFAULT);
+ }
+ }
+
+ if (iterp && iov_iter_fault_in_readable(iterp, n))
+ return (EFAULT);
+#endif
+ return (0);
+}
+EXPORT_SYMBOL(zfs_uio_prefaultpages);
+
+/*
+ * The same as zfs_uiomove() but doesn't modify uio structure.
+ * return in cbytes how many bytes were copied.
+ */
+int
+zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes)
+{
+ zfs_uio_t uio_copy;
+ int ret;
+
+ bcopy(uio, &uio_copy, sizeof (zfs_uio_t));
+
+ if (uio->uio_segflg == UIO_BVEC)
+ ret = zfs_uiomove_bvec(p, n, rw, &uio_copy);
+#if defined(HAVE_VFS_IOV_ITER)
+ else if (uio->uio_segflg == UIO_ITER)
+ ret = zfs_uiomove_iter(p, n, rw, &uio_copy, B_TRUE);
+#endif
+ else
+ ret = zfs_uiomove_iov(p, n, rw, &uio_copy);
+
+ *cbytes = uio->uio_resid - uio_copy.uio_resid;
+
+ return (ret);
+}
+EXPORT_SYMBOL(zfs_uiocopy);
+
+/*
+ * Drop the next n chars out of *uio.
+ */
+void
+zfs_uioskip(zfs_uio_t *uio, size_t n)
+{
+ if (n > uio->uio_resid)
+ return;
+
+ if (uio->uio_segflg == UIO_BVEC) {
+ uio->uio_skip += n;
+ while (uio->uio_iovcnt &&
+ uio->uio_skip >= uio->uio_bvec->bv_len) {
+ uio->uio_skip -= uio->uio_bvec->bv_len;
+ uio->uio_bvec++;
+ uio->uio_iovcnt--;
+ }
+#if defined(HAVE_VFS_IOV_ITER)
+ } else if (uio->uio_segflg == UIO_ITER) {
+ iov_iter_advance(uio->uio_iter, n);
+#endif
+ } else {
+ uio->uio_skip += n;
+ while (uio->uio_iovcnt &&
+ uio->uio_skip >= uio->uio_iov->iov_len) {
+ uio->uio_skip -= uio->uio_iov->iov_len;
+ uio->uio_iov++;
+ uio->uio_iovcnt--;
+ }
+ }
+ uio->uio_loffset += n;
+ uio->uio_resid -= n;
+}
+EXPORT_SYMBOL(zfs_uioskip);
+
+#endif /* _KERNEL */
diff --cc sys/contrib/openzfs/module/zfs/zio.c
index 262ca24b1443,000000000000..a7820e75670b
mode 100644,000000..100644
--- a/sys/contrib/openzfs/module/zfs/zio.c
+++ b/sys/contrib/openzfs/module/zfs/zio.c
@@@ -1,5036 -1,0 +1,5036 @@@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ */
+
+#include <sys/sysmacros.h>
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
+#include <sys/zio_impl.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/ddt.h>
+#include <sys/blkptr.h>
+#include <sys/zfeature.h>
+#include <sys/dsl_scan.h>
+#include <sys/metaslab_impl.h>
+#include <sys/time.h>
+#include <sys/trace_zfs.h>
+#include <sys/abd.h>
+#include <sys/dsl_crypt.h>
+#include <cityhash.h>
+
+/*
+ * ==========================================================================
+ * I/O type descriptions
+ * ==========================================================================
+ */
+const char *zio_type_name[ZIO_TYPES] = {
+ /*
+ * Note: Linux kernel thread name length is limited
+ * so these names will differ from upstream open zfs.
+ */
+ "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim"
+};
+
+int zio_dva_throttle_enabled = B_TRUE;
+int zio_deadman_log_all = B_FALSE;
+
+/*
+ * ==========================================================================
+ * I/O kmem caches
+ * ==========================================================================
+ */
+kmem_cache_t *zio_cache;
+kmem_cache_t *zio_link_cache;
+kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+#if defined(ZFS_DEBUG) && !defined(_KERNEL)
+uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+#endif
+
+/* Mark IOs as "slow" if they take longer than 30 seconds */
+int zio_slow_io_ms = (30 * MILLISEC);
+
+#define BP_SPANB(indblkshift, level) \
+ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
+#define COMPARE_META_LEVEL 0x80000000ul
+/*
+ * The following actions directly effect the spa's sync-to-convergence logic.
+ * The values below define the sync pass when we start performing the action.
+ * Care should be taken when changing these values as they directly impact
+ * spa_sync() performance. Tuning these values may introduce subtle performance
+ * pathologies and should only be done in the context of performance analysis.
+ * These tunables will eventually be removed and replaced with #defines once
+ * enough analysis has been done to determine optimal values.
+ *
+ * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
+ * regular blocks are not deferred.
+ *
+ * Starting in sync pass 8 (zfs_sync_pass_dont_compress), we disable
+ * compression (including of metadata). In practice, we don't have this
+ * many sync passes, so this has no effect.
+ *
+ * The original intent was that disabling compression would help the sync
+ * passes to converge. However, in practice disabling compression increases
+ * the average number of sync passes, because when we turn compression off, a
+ * lot of block's size will change and thus we have to re-allocate (not
+ * overwrite) them. It also increases the number of 128KB allocations (e.g.
+ * for indirect blocks and spacemaps) because these will not be compressed.
+ * The 128K allocations are especially detrimental to performance on highly
+ * fragmented systems, which may have very few free segments of this size,
+ * and may need to load new metaslabs to satisfy 128K allocations.
+ */
+int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
+int zfs_sync_pass_dont_compress = 8; /* don't compress starting in this pass */
+int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
+
+/*
+ * An allocating zio is one that either currently has the DVA allocate
+ * stage set or will have it later in its lifetime.
+ */
+#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
+
+/*
+ * Enable smaller cores by excluding metadata
+ * allocations as well.
+ */
+int zio_exclude_metadata = 0;
+int zio_requeue_io_start_cut_in_line = 1;
+
+#ifdef ZFS_DEBUG
+int zio_buf_debug_limit = 16384;
+#else
+int zio_buf_debug_limit = 0;
+#endif
+
+static inline void __zio_execute(zio_t *zio);
+
+static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
+
+void
+zio_init(void)
+{
+ size_t c;
+
+ zio_cache = kmem_cache_create("zio_cache",
+ sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ zio_link_cache = kmem_cache_create("zio_link_cache",
+ sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ /*
+ * For small buffers, we want a cache for each multiple of
+ * SPA_MINBLOCKSIZE. For larger buffers, we want a cache
+ * for each quarter-power of 2.
+ */
+ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
+ size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
+ size_t p2 = size;
+ size_t align = 0;
+ size_t data_cflags, cflags;
+
+ data_cflags = KMC_NODEBUG;
+ cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ?
+ KMC_NODEBUG : 0;
+
+#if defined(_ILP32) && defined(_KERNEL)
+ /*
+ * Cache size limited to 1M on 32-bit platforms until ARC
+ * buffers no longer require virtual address space.
+ */
+ if (size > zfs_max_recordsize)
+ break;
+#endif
+
+ while (!ISP2(p2))
+ p2 &= p2 - 1;
+
+#ifndef _KERNEL
+ /*
+ * If we are using watchpoints, put each buffer on its own page,
+ * to eliminate the performance overhead of trapping to the
+ * kernel when modifying a non-watched buffer that shares the
+ * page with a watched buffer.
+ */
+ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
+ continue;
+ /*
+ * Here's the problem - on 4K native devices in userland on
+ * Linux using O_DIRECT, buffers must be 4K aligned or I/O
+ * will fail with EINVAL, causing zdb (and others) to coredump.
+ * Since userland probably doesn't need optimized buffer caches,
+ * we just force 4K alignment on everything.
+ */
+ align = 8 * SPA_MINBLOCKSIZE;
+#else
+ if (size < PAGESIZE) {
+ align = SPA_MINBLOCKSIZE;
+ } else if (IS_P2ALIGNED(size, p2 >> 2)) {
+ align = PAGESIZE;
+ }
+#endif
+
+ if (align != 0) {
+ char name[36];
+ if (cflags == data_cflags) {
+ /*
+ * Resulting kmem caches would be identical.
+ * Save memory by creating only one.
+ */
+ (void) snprintf(name, sizeof (name),
+ "zio_buf_comb_%lu", (ulong_t)size);
+ zio_buf_cache[c] = kmem_cache_create(name,
+ size, align, NULL, NULL, NULL, NULL, NULL,
+ cflags);
+ zio_data_buf_cache[c] = zio_buf_cache[c];
+ continue;
+ }
+ (void) snprintf(name, sizeof (name), "zio_buf_%lu",
+ (ulong_t)size);
+ zio_buf_cache[c] = kmem_cache_create(name, size,
+ align, NULL, NULL, NULL, NULL, NULL, cflags);
+
+ (void) snprintf(name, sizeof (name), "zio_data_buf_%lu",
+ (ulong_t)size);
+ zio_data_buf_cache[c] = kmem_cache_create(name, size,
+ align, NULL, NULL, NULL, NULL, NULL, data_cflags);
+ }
+ }
+
+ while (--c != 0) {
+ ASSERT(zio_buf_cache[c] != NULL);
+ if (zio_buf_cache[c - 1] == NULL)
+ zio_buf_cache[c - 1] = zio_buf_cache[c];
+
+ ASSERT(zio_data_buf_cache[c] != NULL);
+ if (zio_data_buf_cache[c - 1] == NULL)
+ zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
+ }
+
+ zio_inject_init();
+
+ lz4_init();
+}
+
+void
+zio_fini(void)
+{
+ size_t i, j, n;
+ kmem_cache_t *cache;
+
+ n = SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT;
+
+#if defined(ZFS_DEBUG) && !defined(_KERNEL)
+ for (i = 0; i < n; i++) {
+ if (zio_buf_cache_allocs[i] != zio_buf_cache_frees[i])
+ (void) printf("zio_fini: [%d] %llu != %llu\n",
+ (int)((i + 1) << SPA_MINBLOCKSHIFT),
+ (long long unsigned)zio_buf_cache_allocs[i],
+ (long long unsigned)zio_buf_cache_frees[i]);
+ }
+#endif
+
+ /*
+ * The same kmem cache can show up multiple times in both zio_buf_cache
+ * and zio_data_buf_cache. Do a wasteful but trivially correct scan to
+ * sort it out.
+ */
+ for (i = 0; i < n; i++) {
+ cache = zio_buf_cache[i];
+ if (cache == NULL)
+ continue;
+ for (j = i; j < n; j++) {
+ if (cache == zio_buf_cache[j])
+ zio_buf_cache[j] = NULL;
+ if (cache == zio_data_buf_cache[j])
+ zio_data_buf_cache[j] = NULL;
+ }
+ kmem_cache_destroy(cache);
+ }
+
+ for (i = 0; i < n; i++) {
+ cache = zio_data_buf_cache[i];
+ if (cache == NULL)
+ continue;
+ for (j = i; j < n; j++) {
+ if (cache == zio_data_buf_cache[j])
+ zio_data_buf_cache[j] = NULL;
+ }
+ kmem_cache_destroy(cache);
+ }
+
+ for (i = 0; i < n; i++) {
+ if (zio_buf_cache[i] != NULL)
+ panic("zio_fini: zio_buf_cache[%d] != NULL", (int)i);
+ if (zio_data_buf_cache[i] != NULL)
+ panic("zio_fini: zio_data_buf_cache[%d] != NULL", (int)i);
+ }
+
+ kmem_cache_destroy(zio_link_cache);
+ kmem_cache_destroy(zio_cache);
+
+ zio_inject_fini();
+
+ lz4_fini();
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free I/O buffers
+ * ==========================================================================
+ */
+
+/*
+ * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a
+ * crashdump if the kernel panics, so use it judiciously. Obviously, it's
+ * useful to inspect ZFS metadata, but if possible, we should avoid keeping
+ * excess / transient data in-core during a crashdump.
+ */
+void *
+zio_buf_alloc(size_t size)
+{
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+#if defined(ZFS_DEBUG) && !defined(_KERNEL)
+ atomic_add_64(&zio_buf_cache_allocs[c], 1);
+#endif
+
+ return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
+}
+
+/*
+ * Use zio_data_buf_alloc to allocate data. The data will not appear in a
+ * crashdump if the kernel panics. This exists so that we will limit the amount
+ * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount
+ * of kernel heap dumped to disk when the kernel panics)
+ */
+void *
+zio_data_buf_alloc(size_t size)
+{
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
+}
+
+void
+zio_buf_free(void *buf, size_t size)
+{
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+#if defined(ZFS_DEBUG) && !defined(_KERNEL)
+ atomic_add_64(&zio_buf_cache_frees[c], 1);
+#endif
+
+ kmem_cache_free(zio_buf_cache[c], buf);
+}
+
+void
+zio_data_buf_free(void *buf, size_t size)
+{
+ size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+ VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+ kmem_cache_free(zio_data_buf_cache[c], buf);
+}
+
+static void
+zio_abd_free(void *abd, size_t size)
+{
+ abd_free((abd_t *)abd);
+}
+
+/*
+ * ==========================================================================
+ * Push and pop I/O transform buffers
+ * ==========================================================================
+ */
+void
+zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
+ zio_transform_func_t *transform)
+{
+ zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
+
+ zt->zt_orig_abd = zio->io_abd;
+ zt->zt_orig_size = zio->io_size;
+ zt->zt_bufsize = bufsize;
+ zt->zt_transform = transform;
+
+ zt->zt_next = zio->io_transform_stack;
+ zio->io_transform_stack = zt;
+
+ zio->io_abd = data;
+ zio->io_size = size;
+}
+
+void
+zio_pop_transforms(zio_t *zio)
+{
+ zio_transform_t *zt;
+
+ while ((zt = zio->io_transform_stack) != NULL) {
+ if (zt->zt_transform != NULL)
+ zt->zt_transform(zio,
+ zt->zt_orig_abd, zt->zt_orig_size);
+
+ if (zt->zt_bufsize != 0)
+ abd_free(zio->io_abd);
+
+ zio->io_abd = zt->zt_orig_abd;
+ zio->io_size = zt->zt_orig_size;
+ zio->io_transform_stack = zt->zt_next;
+
+ kmem_free(zt, sizeof (zio_transform_t));
+ }
+}
+
+/*
+ * ==========================================================================
+ * I/O transform callbacks for subblocks, decompression, and decryption
+ * ==========================================================================
+ */
+static void
+zio_subblock(zio_t *zio, abd_t *data, uint64_t size)
+{
+ ASSERT(zio->io_size > size);
+
+ if (zio->io_type == ZIO_TYPE_READ)
+ abd_copy(data, zio->io_abd, size);
+}
+
+static void
+zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
+{
+ if (zio->io_error == 0) {
+ void *tmp = abd_borrow_buf(data, size);
+ int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
+ zio->io_abd, tmp, zio->io_size, size,
+ &zio->io_prop.zp_complevel);
+ abd_return_buf_copy(data, tmp, size);
+
+ if (zio_injection_enabled && ret == 0)
+ ret = zio_handle_fault_injection(zio, EINVAL);
+
+ if (ret != 0)
+ zio->io_error = SET_ERROR(EIO);
+ }
+}
+
+static void
+zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
+{
+ int ret;
+ void *tmp;
+ blkptr_t *bp = zio->io_bp;
+ spa_t *spa = zio->io_spa;
+ uint64_t dsobj = zio->io_bookmark.zb_objset;
+ uint64_t lsize = BP_GET_LSIZE(bp);
+ dmu_object_type_t ot = BP_GET_TYPE(bp);
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+ boolean_t no_crypt = B_FALSE;
+
*** 5391 LINES SKIPPED ***
More information about the dev-commits-src-all
mailing list