svn commit: r318821 - in head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys

Andriy Gapon avg at FreeBSD.org
Wed May 24 21:43:36 UTC 2017


Author: avg
Date: Wed May 24 21:43:34 2017
New Revision: 318821
URL: https://svnweb.freebsd.org/changeset/base/318821

Log:
  MFC r316912: 7793 ztest fails assertion in dmu_tx_willuse_space
  
  illumos/illumos-gate at 61e255ce7267b52208af9daf434b77d37fb75622
  https://github.com/illumos/illumos-gate/commit/61e255ce7267b52208af9daf434b77d37fb75622
  
  https://www.illumos.org/issues/7793
    Background information: This assertion about tx_space_* verifies that we
    are not dirtying more stuff than we thought we would. We “need” to know
    how much we will dirty so that we can check if we should fail this
    transaction with ENOSPC/EDQUOT, in dmu_tx_assign(). While the
    transaction is open (i.e. between dmu_tx_assign() and dmu_tx_commit() —
    typically less than a millisecond), we call dbuf_dirty() on the exact
    blocks that will be modified. Once this happens, the temporary
    accounting in tx_space_* is unnecessary, because we know exactly what
    blocks are newly dirtied; we call dnode_willuse_space() to track this
    more exact accounting.
    The fundamental problem causing this bug is that dmu_tx_hold_*() relies
    on the current state in the DMU (e.g. dn_nlevels) to predict how much
    will be dirtied by this transaction, but this state can change before we
    actually perform the transaction (i.e. call dbuf_dirty()).
    This bug will be fixed by removing the assertion that the tx_space_*
    accounting is perfectly accurate (i.e. we never dirty more than was
    predicted by dmu_tx_hold_*()). By removing the requirement that this
    accounting be perfectly accurate, we can also vastly simplify it, e.g.
    removing most of the logic in dmu_tx_count_*().
    The new tx space accounting will be very approximate, and may be more or
    less than what is actually dirtied. It will still be used to determine
    if this transaction will put us over quota. Transactions that are marked
    by dmu_tx_mark_netfree() will be excepted from this check. We won’t make
    an attempt to determine how much space will be freed by the transaction
    — this was rarely accurate enough to determine if a transaction should
    be permitted when we are over quota, which is why dmu_tx_mark_netfree()
    was introduced in 2014.
    We also won’t attempt to give “credit” when overwriting existing blocks,
    if those blocks may be freed. This allows us to remove the
    do_free_accounting logic in dbuf_dirty(), and associated routines. This
  
  Reviewed by: Steve Gonczi <steve.gonczi at delphix.com>
  Reviewed by: George Wilson <george.wilson at delphix.com>
  Reviewed by: Pavel Zakharov <pavel.zakharov at delphix.com>
  Reviewed by: Brian Behlendorf <behlendorf1 at llnl.gov>
  Approved by: Robert Mustacchi <rm at joyent.com>
  Author: Matthew Ahrens <mahrens at delphix.com>
  MFC after:	3 weeks

Modified:
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
Directory Properties:
  head/sys/cddl/contrib/opensolaris/   (props changed)

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	Wed May 24 21:42:48 2017	(r318820)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	Wed May 24 21:43:34 2017	(r318821)
@@ -1346,41 +1346,6 @@ dbuf_free_range(dnode_t *dn, uint64_t st
 	mutex_exit(&dn->dn_dbufs_mtx);
 }
 
-static int
-dbuf_block_freeable(dmu_buf_impl_t *db)
-{
-	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
-	uint64_t birth_txg = 0;
-
-	/*
-	 * We don't need any locking to protect db_blkptr:
-	 * If it's syncing, then db_last_dirty will be set
-	 * so we'll ignore db_blkptr.
-	 *
-	 * This logic ensures that only block births for
-	 * filled blocks are considered.
-	 */
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	if (db->db_last_dirty && (db->db_blkptr == NULL ||
-	    !BP_IS_HOLE(db->db_blkptr))) {
-		birth_txg = db->db_last_dirty->dr_txg;
-	} else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
-		birth_txg = db->db_blkptr->blk_birth;
-	}
-
-	/*
-	 * If this block don't exist or is in a snapshot, it can't be freed.
-	 * Don't pass the bp to dsl_dataset_block_freeable() since we
-	 * are holding the db_mtx lock and might deadlock if we are
-	 * prefetching a dedup-ed block.
-	 */
-	if (birth_txg != 0)
-		return (ds == NULL ||
-		    dsl_dataset_block_freeable(ds, NULL, birth_txg));
-	else
-		return (B_FALSE);
-}
-
 void
 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 {
@@ -1430,7 +1395,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int si
 	}
 	mutex_exit(&db->db_mtx);
 
-	dnode_willuse_space(dn, size-osize, tx);
+	dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
 	DB_DNODE_EXIT(db);
 }
 
@@ -1480,7 +1445,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t 
 	objset_t *os;
 	dbuf_dirty_record_t **drp, *dr;
 	int drop_struct_lock = FALSE;
-	boolean_t do_free_accounting = B_FALSE;
 	int txgoff = tx->tx_txg & TXG_MASK;
 
 	ASSERT(tx->tx_txg != 0);
@@ -1602,15 +1566,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
 	if (db->db_blkid != DMU_BONUS_BLKID) {
-		/*
-		 * Update the accounting.
-		 * Note: we delay "free accounting" until after we drop
-		 * the db_mtx.  This keeps us from grabbing other locks
-		 * (and possibly deadlocking) in bp_get_dsize() while
-		 * also holding the db_mtx.
-		 */
-		dnode_willuse_space(dn, db->db.db_size, tx);
-		do_free_accounting = dbuf_block_freeable(db);
+		dmu_objset_willuse_space(os, db->db.db_size, tx);
 	}
 
 	/*
@@ -1703,21 +1659,13 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t 
 		drop_struct_lock = TRUE;
 	}
 
-	if (do_free_accounting) {
-		blkptr_t *bp = db->db_blkptr;
-		int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
-		    bp_get_dsize(os->os_spa, bp) : db->db.db_size;
-		/*
-		 * This is only a guess -- if the dbuf is dirty
-		 * in a previous txg, we don't know how much
-		 * space it will use on disk yet.  We should
-		 * really have the struct_rwlock to access
-		 * db_blkptr, but since this is just a guess,
-		 * it's OK if we get an odd answer.
-		 */
-		ddt_prefetch(os->os_spa, bp);
-		dnode_willuse_space(dn, -willfree, tx);
-	}
+	/*
+	 * If we are overwriting a dedup BP, then unless it is snapshotted,
+	 * when we get to syncing context we will need to decrement its
+	 * refcount in the DDT.  Prefetch the relevant DDT block so that
+	 * syncing context won't have to wait for the i/o.
+	 */
+	ddt_prefetch(os->os_spa, db->db_blkptr);
 
 	if (db->db_level == 0) {
 		dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
@@ -2927,19 +2875,6 @@ dmu_buf_user_evict_wait()
 	taskq_wait(dbu_evict_taskq);
 }
 
-boolean_t
-dmu_buf_freeable(dmu_buf_t *dbuf)
-{
-	boolean_t res = B_FALSE;
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
-
-	if (db->db_blkptr)
-		res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
-		    db->db_blkptr, db->db_blkptr->blk_birth);
-
-	return (res);
-}
-
 blkptr_t *
 dmu_buf_get_blkptr(dmu_buf_t *db)
 {

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	Wed May 24 21:42:48 2017	(r318820)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	Wed May 24 21:43:34 2017	(r318821)
@@ -2103,3 +2103,20 @@ dmu_fsname(const char *snapname, char *b
 	(void) strlcpy(buf, snapname, atp - snapname + 1);
 	return (0);
 }
+
+/*
+ * Call when we think we're going to write/free space in open context to track
+ * the amount of dirty data in the open txg, which is also the amount
+ * of memory that can not be evicted until this txg syncs.
+ */
+void
+dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = os->os_dsl_dataset;
+	int64_t aspace = spa_get_worst_case_asize(os->os_spa, space);
+
+	if (ds != NULL) {
+		dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
+		dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
+	}
+}

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c	Wed May 24 21:42:48 2017	(r318820)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c	Wed May 24 21:43:34 2017	(r318821)
@@ -30,10 +30,10 @@
 #include <sys/dbuf.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
-#include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
-#include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
 #include <sys/dsl_pool.h>
-#include <sys/zap_impl.h> /* for fzap_default_block_shift */
+#include <sys/zap_impl.h>
 #include <sys/spa.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
@@ -56,10 +56,6 @@ dmu_tx_create_dd(dsl_dir_t *dd)
 	list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
 	    offsetof(dmu_tx_callback_t, dcb_node));
 	tx->tx_start = gethrtime();
-#ifdef ZFS_DEBUG
-	refcount_create(&tx->tx_space_written);
-	refcount_create(&tx->tx_space_freed);
-#endif
 	return (tx);
 }
 
@@ -68,7 +64,6 @@ dmu_tx_create(objset_t *os)
 {
 	dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
 	tx->tx_objset = os;
-	tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
 	return (tx);
 }
 
@@ -130,16 +125,10 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, ob
 	txh->txh_tx = tx;
 	txh->txh_dnode = dn;
 	refcount_create(&txh->txh_space_towrite);
-	refcount_create(&txh->txh_space_tofree);
-	refcount_create(&txh->txh_space_tooverwrite);
-	refcount_create(&txh->txh_space_tounref);
 	refcount_create(&txh->txh_memory_tohold);
-	refcount_create(&txh->txh_fudge);
-#ifdef ZFS_DEBUG
 	txh->txh_type = type;
 	txh->txh_arg1 = arg1;
 	txh->txh_arg2 = arg2;
-#endif
 	list_insert_tail(&tx->tx_holds, txh);
 
 	return (txh);
@@ -158,6 +147,34 @@ dmu_tx_add_new_object(dmu_tx_t *tx, objs
 	}
 }
 
+/*
+ * This function reads specified data from disk.  The specified data will
+ * be needed to perform the transaction -- i.e, it will be read after
+ * we do dmu_tx_assign().  There are two reasons that we read the data now
+ * (before dmu_tx_assign()):
+ *
+ * 1. Reading it now has potentially better performance.  The transaction
+ * has not yet been assigned, so the TXG is not held open, and also the
+ * caller typically has less locks held when calling dmu_tx_hold_*() than
+ * after the transaction has been assigned.  This reduces the lock (and txg)
+ * hold times, thus reducing lock contention.
+ *
+ * 2. It is easier for callers (primarily the ZPL) to handle i/o errors
+ * that are detected before they start making changes to the DMU state
+ * (i.e. now).  Once the transaction has been assigned, and some DMU
+ * state has been changed, it can be difficult to recover from an i/o
+ * error (e.g. to undo the changes already made in memory at the DMU
+ * layer).  Typically code to do so does not exist in the caller -- it
+ * assumes that the data has already been cached and thus i/o errors are
+ * not possible.
+ *
+ * It has been observed that the i/o initiated here can be a performance
+ * problem, and it appears to be optional, because we don't look at the
+ * data which is read.  However, removing this read would only serve to
+ * move the work elsewhere (after the dmu_tx_assign()), where it may
+ * have a greater impact on performance (in addition to the impact on
+ * fault tolerance noted above).
+ */
 static int
 dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 {
@@ -174,259 +191,84 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *
 	return (err);
 }
 
-static void
-dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
-    int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
-{
-	objset_t *os = dn->dn_objset;
-	dsl_dataset_t *ds = os->os_dsl_dataset;
-	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-	dmu_buf_impl_t *parent = NULL;
-	blkptr_t *bp = NULL;
-	uint64_t space;
-
-	if (level >= dn->dn_nlevels || history[level] == blkid)
-		return;
-
-	history[level] = blkid;
-
-	space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
-
-	if (db == NULL || db == dn->dn_dbuf) {
-		ASSERT(level != 0);
-		db = NULL;
-	} else {
-		ASSERT(DB_DNODE(db) == dn);
-		ASSERT(db->db_level == level);
-		ASSERT(db->db.db_size == space);
-		ASSERT(db->db_blkid == blkid);
-		bp = db->db_blkptr;
-		parent = db->db_parent;
-	}
-
-	freeable = (bp && (freeable ||
-	    dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
-
-	if (freeable) {
-		(void) refcount_add_many(&txh->txh_space_tooverwrite,
-		    space, FTAG);
-	} else {
-		(void) refcount_add_many(&txh->txh_space_towrite,
-		    space, FTAG);
-	}
-
-	if (bp) {
-		(void) refcount_add_many(&txh->txh_space_tounref,
-		    bp_get_dsize(os->os_spa, bp), FTAG);
-	}
-
-	dmu_tx_count_twig(txh, dn, parent, level + 1,
-	    blkid >> epbs, freeable, history);
-}
-
 /* ARGSUSED */
 static void
 dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 {
 	dnode_t *dn = txh->txh_dnode;
-	uint64_t start, end, i;
-	int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
 	int err = 0;
 
 	if (len == 0)
 		return;
 
-	min_bs = SPA_MINBLOCKSHIFT;
-	max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
-	min_ibs = DN_MIN_INDBLKSHIFT;
-	max_ibs = DN_MAX_INDBLKSHIFT;
-
-	if (dn) {
-		uint64_t history[DN_MAX_LEVELS];
-		int nlvls = dn->dn_nlevels;
-		int delta;
+	(void) refcount_add_many(&txh->txh_space_towrite, len, FTAG);
 
-		/*
-		 * For i/o error checking, read the first and last level-0
-		 * blocks (if they are not aligned), and all the level-1 blocks.
-		 */
-		if (dn->dn_maxblkid == 0) {
-			delta = dn->dn_datablksz;
-			start = (off < dn->dn_datablksz) ? 0 : 1;
-			end = (off+len <= dn->dn_datablksz) ? 0 : 1;
-			if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
-				err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
-				if (err)
-					goto out;
-				delta -= off;
-			}
-		} else {
-			zio_t *zio = zio_root(dn->dn_objset->os_spa,
-			    NULL, NULL, ZIO_FLAG_CANFAIL);
-
-			/* first level-0 block */
-			start = off >> dn->dn_datablkshift;
-			if (P2PHASE(off, dn->dn_datablksz) ||
-			    len < dn->dn_datablksz) {
-				err = dmu_tx_check_ioerr(zio, dn, 0, start);
-				if (err)
-					goto out;
-			}
+	if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS)
+		err = SET_ERROR(EFBIG);
 
-			/* last level-0 block */
-			end = (off+len-1) >> dn->dn_datablkshift;
-			if (end != start && end <= dn->dn_maxblkid &&
-			    P2PHASE(off+len, dn->dn_datablksz)) {
-				err = dmu_tx_check_ioerr(zio, dn, 0, end);
-				if (err)
-					goto out;
-			}
+	if (dn == NULL)
+		return;
 
-			/* level-1 blocks */
-			if (nlvls > 1) {
-				int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-				for (i = (start>>shft)+1; i < end>>shft; i++) {
-					err = dmu_tx_check_ioerr(zio, dn, 1, i);
-					if (err)
-						goto out;
-				}
+	/*
+	 * For i/o error checking, read the blocks that will be needed
+	 * to perform the write: the first and last level-0 blocks (if
+	 * they are not aligned, i.e. if they are partial-block writes),
+	 * and all the level-1 blocks.
+	 */
+	if (dn->dn_maxblkid == 0) {
+		if (off < dn->dn_datablksz &&
+		    (off > 0 || len < dn->dn_datablksz)) {
+			err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+			if (err != 0) {
+				txh->txh_tx->tx_err = err;
 			}
-
-			err = zio_wait(zio);
-			if (err)
-				goto out;
-			delta = P2NPHASE(off, dn->dn_datablksz);
-		}
-
-		min_ibs = max_ibs = dn->dn_indblkshift;
-		if (dn->dn_maxblkid > 0) {
-			/*
-			 * The blocksize can't change,
-			 * so we can make a more precise estimate.
-			 */
-			ASSERT(dn->dn_datablkshift != 0);
-			min_bs = max_bs = dn->dn_datablkshift;
-		} else {
-			/*
-			 * The blocksize can increase up to the recordsize,
-			 * or if it is already more than the recordsize,
-			 * up to the next power of 2.
-			 */
-			min_bs = highbit64(dn->dn_datablksz - 1);
-			max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1));
 		}
+	} else {
+		zio_t *zio = zio_root(dn->dn_objset->os_spa,
+		    NULL, NULL, ZIO_FLAG_CANFAIL);
 
-		/*
-		 * If this write is not off the end of the file
-		 * we need to account for overwrites/unref.
-		 */
-		if (start <= dn->dn_maxblkid) {
-			for (int l = 0; l < DN_MAX_LEVELS; l++)
-				history[l] = -1ULL;
+		/* first level-0 block */
+		uint64_t start = off >> dn->dn_datablkshift;
+		if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {
+			err = dmu_tx_check_ioerr(zio, dn, 0, start);
+			if (err != 0) {
+				txh->txh_tx->tx_err = err;
+			}
 		}
-		while (start <= dn->dn_maxblkid) {
-			dmu_buf_impl_t *db;
-
-			rw_enter(&dn->dn_struct_rwlock, RW_READER);
-			err = dbuf_hold_impl(dn, 0, start,
-			    FALSE, FALSE, FTAG, &db);
-			rw_exit(&dn->dn_struct_rwlock);
 
-			if (err) {
+		/* last level-0 block */
+		uint64_t end = (off + len - 1) >> dn->dn_datablkshift;
+		if (end != start && end <= dn->dn_maxblkid &&
+		    P2PHASE(off + len, dn->dn_datablksz)) {
+			err = dmu_tx_check_ioerr(zio, dn, 0, end);
+			if (err != 0) {
 				txh->txh_tx->tx_err = err;
-				return;
 			}
+		}
 
-			dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
-			    history);
-			dbuf_rele(db, FTAG);
-			if (++start > end) {
-				/*
-				 * Account for new indirects appearing
-				 * before this IO gets assigned into a txg.
-				 */
-				bits = 64 - min_bs;
-				epbs = min_ibs - SPA_BLKPTRSHIFT;
-				for (bits -= epbs * (nlvls - 1);
-				    bits >= 0; bits -= epbs) {
-					(void) refcount_add_many(
-					    &txh->txh_fudge,
-					    1ULL << max_ibs, FTAG);
+		/* level-1 blocks */
+		if (dn->dn_nlevels > 1) {
+			int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+			for (uint64_t i = (start >> shft) + 1;
+			    i < end >> shft; i++) {
+				err = dmu_tx_check_ioerr(zio, dn, 1, i);
+				if (err != 0) {
+					txh->txh_tx->tx_err = err;
 				}
-				goto out;
 			}
-			off += delta;
-			if (len >= delta)
-				len -= delta;
-			delta = dn->dn_datablksz;
 		}
-	}
-
-	/*
-	 * 'end' is the last thing we will access, not one past.
-	 * This way we won't overflow when accessing the last byte.
-	 */
-	start = P2ALIGN(off, 1ULL << max_bs);
-	end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
-	(void) refcount_add_many(&txh->txh_space_towrite,
-	    end - start + 1, FTAG);
-
-	start >>= min_bs;
-	end >>= min_bs;
 
-	epbs = min_ibs - SPA_BLKPTRSHIFT;
-
-	/*
-	 * The object contains at most 2^(64 - min_bs) blocks,
-	 * and each indirect level maps 2^epbs.
-	 */
-	for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
-		start >>= epbs;
-		end >>= epbs;
-		ASSERT3U(end, >=, start);
-		(void) refcount_add_many(&txh->txh_space_towrite,
-		    (end - start + 1) << max_ibs, FTAG);
-		if (start != 0) {
-			/*
-			 * We also need a new blkid=0 indirect block
-			 * to reference any existing file data.
-			 */
-			(void) refcount_add_many(&txh->txh_space_towrite,
-			    1ULL << max_ibs, FTAG);
+		err = zio_wait(zio);
+		if (err != 0) {
+			txh->txh_tx->tx_err = err;
 		}
 	}
-
-out:
-	if (refcount_count(&txh->txh_space_towrite) +
-	    refcount_count(&txh->txh_space_tooverwrite) >
-	    2 * DMU_MAX_ACCESS)
-		err = SET_ERROR(EFBIG);
-
-	if (err)
-		txh->txh_tx->tx_err = err;
 }
 
 static void
 dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 {
-	dnode_t *dn = txh->txh_dnode;
-	dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
-	uint64_t space = mdn->dn_datablksz +
-	    ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
-
-	if (dn && dn->dn_dbuf->db_blkptr &&
-	    dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
-	    dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
-		(void) refcount_add_many(&txh->txh_space_tooverwrite,
-		    space, FTAG);
-		(void) refcount_add_many(&txh->txh_space_tounref, space, FTAG);
-	} else {
-		(void) refcount_add_many(&txh->txh_space_towrite, space, FTAG);
-		if (dn && dn->dn_dbuf->db_blkptr) {
-			(void) refcount_add_many(&txh->txh_space_tounref,
-			    space, FTAG);
-		}
-	}
+	(void) refcount_add_many(&txh->txh_space_towrite, DNODE_SIZE, FTAG);
 }
 
 void
@@ -434,8 +276,8 @@ dmu_tx_hold_write(dmu_tx_t *tx, uint64_t
 {
 	dmu_tx_hold_t *txh;
 
-	ASSERT(tx->tx_txg == 0);
-	ASSERT(len < DMU_MAX_ACCESS);
+	ASSERT0(tx->tx_txg);
+	ASSERT3U(len, <=, DMU_MAX_ACCESS);
 	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
@@ -447,179 +289,6 @@ dmu_tx_hold_write(dmu_tx_t *tx, uint64_t
 	dmu_tx_count_dnode(txh);
 }
 
-static void
-dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
-{
-	uint64_t blkid, nblks, lastblk;
-	uint64_t space = 0, unref = 0, skipped = 0;
-	dnode_t *dn = txh->txh_dnode;
-	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
-	spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
-	int epbs;
-	uint64_t l0span = 0, nl1blks = 0;
-
-	if (dn->dn_nlevels == 0)
-		return;
-
-	/*
-	 * The struct_rwlock protects us against dn_nlevels
-	 * changing, in case (against all odds) we manage to dirty &
-	 * sync out the changes after we check for being dirty.
-	 * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
-	 */
-	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-	if (dn->dn_maxblkid == 0) {
-		if (off == 0 && len >= dn->dn_datablksz) {
-			blkid = 0;
-			nblks = 1;
-		} else {
-			rw_exit(&dn->dn_struct_rwlock);
-			return;
-		}
-	} else {
-		blkid = off >> dn->dn_datablkshift;
-		nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
-
-		if (blkid > dn->dn_maxblkid) {
-			rw_exit(&dn->dn_struct_rwlock);
-			return;
-		}
-		if (blkid + nblks > dn->dn_maxblkid)
-			nblks = dn->dn_maxblkid - blkid + 1;
-
-	}
-	l0span = nblks;    /* save for later use to calc level > 1 overhead */
-	if (dn->dn_nlevels == 1) {
-		int i;
-		for (i = 0; i < nblks; i++) {
-			blkptr_t *bp = dn->dn_phys->dn_blkptr;
-			ASSERT3U(blkid + i, <, dn->dn_nblkptr);
-			bp += blkid + i;
-			if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
-				dprintf_bp(bp, "can free old%s", "");
-				space += bp_get_dsize(spa, bp);
-			}
-			unref += BP_GET_ASIZE(bp);
-		}
-		nl1blks = 1;
-		nblks = 0;
-	}
-
-	lastblk = blkid + nblks - 1;
-	while (nblks) {
-		dmu_buf_impl_t *dbuf;
-		uint64_t ibyte, new_blkid;
-		int epb = 1 << epbs;
-		int err, i, blkoff, tochk;
-		blkptr_t *bp;
-
-		ibyte = blkid << dn->dn_datablkshift;
-		err = dnode_next_offset(dn,
-		    DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
-		new_blkid = ibyte >> dn->dn_datablkshift;
-		if (err == ESRCH) {
-			skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
-			break;
-		}
-		if (err) {
-			txh->txh_tx->tx_err = err;
-			break;
-		}
-		if (new_blkid > lastblk) {
-			skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
-			break;
-		}
-
-		if (new_blkid > blkid) {
-			ASSERT((new_blkid >> epbs) > (blkid >> epbs));
-			skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1;
-			nblks -= new_blkid - blkid;
-			blkid = new_blkid;
-		}
-		blkoff = P2PHASE(blkid, epb);
-		tochk = MIN(epb - blkoff, nblks);
-
-		err = dbuf_hold_impl(dn, 1, blkid >> epbs,
-		    FALSE, FALSE, FTAG, &dbuf);
-		if (err) {
-			txh->txh_tx->tx_err = err;
-			break;
-		}
-
-		(void) refcount_add_many(&txh->txh_memory_tohold,
-		    dbuf->db.db_size, FTAG);
-
-		/*
-		 * We don't check memory_tohold against DMU_MAX_ACCESS because
-		 * memory_tohold is an over-estimation (especially the >L1
-		 * indirect blocks), so it could fail.  Callers should have
-		 * already verified that they will not be holding too much
-		 * memory.
-		 */
-
-		err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
-		if (err != 0) {
-			txh->txh_tx->tx_err = err;
-			dbuf_rele(dbuf, FTAG);
-			break;
-		}
-
-		bp = dbuf->db.db_data;
-		bp += blkoff;
-
-		for (i = 0; i < tochk; i++) {
-			if (dsl_dataset_block_freeable(ds, &bp[i],
-			    bp[i].blk_birth)) {
-				dprintf_bp(&bp[i], "can free old%s", "");
-				space += bp_get_dsize(spa, &bp[i]);
-			}
-			unref += BP_GET_ASIZE(bp);
-		}
-		dbuf_rele(dbuf, FTAG);
-
-		++nl1blks;
-		blkid += tochk;
-		nblks -= tochk;
-	}
-	rw_exit(&dn->dn_struct_rwlock);
-
-	/*
-	 * Add in memory requirements of higher-level indirects.
-	 * This assumes a worst-possible scenario for dn_nlevels and a
-	 * worst-possible distribution of l1-blocks over the region to free.
-	 */
-	{
-		uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs);
-		int level = 2;
-		/*
-		 * Here we don't use DN_MAX_LEVEL, but calculate it with the
-		 * given datablkshift and indblkshift. This makes the
-		 * difference between 19 and 8 on large files.
-		 */
-		int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) /
-		    (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
-
-		while (level++ < maxlevel) {
-			(void) refcount_add_many(&txh->txh_memory_tohold,
-			    MAX(MIN(blkcnt, nl1blks), 1) << dn->dn_indblkshift,
-			    FTAG);
-			blkcnt = 1 + (blkcnt >> epbs);
-		}
-	}
-
-	/* account for new level 1 indirect blocks that might show up */
-	if (skipped > 0) {
-		(void) refcount_add_many(&txh->txh_fudge,
-		    skipped << dn->dn_indblkshift, FTAG);
-		skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
-		(void) refcount_add_many(&txh->txh_memory_tohold,
-		    skipped << dn->dn_indblkshift, FTAG);
-	}
-	(void) refcount_add_many(&txh->txh_space_tofree, space, FTAG);
-	(void) refcount_add_many(&txh->txh_space_tounref, unref, FTAG);
-}
-
 /*
  * This function marks the transaction as being a "net free".  The end
  * result is that refquotas will be disabled for this transaction, and
@@ -631,45 +300,27 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, ui
 void
 dmu_tx_mark_netfree(dmu_tx_t *tx)
 {
-	dmu_tx_hold_t *txh;
-
-	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
-	    DMU_NEW_OBJECT, THT_FREE, 0, 0);
-
-	/*
-	 * Pretend that this operation will free 1GB of space.  This
-	 * should be large enough to cancel out the largest write.
-	 * We don't want to use something like UINT64_MAX, because that would
-	 * cause overflows when doing math with these values (e.g. in
-	 * dmu_tx_try_assign()).
-	 */
-	(void) refcount_add_many(&txh->txh_space_tofree,
-	    1024 * 1024 * 1024, FTAG);
-	(void) refcount_add_many(&txh->txh_space_tounref,
-	    1024 * 1024 * 1024, FTAG);
+	tx->tx_netfree = B_TRUE;
 }
 
 void
 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 {
-	dmu_tx_hold_t *txh;
-	dnode_t *dn;
 	int err;
-	zio_t *zio;
 
 	ASSERT(tx->tx_txg == 0);
 
-	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+	dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    object, THT_FREE, off, len);
 	if (txh == NULL)
 		return;
-	dn = txh->txh_dnode;
+	dnode_t *dn = txh->txh_dnode;
 	dmu_tx_count_dnode(txh);
 
-	if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
+	if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
 		return;
 	if (len == DMU_OBJECT_END)
-		len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
+		len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;
 
 
 	/*
@@ -690,7 +341,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t 
 			dmu_tx_count_write(txh, off, 1);
 		/* last block will be modified if it is not aligned */
 		if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
-			dmu_tx_count_write(txh, off+len, 1);
+			dmu_tx_count_write(txh, off + len, 1);
 	}
 
 	/*
@@ -712,7 +363,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t 
 		if (dn->dn_datablkshift == 0)
 			start = end = 0;
 
-		zio = zio_root(tx->tx_pool->dp_spa,
+		zio_t *zio = zio_root(tx->tx_pool->dp_spa,
 		    NULL, NULL, ZIO_FLAG_CANFAIL);
 		for (uint64_t i = start; i <= end; i++) {
 			uint64_t ibyte = i << shift;
@@ -720,127 +371,80 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t 
 			i = ibyte >> shift;
 			if (err == ESRCH || i > end)
 				break;
-			if (err) {
+			if (err != 0) {
 				tx->tx_err = err;
+				(void) zio_wait(zio);
 				return;
 			}
 
+			(void) refcount_add_many(&txh->txh_memory_tohold,
+			    1 << dn->dn_indblkshift, FTAG);
+
 			err = dmu_tx_check_ioerr(zio, dn, 1, i);
-			if (err) {
+			if (err != 0) {
 				tx->tx_err = err;
+				(void) zio_wait(zio);
 				return;
 			}
 		}
 		err = zio_wait(zio);
-		if (err) {
+		if (err != 0) {
 			tx->tx_err = err;
 			return;
 		}
 	}
-
-	dmu_tx_count_free(txh, off, len);
 }
 
 void
 dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 {
-	dmu_tx_hold_t *txh;
-	dnode_t *dn;
 	int err;
 
 	ASSERT(tx->tx_txg == 0);
 
-	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+	dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    object, THT_ZAP, add, (uintptr_t)name);
 	if (txh == NULL)
 		return;
-	dn = txh->txh_dnode;
+	dnode_t *dn = txh->txh_dnode;
 
 	dmu_tx_count_dnode(txh);
 
-	if (dn == NULL) {
-		/*
-		 * We will be able to fit a new object's entries into one leaf
-		 * block.  So there will be at most 2 blocks total,
-		 * including the header block.
-		 */
-		dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
+	/*
+	 * Modifying a almost-full microzap is around the worst case (128KB)
+	 *
+	 * If it is a fat zap, the worst case would be 7*16KB=112KB:
+	 * - 3 blocks overwritten: target leaf, ptrtbl block, header block
+	 * - 4 new blocks written if adding:
+	 *    - 2 blocks for possibly split leaves,
+	 *    - 2 grown ptrtbl blocks
+	 */
+	(void) refcount_add_many(&txh->txh_space_towrite,
+	    MZAP_MAX_BLKSZ, FTAG);
+
+	if (dn == NULL)
 		return;
-	}
 
 	ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
 
-	if (dn->dn_maxblkid == 0 && !add) {
-		blkptr_t *bp;
-
+	if (dn->dn_maxblkid == 0 || name == NULL) {
 		/*
-		 * If there is only one block  (i.e. this is a micro-zap)
-		 * and we are not adding anything, the accounting is simple.
+		 * This is a microzap (only one block), or we don't know
+		 * the name.  Check the first block for i/o errors.
 		 */
 		err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
-		if (err) {
+		if (err != 0) {
 			tx->tx_err = err;
-			return;
-		}
-
-		/*
-		 * Use max block size here, since we don't know how much
-		 * the size will change between now and the dbuf dirty call.
-		 */
-		bp = &dn->dn_phys->dn_blkptr[0];
-		if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
-		    bp, bp->blk_birth)) {
-			(void) refcount_add_many(&txh->txh_space_tooverwrite,
-			    MZAP_MAX_BLKSZ, FTAG);
-		} else {
-			(void) refcount_add_many(&txh->txh_space_towrite,
-			    MZAP_MAX_BLKSZ, FTAG);
 		}
-		if (!BP_IS_HOLE(bp)) {
-			(void) refcount_add_many(&txh->txh_space_tounref,
-			    MZAP_MAX_BLKSZ, FTAG);
-		}
-		return;
-	}
-
-	if (dn->dn_maxblkid > 0 && name) {
+	} else {
 		/*
-		 * access the name in this fat-zap so that we'll check
-		 * for i/o errors to the leaf blocks, etc.
+		 * Access the name so that we'll check for i/o errors to
+		 * the leaf blocks, etc.  We ignore ENOENT, as this name
+		 * may not yet exist.
 		 */
 		err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
-		if (err == EIO) {
+		if (err == EIO || err == ECKSUM || err == ENXIO) {
 			tx->tx_err = err;
-			return;
-		}
-	}
-
-	err = zap_count_write_by_dnode(dn, name, add,
-	    &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
-
-	/*
-	 * If the modified blocks are scattered to the four winds,
-	 * we'll have to modify an indirect twig for each.  We can make
-	 * modifications at up to 3 locations:
-	 *  - header block at the beginning of the object
-	 *  - target leaf block
-	 *  - end of the object, where we might need to write:
-	 *	- a new leaf block if the target block needs to be split
-	 *	- the new pointer table, if it is growing
-	 *	- the new cookie table, if it is growing
-	 */
-	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-	dsl_dataset_phys_t *ds_phys =
-	    dsl_dataset_phys(dn->dn_objset->os_dsl_dataset);
-	for (int lvl = 1; lvl < dn->dn_nlevels; lvl++) {
-		uint64_t num_indirects = 1 + (dn->dn_maxblkid >> (epbs * lvl));
-		uint64_t spc = MIN(3, num_indirects) << dn->dn_indblkshift;
-		if (ds_phys->ds_prev_snap_obj != 0) {
-			(void) refcount_add_many(&txh->txh_space_towrite,
-			    spc, FTAG);
-		} else {
-			(void) refcount_add_many(&txh->txh_space_tooverwrite,
-			    spc, FTAG);
 		}
 	}
 }
@@ -870,42 +474,15 @@ dmu_tx_hold_space(dmu_tx_t *tx, uint64_t
 	(void) refcount_add_many(&txh->txh_space_towrite, space, FTAG);
 }
 
-int
-dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
-{
-	dmu_tx_hold_t *txh;
-	int holds = 0;
-
-	/*
-	 * By asserting that the tx is assigned, we're counting the
-	 * number of dn_tx_holds, which is the same as the number of
-	 * dn_holds.  Otherwise, we'd be counting dn_holds, but
-	 * dn_tx_holds could be 0.
-	 */
-	ASSERT(tx->tx_txg != 0);
-
-	/* if (tx->tx_anyobj == TRUE) */
-		/* return (0); */
-
-	for (txh = list_head(&tx->tx_holds); txh;
-	    txh = list_next(&tx->tx_holds, txh)) {
-		if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
-			holds++;
-	}
-
-	return (holds);
-}
-
 #ifdef ZFS_DEBUG
 void
 dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 {
-	dmu_tx_hold_t *txh;
-	int match_object = FALSE, match_offset = FALSE;
-	dnode_t *dn;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-head mailing list