svn commit: r285202 - in stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys
Andriy Gapon
avg at FreeBSD.org
Mon Jul 6 10:40:55 UTC 2015
Author: avg
Date: Mon Jul 6 10:40:51 2015
New Revision: 285202
URL: https://svnweb.freebsd.org/changeset/base/285202
Log:
MFC r284593: MFV r284412: 5911 ZFS "hangs" while deleting file
illumos/illumos-gate at 46e1baa6cf6d5432f5fd231bb588df8f9570c858
https://www.illumos.org/issues/5911
Sometimes ZFS appears to hang while deleting a file. It is actually
making slow progress at the file deletion, but other operations
(administrative and writes via the data path) "hang" until the file
removal completes, which can take a long time if the file has many
blocks. The deletion (or most of it) happens in a single txg, and the
sync thread spends most of its time reading indirect blocks...
Reviewed by: Bayard Bell <buffer.g.overflow at gmail.com>
Reviewed by: Alek Pinchuk <alek at nexenta.com>
Reviewed by: Simon Klinkert <simon.klinkert at gmail.com>
Reviewed by: Dan McDonald <danmcd at omniti.com>
Approved by: Richard Lowe <richlowe at richlowe.net>
Author: Matthew Ahrens <mahrens at delphix.com>
PR: 199775
Approved by: re(kib)
Modified:
stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
Directory Properties:
stable/10/ (props changed)
Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c Mon Jul 6 09:57:40 2015 (r285201)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c Mon Jul 6 10:40:51 2015 (r285202)
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
@@ -1298,6 +1298,16 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_
dbuf_dirty_record_t *dr, **drp;
ASSERT(txg != 0);
+
+ /*
+ * Due to our use of dn_nlevels below, this can only be called
+ * in open context, unless we are operating on the MOS.
+ * From syncing context, dn_nlevels may be different from the
+ * dn_nlevels used when dbuf was dirtied.
+ */
+ ASSERT(db->db_objset ==
+ dmu_objset_pool(db->db_objset)->dp_meta_objset ||
+ txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT0(db->db_level);
ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1320,11 +1330,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_
ASSERT(db->db.db_size != 0);
- /*
- * Any space we accounted for in dp_dirty_* will be cleaned up by
- * dsl_pool_sync(). This is relatively rare so the discrepancy
- * is not a big deal.
- */
+ dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
+ dr->dr_accounted, txg);
*drp = dr->dr_next;
@@ -1339,7 +1346,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_
list_remove(&dr->dr_parent->dt.di.dr_children, dr);
mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
} else if (db->db_blkid == DMU_SPILL_BLKID ||
- db->db_level+1 == dn->dn_nlevels) {
+ db->db_level + 1 == dn->dn_nlevels) {
ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
mutex_enter(&dn->dn_mtx);
list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
@@ -1356,11 +1363,6 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_
VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
}
- if (db->db_level != 0) {
- mutex_destroy(&dr->dt.di.dr_mtx);
- list_destroy(&dr->dt.di.dr_children);
- }
-
kmem_free(dr, sizeof (dbuf_dirty_record_t));
ASSERT(db->db_dirtycnt > 0);
@@ -2318,7 +2320,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *
zio = dr->dr_zio;
mutex_enter(&dr->dt.di.dr_mtx);
- dbuf_sync_list(&dr->dt.di.dr_children, tx);
+ dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
mutex_exit(&dr->dt.di.dr_mtx);
zio_nowait(zio);
@@ -2464,7 +2466,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr,
}
void
-dbuf_sync_list(list_t *list, dmu_tx_t *tx)
+dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
{
dbuf_dirty_record_t *dr;
@@ -2481,6 +2483,10 @@ dbuf_sync_list(list_t *list, dmu_tx_t *t
DMU_META_DNODE_OBJECT);
break;
}
+ if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+ dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
+ VERIFY3U(dr->dr_dbuf->db_level, ==, level);
+ }
list_remove(list, dr);
if (dr->dr_dbuf->db_level > 0)
dbuf_sync_indirect(dr, tx);
Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c Mon Jul 6 09:57:40 2015 (r285201)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c Mon Jul 6 10:40:51 2015 (r285202)
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#include <sys/dmu.h>
@@ -686,7 +686,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t
uint64_t ibyte = i << shift;
err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
i = ibyte >> shift;
- if (err == ESRCH)
+ if (err == ESRCH || i > end)
break;
if (err) {
tx->tx_err = err;
Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c Mon Jul 6 09:57:40 2015 (r285201)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c Mon Jul 6 10:40:51 2015 (r285202)
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -1492,6 +1492,16 @@ out:
rw_downgrade(&dn->dn_struct_rwlock);
}
+static void
+dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
+ if (db != NULL) {
+ dmu_buf_will_dirty(&db->db, tx);
+ dbuf_rele(db, FTAG);
+ }
+}
+
void
dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
{
@@ -1612,27 +1622,67 @@ dnode_free_range(dnode_t *dn, uint64_t o
nblks += 1;
/*
- * Dirty the first and last indirect blocks, as they (and/or their
- * parents) will need to be written out if they were only
- * partially freed. Interior indirect blocks will be themselves freed,
- * by free_children(), so they need not be dirtied. Note that these
- * interior blocks have already been prefetched by dmu_tx_hold_free().
+ * Dirty all the indirect blocks in this range. Note that only
+ * the first and last indirect blocks can actually be written
+ * (if they were partially freed) -- they must be dirtied, even if
+ * they do not exist on disk yet. The interior blocks will
+ * be freed by free_children(), so they will not actually be written.
+ * Even though these interior blocks will not be written, we
+ * dirty them for two reasons:
+ *
+ * - It ensures that the indirect blocks remain in memory until
+ * syncing context. (They have already been prefetched by
+ * dmu_tx_hold_free(), so we don't have to worry about reading
+ * them serially here.)
+ *
+ * - The dirty space accounting will put pressure on the txg sync
+ * mechanism to begin syncing, and to delay transactions if there
+ * is a large amount of freeing. Even though these indirect
+ * blocks will not be written, we could need to write the same
+ * amount of space if we copy the freed BPs into deadlists.
*/
if (dn->dn_nlevels > 1) {
uint64_t first, last;
first = blkid >> epbs;
- if (db = dbuf_hold_level(dn, 1, first, FTAG)) {
- dmu_buf_will_dirty(&db->db, tx);
- dbuf_rele(db, FTAG);
- }
+ dnode_dirty_l1(dn, first, tx);
if (trunc)
last = dn->dn_maxblkid >> epbs;
else
last = (blkid + nblks - 1) >> epbs;
- if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
- dmu_buf_will_dirty(&db->db, tx);
- dbuf_rele(db, FTAG);
+ if (last != first)
+ dnode_dirty_l1(dn, last, tx);
+
+ int shift = dn->dn_datablkshift + dn->dn_indblkshift -
+ SPA_BLKPTRSHIFT;
+ for (uint64_t i = first + 1; i < last; i++) {
+ /*
+ * Set i to the blockid of the next non-hole
+ * level-1 indirect block at or after i. Note
+ * that dnode_next_offset() operates in terms of
+ * level-0-equivalent bytes.
+ */
+ uint64_t ibyte = i << shift;
+ int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
+ &ibyte, 2, 1, 0);
+ i = ibyte >> shift;
+ if (i >= last)
+ break;
+
+ /*
+ * Normally we should not see an error, either
+ * from dnode_next_offset() or dbuf_hold_level()
+ * (except for ESRCH from dnode_next_offset).
+ * If there is an i/o error, then when we read
+ * this block in syncing context, it will use
+ * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
+ * to the "failmode" property. dnode_next_offset()
+ * doesn't have a flag to indicate MUSTSUCCEED.
+ */
+ if (err != 0)
+ break;
+
+ dnode_dirty_l1(dn, i, tx);
}
}
Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c Mon Jul 6 09:57:40 2015 (r285201)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c Mon Jul 6 10:40:51 2015 (r285202)
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -712,7 +712,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
mutex_exit(&dn->dn_mtx);
}
- dbuf_sync_list(list, tx);
+ dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx);
if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
ASSERT3P(list_head(list), ==, NULL);
Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h Mon Jul 6 09:57:40 2015 (r285201)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h Mon Jul 6 10:40:51 2015 (r285202)
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
*/
@@ -287,7 +287,7 @@ void dbuf_evict(dmu_buf_impl_t *db);
void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
void dbuf_unoverride(dbuf_dirty_record_t *dr);
-void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
+void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
void dbuf_release_bp(dmu_buf_impl_t *db);
void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
More information about the svn-src-stable-10
mailing list