svn commit: r350898 - vendor-sys/illumos/dist/common/zfs vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor-sys/illumos/dist/uts/common/sys/fs vendor/ill...
Andriy Gapon
avg at FreeBSD.org
Mon Aug 12 12:05:43 UTC 2019
Author: avg
Date: Mon Aug 12 12:05:40 2019
New Revision: 350898
URL: https://svnweb.freebsd.org/changeset/base/350898
Log:
8423 8199 7432 Implement large_dnode pool feature
8423 Implement large_dnode pool feature
8199 multi-threaded dmu_object_alloc()
7432 Large dnode pool feature
llumos/illumos-gate at 54811da5ac6b517992fdc173df5d605e4e61fdc0
https://github.com/illumos/illumos-gate/commit/54811da5ac6b517992fdc173df5d605e4e61fdc0
https://www.illumos.org/issues/8423
https://www.illumos.org/issues/8199
https://www.illumos.org/issues/7432
ZoL issues:
Improved dnode allocation #6564
Clean up large dnode code #6262
Fix dnode_hold() freeing dnode behavior #8172
Fix dnode allocation race #6414, #6439
Partial: Raw sends must be able to decrease nlevels #6821, #6864
Remove unnecessary txg syncs from receive_object() Closes #7197
Author: Toomas Soome <tsoome at me.com>
Modified:
vendor-sys/illumos/dist/common/zfs/zfeature_common.c
vendor-sys/illumos/dist/common/zfs/zfeature_common.h
vendor-sys/illumos/dist/common/zfs/zfs_prop.c
vendor-sys/illumos/dist/common/zfs/zpool_prop.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_object.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_tx.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c
vendor-sys/illumos/dist/uts/common/fs/zfs/sa.c
vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c
vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/arc.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu_objset.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dnode.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_dataset.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/sa_impl.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zap.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_ioctl.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_znode.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zil.h
vendor-sys/illumos/dist/uts/common/fs/zfs/zap.c
vendor-sys/illumos/dist/uts/common/fs/zfs/zap_micro.c
vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_acl.c
vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ioctl.c
vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_log.c
vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_replay.c
vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_sa.c
vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_znode.c
vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c
vendor-sys/illumos/dist/uts/common/sys/fs/zfs.h
Changes in other areas also in this revision:
Modified:
vendor/illumos/dist/cmd/zdb/zdb.c
vendor/illumos/dist/cmd/zdb/zdb_il.c
vendor/illumos/dist/cmd/zstreamdump/zstreamdump.c
vendor/illumos/dist/cmd/ztest/ztest.c
vendor/illumos/dist/man/man5/zpool-features.5
Modified: vendor-sys/illumos/dist/common/zfs/zfeature_common.c
==============================================================================
--- vendor-sys/illumos/dist/common/zfs/zfeature_common.c Mon Aug 12 11:42:16 2019 (r350897)
+++ vendor-sys/illumos/dist/common/zfs/zfeature_common.c Mon Aug 12 12:05:40 2019 (r350898)
@@ -245,6 +245,17 @@ zpool_feature_init(void)
"Support for blocks larger than 128KB.",
ZFEATURE_FLAG_PER_DATASET, large_blocks_deps);
+ {
+ static const spa_feature_t large_dnode_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_LARGE_DNODE,
+ "org.zfsonlinux:large_dnode", "large_dnode",
+ "Variable on-disk size of dnodes.",
+ ZFEATURE_FLAG_PER_DATASET, large_dnode_deps);
+ }
+
static const spa_feature_t sha512_deps[] = {
SPA_FEATURE_EXTENSIBLE_DATASET,
SPA_FEATURE_NONE
Modified: vendor-sys/illumos/dist/common/zfs/zfeature_common.h
==============================================================================
--- vendor-sys/illumos/dist/common/zfs/zfeature_common.h Mon Aug 12 11:42:16 2019 (r350897)
+++ vendor-sys/illumos/dist/common/zfs/zfeature_common.h Mon Aug 12 12:05:40 2019 (r350898)
@@ -53,6 +53,7 @@ typedef enum spa_feature {
SPA_FEATURE_BOOKMARKS,
SPA_FEATURE_FS_SS_LIMIT,
SPA_FEATURE_LARGE_BLOCKS,
+ SPA_FEATURE_LARGE_DNODE,
SPA_FEATURE_SHA512,
SPA_FEATURE_SKEIN,
SPA_FEATURE_EDONR,
Modified: vendor-sys/illumos/dist/common/zfs/zfs_prop.c
==============================================================================
--- vendor-sys/illumos/dist/common/zfs/zfs_prop.c Mon Aug 12 11:42:16 2019 (r350897)
+++ vendor-sys/illumos/dist/common/zfs/zfs_prop.c Mon Aug 12 12:05:40 2019 (r350898)
@@ -210,6 +210,17 @@ zfs_prop_init(void)
{ NULL }
};
+ static zprop_index_t dnsize_table[] = {
+ { "legacy", ZFS_DNSIZE_LEGACY },
+ { "auto", ZFS_DNSIZE_AUTO },
+ { "1k", ZFS_DNSIZE_1K },
+ { "2k", ZFS_DNSIZE_2K },
+ { "4k", ZFS_DNSIZE_4K },
+ { "8k", ZFS_DNSIZE_8K },
+ { "16k", ZFS_DNSIZE_16K },
+ { NULL }
+ };
+
static zprop_index_t redundant_metadata_table[] = {
{ "all", ZFS_REDUNDANT_METADATA_ALL },
{ "most", ZFS_REDUNDANT_METADATA_MOST },
@@ -265,6 +276,10 @@ zfs_prop_init(void)
zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY,
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"latency | throughput", "LOGBIAS", logbias_table);
+
+ zprop_register_index(ZFS_PROP_DNODESIZE, "dnodesize",
+ ZFS_DNSIZE_LEGACY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+ "legacy | auto | 1k | 2k | 4k | 8k | 16k", "DNSIZE", dnsize_table);
/* inherit index (boolean) properties */
zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
Modified: vendor-sys/illumos/dist/common/zfs/zpool_prop.c
==============================================================================
--- vendor-sys/illumos/dist/common/zfs/zpool_prop.c Mon Aug 12 11:42:16 2019 (r350897)
+++ vendor-sys/illumos/dist/common/zfs/zpool_prop.c Mon Aug 12 12:05:40 2019 (r350898)
@@ -138,6 +138,8 @@ zpool_prop_init(void)
PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE");
zprop_register_hidden(ZPOOL_PROP_TNAME, "tname", PROP_TYPE_STRING,
PROP_ONETIME, ZFS_TYPE_POOL, "TNAME");
+ zprop_register_hidden(ZPOOL_PROP_MAXDNODESIZE, "maxdnodesize",
+ PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXDNODESIZE");
}
/*
Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c Mon Aug 12 11:42:16 2019 (r350897)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c Mon Aug 12 12:05:40 2019 (r350898)
@@ -742,7 +742,6 @@ dbuf_verify(dmu_buf_impl_t *db)
ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
} else if (db->db_blkid == DMU_SPILL_BLKID) {
ASSERT(dn != NULL);
- ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
ASSERT0(db->db.db_offset);
} else {
ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
@@ -995,13 +994,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_
ASSERT(db->db_buf == NULL);
if (db->db_blkid == DMU_BONUS_BLKID) {
+ /*
+ * The bonus length stored in the dnode may be less than
+ * the maximum available space in the bonus buffer.
+ */
int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
+ int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
ASSERT3U(bonuslen, <=, db->db.db_size);
- db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
- arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
- if (bonuslen < DN_MAX_BONUSLEN)
- bzero(db->db.db_data, DN_MAX_BONUSLEN);
+ db->db.db_data = zio_buf_alloc(max_bonuslen);
+ arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
+ if (bonuslen < max_bonuslen)
+ bzero(db->db.db_data, max_bonuslen);
if (bonuslen)
bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
DB_DNODE_EXIT(db);
@@ -1108,9 +1112,11 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
ASSERT(dr->dr_txg >= txg - 2);
if (db->db_blkid == DMU_BONUS_BLKID) {
/* Note that the data bufs here are zio_bufs */
- dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
- arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
- bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
+ dnode_t *dn = DB_DNODE(db);
+ int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
+ dr->dt.dl.dr_data = zio_buf_alloc(bonuslen);
+ arc_space_consume(bonuslen, ARC_SPACE_BONUS);
+ bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
int size = arc_buf_size(db->db_buf);
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
@@ -2081,10 +2087,13 @@ dbuf_destroy(dmu_buf_impl_t *db)
}
if (db->db_blkid == DMU_BONUS_BLKID) {
- ASSERT(db->db.db_data != NULL);
- zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
- arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
- db->db_state = DB_UNCACHED;
+ int slots = DB_DNODE(db)->dn_num_slots;
+ int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
+ if (db->db.db_data != NULL) {
+ zio_buf_free(db->db.db_data, bonuslen);
+ arc_space_return(bonuslen, ARC_SPACE_BONUS);
+ db->db_state = DB_UNCACHED;
+ }
}
dbuf_clear_data(db);
@@ -2188,7 +2197,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, in
mutex_enter(&dn->dn_mtx);
if (dn->dn_have_spill &&
(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
- *bpp = &dn->dn_phys->dn_spill;
+ *bpp = DN_SPILL_BLKPTR(dn->dn_phys);
else
*bpp = NULL;
dbuf_add_ref(dn->dn_dbuf, NULL);
@@ -2289,7 +2298,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid
if (blkid == DMU_BONUS_BLKID) {
ASSERT3P(parent, ==, dn->dn_dbuf);
- db->db.db_size = DN_MAX_BONUSLEN -
+ db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
(dn->dn_nblkptr-1) * sizeof (blkptr_t);
ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
db->db.db_offset = DMU_BONUS_BLKID;
@@ -3031,7 +3040,7 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
return;
if (db->db_blkid == DMU_SPILL_BLKID) {
- db->db_blkptr = &dn->dn_phys->dn_spill;
+ db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys);
BP_ZERO(db->db_blkptr);
return;
}
@@ -3162,13 +3171,17 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT(*datap != NULL);
ASSERT0(db->db_level);
- ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
- bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
+ ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
+ DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
+ bcopy(*datap, DN_BONUS(dn->dn_phys),
+ DN_MAX_BONUS_LEN(dn->dn_phys));
DB_DNODE_EXIT(db);
if (*datap != db->db.db_data) {
- zio_buf_free(*datap, DN_MAX_BONUSLEN);
- arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
+ int slots = DB_DNODE(db)->dn_num_slots;
+ int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
+ zio_buf_free(*datap, bonuslen);
+ arc_space_return(bonuslen, ARC_SPACE_BONUS);
}
db->db_data_pending = NULL;
drp = &db->db_last_dirty;
@@ -3324,7 +3337,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb
if (db->db_blkid == DMU_SPILL_BLKID) {
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
ASSERT(!(BP_IS_HOLE(bp)) &&
- db->db_blkptr == &dn->dn_phys->dn_spill);
+ db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
}
#endif
@@ -3336,11 +3349,17 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb
mutex_exit(&dn->dn_mtx);
if (dn->dn_type == DMU_OT_DNODE) {
- dnode_phys_t *dnp = db->db.db_data;
- for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
- i--, dnp++) {
- if (dnp->dn_type != DMU_OT_NONE)
+ i = 0;
+ while (i < db->db.db_size) {
+ dnode_phys_t *dnp =
+ (void *)(((char *)db->db.db_data) + i);
+
+ i += DNODE_MIN_SIZE;
+ if (dnp->dn_type != DMU_OT_NONE) {
fill++;
+ i += dnp->dn_extra_slots *
+ DNODE_MIN_SIZE;
+ }
}
} else {
if (BP_IS_HOLE(bp)) {
@@ -3493,7 +3512,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
dn = DB_DNODE(db);
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
- db->db_blkptr == &dn->dn_phys->dn_spill);
+ db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
DB_DNODE_EXIT(db);
}
#endif
Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c Mon Aug 12 11:42:16 2019 (r350897)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c Mon Aug 12 12:05:40 2019 (r350898)
@@ -254,7 +254,7 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t o
int
dmu_bonus_max(void)
{
- return (DN_MAX_BONUSLEN);
+ return (DN_OLD_MAX_BONUSLEN);
}
int
@@ -2264,6 +2264,7 @@ dmu_object_info_from_dnode(dnode_t *dn, dmu_object_inf
doi->doi_type = dn->dn_type;
doi->doi_bonus_type = dn->dn_bonustype;
doi->doi_bonus_size = dn->dn_bonuslen;
+ doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT;
doi->doi_indirection = dn->dn_nlevels;
doi->doi_checksum = dn->dn_checksum;
doi->doi_compress = dn->dn_compress;
@@ -2326,9 +2327,21 @@ dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *
dn = DB_DNODE(db);
*blksize = dn->dn_datablksz;
- /* add 1 for dnode space */
+ /* add in number of slots used for the dnode itself */
*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
- SPA_MINBLOCKSHIFT) + 1;
+ SPA_MINBLOCKSHIFT) + dn->dn_num_slots;
+ DB_DNODE_EXIT(db);
+}
+
+void
+dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dnode_t *dn;
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ *dnsize = dn->dn_num_slots << DNODE_SHIFT;
DB_DNODE_EXIT(db);
}
Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_object.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_object.c Mon Aug 12 11:42:16 2019 (r350897)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_object.c Mon Aug 12 12:05:40 2019 (r350898)
@@ -30,101 +30,227 @@
#include <sys/dnode.h>
#include <sys/zap.h>
#include <sys/zfeature.h>
+#include <sys/dsl_dataset.h>
-uint64_t
-dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
- int indirect_blockshift,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+/*
+ * Each of the concurrent object allocators will grab
+ * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to
+ * grab 128 slots, which is 4 blocks worth. This was experimentally
+ * determined to be the lowest value that eliminates the measurable effect
+ * of lock contention from this code path.
+ */
+int dmu_object_alloc_chunk_shift = 7;
+
+static uint64_t
+dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
+ int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
+ int dnodesize, dmu_tx_t *tx)
{
uint64_t object;
uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
(DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
dnode_t *dn = NULL;
+ int dn_slots = dnodesize >> DNODE_SHIFT;
+ boolean_t restarted = B_FALSE;
+ uint64_t *cpuobj = &os->os_obj_next_percpu[CPU_SEQID %
+ os->os_obj_next_percpu_len];
+ int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
+ int error;
- mutex_enter(&os->os_obj_lock);
+ if (dn_slots == 0) {
+ dn_slots = DNODE_MIN_SLOTS;
+ } else {
+ ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
+ ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
+ }
+
+ /*
+ * The "chunk" of dnodes that is assigned to a CPU-specific
+ * allocator needs to be at least one block's worth, to avoid
+ * lock contention on the dbuf. It can be at most one L1 block's
+ * worth, so that the "rescan after polishing off a L1's worth"
+ * logic below will be sure to kick in.
+ */
+ if (dnodes_per_chunk < DNODES_PER_BLOCK)
+ dnodes_per_chunk = DNODES_PER_BLOCK;
+ if (dnodes_per_chunk > L1_dnode_count)
+ dnodes_per_chunk = L1_dnode_count;
+
+ object = *cpuobj;
+
for (;;) {
- object = os->os_obj_next;
/*
- * Each time we polish off a L1 bp worth of dnodes (2^12
- * objects), move to another L1 bp that's still reasonably
- * sparse (at most 1/4 full). Look from the beginning at most
- * once per txg, but after that keep looking from here.
- * os_scan_dnodes is set during txg sync if enough objects
- * have been freed since the previous rescan to justify
- * backfilling again. If we can't find a suitable block, just
- * keep going from here.
- *
- * Note that dmu_traverse depends on the behavior that we use
- * multiple blocks of the dnode object before going back to
- * reuse objects. Any change to this algorithm should preserve
- * that property or find another solution to the issues
- * described in traverse_visitbp.
+ * If we finished a chunk of dnodes, get a new one from
+ * the global allocator.
*/
+ if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
+ (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
+ dn_slots)) {
+ DNODE_STAT_BUMP(dnode_alloc_next_chunk);
+ mutex_enter(&os->os_obj_lock);
+ ASSERT0(P2PHASE(os->os_obj_next_chunk,
+ dnodes_per_chunk));
+ object = os->os_obj_next_chunk;
- if (P2PHASE(object, L1_dnode_count) == 0) {
- uint64_t offset;
- int error;
- if (os->os_rescan_dnodes) {
- offset = 0;
- os->os_rescan_dnodes = B_FALSE;
- } else {
- offset = object << DNODE_SHIFT;
+ /*
+ * Each time we polish off a L1 bp worth of dnodes
+ * (2^12 objects), move to another L1 bp that's
+ * still reasonably sparse (at most 1/4 full). Look
+ * from the beginning at most once per txg. If we
+ * still can't allocate from that L1 block, search
+ * for an empty L0 block, which will quickly skip
+ * to the end of the metadnode if the no nearby L0
+ * blocks are empty. This fallback avoids a
+ * pathology where full dnode blocks containing
+ * large dnodes appear sparse because they have a
+ * low blk_fill, leading to many failed allocation
+ * attempts. In the long term a better mechanism to
+ * search for sparse metadnode regions, such as
+ * spacemaps, could be implemented.
+ *
+ * os_scan_dnodes is set during txg sync if enough
+ * objects have been freed since the previous
+ * rescan to justify backfilling again.
+ *
+ * Note that dmu_traverse depends on the behavior
+ * that we use multiple blocks of the dnode object
+ * before going back to reuse objects. Any change
+ * to this algorithm should preserve that property
+ * or find another solution to the issues described
+ * in traverse_visitbp.
+ */
+ if (P2PHASE(object, L1_dnode_count) == 0) {
+ uint64_t offset;
+ uint64_t blkfill;
+ int minlvl;
+ if (os->os_rescan_dnodes) {
+ offset = 0;
+ os->os_rescan_dnodes = B_FALSE;
+ } else {
+ offset = object << DNODE_SHIFT;
+ }
+ blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
+ minlvl = restarted ? 1 : 2;
+ restarted = B_TRUE;
+ error = dnode_next_offset(DMU_META_DNODE(os),
+ DNODE_FIND_HOLE, &offset, minlvl,
+ blkfill, 0);
+ if (error == 0) {
+ object = offset >> DNODE_SHIFT;
+ }
}
- error = dnode_next_offset(DMU_META_DNODE(os),
- DNODE_FIND_HOLE,
- &offset, 2, DNODES_PER_BLOCK >> 2, 0);
- if (error == 0)
- object = offset >> DNODE_SHIFT;
+ /*
+ * Note: if "restarted", we may find a L0 that
+ * is not suitably aligned.
+ */
+ os->os_obj_next_chunk =
+ P2ALIGN(object, dnodes_per_chunk) +
+ dnodes_per_chunk;
+ (void) atomic_swap_64(cpuobj, object);
+ mutex_exit(&os->os_obj_lock);
}
- os->os_obj_next = ++object;
/*
+ * The value of (*cpuobj) before adding dn_slots is the object
+ * ID assigned to us. The value afterwards is the object ID
+ * assigned to whoever wants to do an allocation next.
+ */
+ object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
+
+ /*
* XXX We should check for an i/o error here and return
* up to our caller. Actually we should pre-read it in
* dmu_tx_assign(), but there is currently no mechanism
* to do so.
*/
- (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
- FTAG, &dn);
- if (dn)
- break;
+ error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
+ dn_slots, FTAG, &dn);
+ if (error == 0) {
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ /*
+ * Another thread could have allocated it; check
+ * again now that we have the struct lock.
+ */
+ if (dn->dn_type == DMU_OT_NONE) {
+ dnode_allocate(dn, ot, blocksize, 0,
+ bonustype, bonuslen, dn_slots, tx);
+ rw_exit(&dn->dn_struct_rwlock);
+ dmu_tx_add_new_object(tx, dn);
+ dnode_rele(dn, FTAG);
+ return (object);
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ DNODE_STAT_BUMP(dnode_alloc_race);
+ }
- if (dmu_object_next(os, &object, B_TRUE, 0) == 0)
- os->os_obj_next = object - 1;
+ /*
+ * Skip to next known valid starting point on error. This
+ * is the start of the next block of dnodes.
+ */
+ if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
+ object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
+ DNODE_STAT_BUMP(dnode_alloc_next_block);
+ }
+ (void) atomic_swap_64(cpuobj, object);
}
-
- dnode_allocate(dn, ot, blocksize, indirect_blockshift,
- bonustype, bonuslen, tx);
- mutex_exit(&os->os_obj_lock);
-
- dmu_tx_add_new_object(tx, dn);
- dnode_rele(dn, FTAG);
-
- return (object);
}
uint64_t
dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
- return (dmu_object_alloc_ibs(os, ot, blocksize, 0,
- bonustype, bonuslen, tx));
+ return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
+ bonuslen, 0, tx));
}
+uint64_t
+dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
+ int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
+ dmu_tx_t *tx)
+{
+ return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
+ bonustype, bonuslen, 0, tx));
+}
+
+uint64_t
+dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+ return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
+ bonuslen, dnodesize, tx));
+}
+
int
dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
+ return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
+ bonuslen, 0, tx));
+}
+
+int
+dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen,
+ int dnodesize, dmu_tx_t *tx)
+{
dnode_t *dn;
+ int dn_slots = dnodesize >> DNODE_SHIFT;
int err;
+ if (dn_slots == 0)
+ dn_slots = DNODE_MIN_SLOTS;
+ ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
+ ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
+
if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
return (SET_ERROR(EBADF));
- err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
+ FTAG, &dn);
if (err)
return (err);
- dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
+ dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
dmu_tx_add_new_object(tx, dn);
dnode_rele(dn, FTAG);
@@ -136,18 +262,28 @@ int
dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
+ return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
+ bonuslen, 0, tx));
+}
+
+int
+dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
+ dmu_tx_t *tx)
+{
dnode_t *dn;
+ int dn_slots = dnodesize >> DNODE_SHIFT;
int err;
if (object == DMU_META_DNODE_OBJECT)
return (SET_ERROR(EBADF));
- err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
FTAG, &dn);
if (err)
return (err);
- dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
+ dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx);
dnode_rele(dn, FTAG);
return (err);
@@ -161,7 +297,7 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_
ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
- err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
+ err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
FTAG, &dn);
if (err)
return (err);
@@ -186,8 +322,53 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_
int
dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
{
- uint64_t offset = (*objectp + 1) << DNODE_SHIFT;
+ uint64_t offset;
+ uint64_t start_obj;
+ struct dsl_dataset *ds = os->os_dsl_dataset;
int error;
+
+ if (*objectp == 0) {
+ start_obj = 1;
+ } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
+ uint64_t i = *objectp + 1;
+ uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
+ dmu_object_info_t doi;
+
+ /*
+ * Scan through the remaining meta dnode block. The contents
+ * of each slot in the block are known so it can be quickly
+ * checked. If the block is exhausted without a match then
+ * hand off to dnode_next_offset() for further scanning.
+ */
+ while (i <= last_obj) {
+ error = dmu_object_info(os, i, &doi);
+ if (error == ENOENT) {
+ if (hole) {
+ *objectp = i;
+ return (0);
+ } else {
+ i++;
+ }
+ } else if (error == EEXIST) {
+ i++;
+ } else if (error == 0) {
+ if (hole) {
+ i += doi.doi_dnodesize >> DNODE_SHIFT;
+ } else {
+ *objectp = i;
+ return (0);
+ }
+ } else {
+ return (error);
+ }
+ }
+
+ start_obj = i;
+ } else {
+ start_obj = *objectp + 1;
+ }
+
+ offset = start_obj << DNODE_SHIFT;
error = dnode_next_offset(DMU_META_DNODE(os),
(hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c Mon Aug 12 11:42:16 2019 (r350897)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c Mon Aug 12 12:05:40 2019 (r350898)
@@ -140,6 +140,12 @@ dmu_objset_id(objset_t *os)
return (ds ? ds->ds_object : 0);
}
+uint64_t
+dmu_objset_dnodesize(objset_t *os)
+{
+ return (os->os_dnodesize);
+}
+
zfs_sync_type_t
dmu_objset_syncprop(objset_t *os)
{
@@ -270,6 +276,34 @@ redundant_metadata_changed_cb(void *arg, uint64_t newv
}
static void
+dnodesize_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ switch (newval) {
+ case ZFS_DNSIZE_LEGACY:
+ os->os_dnodesize = DNODE_MIN_SIZE;
+ break;
+ case ZFS_DNSIZE_AUTO:
+ /*
+ * Choose a dnode size that will work well for most
+ * workloads if the user specified "auto". Future code
+ * improvements could dynamically select a dnode size
+ * based on observed workload patterns.
+ */
+ os->os_dnodesize = DNODE_MIN_SIZE * 2;
+ break;
+ case ZFS_DNSIZE_1K:
+ case ZFS_DNSIZE_2K:
+ case ZFS_DNSIZE_4K:
+ case ZFS_DNSIZE_8K:
+ case ZFS_DNSIZE_16K:
+ os->os_dnodesize = newval;
+ break;
+ }
+}
+
+static void
logbias_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
@@ -477,6 +511,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, bl
zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
recordsize_changed_cb, os);
}
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_DNODESIZE),
+ dnodesize_changed_cb, os);
+ }
}
if (needlock)
dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
@@ -496,6 +535,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, bl
os->os_sync = ZFS_SYNC_STANDARD;
os->os_primary_cache = ZFS_CACHE_ALL;
os->os_secondary_cache = ZFS_CACHE_ALL;
+ os->os_dnodesize = DNODE_MIN_SIZE;
}
/*
* These properties will be filled in by the logic in zfs_get_zplprop()
@@ -524,6 +564,9 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, bl
mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
+ os->os_obj_next_percpu_len = boot_ncpus;
+ os->os_obj_next_percpu = kmem_zalloc(os->os_obj_next_percpu_len *
+ sizeof (os->os_obj_next_percpu[0]), KM_SLEEP);
dnode_special_open(os, &os->os_phys->os_meta_dnode,
DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
@@ -802,6 +845,9 @@ dmu_objset_evict_done(objset_t *os)
rw_enter(&os_lock, RW_READER);
rw_exit(&os_lock);
+ kmem_free(os->os_obj_next_percpu,
+ os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0]));
+
mutex_destroy(&os->os_lock);
mutex_destroy(&os->os_userused_lock);
mutex_destroy(&os->os_obj_lock);
@@ -836,8 +882,8 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds,
mdn = DMU_META_DNODE(os);
- dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
- DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
+ dnode_allocate(mdn, DMU_OT_DNODE, DNODE_BLOCK_SIZE, DN_MAX_INDBLKSHIFT,
+ DMU_OT_NONE, 0, DNODE_MIN_SLOTS, tx);
/*
* We don't want to have to increase the meta-dnode's nlevels
@@ -1496,7 +1542,7 @@ do_userquota_update(userquota_cache_t *cache, uint64_t
uint64_t user, uint64_t group, boolean_t subtract)
{
if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
- int64_t delta = DNODE_SIZE + used;
+ int64_t delta = DNODE_MIN_SIZE + used;
if (subtract)
delta = -delta;
Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c Mon Aug 12 11:42:16 2019 (r350897)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c Mon Aug 12 12:05:40 2019 (r350898)
@@ -469,6 +469,7 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_
drro->drr_bonustype = dnp->dn_bonustype;
drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
drro->drr_bonuslen = dnp->dn_bonuslen;
+ drro->drr_dn_slots = dnp->dn_extra_slots + 1;
drro->drr_checksumtype = dnp->dn_checksum;
drro->drr_compress = dnp->dn_compress;
drro->drr_toguid = dsp->dsa_toguid;
@@ -621,7 +622,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *
} else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
return (0);
} else if (type == DMU_OT_DNODE) {
- int blksz = BP_GET_LSIZE(bp);
+ int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf;
@@ -633,8 +634,8 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *
return (SET_ERROR(EIO));
dnode_phys_t *blk = abuf->b_data;
- uint64_t dnobj = zb->zb_blkid * (blksz >> DNODE_SHIFT);
- for (int i = 0; i < blksz >> DNODE_SHIFT; i++) {
+ uint64_t dnobj = zb->zb_blkid * epb;
+ for (int i = 0; i < epb; i += blk[i].dn_extra_slots + 1) {
err = dump_dnode(dsa, dnobj + i, blk + i);
if (err != 0)
break;
@@ -802,6 +803,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t
if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS])
featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
+ if (to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE])
+ featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE;
if (embedok &&
spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
@@ -1396,11 +1399,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
/*
* The receiving code doesn't know how to translate large blocks
* to smaller ones, so the pool must have the LARGE_BLOCKS
- * feature enabled if the stream has LARGE_BLOCKS.
+ * feature enabled if the stream has LARGE_BLOCKS. Same with
+ * large dnodes.
*/
if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
return (SET_ERROR(ENOTSUP));
+ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE))
+ return (SET_ERROR(ENOTSUP));
error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
if (error == 0) {
@@ -1605,6 +1612,9 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
dsl_dataset_t *ds;
const char *tofs = drba->drba_cookie->drc_tofs;
+ /* 6 extra bytes for /%recv */
+ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+
/* already checked */
ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING);
@@ -1632,8 +1642,18 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
return (SET_ERROR(ENOTSUP));
- /* 6 extra bytes for /%recv */
- char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+ /*
+ * The receiving code doesn't know how to translate large blocks
+ * to smaller ones, so the pool must have the LARGE_BLOCKS
+ * feature enabled if the stream has LARGE_BLOCKS. Same with
+ * large dnodes.
+ */
+ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+ return (SET_ERROR(ENOTSUP));
+ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE))
+ return (SET_ERROR(ENOTSUP));
(void) snprintf(recvname, sizeof (recvname), "%s/%s",
tofs, recv_clone_name);
@@ -2024,7 +2044,8 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t
return (1);
} else {
return (1 +
- ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT));
+ ((DN_OLD_MAX_BONUSLEN -
+ MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT));
}
}
@@ -2082,15 +2103,17 @@ receive_object(struct receive_writer_arg *rwa, struct
P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
drro->drr_blksz < SPA_MINBLOCKSIZE ||
drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
- drro->drr_bonuslen > DN_MAX_BONUSLEN) {
+ drro->drr_bonuslen >
+ DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) ||
+ drro->drr_dn_slots >
+ (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) {
return (SET_ERROR(EINVAL));
}
err = dmu_object_info(rwa->os, drro->drr_object, &doi);
- if (err != 0 && err != ENOENT)
+ if (err != 0 && err != ENOENT && err != EEXIST)
return (SET_ERROR(EINVAL));
- object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
if (drro->drr_object > rwa->max_object)
rwa->max_object = drro->drr_object;
@@ -2103,18 +2126,66 @@ receive_object(struct receive_writer_arg *rwa, struct
if (err == 0) {
int nblkptr;
+ object = drro->drr_object;
+
nblkptr = deduce_nblkptr(drro->drr_bonustype,
drro->drr_bonuslen);
if (drro->drr_blksz != doi.doi_data_block_size ||
- nblkptr < doi.doi_nblkptr) {
+ nblkptr < doi.doi_nblkptr ||
+ drro->drr_dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
err = dmu_free_long_range(rwa->os, drro->drr_object,
0, DMU_OBJECT_END);
if (err != 0)
return (SET_ERROR(EINVAL));
}
+ } else if (err == EEXIST) {
+ /*
+ * The object requested is currently an interior slot of a
+ * multi-slot dnode. This will be resolved when the next txg
+ * is synced out, since the send stream will have told us
+ * to free this slot when we freed the associated dnode
+ * earlier in the stream.
+ */
+ txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+ object = drro->drr_object;
+ } else {
+ /* object is free and we are about to allocate a new one */
+ object = DMU_NEW_OBJECT;
}
+ /*
+ * If this is a multi-slot dnode there is a chance that this
+ * object will expand into a slot that is already used by
+ * another object from the previous snapshot. We must free
+ * these objects before we attempt to allocate the new dnode.
+ */
+ if (drro->drr_dn_slots > 1) {
+ boolean_t need_sync = B_FALSE;
+
+ for (uint64_t slot = drro->drr_object + 1;
+ slot < drro->drr_object + drro->drr_dn_slots;
+ slot++) {
+ dmu_object_info_t slot_doi;
+
+ err = dmu_object_info(rwa->os, slot, &slot_doi);
+ if (err == ENOENT || err == EEXIST)
+ continue;
+ else if (err != 0)
+ return (err);
+
+ err = dmu_free_long_object(rwa->os, slot);
+
+ if (err != 0)
+ return (err);
+
+ need_sync = B_TRUE;
+ }
+
+ if (need_sync)
+ txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+ }
+
tx = dmu_tx_create(rwa->os);
dmu_tx_hold_bonus(tx, object);
err = dmu_tx_assign(tx, TXG_WAIT);
@@ -2125,9 +2196,10 @@ receive_object(struct receive_writer_arg *rwa, struct
if (object == DMU_NEW_OBJECT) {
/* currently free, want to be allocated */
- err = dmu_object_claim(rwa->os, drro->drr_object,
+ err = dmu_object_claim_dnsize(rwa->os, drro->drr_object,
drro->drr_type, drro->drr_blksz,
- drro->drr_bonustype, drro->drr_bonuslen, tx);
+ drro->drr_bonustype, drro->drr_bonuslen,
+ drro->drr_dn_slots << DNODE_SHIFT, tx);
} else if (drro->drr_type != doi.doi_type ||
drro->drr_blksz != doi.doi_data_block_size ||
drro->drr_bonustype != doi.doi_bonus_type ||
@@ -2179,13 +2251,18 @@ receive_freeobjects(struct receive_writer_arg *rwa,
if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
return (SET_ERROR(EINVAL));
- for (obj = drrfo->drr_firstobj;
+ for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj;
obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0;
next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
int err;
- if (dmu_object_info(rwa->os, obj, NULL) != 0)
+ err = dmu_object_info(rwa->os, obj, NULL);
+ if (err == ENOENT) {
+ obj++;
continue;
+ } else if (err != 0) {
+ return (err);
+ }
err = dmu_free_long_object(rwa->os, obj);
if (err != 0)
Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c Mon Aug 12 11:42:16 2019 (r350897)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c Mon Aug 12 12:05:40 2019 (r350898)
@@ -327,13 +327,13 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys
goto post;
dnode_phys_t *child_dnp = buf->b_data;
- for (i = 0; i < epb; i++) {
+ for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
prefetch_dnode_metadata(td, &child_dnp[i],
zb->zb_objset, zb->zb_blkid * epb + i);
}
/* recursively visitbp() blocks below this */
- for (i = 0; i < epb; i++) {
+ for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
err = traverse_dnode(td, &child_dnp[i],
zb->zb_objset, zb->zb_blkid * epb + i);
if (err != 0)
@@ -435,7 +435,7 @@ prefetch_dnode_metadata(traverse_data_t *td, const dno
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-vendor
mailing list