svn commit: r264829 - vendor-sys/illumos/dist/common/zfs vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor-sys/illumos/dist/uts/common/sys/fs vendor/ill...
Xin LI
delphij at FreeBSD.org
Wed Apr 23 19:09:17 UTC 2014
Author: delphij
Date: Wed Apr 23 19:09:14 2014
New Revision: 264829
URL: http://svnweb.freebsd.org/changeset/base/264829
Log:
3897 zfs filesystem and snapshot limits
illumos/illumos at a2afb611b30628fb74ad9eade4ae465f9031e262
Modified:
vendor-sys/illumos/dist/common/zfs/zfeature_common.c
vendor-sys/illumos/dist/common/zfs/zfeature_common.h
vendor-sys/illumos/dist/common/zfs/zfs_prop.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dataset.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_destroy.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dir.c
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu_send.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_dataset.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_dir.h
vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ioctl.c
vendor-sys/illumos/dist/uts/common/sys/fs/zfs.h
Changes in other areas also in this revision:
Modified:
vendor/illumos/dist/lib/libzfs/common/libzfs_dataset.c
vendor/illumos/dist/lib/libzfs/common/libzfs_util.c
vendor/illumos/dist/man/man1m/zfs.1m
vendor/illumos/dist/man/man5/zpool-features.5
Modified: vendor-sys/illumos/dist/common/zfs/zfeature_common.c
==============================================================================
--- vendor-sys/illumos/dist/common/zfs/zfeature_common.c Wed Apr 23 18:36:32 2014 (r264828)
+++ vendor-sys/illumos/dist/common/zfs/zfeature_common.c Wed Apr 23 19:09:14 2014 (r264829)
@@ -206,4 +206,13 @@ zpool_feature_init(void)
"com.delphix:bookmarks", "bookmarks",
"\"zfs bookmark\" command",
B_TRUE, B_FALSE, B_FALSE, bookmarks_deps);
+
+ static const spa_feature_t filesystem_limits_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_FS_SS_LIMIT,
+ "com.joyent:filesystem_limits", "filesystem_limits",
+ "Filesystem and snapshot limits.", B_TRUE, B_FALSE, B_FALSE,
+ filesystem_limits_deps);
}
Modified: vendor-sys/illumos/dist/common/zfs/zfeature_common.h
==============================================================================
--- vendor-sys/illumos/dist/common/zfs/zfeature_common.h Wed Apr 23 18:36:32 2014 (r264828)
+++ vendor-sys/illumos/dist/common/zfs/zfeature_common.h Wed Apr 23 19:09:14 2014 (r264829)
@@ -49,6 +49,7 @@ typedef enum spa_feature {
SPA_FEATURE_HOLE_BIRTH,
SPA_FEATURE_EXTENSIBLE_DATASET,
SPA_FEATURE_BOOKMARKS,
+ SPA_FEATURE_FS_SS_LIMIT,
SPA_FEATURES
} spa_feature_t;
Modified: vendor-sys/illumos/dist/common/zfs/zfs_prop.c
==============================================================================
--- vendor-sys/illumos/dist/common/zfs/zfs_prop.c Wed Apr 23 18:36:32 2014 (r264828)
+++ vendor-sys/illumos/dist/common/zfs/zfs_prop.c Wed Apr 23 19:09:14 2014 (r264829)
@@ -371,6 +371,18 @@ zfs_prop_init(void)
zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0,
PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"<size> | none", "REFRESERV");
+ zprop_register_number(ZFS_PROP_FILESYSTEM_LIMIT, "filesystem_limit",
+ UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM,
+ "<count> | none", "FSLIMIT");
+ zprop_register_number(ZFS_PROP_SNAPSHOT_LIMIT, "snapshot_limit",
+ UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "<count> | none", "SSLIMIT");
+ zprop_register_number(ZFS_PROP_FILESYSTEM_COUNT, "filesystem_count",
+ UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM,
+ "<count>", "FSCOUNT");
+ zprop_register_number(ZFS_PROP_SNAPSHOT_COUNT, "snapshot_count",
+ UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+ "<count>", "SSCOUNT");
/* inherit number properties */
zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c Wed Apr 23 18:36:32 2014 (r264828)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c Wed Apr 23 19:09:14 2014 (r264829)
@@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -758,9 +759,11 @@ dmu_objset_create_check(void *arg, dmu_t
dsl_dir_rele(pdd, FTAG);
return (SET_ERROR(EEXIST));
}
+ error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
+ doca->doca_cred);
dsl_dir_rele(pdd, FTAG);
- return (0);
+ return (error);
}
static void
@@ -844,6 +847,12 @@ dmu_objset_clone_check(void *arg, dmu_tx
dsl_dir_rele(pdd, FTAG);
return (SET_ERROR(EXDEV));
}
+ error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
+ doca->doca_cred);
+ if (error != 0) {
+ dsl_dir_rele(pdd, FTAG);
+ return (SET_ERROR(EDQUOT));
+ }
dsl_dir_rele(pdd, FTAG);
error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c Wed Apr 23 18:36:32 2014 (r264828)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c Wed Apr 23 19:09:14 2014 (r264829)
@@ -22,7 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
*/
#include <sys/dmu.h>
@@ -776,6 +776,20 @@ recv_begin_check_existing_impl(dmu_recv_
if (error != ENOENT)
return (error == 0 ? EEXIST : error);
+ /*
+ * Check snapshot limit before receiving. We'll recheck again at the
+ * end, but might as well abort before receiving if we're already over
+ * the limit.
+ *
+ * Note that we do not check the file system limit with
+ * dsl_dir_fscount_check because the temporary %clones don't count
+ * against that limit.
+ */
+ error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT,
+ NULL, drba->drba_cred);
+ if (error != 0)
+ return (error);
+
if (fromguid != 0) {
dsl_dataset_t *snap;
uint64_t obj = ds->ds_phys->ds_prev_snap_obj;
@@ -882,6 +896,25 @@ dmu_recv_begin_check(void *arg, dmu_tx_t
if (error != 0)
return (error);
+ /*
+ * Check filesystem and snapshot limits before receiving. We'll
+ * recheck snapshot limits again at the end (we create the
+ * filesystems and increment those counts during begin_sync).
+ */
+ error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
+ ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
+ ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
if (drba->drba_origin != NULL) {
dsl_dataset_t *origin;
error = dsl_dataset_hold(dp, drba->drba_origin,
@@ -991,6 +1024,7 @@ dmu_recv_begin(char *tofs, char *tosnap,
drc->drc_tosnap = tosnap;
drc->drc_tofs = tofs;
drc->drc_force = force;
+ drc->drc_cred = CRED();
if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
drc->drc_byteswap = B_TRUE;
@@ -1684,7 +1718,7 @@ dmu_recv_end_check(void *arg, dmu_tx_t *
return (error);
}
error = dsl_dataset_snapshot_check_impl(origin_head,
- drc->drc_tosnap, tx, B_TRUE);
+ drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
dsl_dataset_rele(origin_head, FTAG);
if (error != 0)
return (error);
@@ -1692,7 +1726,7 @@ dmu_recv_end_check(void *arg, dmu_tx_t *
error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
} else {
error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
- drc->drc_tosnap, tx, B_TRUE);
+ drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
}
return (error);
}
Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dataset.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dataset.c Wed Apr 23 18:36:32 2014 (r264828)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dataset.c Wed Apr 23 19:09:14 2014 (r264829)
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 RackTop Systems.
*/
@@ -316,7 +316,8 @@ dsl_dataset_snap_lookup(dsl_dataset_t *d
}
int
-dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx)
+dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
+ boolean_t adj_cnt)
{
objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
@@ -333,6 +334,11 @@ dsl_dataset_snap_remove(dsl_dataset_t *d
err = zap_remove_norm(mos, snapobj, name, mt, tx);
if (err == ENOTSUP && mt == MT_FIRST)
err = zap_remove(mos, snapobj, name, tx);
+
+ if (err == 0 && adj_cnt)
+ dsl_fs_ss_count_adjust(ds->ds_dir, -1,
+ DD_FIELD_SNAPSHOT_COUNT, tx);
+
return (err);
}
@@ -758,6 +764,21 @@ dsl_dataset_create_sync(dsl_dir_t *pdd,
dsl_deleg_set_create_perms(dd, tx, cr);
+ /*
+ * Since we're creating a new node we know it's a leaf, so we can
+ * initialize the counts if the limit feature is active.
+ */
+ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
+ uint64_t cnt = 0;
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+
+ dsl_dir_zapify(dd, tx);
+ VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+ sizeof (cnt), 1, &cnt, tx));
+ VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+ sizeof (cnt), 1, &cnt, tx));
+ }
+
dsl_dir_rele(dd, FTAG);
/*
@@ -924,11 +945,12 @@ typedef struct dsl_dataset_snapshot_arg
nvlist_t *ddsa_snaps;
nvlist_t *ddsa_props;
nvlist_t *ddsa_errors;
+ cred_t *ddsa_cr;
} dsl_dataset_snapshot_arg_t;
int
dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
- dmu_tx_t *tx, boolean_t recv)
+ dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr)
{
int error;
uint64_t value;
@@ -966,6 +988,18 @@ dsl_dataset_snapshot_check_impl(dsl_data
if (!recv && DS_IS_INCONSISTENT(ds))
return (SET_ERROR(EBUSY));
+ /*
+ * Skip the check for temporary snapshots or if we have already checked
+ * the counts in dsl_dataset_snapshot_check. This means we really only
+ * check the count here when we're receiving a stream.
+ */
+ if (cnt != 0 && cr != NULL) {
+ error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
+ ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr);
+ if (error != 0)
+ return (error);
+ }
+
error = dsl_dataset_snapshot_reserve_space(ds, tx);
if (error != 0)
return (error);
@@ -981,6 +1015,99 @@ dsl_dataset_snapshot_check(void *arg, dm
nvpair_t *pair;
int rv = 0;
+ /*
+ * Pre-compute how many total new snapshots will be created for each
+ * level in the tree and below. This is needed for validating the
+ * snapshot limit when either taking a recursive snapshot or when
+ * taking multiple snapshots.
+ *
+ * The problem is that the counts are not actually adjusted when
+ * we are checking, only when we finally sync. For a single snapshot,
+ * this is easy, the count will increase by 1 at each node up the tree,
+ * but its more complicated for the recursive/multiple snapshot case.
+ *
+ * The dsl_fs_ss_limit_check function does recursively check the count
+ * at each level up the tree but since it is validating each snapshot
+ * independently we need to be sure that we are validating the complete
+ * count for the entire set of snapshots. We do this by rolling up the
+ * counts for each component of the name into an nvlist and then
+ * checking each of those cases with the aggregated count.
+ *
+ * This approach properly handles not only the recursive snapshot
+ * case (where we get all of those on the ddsa_snaps list) but also
+ * the sibling case (e.g. snapshot a/b and a/c so that we will also
+ * validate the limit on 'a' using a count of 2).
+ *
+ * We validate the snapshot names in the third loop and only report
+ * name errors once.
+ */
+ if (dmu_tx_is_syncing(tx)) {
+ nvlist_t *cnt_track = NULL;
+ cnt_track = fnvlist_alloc();
+
+ /* Rollup aggregated counts into the cnt_track list */
+ for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+ pair != NULL;
+ pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+ char *pdelim;
+ uint64_t val;
+ char nm[MAXPATHLEN];
+
+ (void) strlcpy(nm, nvpair_name(pair), sizeof (nm));
+ pdelim = strchr(nm, '@');
+ if (pdelim == NULL)
+ continue;
+ *pdelim = '\0';
+
+ do {
+ if (nvlist_lookup_uint64(cnt_track, nm,
+ &val) == 0) {
+ /* update existing entry */
+ fnvlist_add_uint64(cnt_track, nm,
+ val + 1);
+ } else {
+ /* add to list */
+ fnvlist_add_uint64(cnt_track, nm, 1);
+ }
+
+ pdelim = strrchr(nm, '/');
+ if (pdelim != NULL)
+ *pdelim = '\0';
+ } while (pdelim != NULL);
+ }
+
+ /* Check aggregated counts at each level */
+ for (pair = nvlist_next_nvpair(cnt_track, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
+ int error = 0;
+ char *name;
+ uint64_t cnt = 0;
+ dsl_dataset_t *ds;
+
+ name = nvpair_name(pair);
+ cnt = fnvpair_value_uint64(pair);
+ ASSERT(cnt > 0);
+
+ error = dsl_dataset_hold(dp, name, FTAG, &ds);
+ if (error == 0) {
+ error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
+ ZFS_PROP_SNAPSHOT_LIMIT, NULL,
+ ddsa->ddsa_cr);
+ dsl_dataset_rele(ds, FTAG);
+ }
+
+ if (error != 0) {
+ if (ddsa->ddsa_errors != NULL)
+ fnvlist_add_int32(ddsa->ddsa_errors,
+ name, error);
+ rv = error;
+ /* only report one error for this check */
+ break;
+ }
+ }
+ nvlist_free(cnt_track);
+ }
+
for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
int error = 0;
@@ -1001,8 +1128,9 @@ dsl_dataset_snapshot_check(void *arg, dm
if (error == 0)
error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
if (error == 0) {
+ /* passing 0/NULL skips dsl_fs_ss_limit_check */
error = dsl_dataset_snapshot_check_impl(ds,
- atp + 1, tx, B_FALSE);
+ atp + 1, tx, B_FALSE, 0, NULL);
dsl_dataset_rele(ds, FTAG);
}
@@ -1014,6 +1142,7 @@ dsl_dataset_snapshot_check(void *arg, dm
rv = error;
}
}
+
return (rv);
}
@@ -1041,6 +1170,7 @@ dsl_dataset_snapshot_sync_impl(dsl_datas
bcmp(&os->os_phys->os_zil_header, &zero_zil,
sizeof (zero_zil)) == 0);
+ dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);
/*
* The origin's ds_creation_txg has to be < TXG_INITIAL
@@ -1217,6 +1347,7 @@ dsl_dataset_snapshot(nvlist_t *snaps, nv
ddsa.ddsa_snaps = snaps;
ddsa.ddsa_props = props;
ddsa.ddsa_errors = errors;
+ ddsa.ddsa_cr = CRED();
if (error == 0) {
error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
@@ -1255,8 +1386,9 @@ dsl_dataset_snapshot_tmp_check(void *arg
if (error != 0)
return (error);
+ /* NULL cred means no limit check for tmp snapshot */
error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
- tx, B_FALSE);
+ tx, B_FALSE, 0, NULL);
if (error != 0) {
dsl_dataset_rele(ds, FTAG);
return (error);
@@ -1624,7 +1756,8 @@ dsl_dataset_rename_snapshot_sync_impl(ds
spa_history_log_internal_ds(ds, "rename", tx,
"-> @%s", ddrsa->ddrsa_newsnapname);
- VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx));
+ VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
+ B_FALSE));
mutex_enter(&ds->ds_lock);
(void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
mutex_exit(&ds->ds_lock);
@@ -1856,6 +1989,7 @@ typedef struct dsl_dataset_promote_arg {
dsl_dataset_t *origin_origin; /* origin of the origin */
uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
char *err_ds;
+ cred_t *cr;
} dsl_dataset_promote_arg_t;
static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
@@ -1873,6 +2007,7 @@ dsl_dataset_promote_check(void *arg, dmu
dsl_dataset_t *origin_ds;
int err;
uint64_t unused;
+ uint64_t ss_mv_cnt;
err = promote_hold(ddpa, dp, FTAG);
if (err != 0)
@@ -1919,6 +2054,7 @@ dsl_dataset_promote_check(void *arg, dmu
* Note however, if we stop before we reach the ORIGIN we get:
* uN + kN + kN-1 + ... + kM - uM-1
*/
+ ss_mv_cnt = 0;
ddpa->used = origin_ds->ds_phys->ds_referenced_bytes;
ddpa->comp = origin_ds->ds_phys->ds_compressed_bytes;
ddpa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
@@ -1927,6 +2063,8 @@ dsl_dataset_promote_check(void *arg, dmu
uint64_t val, dlused, dlcomp, dluncomp;
dsl_dataset_t *ds = snap->ds;
+ ss_mv_cnt++;
+
/*
* If there are long holds, we won't be able to evict
* the objset.
@@ -1969,9 +2107,9 @@ dsl_dataset_promote_check(void *arg, dmu
ddpa->origin_origin->ds_phys->ds_uncompressed_bytes;
}
- /* Check that there is enough space here */
+ /* Check that there is enough space and limit headroom here */
err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
- ddpa->used);
+ 0, ss_mv_cnt, ddpa->used, ddpa->cr);
if (err != 0)
goto out;
@@ -2111,10 +2249,12 @@ dsl_dataset_promote_sync(void *arg, dmu_
/* move snap name entry */
VERIFY0(dsl_dataset_get_snapname(ds));
VERIFY0(dsl_dataset_snap_remove(origin_head,
- ds->ds_snapname, tx));
+ ds->ds_snapname, tx, B_TRUE));
VERIFY0(zap_add(dp->dp_meta_objset,
hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
8, 1, &ds->ds_object, tx));
+ dsl_fs_ss_count_adjust(hds->ds_dir, 1,
+ DD_FIELD_SNAPSHOT_COUNT, tx);
/* change containing dsl_dir */
dmu_buf_will_dirty(ds->ds_dbuf, tx);
@@ -2352,6 +2492,7 @@ dsl_dataset_promote(const char *name, ch
ddpa.ddpa_clonename = name;
ddpa.err_ds = conflsnap;
+ ddpa.cr = CRED();
return (dsl_sync_task(name, dsl_dataset_promote_check,
dsl_dataset_promote_sync, &ddpa, 2 + numsnaps));
Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_destroy.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_destroy.c Wed Apr 23 18:36:32 2014 (r264828)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_destroy.c Wed Apr 23 19:09:14 2014 (r264829)
@@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2013 by Joyent, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -430,7 +431,7 @@ dsl_destroy_snapshot_sync_impl(dsl_datas
ASSERT3U(val, ==, obj);
}
#endif
- VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx));
+ VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE));
dsl_dataset_rele(ds_head, FTAG);
if (ds_prev != NULL)
@@ -657,6 +658,17 @@ dsl_dir_destroy_sync(uint64_t ddobj, dmu
ASSERT0(dd->dd_phys->dd_head_dataset_obj);
/*
+ * Decrement the filesystem count for all parent filesystems.
+ *
+ * When we receive an incremental stream into a filesystem that already
+ * exists, a temporary clone is created. We never count this temporary
+ * clone, whose name begins with a '%'.
+ */
+ if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL)
+ dsl_fs_ss_count_adjust(dd->dd_parent, -1,
+ DD_FIELD_FILESYSTEM_COUNT, tx);
+
+ /*
* Remove our reservation. The impl() routine avoids setting the
* actual property, which would require the (already destroyed) ds.
*/
Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dir.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dir.c Wed Apr 23 18:36:32 2014 (r264828)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dir.c Wed Apr 23 19:09:14 2014 (r264829)
@@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2013 Martin Matuska. All rights reserved.
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
*/
#include <sys/dmu.h>
@@ -39,7 +40,86 @@
#include <sys/zio.h>
#include <sys/arc.h>
#include <sys/sunddi.h>
+#include <sys/zfeature.h>
+#include <sys/policy.h>
+#include <sys/zfs_znode.h>
#include "zfs_namecheck.h"
+#include "zfs_prop.h"
+
+/*
+ * Filesystem and Snapshot Limits
+ * ------------------------------
+ *
+ * These limits are used to restrict the number of filesystems and/or snapshots
+ * that can be created at a given level in the tree or below. A typical
+ * use-case is with a delegated dataset where the administrator wants to ensure
+ * that a user within the zone is not creating too many additional filesystems
+ * or snapshots, even though they're not exceeding their space quota.
+ *
+ * The filesystem and snapshot counts are stored as extensible properties. This
+ * capability is controlled by a feature flag and must be enabled to be used.
+ * Once enabled, the feature is not active until the first limit is set. At
+ * that point, future operations to create/destroy filesystems or snapshots
+ * will validate and update the counts.
+ *
+ * Because the count properties will not exist before the feature is active,
+ * the counts are updated when a limit is first set on an uninitialized
+ * dsl_dir node in the tree (The filesystem/snapshot count on a node includes
+ * all of the nested filesystems/snapshots. Thus, a new leaf node has a
+ * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
+ * snapshot count properties on a node indicate uninitialized counts on that
+ * node.) When first setting a limit on an uninitialized node, the code starts
+ * at the filesystem with the new limit and descends into all sub-filesystems
+ * to add the count properties.
+ *
+ * In practice this is lightweight since a limit is typically set when the
+ * filesystem is created and thus has no children. Once valid, changing the
+ * limit value won't require a re-traversal since the counts are already valid.
+ * When recursively fixing the counts, if a node with a limit is encountered
+ * during the descent, the counts are known to be valid and there is no need to
+ * descend into that filesystem's children. The counts on filesystems above the
+ * one with the new limit will still be uninitialized, unless a limit is
+ * eventually set on one of those filesystems. The counts are always recursively
+ * updated when a limit is set on a dataset, unless there is already a limit.
+ * When a new limit value is set on a filesystem with an existing limit, it is
+ * possible for the new limit to be less than the current count at that level
+ * since a user who can change the limit is also allowed to exceed the limit.
+ *
+ * Once the feature is active, then whenever a filesystem or snapshot is
+ * created, the code recurses up the tree, validating the new count against the
+ * limit at each initialized level. In practice, most levels will not have a
+ * limit set. If there is a limit at any initialized level up the tree, the
+ * check must pass or the creation will fail. Likewise, when a filesystem or
+ * snapshot is destroyed, the counts are recursively adjusted all the way up
+ * the initizized nodes in the tree. Renaming a filesystem into different point
+ * in the tree will first validate, then update the counts on each branch up to
+ * the common ancestor. A receive will also validate the counts and then update
+ * them.
+ *
+ * An exception to the above behavior is that the limit is not enforced if the
+ * user has permission to modify the limit. This is primarily so that
+ * recursive snapshots in the global zone always work. We want to prevent a
+ * denial-of-service in which a lower level delegated dataset could max out its
+ * limit and thus block recursive snapshots from being taken in the global zone.
+ * Because of this, it is possible for the snapshot count to be over the limit
+ * and snapshots taken in the global zone could cause a lower level dataset to
+ * hit or exceed its limit. The administrator taking the global zone recursive
+ * snapshot should be aware of this side-effect and behave accordingly.
+ * For consistency, the filesystem limit is also not enforced if the user can
+ * modify the limit.
+ *
+ * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
+ * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
+ * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
+ * dsl_dir_init_fs_ss_count().
+ *
+ * There is a special case when we receive a filesystem that already exists. In
+ * this case a temporary clone name of %X is created (see dmu_recv_begin). We
+ * never update the filesystem counts for temporary clones.
+ *
+ * Likewise, we do not update the snapshot counts for temporary snapshots,
+ * such as those created by zfs diff.
+ */
static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
@@ -378,6 +458,398 @@ dsl_dir_hold(dsl_pool_t *dp, const char
return (err);
}
+/*
+ * If the counts are already initialized for this filesystem and its
+ * descendants then do nothing, otherwise initialize the counts.
+ *
+ * The counts on this filesystem, and those below, may be uninitialized due to
+ * either the use of a pre-existing pool which did not support the
+ * filesystem/snapshot limit feature, or one in which the feature had not yet
+ * been enabled.
+ *
+ * Recursively descend the filesystem tree and update the filesystem/snapshot
+ * counts on each filesystem below, then update the cumulative count on the
+ * current filesystem. If the filesystem already has a count set on it,
+ * then we know that its counts, and the counts on the filesystems below it,
+ * are already correct, so we don't have to update this filesystem.
+ */
+static void
+dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+ uint64_t my_fs_cnt = 0;
+ uint64_t my_ss_cnt = 0;
+ dsl_pool_t *dp = dd->dd_pool;
+ objset_t *os = dp->dp_meta_objset;
+ zap_cursor_t *zc;
+ zap_attribute_t *za;
+ dsl_dataset_t *ds;
+
+ ASSERT(spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT));
+ ASSERT(dsl_pool_config_held(dp));
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dsl_dir_zapify(dd, tx);
+
+ /*
+ * If the filesystem count has already been initialized then we
+ * don't need to recurse down any further.
+ */
+ if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0)
+ return;
+
+ zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+ /* Iterate my child dirs */
+ for (zap_cursor_init(zc, os, dd->dd_phys->dd_child_dir_zapobj);
+ zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
+ dsl_dir_t *chld_dd;
+ uint64_t count;
+
+ VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG,
+ &chld_dd));
+
+ /*
+ * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and
+ * temporary datasets.
+ */
+ if (chld_dd->dd_myname[0] == '$' ||
+ chld_dd->dd_myname[0] == '%') {
+ dsl_dir_rele(chld_dd, FTAG);
+ continue;
+ }
+
+ my_fs_cnt++; /* count this child */
+
+ dsl_dir_init_fs_ss_count(chld_dd, tx);
+
+ VERIFY0(zap_lookup(os, chld_dd->dd_object,
+ DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count));
+ my_fs_cnt += count;
+ VERIFY0(zap_lookup(os, chld_dd->dd_object,
+ DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count));
+ my_ss_cnt += count;
+
+ dsl_dir_rele(chld_dd, FTAG);
+ }
+ zap_cursor_fini(zc);
+ /* Count my snapshots (we counted children's snapshots above) */
+ VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
+ dd->dd_phys->dd_head_dataset_obj, FTAG, &ds));
+
+ for (zap_cursor_init(zc, os, ds->ds_phys->ds_snapnames_zapobj);
+ zap_cursor_retrieve(zc, za) == 0;
+ zap_cursor_advance(zc)) {
+ /* Don't count temporary snapshots */
+ if (za->za_name[0] != '%')
+ my_ss_cnt++;
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+
+ kmem_free(zc, sizeof (zap_cursor_t));
+ kmem_free(za, sizeof (zap_attribute_t));
+
+ /* we're in a sync task, update counts */
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+ sizeof (my_fs_cnt), 1, &my_fs_cnt, tx));
+ VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+ sizeof (my_ss_cnt), 1, &my_ss_cnt, tx));
+}
+
+static int
+dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx)
+{
+ char *ddname = (char *)arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ dsl_dir_t *dd;
+ int error;
+
+ error = dsl_dataset_hold(dp, ddname, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ dd = ds->ds_dir;
+ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) &&
+ dsl_dir_is_zapified(dd) &&
+ zap_contains(dp->dp_meta_objset, dd->dd_object,
+ DD_FIELD_FILESYSTEM_COUNT) == 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EALREADY));
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+static void
+dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx)
+{
+ char *ddname = (char *)arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ spa_t *spa;
+
+ VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds));
+
+ spa = dsl_dataset_get_spa(ds);
+
+ if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) {
+ /*
+ * Since the feature was not active and we're now setting a
+ * limit, increment the feature-active counter so that the
+ * feature becomes active for the first time.
+ *
+ * We are already in a sync task so we can update the MOS.
+ */
+ spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx);
+ }
+
+ /*
+ * Since we are now setting a non-UINT64_MAX limit on the filesystem,
+ * we need to ensure the counts are correct. Descend down the tree from
+ * this point and update all of the counts to be accurate.
+ */
+ dsl_dir_init_fs_ss_count(ds->ds_dir, tx);
+
+ dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * Make sure the feature is enabled and activate it if necessary.
+ * Since we're setting a limit, ensure the on-disk counts are valid.
+ * This is only called by the ioctl path when setting a limit value.
+ *
+ * We do not need to validate the new limit, since users who can change the
+ * limit are also allowed to exceed the limit.
+ */
+int
+dsl_dir_activate_fs_ss_limit(const char *ddname)
+{
+ int error;
+
+ error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check,
+ dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0);
+
+ if (error == EALREADY)
+ error = 0;
+
+ return (error);
+}
+
+/*
+ * Used to determine if the filesystem_limit or snapshot_limit should be
+ * enforced. We allow the limit to be exceeded if the user has permission to
+ * write the property value. We pass in the creds that we got in the open
+ * context since we will always be the GZ root in syncing context. We also have
+ * to handle the case where we are allowed to change the limit on the current
+ * dataset, but there may be another limit in the tree above.
+ *
+ * We can never modify these two properties within a non-global zone. In
+ * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
+ * can't use that function since we are already holding the dp_config_rwlock.
+ * In addition, we already have the dd and dealing with snapshots is simplified
+ * in this code.
+ */
+
+typedef enum {
+ ENFORCE_ALWAYS,
+ ENFORCE_NEVER,
+ ENFORCE_ABOVE
+} enforce_res_t;
+
+static enforce_res_t
+dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr)
+{
+ enforce_res_t enforce = ENFORCE_ALWAYS;
+ uint64_t obj;
+ dsl_dataset_t *ds;
+ uint64_t zoned;
+
+ ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
+ prop == ZFS_PROP_SNAPSHOT_LIMIT);
+
+#ifdef _KERNEL
+ if (crgetzoneid(cr) != GLOBAL_ZONEID)
+ return (ENFORCE_ALWAYS);
+
+ if (secpolicy_zfs(cr) == 0)
+ return (ENFORCE_NEVER);
+#endif
+
+ if ((obj = dd->dd_phys->dd_head_dataset_obj) == 0)
+ return (ENFORCE_ALWAYS);
+
+ ASSERT(dsl_pool_config_held(dd->dd_pool));
+
+ if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0)
+ return (ENFORCE_ALWAYS);
+
+ if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) {
+ /* Only root can access zoned fs's from the GZ */
+ enforce = ENFORCE_ALWAYS;
+ } else {
+ if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0)
+ enforce = ENFORCE_ABOVE;
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (enforce);
+}
+
+/*
+ * Check if adding additional child filesystem(s) would exceed any filesystem
+ * limits or adding additional snapshot(s) would exceed any snapshot limits.
+ * The prop argument indicates which limit to check.
+ *
+ * Note that all filesystem limits up to the root (or the highest
+ * initialized) filesystem or the given ancestor must be satisfied.
+ */
+int
+dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
+ dsl_dir_t *ancestor, cred_t *cr)
+{
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+ uint64_t limit, count;
+ char *count_prop;
+ enforce_res_t enforce;
+ int err = 0;
+
+ ASSERT(dsl_pool_config_held(dd->dd_pool));
+ ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
+ prop == ZFS_PROP_SNAPSHOT_LIMIT);
+
+ /*
+ * If we're allowed to change the limit, don't enforce the limit
+ * e.g. this can happen if a snapshot is taken by an administrative
+ * user in the global zone (i.e. a recursive snapshot by root).
+ * However, we must handle the case of delegated permissions where we
+ * are allowed to change the limit on the current dataset, but there
+ * is another limit in the tree above.
+ */
+ enforce = dsl_enforce_ds_ss_limits(dd, prop, cr);
+ if (enforce == ENFORCE_NEVER)
+ return (0);
+
+ /*
+ * e.g. if renaming a dataset with no snapshots, count adjustment
+ * is 0.
+ */
+ if (delta == 0)
+ return (0);
+
+ if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
+ /*
+ * We don't enforce the limit for temporary snapshots. This is
+ * indicated by a NULL cred_t argument.
+ */
+ if (cr == NULL)
+ return (0);
+
+ count_prop = DD_FIELD_SNAPSHOT_COUNT;
+ } else {
+ count_prop = DD_FIELD_FILESYSTEM_COUNT;
+ }
+
+ /*
+ * If an ancestor has been provided, stop checking the limit once we
+ * hit that dir. We need this during rename so that we don't overcount
+ * the check once we recurse up to the common ancestor.
+ */
+ if (ancestor == dd)
+ return (0);
+
+ /*
+ * If we hit an uninitialized node while recursing up the tree, we can
+ * stop since we know there is no limit here (or above). The counts are
+ * not valid on this node and we know we won't touch this node's counts.
+ */
+ if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object,
+ count_prop, sizeof (count), 1, &count) == ENOENT)
+ return (0);
+
+ err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
+ B_FALSE);
+ if (err != 0)
+ return (err);
+
+ /* Is there a limit which we've hit? */
+ if (enforce == ENFORCE_ALWAYS && (count + delta) > limit)
+ return (SET_ERROR(EDQUOT));
+
+ if (dd->dd_parent != NULL)
+ err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop,
+ ancestor, cr);
+
+ return (err);
+}
+
+/*
+ * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
+ * parents. When a new filesystem/snapshot is created, increment the count on
+ * all parents, and when a filesystem/snapshot is destroyed, decrement the
+ * count.
+ */
+void
+dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop,
+ dmu_tx_t *tx)
+{
+ int err;
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+ uint64_t count;
+
+ ASSERT(dsl_pool_config_held(dd->dd_pool));
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 ||
+ strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0);
+
+ /*
+ * When we receive an incremental stream into a filesystem that already
+ * exists, a temporary clone is created. We don't count this temporary
+ * clone, whose name begins with a '%'. We also ignore hidden ($FREE,
+ * $MOS & $ORIGIN) objsets.
+ */
+ if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') &&
+ strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0)
+ return;
+
+ /*
+ * e.g. if renaming a dataset with no snapshots, count adjustment is 0
+ */
+ if (delta == 0)
+ return;
+
+ /*
+ * If we hit an uninitialized node while recursing up the tree, we can
+ * stop since we know the counts are not valid on this node and we
+ * know we shouldn't touch this node's counts. An uninitialized count
+ * on the node indicates that either the feature has not yet been
+ * activated or there are no limits on this part of the tree.
+ */
+ if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object,
+ prop, sizeof (count), 1, &count)) == ENOENT)
+ return;
+ VERIFY0(err);
+
+ count += delta;
+ /* Use a signed verify to make sure we're not neg. */
+ VERIFY3S(count, >=, 0);
+
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-all
mailing list