svn commit: r260766 - in stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys
Andriy Gapon
avg at FreeBSD.org
Thu Jan 16 16:04:21 UTC 2014
Author: avg
Date: Thu Jan 16 16:04:20 2014
New Revision: 260766
URL: http://svnweb.freebsd.org/changeset/base/260766
Log:
MFC r258633: MFV r255256: 3954 metaslabs continue to load even after
hitting zfs_mg_alloc_failure limit
Modified:
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
Directory Properties:
stable/9/sys/ (props changed)
stable/9/sys/cddl/contrib/opensolaris/ (props changed)
Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c Thu Jan 16 16:00:05 2014 (r260765)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c Thu Jan 16 16:04:20 2014 (r260766)
@@ -58,7 +58,8 @@ int zfs_condense_pct = 200;
/*
* This value defines the number of allowed allocation failures per vdev.
* If a device reaches this threshold in a given txg then we consider skipping
- * allocations on that device.
+ * allocations on that device. The value of zfs_mg_alloc_failures is computed
+ * in zio_init() unless it has been overridden in /etc/system.
*/
int zfs_mg_alloc_failures = 0;
@@ -69,6 +70,21 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_alloc_
TUNABLE_INT("vfs.zfs.mg_alloc_failures", &zfs_mg_alloc_failures);
/*
+ * The zfs_mg_noalloc_threshold defines which metaslab groups should
+ * be eligible for allocation. The value is defined as a percentage of
+ * a free space. Metaslab groups that have more free space than
+ * zfs_mg_noalloc_threshold are always eligible for allocations. Once
+ * a metaslab group's free space is less than or equal to the
+ * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
+ * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
+ * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
+ * groups are allowed to accept allocations. Gang blocks are always
+ * eligible to allocate on any metaslab group. The default value of 0 means
+ * no metaslab group will be excluded based on this criterion.
+ */
+int zfs_mg_noalloc_threshold = 0;
+
+/*
* Metaslab debugging: when set, keeps all space maps in core to verify frees.
*/
static int metaslab_debug = 0;
@@ -234,6 +250,53 @@ metaslab_compare(const void *x1, const v
return (0);
}
+/*
+ * Update the allocatable flag and the metaslab group's capacity.
+ * The allocatable flag is set to true if the capacity is below
+ * the zfs_mg_noalloc_threshold. If a metaslab group transitions
+ * from allocatable to non-allocatable or vice versa then the metaslab
+ * group's class is updated to reflect the transition.
+ */
+static void
+metaslab_group_alloc_update(metaslab_group_t *mg)
+{
+ vdev_t *vd = mg->mg_vd;
+ metaslab_class_t *mc = mg->mg_class;
+ vdev_stat_t *vs = &vd->vdev_stat;
+ boolean_t was_allocatable;
+
+ ASSERT(vd == vd->vdev_top);
+
+ mutex_enter(&mg->mg_lock);
+ was_allocatable = mg->mg_allocatable;
+
+ mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
+ (vs->vs_space + 1);
+
+ mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
+
+ /*
+ * The mc_alloc_groups maintains a count of the number of
+ * groups in this metaslab class that are still above the
+ * zfs_mg_noalloc_threshold. This is used by the allocating
+ * threads to determine if they should avoid allocations to
+ * a given group. The allocator will avoid allocations to a group
+ * if that group has reached or is below the zfs_mg_noalloc_threshold
+ * and there are still other groups that are above the threshold.
+ * When a group transitions from allocatable to non-allocatable or
+ * vice versa we update the metaslab class to reflect that change.
+ * When the mc_alloc_groups value drops to 0 that means that all
+ * groups have reached the zfs_mg_noalloc_threshold making all groups
+ * eligible for allocations. This effectively means that all devices
+ * are balanced again.
+ */
+ if (was_allocatable && !mg->mg_allocatable)
+ mc->mc_alloc_groups--;
+ else if (!was_allocatable && mg->mg_allocatable)
+ mc->mc_alloc_groups++;
+ mutex_exit(&mg->mg_lock);
+}
+
metaslab_group_t *
metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
{
@@ -284,6 +347,7 @@ metaslab_group_activate(metaslab_group_t
return;
mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
+ metaslab_group_alloc_update(mg);
if ((mgprev = mc->mc_rotor) == NULL) {
mg->mg_prev = mg;
@@ -369,6 +433,29 @@ metaslab_group_sort(metaslab_group_t *mg
}
/*
+ * Determine if a given metaslab group should skip allocations. A metaslab
+ * group should avoid allocations if its used capacity has crossed the
+ * zfs_mg_noalloc_threshold and there is at least one metaslab group
+ * that can still handle allocations.
+ */
+static boolean_t
+metaslab_group_allocatable(metaslab_group_t *mg)
+{
+ vdev_t *vd = mg->mg_vd;
+ spa_t *spa = vd->vdev_spa;
+ metaslab_class_t *mc = mg->mg_class;
+
+ /*
+ * A metaslab group is considered allocatable if its free capacity
+ * is greater than the set value of zfs_mg_noalloc_threshold, it's
+ * associated with a slog, or there are no other metaslab groups
+ * with free capacity greater than zfs_mg_noalloc_threshold.
+ */
+ return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
+ mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
+}
+
+/*
* ==========================================================================
* Common allocator routines
* ==========================================================================
@@ -1317,6 +1404,8 @@ metaslab_sync_reassess(metaslab_group_t
vdev_t *vd = mg->mg_vd;
int64_t failures = mg->mg_alloc_failures;
+ metaslab_group_alloc_update(mg);
+
/*
* Re-evaluate all metaslabs which have lower offsets than the
* bonus area.
@@ -1418,6 +1507,8 @@ metaslab_group_alloc(metaslab_group_t *m
if (msp == NULL)
return (-1ULL);
+ mutex_enter(&msp->ms_lock);
+
/*
* If we've already reached the allowable number of failed
* allocation attempts on this metaslab group then we
@@ -1434,11 +1525,10 @@ metaslab_group_alloc(metaslab_group_t *m
"asize %llu, failures %llu", spa_name(spa),
mg->mg_vd->vdev_id, txg, mg, psize, asize,
mg->mg_alloc_failures);
+ mutex_exit(&msp->ms_lock);
return (-1ULL);
}
- mutex_enter(&msp->ms_lock);
-
/*
* Ensure that the metaslab we have selected is still
* capable of handling our request. It's possible that
@@ -1591,6 +1681,21 @@ top:
} else {
allocatable = vdev_allocatable(vd);
}
+
+ /*
+ * Determine if the selected metaslab group is eligible
+ * for allocations. If we're ganging or have requested
+ * an allocation for the smallest gang block size
+ * then we don't want to avoid allocating to the this
+ * metaslab group. If we're in this condition we should
+ * try to allocate from any device possible so that we
+ * don't inadvertently return ENOSPC and suspend the pool
+ * even though space is still available.
+ */
+ if (allocatable && CAN_FASTGANG(flags) &&
+ psize > SPA_GANGBLOCKSIZE)
+ allocatable = metaslab_group_allocatable(mg);
+
if (!allocatable)
goto next;
Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h Thu Jan 16 16:00:05 2014 (r260765)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h Thu Jan 16 16:04:20 2014 (r260766)
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
*/
#ifndef _SYS_METASLAB_IMPL_H
@@ -45,6 +45,7 @@ struct metaslab_class {
metaslab_group_t *mc_rotor;
space_map_ops_t *mc_ops;
uint64_t mc_aliquot;
+ uint64_t mc_alloc_groups; /* # of allocatable groups */
uint64_t mc_alloc; /* total allocated space */
uint64_t mc_deferred; /* total deferred frees */
uint64_t mc_space; /* total space (alloc + free) */
@@ -57,6 +58,8 @@ struct metaslab_group {
uint64_t mg_aliquot;
uint64_t mg_bonus_area;
uint64_t mg_alloc_failures;
+ boolean_t mg_allocatable; /* can we allocate? */
+ uint64_t mg_free_capacity; /* percentage free */
int64_t mg_bias;
int64_t mg_activation_count;
metaslab_class_t *mg_class;
Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c Thu Jan 16 16:00:05 2014 (r260765)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c Thu Jan 16 16:04:20 2014 (r260766)
@@ -2418,7 +2418,7 @@ zio_alloc_zil(spa_t *spa, uint64_t txg,
if (error) {
error = metaslab_alloc(spa, spa_normal_class(spa), size,
new_bp, 1, txg, old_bp,
- METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
+ METASLAB_HINTBP_AVOID);
}
if (error == 0) {
More information about the svn-src-stable-9
mailing list