svn commit: r277431 - vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor/illumos/dist/lib/libzpool/common/sys
Xin LI
delphij at FreeBSD.org
Tue Jan 20 20:17:31 UTC 2015
Author: delphij
Date: Tue Jan 20 20:17:29 2015
New Revision: 277431
URL: https://svnweb.freebsd.org/changeset/base/277431
Log:
5497 lock contention on arcs_mtx
Reviewed by: George Wilson <george.wilson at delphix.com>
Reviewed by: Matthew Ahrens <mahrens at delphix.com>
Reviewed by: Richard Elling <richard.elling at richardelling.com>
Approved by: Dan McDonald <danmcd at omniti.com>
Author: Prakash Surya <prakash.surya at delphix.com>
illumos/illumos-gate at 244781f10dcd82684fd8163c016540667842f203
Added:
vendor-sys/illumos/dist/uts/common/fs/zfs/multilist.c (contents, props changed)
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/multilist.h (contents, props changed)
Modified:
vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_pool.c
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/arc.h
vendor-sys/illumos/dist/uts/common/fs/zfs/zio_inject.c
Changes in other areas also in this revision:
Modified:
vendor/illumos/dist/lib/libzpool/common/sys/zfs_context.h
Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c Tue Jan 20 20:14:50 2015 (r277430)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c Tue Jan 20 20:17:29 2015 (r277431)
@@ -129,6 +129,7 @@
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#include <sys/dsl_pool.h>
+#include <sys/multilist.h>
#ifdef _KERNEL
#include <sys/vmsystm.h>
#include <vm/anon.h>
@@ -145,21 +146,39 @@ boolean_t arc_watch = B_FALSE;
int arc_procfd;
#endif
-static kmutex_t arc_reclaim_thr_lock;
-static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
-static uint8_t arc_thread_exit;
+static kmutex_t arc_reclaim_lock;
+static kcondvar_t arc_reclaim_thread_cv;
+static boolean_t arc_reclaim_thread_exit;
+static kcondvar_t arc_reclaim_waiters_cv;
+
+static kmutex_t arc_user_evicts_lock;
+static kcondvar_t arc_user_evicts_cv;
+static boolean_t arc_user_evicts_thread_exit;
uint_t arc_reduce_dnlc_percent = 3;
/*
- * The number of iterations through arc_evict_*() before we
- * drop & reacquire the lock.
+ * The number of headers to evict in arc_evict_state_impl() before
+ * dropping the sublist lock and evicting from another sublist. A lower
+ * value means we're more likely to evict the "correct" header (i.e. the
+ * oldest header in the arc state), but comes with higher overhead
+ * (i.e. more invocations of arc_evict_state_impl()).
*/
-int arc_evict_iterations = 100;
+int zfs_arc_evict_batch_limit = 10;
+
+/*
+ * The number of sublists used for each of the arc state lists. If this
+ * is not set to a suitable value by the user, it will be configured to
+ * the number of CPUs on the system in arc_init().
+ */
+int zfs_arc_num_sublists_per_state = 0;
/* number of seconds before growing cache again */
static int arc_grow_retry = 60;
+/* shift of arc_c for calculating overflow limit in arc_get_data_buf */
+int zfs_arc_overflow_shift = 8;
+
/* shift of arc_c for calculating both min and max arc_p */
static int arc_p_min_shift = 4;
@@ -242,10 +261,19 @@ int zfs_arc_average_blocksize = 8 * 1024
*/
typedef struct arc_state {
- list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
- uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
- uint64_t arcs_size; /* total amount of data in this state */
- kmutex_t arcs_mtx;
+ /*
+ * list of evictable buffers
+ */
+ multilist_t arcs_list[ARC_BUFC_NUMTYPES];
+ /*
+ * total amount of evictable data in this state
+ */
+ uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];
+ /*
+ * total amount of data in this state; this includes: evictable,
+ * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
+ */
+ uint64_t arcs_size;
} arc_state_t;
/* The 6 states: */
@@ -272,7 +300,6 @@ typedef struct arc_stats {
kstat_named_t arcstat_mfu_hits;
kstat_named_t arcstat_mfu_ghost_hits;
kstat_named_t arcstat_deleted;
- kstat_named_t arcstat_recycle_miss;
/*
* Number of buffers that could not be evicted because the hash lock
* was held by another thread. The lock may not necessarily be held
@@ -286,9 +313,15 @@ typedef struct arc_stats {
* not from the spa we're trying to evict from.
*/
kstat_named_t arcstat_evict_skip;
+ /*
+ * Number of times arc_evict_state() was unable to evict enough
+ * buffers to reach it's target amount.
+ */
+ kstat_named_t arcstat_evict_not_enough;
kstat_named_t arcstat_evict_l2_cached;
kstat_named_t arcstat_evict_l2_eligible;
kstat_named_t arcstat_evict_l2_ineligible;
+ kstat_named_t arcstat_evict_l2_skip;
kstat_named_t arcstat_hash_elements;
kstat_named_t arcstat_hash_elements_max;
kstat_named_t arcstat_hash_collisions;
@@ -439,11 +472,12 @@ typedef struct arc_stats {
kstat_named_t arcstat_l2_writes_sent;
kstat_named_t arcstat_l2_writes_done;
kstat_named_t arcstat_l2_writes_error;
- kstat_named_t arcstat_l2_writes_hdr_miss;
+ kstat_named_t arcstat_l2_writes_lock_retry;
kstat_named_t arcstat_l2_evict_lock_retry;
kstat_named_t arcstat_l2_evict_reading;
kstat_named_t arcstat_l2_evict_l1cached;
kstat_named_t arcstat_l2_free_on_write;
+ kstat_named_t arcstat_l2_cdata_free_on_write;
kstat_named_t arcstat_l2_abort_lowmem;
kstat_named_t arcstat_l2_cksum_bad;
kstat_named_t arcstat_l2_io_error;
@@ -479,12 +513,13 @@ static arc_stats_t arc_stats = {
{ "mfu_hits", KSTAT_DATA_UINT64 },
{ "mfu_ghost_hits", KSTAT_DATA_UINT64 },
{ "deleted", KSTAT_DATA_UINT64 },
- { "recycle_miss", KSTAT_DATA_UINT64 },
{ "mutex_miss", KSTAT_DATA_UINT64 },
{ "evict_skip", KSTAT_DATA_UINT64 },
+ { "evict_not_enough", KSTAT_DATA_UINT64 },
{ "evict_l2_cached", KSTAT_DATA_UINT64 },
{ "evict_l2_eligible", KSTAT_DATA_UINT64 },
{ "evict_l2_ineligible", KSTAT_DATA_UINT64 },
+ { "evict_l2_skip", KSTAT_DATA_UINT64 },
{ "hash_elements", KSTAT_DATA_UINT64 },
{ "hash_elements_max", KSTAT_DATA_UINT64 },
{ "hash_collisions", KSTAT_DATA_UINT64 },
@@ -523,11 +558,12 @@ static arc_stats_t arc_stats = {
{ "l2_writes_sent", KSTAT_DATA_UINT64 },
{ "l2_writes_done", KSTAT_DATA_UINT64 },
{ "l2_writes_error", KSTAT_DATA_UINT64 },
- { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
+ { "l2_writes_lock_retry", KSTAT_DATA_UINT64 },
{ "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
{ "l2_evict_reading", KSTAT_DATA_UINT64 },
{ "l2_evict_l1cached", KSTAT_DATA_UINT64 },
{ "l2_free_on_write", KSTAT_DATA_UINT64 },
+ { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 },
{ "l2_abort_lowmem", KSTAT_DATA_UINT64 },
{ "l2_cksum_bad", KSTAT_DATA_UINT64 },
{ "l2_io_error", KSTAT_DATA_UINT64 },
@@ -687,7 +723,7 @@ typedef struct l1arc_buf_hdr {
/* protected by arc state mutex */
arc_state_t *b_state;
- list_node_t b_arc_node;
+ multilist_node_t b_arc_node;
/* updated atomically */
clock_t b_arc_access;
@@ -738,7 +774,6 @@ struct arc_buf_hdr {
};
static arc_buf_t *arc_eviction_list;
-static kmutex_t arc_eviction_mtx;
static arc_buf_hdr_t arc_eviction_hdr;
#define GHOST_STATE(state) \
@@ -897,8 +932,7 @@ static uint8_t l2arc_thread_exit;
static void arc_get_data_buf(arc_buf_t *);
static void arc_access(arc_buf_hdr_t *, kmutex_t *);
-static int arc_evict_needed(arc_buf_contents_t);
-static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t);
+static boolean_t arc_is_overflowing();
static void arc_buf_watch(arc_buf_t *);
static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
@@ -1079,6 +1113,7 @@ hdr_full_cons(void *vbuf, void *unused,
cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
refcount_create(&hdr->b_l1hdr.b_refcnt);
mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+ multilist_link_init(&hdr->b_l1hdr.b_arc_node);
arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
return (0);
@@ -1123,6 +1158,7 @@ hdr_full_dest(void *vbuf, void *unused)
cv_destroy(&hdr->b_l1hdr.b_cv);
refcount_destroy(&hdr->b_l1hdr.b_refcnt);
mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
+ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
}
@@ -1159,7 +1195,7 @@ hdr_recl(void *unused)
* which is after we do arc_fini().
*/
if (!arc_dead)
- cv_signal(&arc_reclaim_thr_cv);
+ cv_signal(&arc_reclaim_thread_cv);
}
static void
@@ -1237,18 +1273,31 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem
* l2c_only even though it's about to change.
*/
nhdr->b_l1hdr.b_state = arc_l2c_only;
+
+ /* Verify previous threads set to NULL before freeing */
+ ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL);
} else {
ASSERT(hdr->b_l1hdr.b_buf == NULL);
ASSERT0(hdr->b_l1hdr.b_datacnt);
- ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+
/*
- * We might be removing the L1hdr of a buffer which was just
- * written out to L2ARC. If such a buffer is compressed then we
- * need to free its b_tmp_cdata before destroying the header.
- */
- if (hdr->b_l1hdr.b_tmp_cdata != NULL &&
- HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
- l2arc_release_cdata_buf(hdr);
+ * If we've reached here, We must have been called from
+ * arc_evict_hdr(), as such we should have already been
+ * removed from any ghost list we were previously on
+ * (which protects us from racing with arc_evict_state),
+ * thus no locking is needed during this check.
+ */
+ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+
+ /*
+ * A buffer must not be moved into the arc_l2c_only
+ * state if it's not finished being written out to the
+ * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field
+ * might try to be accessed, even though it was removed.
+ */
+ VERIFY(!HDR_L2_WRITING(hdr));
+ VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+
nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
}
/*
@@ -1461,14 +1510,13 @@ add_reference(arc_buf_hdr_t *hdr, kmutex
(state != arc_anon)) {
/* We don't use the L2-only state list. */
if (state != arc_l2c_only) {
+ arc_buf_contents_t type = arc_buf_type(hdr);
uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
- list_t *list = &state->arcs_list[arc_buf_type(hdr)];
- uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
+ multilist_t *list = &state->arcs_list[type];
+ uint64_t *size = &state->arcs_lsize[type];
+
+ multilist_remove(list, hdr);
- ASSERT(!MUTEX_HELD(&state->arcs_mtx));
- mutex_enter(&state->arcs_mtx);
- ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
- list_remove(list, hdr);
if (GHOST_STATE(state)) {
ASSERT0(hdr->b_l1hdr.b_datacnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
@@ -1477,7 +1525,6 @@ add_reference(arc_buf_hdr_t *hdr, kmutex
ASSERT(delta > 0);
ASSERT3U(*size, >=, delta);
atomic_add_64(size, -delta);
- mutex_exit(&state->arcs_mtx);
}
/* remove the prefetch flag if we get a reference */
hdr->b_flags &= ~ARC_FLAG_PREFETCH;
@@ -1500,22 +1547,21 @@ remove_reference(arc_buf_hdr_t *hdr, kmu
*/
if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
(state != arc_anon)) {
- uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
+ arc_buf_contents_t type = arc_buf_type(hdr);
+ multilist_t *list = &state->arcs_list[type];
+ uint64_t *size = &state->arcs_lsize[type];
+
+ multilist_insert(list, hdr);
- ASSERT(!MUTEX_HELD(&state->arcs_mtx));
- mutex_enter(&state->arcs_mtx);
- ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
- list_insert_head(&state->arcs_list[arc_buf_type(hdr)], hdr);
ASSERT(hdr->b_l1hdr.b_datacnt > 0);
atomic_add_64(size, hdr->b_size *
hdr->b_l1hdr.b_datacnt);
- mutex_exit(&state->arcs_mtx);
}
return (cnt);
}
/*
- * Move the supplied buffer to the indicated state. The mutex
+ * Move the supplied buffer to the indicated state. The hash lock
* for the buffer must be held by the caller.
*/
static void
@@ -1559,15 +1605,10 @@ arc_change_state(arc_state_t *new_state,
*/
if (refcnt == 0) {
if (old_state != arc_anon && old_state != arc_l2c_only) {
- int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
uint64_t *size = &old_state->arcs_lsize[buftype];
- if (use_mutex)
- mutex_enter(&old_state->arcs_mtx);
-
ASSERT(HDR_HAS_L1HDR(hdr));
- ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
- list_remove(&old_state->arcs_list[buftype], hdr);
+ multilist_remove(&old_state->arcs_list[buftype], hdr);
/*
* If prefetching out of the ghost cache,
@@ -1580,12 +1621,8 @@ arc_change_state(arc_state_t *new_state,
}
ASSERT3U(*size, >=, from_delta);
atomic_add_64(size, -from_delta);
-
- if (use_mutex)
- mutex_exit(&old_state->arcs_mtx);
}
if (new_state != arc_anon && new_state != arc_l2c_only) {
- int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
uint64_t *size = &new_state->arcs_lsize[buftype];
/*
@@ -1595,10 +1632,7 @@ arc_change_state(arc_state_t *new_state,
* beforehand.
*/
ASSERT(HDR_HAS_L1HDR(hdr));
- if (use_mutex)
- mutex_enter(&new_state->arcs_mtx);
-
- list_insert_head(&new_state->arcs_list[buftype], hdr);
+ multilist_insert(&new_state->arcs_list[buftype], hdr);
/* ghost elements have a ghost size */
if (GHOST_STATE(new_state)) {
@@ -1607,9 +1641,6 @@ arc_change_state(arc_state_t *new_state,
to_delta = hdr->b_size;
}
atomic_add_64(size, to_delta);
-
- if (use_mutex)
- mutex_exit(&new_state->arcs_mtx);
}
}
@@ -1631,8 +1662,8 @@ arc_change_state(arc_state_t *new_state,
* L2 headers should never be on the L2 state list since they don't
* have L1 headers allocated.
*/
- ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
- list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
+ ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
+ multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
}
void
@@ -1725,6 +1756,7 @@ arc_buf_alloc(spa_t *spa, int32_t size,
hdr->b_l1hdr.b_state = arc_anon;
hdr->b_l1hdr.b_arc_access = 0;
hdr->b_l1hdr.b_datacnt = 1;
+ hdr->b_l1hdr.b_tmp_cdata = NULL;
arc_get_data_buf(buf);
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
@@ -1854,6 +1886,21 @@ arc_buf_add_ref(arc_buf_t *buf, void* ta
data, metadata, hits);
}
+static void
+arc_buf_free_on_write(void *data, size_t size,
+ void (*free_func)(void *, size_t))
+{
+ l2arc_data_free_t *df;
+
+ df = kmem_alloc(sizeof (*df), KM_SLEEP);
+ df->l2df_data = data;
+ df->l2df_size = size;
+ df->l2df_func = free_func;
+ mutex_enter(&l2arc_free_on_write_mtx);
+ list_insert_head(l2arc_free_on_write, df);
+ mutex_exit(&l2arc_free_on_write_mtx);
+}
+
/*
* Free the arc data buffer. If it is an l2arc write in progress,
* the buffer is placed on l2arc_free_on_write to be freed later.
@@ -1864,26 +1911,74 @@ arc_buf_data_free(arc_buf_t *buf, void (
arc_buf_hdr_t *hdr = buf->b_hdr;
if (HDR_L2_WRITING(hdr)) {
- l2arc_data_free_t *df;
- df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
- df->l2df_data = buf->b_data;
- df->l2df_size = hdr->b_size;
- df->l2df_func = free_func;
- mutex_enter(&l2arc_free_on_write_mtx);
- list_insert_head(l2arc_free_on_write, df);
- mutex_exit(&l2arc_free_on_write_mtx);
+ arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
ARCSTAT_BUMP(arcstat_l2_free_on_write);
} else {
free_func(buf->b_data, hdr->b_size);
}
}
+static void
+arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
+{
+ ASSERT(HDR_HAS_L2HDR(hdr));
+ ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx));
+
+ /*
+ * The b_tmp_cdata field is linked off of the b_l1hdr, so if
+ * that doesn't exist, the header is in the arc_l2c_only state,
+ * and there isn't anything to free (it's already been freed).
+ */
+ if (!HDR_HAS_L1HDR(hdr))
+ return;
+
+ /*
+ * The header isn't being written to the l2arc device, thus it
+ * shouldn't have a b_tmp_cdata to free.
+ */
+ if (!HDR_L2_WRITING(hdr)) {
+ ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+ return;
+ }
+
+ /*
+ * The header does not have compression enabled. This can be due
+ * to the buffer not being compressible, or because we're
+ * freeing the buffer before the second phase of
+ * l2arc_write_buffer() has started (which does the compression
+ * step). In either case, b_tmp_cdata does not point to a
+ * separately compressed buffer, so there's nothing to free (it
+ * points to the same buffer as the arc_buf_t's b_data field).
+ */
+ if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) {
+ hdr->b_l1hdr.b_tmp_cdata = NULL;
+ return;
+ }
+
+ /*
+ * There's nothing to free since the buffer was all zero's and
+ * compressed to a zero length buffer.
+ */
+ if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_EMPTY) {
+ ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+ return;
+ }
+
+ ASSERT(L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr)));
+
+ arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata,
+ hdr->b_size, zio_data_buf_free);
+
+ ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
+ hdr->b_l1hdr.b_tmp_cdata = NULL;
+}
+
/*
* Free up buf->b_data and if 'remove' is set, then pull the
* arc_buf_t off of the the arc_buf_hdr_t's list and free it.
*/
static void
-arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
+arc_buf_destroy(arc_buf_t *buf, boolean_t remove)
{
arc_buf_t **bufp;
@@ -1896,17 +1991,17 @@ arc_buf_destroy(arc_buf_t *buf, boolean_
arc_cksum_verify(buf);
arc_buf_unwatch(buf);
- if (!recycle) {
- if (type == ARC_BUFC_METADATA) {
- arc_buf_data_free(buf, zio_buf_free);
- arc_space_return(size, ARC_SPACE_META);
- } else {
- ASSERT(type == ARC_BUFC_DATA);
- arc_buf_data_free(buf, zio_data_buf_free);
- arc_space_return(size, ARC_SPACE_DATA);
- }
+ if (type == ARC_BUFC_METADATA) {
+ arc_buf_data_free(buf, zio_buf_free);
+ arc_space_return(size, ARC_SPACE_META);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ arc_buf_data_free(buf, zio_data_buf_free);
+ arc_space_return(size, ARC_SPACE_DATA);
}
- if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
+
+ /* protected by hash lock, if in the hash table */
+ if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
uint64_t *cnt = &state->arcs_lsize[type];
ASSERT(refcount_is_zero(
@@ -1974,6 +2069,12 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
list_remove(&l2hdr->b_dev->l2ad_buflist, hdr);
+ /*
+ * We don't want to leak the b_tmp_cdata buffer that was
+ * allocated in l2arc_write_buffers()
+ */
+ arc_buf_l2_cdata_free(hdr);
+
ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
@@ -1996,20 +2097,19 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
arc_buf_t *buf = hdr->b_l1hdr.b_buf;
if (buf->b_efunc != NULL) {
- mutex_enter(&arc_eviction_mtx);
+ mutex_enter(&arc_user_evicts_lock);
mutex_enter(&buf->b_evict_lock);
ASSERT(buf->b_hdr != NULL);
- arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
- FALSE);
+ arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE);
hdr->b_l1hdr.b_buf = buf->b_next;
buf->b_hdr = &arc_eviction_hdr;
buf->b_next = arc_eviction_list;
arc_eviction_list = buf;
mutex_exit(&buf->b_evict_lock);
- mutex_exit(&arc_eviction_mtx);
+ cv_signal(&arc_user_evicts_cv);
+ mutex_exit(&arc_user_evicts_lock);
} else {
- arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
- TRUE);
+ arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE);
}
}
#ifdef ZFS_DEBUG
@@ -2022,7 +2122,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
ASSERT3P(hdr->b_hash_next, ==, NULL);
if (HDR_HAS_L1HDR(hdr)) {
- ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
kmem_cache_free(hdr_full_cache, hdr);
} else {
@@ -2048,7 +2148,7 @@ arc_buf_free(arc_buf_t *buf, void *tag)
(void) remove_reference(hdr, hash_lock, tag);
if (hdr->b_l1hdr.b_datacnt > 1) {
- arc_buf_destroy(buf, FALSE, TRUE);
+ arc_buf_destroy(buf, TRUE);
} else {
ASSERT(buf == hdr->b_l1hdr.b_buf);
ASSERT(buf->b_efunc == NULL);
@@ -2062,16 +2162,16 @@ arc_buf_free(arc_buf_t *buf, void *tag)
* this buffer unless the write completes before we finish
* decrementing the reference count.
*/
- mutex_enter(&arc_eviction_mtx);
+ mutex_enter(&arc_user_evicts_lock);
(void) remove_reference(hdr, NULL, tag);
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
- mutex_exit(&arc_eviction_mtx);
+ mutex_exit(&arc_user_evicts_lock);
if (destroy_hdr)
arc_hdr_destroy(hdr);
} else {
if (remove_reference(hdr, NULL, tag) > 0)
- arc_buf_destroy(buf, FALSE, TRUE);
+ arc_buf_destroy(buf, TRUE);
else
arc_hdr_destroy(hdr);
}
@@ -2100,7 +2200,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void*
(void) remove_reference(hdr, hash_lock, tag);
if (hdr->b_l1hdr.b_datacnt > 1) {
if (no_callback)
- arc_buf_destroy(buf, FALSE, TRUE);
+ arc_buf_destroy(buf, TRUE);
} else if (no_callback) {
ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL);
ASSERT(buf->b_efunc == NULL);
@@ -2161,418 +2261,675 @@ arc_buf_eviction_needed(arc_buf_t *buf)
}
/*
- * Evict buffers from list until we've removed the specified number of
- * bytes. Move the removed buffers to the appropriate evict state.
- * If the recycle flag is set, then attempt to "recycle" a buffer:
- * - look for a buffer to evict that is `bytes' long.
- * - return the data block from this buffer rather than freeing it.
- * This flag is used by callers that are trying to make space for a
- * new buffer in a full arc cache.
+ * Evict the arc_buf_hdr that is provided as a parameter. The resultant
+ * state of the header is dependent on it's state prior to entering this
+ * function. The following transitions are possible:
*
- * This function makes a "best effort". It skips over any buffers
- * it can't get a hash_lock on, and so may not catch all candidates.
- * It may also return without evicting as much space as requested.
+ * - arc_mru -> arc_mru_ghost
+ * - arc_mfu -> arc_mfu_ghost
+ * - arc_mru_ghost -> arc_l2c_only
+ * - arc_mru_ghost -> deleted
+ * - arc_mfu_ghost -> arc_l2c_only
+ * - arc_mfu_ghost -> deleted
*/
-static void *
-arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
- arc_buf_contents_t type)
+static int64_t
+arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
{
- arc_state_t *evicted_state;
- uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
- arc_buf_hdr_t *hdr, *hdr_prev = NULL;
- kmutex_t *hash_lock;
- boolean_t have_lock;
- void *stolen = NULL;
- arc_buf_hdr_t marker = { 0 };
- int count = 0;
-
- ASSERT(state == arc_mru || state == arc_mfu);
+ arc_state_t *evicted_state, *state;
+ int64_t bytes_evicted = 0;
- evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+ ASSERT(MUTEX_HELD(hash_lock));
+ ASSERT(HDR_HAS_L1HDR(hdr));
- /*
- * The ghost list lock must be acquired first in order to prevent
- * a 3 party deadlock:
- *
- * - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by
- * l2ad_mtx in arc_hdr_realloc
- * - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx
- * - arc_evict acquires arc_*_ghost->arcs_mtx, followed by
- * arc_*_ghost->arcs_mtx and forms a deadlock cycle.
- *
- * This situation is avoided by acquiring the ghost list lock first.
- */
- mutex_enter(&evicted_state->arcs_mtx);
- mutex_enter(&state->arcs_mtx);
+ state = hdr->b_l1hdr.b_state;
+ if (GHOST_STATE(state)) {
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT(hdr->b_l1hdr.b_buf == NULL);
- /*
- * Decide which "type" (data vs metadata) to recycle from.
- *
- * If we are over the metadata limit, recycle from metadata.
- * If we are under the metadata minimum, recycle from data.
- * Otherwise, recycle from whichever type has the oldest (least
- * recently accessed) header.
- */
- if (recycle) {
- arc_buf_hdr_t *data_hdr =
- list_tail(&state->arcs_list[ARC_BUFC_DATA]);
- arc_buf_hdr_t *metadata_hdr =
- list_tail(&state->arcs_list[ARC_BUFC_METADATA]);
- arc_buf_contents_t realtype;
-
- if (data_hdr == NULL) {
- realtype = ARC_BUFC_METADATA;
- } else if (metadata_hdr == NULL) {
- realtype = ARC_BUFC_DATA;
- } else if (arc_meta_used >= arc_meta_limit) {
- realtype = ARC_BUFC_METADATA;
- } else if (arc_meta_used <= arc_meta_min) {
- realtype = ARC_BUFC_DATA;
- } else if (HDR_HAS_L1HDR(data_hdr) &&
- HDR_HAS_L1HDR(metadata_hdr) &&
- data_hdr->b_l1hdr.b_arc_access <
- metadata_hdr->b_l1hdr.b_arc_access) {
- realtype = ARC_BUFC_DATA;
- } else {
- realtype = ARC_BUFC_METADATA;
+ /*
+ * l2arc_write_buffers() relies on a header's L1 portion
+ * (i.e. it's b_tmp_cdata field) during it's write phase.
+ * Thus, we cannot push a header onto the arc_l2c_only
+ * state (removing it's L1 piece) until the header is
+ * done being written to the l2arc.
+ */
+ if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
+ ARCSTAT_BUMP(arcstat_evict_l2_skip);
+ return (bytes_evicted);
}
- if (realtype != type) {
+
+ ARCSTAT_BUMP(arcstat_deleted);
+ bytes_evicted += hdr->b_size;
+
+ DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
+
+ if (HDR_HAS_L2HDR(hdr)) {
/*
- * If we want to evict from a different list,
- * we can not recycle, because DATA vs METADATA
- * buffers are segregated into different kmem
- * caches (and vmem arenas).
+ * This buffer is cached on the 2nd Level ARC;
+ * don't destroy the header.
*/
- type = realtype;
- recycle = B_FALSE;
+ arc_change_state(arc_l2c_only, hdr, hash_lock);
+ /*
+ * dropping from L1+L2 cached to L2-only,
+ * realloc to remove the L1 header.
+ */
+ hdr = arc_hdr_realloc(hdr, hdr_full_cache,
+ hdr_l2only_cache);
+ } else {
+ arc_change_state(arc_anon, hdr, hash_lock);
+ arc_hdr_destroy(hdr);
}
+ return (bytes_evicted);
}
- list_t *list = &state->arcs_list[type];
+ ASSERT(state == arc_mru || state == arc_mfu);
+ evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
- for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
- hdr_prev = list_prev(list, hdr);
- /* prefetch buffers have a minimum lifespan */
- if (HDR_IO_IN_PROGRESS(hdr) ||
- (spa && hdr->b_spa != spa) ||
- ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
- ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
- arc_min_prefetch_lifespan)) {
- skipped++;
- continue;
+ /* prefetch buffers have a minimum lifespan */
+ if (HDR_IO_IN_PROGRESS(hdr) ||
+ ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
+ ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
+ arc_min_prefetch_lifespan)) {
+ ARCSTAT_BUMP(arcstat_evict_skip);
+ return (bytes_evicted);
+ }
+
+ ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
+ ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
+ while (hdr->b_l1hdr.b_buf) {
+ arc_buf_t *buf = hdr->b_l1hdr.b_buf;
+ if (!mutex_tryenter(&buf->b_evict_lock)) {
+ ARCSTAT_BUMP(arcstat_mutex_miss);
+ break;
}
- /* "lookahead" for better eviction candidate */
- if (recycle && hdr->b_size != bytes &&
- hdr_prev && hdr_prev->b_size == bytes)
- continue;
+ if (buf->b_data != NULL)
+ bytes_evicted += hdr->b_size;
+ if (buf->b_efunc != NULL) {
+ mutex_enter(&arc_user_evicts_lock);
+ arc_buf_destroy(buf, FALSE);
+ hdr->b_l1hdr.b_buf = buf->b_next;
+ buf->b_hdr = &arc_eviction_hdr;
+ buf->b_next = arc_eviction_list;
+ arc_eviction_list = buf;
+ cv_signal(&arc_user_evicts_cv);
+ mutex_exit(&arc_user_evicts_lock);
+ mutex_exit(&buf->b_evict_lock);
+ } else {
+ mutex_exit(&buf->b_evict_lock);
+ arc_buf_destroy(buf, TRUE);
+ }
+ }
- /* ignore markers */
- if (hdr->b_spa == 0)
- continue;
+ if (HDR_HAS_L2HDR(hdr)) {
+ ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size);
+ } else {
+ if (l2arc_write_eligible(hdr->b_spa, hdr))
+ ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size);
+ else
+ ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size);
+ }
+
+ if (hdr->b_l1hdr.b_datacnt == 0) {
+ arc_change_state(evicted_state, hdr, hash_lock);
+ ASSERT(HDR_IN_HASH_TABLE(hdr));
+ hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
+ hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
+ DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
+ }
+
+ return (bytes_evicted);
+}
+
+static uint64_t
+arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
+ uint64_t spa, int64_t bytes)
+{
+ multilist_sublist_t *mls;
+ uint64_t bytes_evicted = 0;
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+ int evict_count = 0;
+
+ ASSERT3P(marker, !=, NULL);
+ IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
+
+ mls = multilist_sublist_lock(ml, idx);
+
+ for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
+ hdr = multilist_sublist_prev(mls, marker)) {
+ if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
+ (evict_count >= zfs_arc_evict_batch_limit))
+ break;
/*
- * It may take a long time to evict all the bufs requested.
- * To avoid blocking all arc activity, periodically drop
- * the arcs_mtx and give other threads a chance to run
- * before reacquiring the lock.
- *
- * If we are looking for a buffer to recycle, we are in
- * the hot code path, so don't sleep.
+ * To keep our iteration location, move the marker
+ * forward. Since we're not holding hdr's hash lock, we
+ * must be very careful and not remove 'hdr' from the
+ * sublist. Otherwise, other consumers might mistake the
+ * 'hdr' as not being on a sublist when they call the
+ * multilist_link_active() function (they all rely on
+ * the hash lock protecting concurrent insertions and
+ * removals). multilist_sublist_move_forward() was
+ * specifically implemented to ensure this is the case
+ * (only 'marker' will be removed and re-inserted).
+ */
+ multilist_sublist_move_forward(mls, marker);
+
+ /*
+ * The only case where the b_spa field should ever be
+ * zero, is the marker headers inserted by
+ * arc_evict_state(). It's possible for multiple threads
+ * to be calling arc_evict_state() concurrently (e.g.
+ * dsl_pool_close() and zio_inject_fault()), so we must
+ * skip any markers we see from these other threads.
*/
- if (!recycle && count++ > arc_evict_iterations) {
- list_insert_after(list, hdr, &marker);
- mutex_exit(&state->arcs_mtx);
- mutex_exit(&evicted_state->arcs_mtx);
- kpreempt(KPREEMPT_SYNC);
- mutex_enter(&evicted_state->arcs_mtx);
- mutex_enter(&state->arcs_mtx);
- hdr_prev = list_prev(list, &marker);
- list_remove(list, &marker);
- count = 0;
+ if (hdr->b_spa == 0)
+ continue;
+
+ /* we're only interested in evicting buffers of a certain spa */
+ if (spa != 0 && hdr->b_spa != spa) {
+ ARCSTAT_BUMP(arcstat_evict_skip);
continue;
}
hash_lock = HDR_LOCK(hdr);
- have_lock = MUTEX_HELD(hash_lock);
- if (have_lock || mutex_tryenter(hash_lock)) {
- ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
- ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
- while (hdr->b_l1hdr.b_buf) {
- arc_buf_t *buf = hdr->b_l1hdr.b_buf;
- if (!mutex_tryenter(&buf->b_evict_lock)) {
- missed += 1;
- break;
- }
- if (buf->b_data != NULL) {
- bytes_evicted += hdr->b_size;
- if (recycle &&
- arc_buf_type(hdr) == type &&
- hdr->b_size == bytes &&
- !HDR_L2_WRITING(hdr)) {
- stolen = buf->b_data;
- recycle = FALSE;
- }
- }
- if (buf->b_efunc != NULL) {
- mutex_enter(&arc_eviction_mtx);
- arc_buf_destroy(buf,
- buf->b_data == stolen, FALSE);
- hdr->b_l1hdr.b_buf = buf->b_next;
- buf->b_hdr = &arc_eviction_hdr;
- buf->b_next = arc_eviction_list;
- arc_eviction_list = buf;
- mutex_exit(&arc_eviction_mtx);
- mutex_exit(&buf->b_evict_lock);
- } else {
- mutex_exit(&buf->b_evict_lock);
- arc_buf_destroy(buf,
- buf->b_data == stolen, TRUE);
- }
- }
- if (HDR_HAS_L2HDR(hdr)) {
- ARCSTAT_INCR(arcstat_evict_l2_cached,
- hdr->b_size);
- } else {
- if (l2arc_write_eligible(hdr->b_spa, hdr)) {
- ARCSTAT_INCR(arcstat_evict_l2_eligible,
- hdr->b_size);
- } else {
- ARCSTAT_INCR(
- arcstat_evict_l2_ineligible,
- hdr->b_size);
- }
- }
+ /*
+ * We aren't calling this function from any code path
+ * that would already be holding a hash lock, so we're
+ * asserting on this assumption to be defensive in case
+ * this ever changes. Without this check, it would be
+ * possible to incorrectly increment arcstat_mutex_miss
+ * below (e.g. if the code changed such that we called
+ * this function with a hash lock held).
+ */
+ ASSERT(!MUTEX_HELD(hash_lock));
- if (hdr->b_l1hdr.b_datacnt == 0) {
- arc_change_state(evicted_state, hdr, hash_lock);
- ASSERT(HDR_IN_HASH_TABLE(hdr));
- hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
- hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
- DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
- }
- if (!have_lock)
- mutex_exit(hash_lock);
- if (bytes >= 0 && bytes_evicted >= bytes)
- break;
+ if (mutex_tryenter(hash_lock)) {
+ uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
+ mutex_exit(hash_lock);
+
+ bytes_evicted += evicted;
+
+ /*
+ * If evicted is zero, arc_evict_hdr() must have
+ * decided to skip this header, don't increment
+ * evict_count in this case.
+ */
+ if (evicted != 0)
+ evict_count++;
+
+ /*
+ * If arc_size isn't overflowing, signal any
+ * threads that might happen to be waiting.
+ *
+ * For each header evicted, we wake up a single
+ * thread. If we used cv_broadcast, we could
+ * wake up "too many" threads causing arc_size
+ * to significantly overflow arc_c; since
+ * arc_get_data_buf() doesn't check for overflow
+ * when it's woken up (it doesn't because it's
+ * possible for the ARC to be overflowing while
+ * full of un-evictable buffers, and the
+ * function should proceed in this case).
+ *
+ * If threads are left sleeping, due to not
+ * using cv_broadcast, they will be woken up
+ * just before arc_reclaim_thread() sleeps.
+ */
+ mutex_enter(&arc_reclaim_lock);
+ if (!arc_is_overflowing())
+ cv_signal(&arc_reclaim_waiters_cv);
+ mutex_exit(&arc_reclaim_lock);
} else {
- missed += 1;
+ ARCSTAT_BUMP(arcstat_mutex_miss);
}
}
- mutex_exit(&state->arcs_mtx);
- mutex_exit(&evicted_state->arcs_mtx);
+ multilist_sublist_unlock(mls);
- if (bytes_evicted < bytes)
- dprintf("only evicted %lld bytes from %x",
- (longlong_t)bytes_evicted, state);
+ return (bytes_evicted);
+}
- if (skipped)
- ARCSTAT_INCR(arcstat_evict_skip, skipped);
+/*
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-vendor
mailing list