svn commit: r260764 - in stable/9: cddl/contrib/opensolaris/cmd/ztest cddl/contrib/opensolaris/lib/libzpool/common/sys sys/cddl/compat/opensolaris/sys sys/cddl/contrib/opensolaris/uts/common/fs/zfs...
Andriy Gapon
avg at FreeBSD.org
Thu Jan 16 15:59:10 UTC 2014
Author: avg
Date: Thu Jan 16 15:59:08 2014
New Revision: 260764
URL: http://svnweb.freebsd.org/changeset/base/260764
Log:
MFC r258632,258704: MFV r255255: 4045 zfs write throttle & i/o scheduler
performance work
Note a change in dmu_tx_delay: pause_sbt is not available in this
branch.
Sponsored by: HybridCluster [merge]
Added:
stable/9/sys/cddl/compat/opensolaris/sys/disp.h
- copied unchanged from r258632, head/sys/cddl/compat/opensolaris/sys/disp.h
Modified:
stable/9/cddl/contrib/opensolaris/cmd/ztest/ztest.c
stable/9/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
Directory Properties:
stable/9/cddl/contrib/opensolaris/ (props changed)
stable/9/cddl/contrib/opensolaris/cmd/dtrace/test/tst/common/print/ (props changed)
stable/9/cddl/contrib/opensolaris/cmd/zfs/ (props changed)
stable/9/cddl/contrib/opensolaris/lib/libzfs/ (props changed)
stable/9/sys/ (props changed)
stable/9/sys/cddl/contrib/opensolaris/ (props changed)
Modified: stable/9/cddl/contrib/opensolaris/cmd/ztest/ztest.c
==============================================================================
--- stable/9/cddl/contrib/opensolaris/cmd/ztest/ztest.c Thu Jan 16 15:57:39 2014 (r260763)
+++ stable/9/cddl/contrib/opensolaris/cmd/ztest/ztest.c Thu Jan 16 15:59:08 2014 (r260764)
@@ -186,7 +186,7 @@ static const ztest_shared_opts_t ztest_o
extern uint64_t metaslab_gang_bang;
extern uint64_t metaslab_df_alloc_threshold;
-extern uint64_t zfs_deadman_synctime;
+extern uint64_t zfs_deadman_synctime_ms;
static ztest_shared_opts_t *ztest_shared_opts;
static ztest_shared_opts_t ztest_opts;
@@ -5315,10 +5315,10 @@ ztest_deadman_thread(void *arg)
hrtime_t delta, total = 0;
for (;;) {
- delta = (zs->zs_thread_stop - zs->zs_thread_start) /
- NANOSEC + zfs_deadman_synctime;
+ delta = zs->zs_thread_stop - zs->zs_thread_start +
+ MSEC2NSEC(zfs_deadman_synctime_ms);
- (void) poll(NULL, 0, (int)(1000 * delta));
+ (void) poll(NULL, 0, (int)NSEC2MSEC(delta));
/*
* If the pool is suspended then fail immediately. Otherwise,
@@ -5329,12 +5329,12 @@ ztest_deadman_thread(void *arg)
if (spa_suspended(spa)) {
fatal(0, "aborting test after %llu seconds because "
"pool has transitioned to a suspended state.",
- zfs_deadman_synctime);
+ zfs_deadman_synctime_ms / 1000);
return (NULL);
}
vdev_deadman(spa->spa_root_vdev);
- total += zfs_deadman_synctime;
+ total += zfs_deadman_synctime_ms/1000;
(void) printf("ztest has been running for %lld seconds\n",
total);
}
@@ -6066,7 +6066,7 @@ main(int argc, char **argv)
(void) setvbuf(stdout, NULL, _IOLBF, 0);
dprintf_setup(&argc, argv);
- zfs_deadman_synctime = 300;
+ zfs_deadman_synctime_ms = 300000;
ztest_fd_rand = open("/dev/urandom", O_RDONLY);
ASSERT3S(ztest_fd_rand, >=, 0);
Modified: stable/9/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
==============================================================================
--- stable/9/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h Thu Jan 16 15:57:39 2014 (r260763)
+++ stable/9/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h Thu Jan 16 15:59:08 2014 (r260764)
@@ -65,6 +65,7 @@ extern "C" {
#include <inttypes.h>
#include <fsshare.h>
#include <pthread.h>
+#include <sched.h>
#include <sys/debug.h>
#include <sys/note.h>
#include <sys/types.h>
@@ -204,6 +205,8 @@ extern int aok;
*/
#define curthread ((void *)(uintptr_t)thr_self())
+#define kpreempt(x) sched_yield()
+
typedef struct kthread kthread_t;
#define thread_create(stk, stksize, func, arg, len, pp, state, pri) \
Copied: stable/9/sys/cddl/compat/opensolaris/sys/disp.h (from r258632, head/sys/cddl/compat/opensolaris/sys/disp.h)
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ stable/9/sys/cddl/compat/opensolaris/sys/disp.h Thu Jan 16 15:59:08 2014 (r260764, copy of r258632, head/sys/cddl/compat/opensolaris/sys/disp.h)
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2013 Andriy Gapon
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _OPENSOLARIS_SYS_DISP_H_
+#define _OPENSOLARIS_SYS_DISP_H_
+
+#ifdef _KERNEL
+
+#include <sys/proc.h>
+
+#define kpreempt(x) kern_yield(PRI_USER)
+
+#endif /* _KERNEL */
+
+#endif /* _OPENSOLARIS_SYS_DISP_H_ */
Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Thu Jan 16 15:57:39 2014 (r260763)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Thu Jan 16 15:59:08 2014 (r260764)
@@ -127,6 +127,7 @@
#include <sys/refcount.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
+#include <sys/dsl_pool.h>
#ifdef _KERNEL
#include <sys/dnlc.h>
#endif
@@ -150,10 +151,6 @@ static kmutex_t arc_reclaim_thr_lock;
static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
static uint8_t arc_thread_exit;
-extern int zfs_write_limit_shift;
-extern uint64_t zfs_write_limit_max;
-extern kmutex_t zfs_write_limit_lock;
-
#define ARC_REDUCE_DNLC_PERCENT 3
uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
@@ -162,6 +159,12 @@ typedef enum arc_reclaim_strategy {
ARC_RECLAIM_CONS /* Conservative reclaim strategy */
} arc_reclaim_strategy_t;
+/*
+ * The number of iterations through arc_evict_*() before we
+ * drop & reacquire the lock.
+ */
+int arc_evict_iterations = 100;
+
/* number of seconds before growing cache again */
static int arc_grow_retry = 60;
@@ -177,6 +180,11 @@ static int arc_shrink_shift = 5;
*/
static int arc_min_prefetch_lifespan;
+/*
+ * If this percent of memory is free, don't throttle.
+ */
+int arc_lotsfree_percent = 10;
+
static int arc_dead;
extern int zfs_prefetch_disable;
@@ -526,6 +534,7 @@ typedef struct arc_write_callback arc_wr
struct arc_write_callback {
void *awcb_private;
arc_done_func_t *awcb_ready;
+ arc_done_func_t *awcb_physdone;
arc_done_func_t *awcb_done;
arc_buf_t *awcb_buf;
};
@@ -1312,7 +1321,7 @@ arc_change_state(arc_state_t *new_state,
kmutex_t *lock;
ASSERT(MUTEX_HELD(hash_lock));
- ASSERT(new_state != old_state);
+ ASSERT3P(new_state, !=, old_state);
ASSERT(refcnt == 0 || ab->b_datacnt > 0);
ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
@@ -1937,8 +1946,10 @@ arc_evict(arc_state_t *state, uint64_t s
kmutex_t *hash_lock;
boolean_t have_lock;
void *stolen = NULL;
+ arc_buf_hdr_t marker = { 0 };
+ int count = 0;
static int evict_metadata_offset, evict_data_offset;
- int i, idx, offset, list_count, count;
+ int i, idx, offset, list_count, lists;
ASSERT(state == arc_mru || state == arc_mfu);
@@ -1958,7 +1969,7 @@ arc_evict(arc_state_t *state, uint64_t s
idx = evict_data_offset;
}
bytes_remaining = evicted_state->arcs_lsize[type];
- count = 0;
+ lists = 0;
evict_start:
list = &list_start[idx];
@@ -1985,6 +1996,33 @@ evict_start:
if (recycle && ab->b_size != bytes &&
ab_prev && ab_prev->b_size == bytes)
continue;
+
+ /* ignore markers */
+ if (ab->b_spa == 0)
+ continue;
+
+ /*
+ * It may take a long time to evict all the bufs requested.
+ * To avoid blocking all arc activity, periodically drop
+ * the arcs_mtx and give other threads a chance to run
+ * before reacquiring the lock.
+ *
+ * If we are looking for a buffer to recycle, we are in
+ * the hot code path, so don't sleep.
+ */
+ if (!recycle && count++ > arc_evict_iterations) {
+ list_insert_after(list, ab, &marker);
+ mutex_exit(evicted_lock);
+ mutex_exit(lock);
+ kpreempt(KPREEMPT_SYNC);
+ mutex_enter(lock);
+ mutex_enter(evicted_lock);
+ ab_prev = list_prev(list, &marker);
+ list_remove(list, &marker);
+ count = 0;
+ continue;
+ }
+
hash_lock = HDR_LOCK(ab);
have_lock = MUTEX_HELD(hash_lock);
if (have_lock || mutex_tryenter(hash_lock)) {
@@ -2051,7 +2089,7 @@ evict_start:
mutex_exit(evicted_lock);
mutex_exit(lock);
idx = ((idx + 1) & (list_count - 1));
- count++;
+ lists++;
goto evict_start;
}
} else {
@@ -2063,10 +2101,10 @@ evict_start:
mutex_exit(lock);
idx = ((idx + 1) & (list_count - 1));
- count++;
+ lists++;
if (bytes_evicted < bytes) {
- if (count < list_count)
+ if (lists < list_count)
goto evict_start;
else
dprintf("only evicted %lld bytes from %x",
@@ -2084,28 +2122,14 @@ evict_start:
ARCSTAT_INCR(arcstat_mutex_miss, missed);
/*
- * We have just evicted some data into the ghost state, make
- * sure we also adjust the ghost state size if necessary.
+ * Note: we have just evicted some data into the ghost state,
+ * potentially putting the ghost size over the desired size. Rather
+ * that evicting from the ghost list in this hot code path, leave
+ * this chore to the arc_reclaim_thread().
*/
- if (arc_no_grow &&
- arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
- int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
- arc_mru_ghost->arcs_size - arc_c;
-
- if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
- int64_t todelete =
- MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
- arc_evict_ghost(arc_mru_ghost, 0, todelete);
- } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
- int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
- arc_mru_ghost->arcs_size +
- arc_mfu_ghost->arcs_size - arc_c);
- arc_evict_ghost(arc_mfu_ghost, 0, todelete);
- }
- }
+
if (stolen)
ARCSTAT_BUMP(arcstat_stolen);
-
return (stolen);
}
@@ -2122,9 +2146,10 @@ arc_evict_ghost(arc_state_t *state, uint
kmutex_t *hash_lock, *lock;
uint64_t bytes_deleted = 0;
uint64_t bufs_skipped = 0;
+ int count = 0;
static int evict_offset;
int list_count, idx = evict_offset;
- int offset, count = 0;
+ int offset, lists = 0;
ASSERT(GHOST_STATE(state));
@@ -2142,6 +2167,8 @@ evict_start:
mutex_enter(lock);
for (ab = list_tail(list); ab; ab = ab_prev) {
ab_prev = list_prev(list, ab);
+ if (ab->b_type > ARC_BUFC_NUMTYPES)
+ panic("invalid ab=%p", (void *)ab);
if (spa && ab->b_spa != spa)
continue;
@@ -2153,6 +2180,23 @@ evict_start:
/* caller may be trying to modify this buffer, skip it */
if (MUTEX_HELD(hash_lock))
continue;
+
+ /*
+ * It may take a long time to evict all the bufs requested.
+ * To avoid blocking all arc activity, periodically drop
+ * the arcs_mtx and give other threads a chance to run
+ * before reacquiring the lock.
+ */
+ if (count++ > arc_evict_iterations) {
+ list_insert_after(list, ab, &marker);
+ mutex_exit(lock);
+ kpreempt(KPREEMPT_SYNC);
+ mutex_enter(lock);
+ ab_prev = list_prev(list, &marker);
+ list_remove(list, &marker);
+ count = 0;
+ continue;
+ }
if (mutex_tryenter(hash_lock)) {
ASSERT(!HDR_IO_IN_PROGRESS(ab));
ASSERT(ab->b_buf == NULL);
@@ -2188,14 +2232,16 @@ evict_start:
mutex_enter(lock);
ab_prev = list_prev(list, &marker);
list_remove(list, &marker);
- } else
+ } else {
bufs_skipped += 1;
+ }
+
}
mutex_exit(lock);
idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
- count++;
+ lists++;
- if (count < list_count)
+ if (lists < list_count)
goto evict_start;
evict_offset = idx;
@@ -2203,7 +2249,7 @@ evict_start:
(bytes < 0 || bytes_deleted < bytes)) {
list_start = &state->arcs_lists[0];
list_count = ARC_BUFC_NUMMETADATALISTS;
- offset = count = 0;
+ offset = lists = 0;
goto evict_start;
}
@@ -3083,7 +3129,7 @@ arc_read_done(zio_t *zio)
*/
int
arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
- void *private, int priority, int zio_flags, uint32_t *arc_flags,
+ void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr;
@@ -3699,6 +3745,18 @@ arc_write_ready(zio_t *zio)
hdr->b_flags |= ARC_IO_IN_PROGRESS;
}
+/*
+ * The SPA calls this callback for each physical write that happens on behalf
+ * of a logical write. See the comment in dbuf_write_physdone() for details.
+ */
+static void
+arc_write_physdone(zio_t *zio)
+{
+ arc_write_callback_t *cb = zio->io_private;
+ if (cb->awcb_physdone != NULL)
+ cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
+}
+
static void
arc_write_done(zio_t *zio)
{
@@ -3779,8 +3837,9 @@ arc_write_done(zio_t *zio)
zio_t *
arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
- const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
- void *private, int priority, int zio_flags, const zbookmark_t *zb)
+ const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
+ arc_done_func_t *done, void *private, zio_priority_t priority,
+ int zio_flags, const zbookmark_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
arc_write_callback_t *callback;
@@ -3797,18 +3856,20 @@ arc_write(zio_t *pio, spa_t *spa, uint64
hdr->b_flags |= ARC_L2COMPRESS;
callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
callback->awcb_ready = ready;
+ callback->awcb_physdone = physdone;
callback->awcb_done = done;
callback->awcb_private = private;
callback->awcb_buf = buf;
zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
- arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
+ arc_write_ready, arc_write_physdone, arc_write_done, callback,
+ priority, zio_flags, zb);
return (zio);
}
static int
-arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
+arc_memory_throttle(uint64_t reserve, uint64_t txg)
{
#ifdef _KERNEL
uint64_t available_memory =
@@ -3822,7 +3883,9 @@ arc_memory_throttle(uint64_t reserve, ui
MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
#endif
#endif /* sun */
- if (available_memory >= zfs_write_limit_max)
+
+ if (cnt.v_free_count + cnt.v_cache_count >
+ (uint64_t)physmem * arc_lotsfree_percent / 100)
return (0);
if (txg > last_txg) {
@@ -3846,20 +3909,6 @@ arc_memory_throttle(uint64_t reserve, ui
return (SET_ERROR(EAGAIN));
}
page_load = 0;
-
- if (arc_size > arc_c_min) {
- uint64_t evictable_memory =
- arc_mru->arcs_lsize[ARC_BUFC_DATA] +
- arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
- arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
- arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
- available_memory += MIN(evictable_memory, arc_size - arc_c_min);
- }
-
- if (inflight_data > available_memory / 4) {
- ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
- return (SET_ERROR(ERESTART));
- }
#endif
return (0);
}
@@ -3877,15 +3926,6 @@ arc_tempreserve_space(uint64_t reserve,
int error;
uint64_t anon_size;
-#ifdef ZFS_DEBUG
- /*
- * Once in a while, fail for no reason. Everything should cope.
- */
- if (spa_get_random(10000) == 0) {
- dprintf("forcing random failure\n");
- return (SET_ERROR(ERESTART));
- }
-#endif
if (reserve > arc_c/4 && !arc_no_grow)
arc_c = MIN(arc_c_max, reserve * 4);
if (reserve > arc_c)
@@ -3903,7 +3943,8 @@ arc_tempreserve_space(uint64_t reserve,
* in order to compress/encrypt/etc the data. We therefore need to
* make sure that there is sufficient available memory for this.
*/
- if (error = arc_memory_throttle(reserve, anon_size, txg))
+ error = arc_memory_throttle(reserve, txg);
+ if (error != 0)
return (error);
/*
@@ -4094,11 +4135,20 @@ arc_init(void)
arc_dead = FALSE;
arc_warm = B_FALSE;
- if (zfs_write_limit_max == 0)
- zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
- else
- zfs_write_limit_shift = 0;
- mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
+ /*
+ * Calculate maximum amount of dirty data per pool.
+ *
+ * If it has been set by /etc/system, take that.
+ * Otherwise, use a percentage of physical memory defined by
+ * zfs_dirty_data_max_percent (default 10%) with a cap at
+ * zfs_dirty_data_max_max (default 4GB).
+ */
+ if (zfs_dirty_data_max == 0) {
+ zfs_dirty_data_max = ptob(physmem) *
+ zfs_dirty_data_max_percent / 100;
+ zfs_dirty_data_max = MIN(zfs_dirty_data_max,
+ zfs_dirty_data_max_max);
+ }
#ifdef _KERNEL
if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
@@ -4177,8 +4227,6 @@ arc_fini(void)
mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
}
- mutex_destroy(&zfs_write_limit_lock);
-
buf_fini();
ASSERT(arc_loaned_bytes == 0);
Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c Thu Jan 16 15:57:39 2014 (r260763)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c Thu Jan 16 15:59:08 2014 (r260764)
@@ -841,7 +841,7 @@ dbuf_free_range(dnode_t *dn, uint64_t st
atomic_inc_64(&zfs_free_range_recv_miss);
}
- for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
+ for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
db_next = list_next(&dn->dn_dbufs, db);
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
@@ -1187,6 +1187,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t
sizeof (dbuf_dirty_record_t),
offsetof(dbuf_dirty_record_t, dr_dirty_node));
}
+ if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
+ dr->dr_accounted = db->db.db_size;
dr->dr_dbuf = db;
dr->dr_txg = tx->tx_txg;
dr->dr_next = *drp;
@@ -1270,7 +1272,10 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t
dbuf_rele(parent, FTAG);
mutex_enter(&db->db_mtx);
- /* possible race with dbuf_undirty() */
+ /*
+ * Since we've dropped the mutex, it's possible that
+ * dbuf_undirty() might have changed this out from under us.
+ */
if (db->db_last_dirty == dr ||
dn->dn_object == DMU_META_DNODE_OBJECT) {
mutex_enter(&di->dt.di.dr_mtx);
@@ -1340,7 +1345,11 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_
ASSERT(db->db.db_size != 0);
- /* XXX would be nice to fix up dn_towrite_space[] */
+ /*
+ * Any space we accounted for in dp_dirty_* will be cleaned up by
+ * dsl_pool_sync(). This is relatively rare so the discrepancy
+ * is not a big deal.
+ */
*drp = dr->dr_next;
@@ -1520,7 +1529,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, a
/*
* "Clear" the contents of this dbuf. This will mark the dbuf
- * EVICTING and clear *most* of its references. Unfortunetely,
+ * EVICTING and clear *most* of its references. Unfortunately,
* when we are not holding the dn_dbufs_mtx, we can't clear the
* entry in the dn_dbufs list. We have to wait until dbuf_destroy()
* in this case. For callers from the DMU we will usually see:
@@ -1707,7 +1716,7 @@ dbuf_create(dnode_t *dn, uint8_t level,
db->db.db_offset = 0;
} else {
int blocksize =
- db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz;
+ db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
db->db.db_size = blocksize;
db->db.db_offset = db->db_blkid * blocksize;
}
@@ -1816,7 +1825,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
}
void
-dbuf_prefetch(dnode_t *dn, uint64_t blkid)
+dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
{
dmu_buf_impl_t *db = NULL;
blkptr_t *bp = NULL;
@@ -1840,8 +1849,6 @@ dbuf_prefetch(dnode_t *dn, uint64_t blki
if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
if (bp && !BP_IS_HOLE(bp)) {
- int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
- ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
zbookmark_t zb;
@@ -1850,7 +1857,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blki
dn->dn_object, 0, blkid);
(void) arc_read(NULL, dn->dn_objset->os_spa,
- bp, NULL, NULL, priority,
+ bp, NULL, NULL, prio,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&aflags, &zb);
}
@@ -2535,6 +2542,38 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *
mutex_exit(&db->db_mtx);
}
+/*
+ * The SPA will call this callback several times for each zio - once
+ * for every physical child i/o (zio->io_phys_children times). This
+ * allows the DMU to monitor the progress of each logical i/o. For example,
+ * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
+ * block. There may be a long delay before all copies/fragments are completed,
+ * so this callback allows us to retire dirty space gradually, as the physical
+ * i/os complete.
+ */
+/* ARGSUSED */
+static void
+dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+ dmu_buf_impl_t *db = arg;
+ objset_t *os = db->db_objset;
+ dsl_pool_t *dp = dmu_objset_pool(os);
+ dbuf_dirty_record_t *dr;
+ int delta = 0;
+
+ dr = db->db_data_pending;
+ ASSERT3U(dr->dr_txg, ==, zio->io_txg);
+
+ /*
+ * The callback will be called io_phys_children times. Retire one
+ * portion of our dirty space each time we are called. Any rounding
+ * error will be cleaned up by dsl_pool_sync()'s call to
+ * dsl_pool_undirty_space().
+ */
+ delta = dr->dr_accounted / zio->io_phys_children;
+ dsl_pool_undirty_space(dp, delta, zio->io_txg);
+}
+
/* ARGSUSED */
static void
dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
@@ -2629,6 +2668,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *b
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
db->db_data_pending = NULL;
+
dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
}
@@ -2747,8 +2787,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_
ASSERT(db->db_state != DB_NOFILL);
dr->dr_zio = zio_write(zio, os->os_spa, txg,
db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
- dbuf_write_override_ready, dbuf_write_override_done, dr,
- ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+ dbuf_write_override_ready, NULL, dbuf_write_override_done,
+ dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
mutex_enter(&db->db_mtx);
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
@@ -2758,7 +2798,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
dr->dr_zio = zio_write(zio, os->os_spa, txg,
db->db_blkptr, NULL, db->db.db_size, &zp,
- dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
+ dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
} else {
@@ -2766,7 +2806,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_
dr->dr_zio = arc_write(zio, os->os_spa, txg,
db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
- dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
- ZIO_FLAG_MUSTSUCCEED, &zb);
+ dbuf_write_physdone, dbuf_write_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
}
}
Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c Thu Jan 16 15:57:39 2014 (r260763)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c Thu Jan 16 15:59:08 2014 (r260764)
@@ -374,13 +374,11 @@ static int
dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
{
- dsl_pool_t *dp = NULL;
dmu_buf_t **dbp;
uint64_t blkid, nblks, i;
uint32_t dbuf_flags;
int err;
zio_t *zio;
- hrtime_t start;
ASSERT(length <= DMU_MAX_ACCESS);
@@ -408,9 +406,6 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn,
}
dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
- if (dn->dn_objset->os_dsl_dataset)
- dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
- start = gethrtime();
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
blkid = dbuf_whichblock(dn, offset);
for (i = 0; i < nblks; i++) {
@@ -434,9 +429,6 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn,
/* wait for async i/o */
err = zio_wait(zio);
- /* track read overhead when we are in sync context */
- if (dp && dsl_pool_sync_context(dp))
- dp->dp_read_overhead += gethrtime() - start;
if (err) {
dmu_buf_rele_array(dbp, nblks, tag);
return (err);
@@ -518,12 +510,22 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake,
kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
}
+/*
+ * Issue prefetch i/os for the given blocks.
+ *
+ * Note: The assumption is that we *know* these blocks will be needed
+ * almost immediately. Therefore, the prefetch i/os will be issued at
+ * ZIO_PRIORITY_SYNC_READ
+ *
+ * Note: indirect blocks and other metadata will be read synchronously,
+ * causing this function to block if they are not already cached.
+ */
void
dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
{
dnode_t *dn;
uint64_t blkid;
- int nblks, i, err;
+ int nblks, err;
if (zfs_prefetch_disable)
return;
@@ -536,7 +538,7 @@ dmu_prefetch(objset_t *os, uint64_t obje
rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
- dbuf_prefetch(dn, blkid);
+ dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ);
rw_exit(&dn->dn_struct_rwlock);
return;
}
@@ -553,16 +555,16 @@ dmu_prefetch(objset_t *os, uint64_t obje
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_datablkshift) {
int blkshift = dn->dn_datablkshift;
- nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
- P2ALIGN(offset, 1<<blkshift)) >> blkshift;
+ nblks = (P2ROUNDUP(offset + len, 1 << blkshift) -
+ P2ALIGN(offset, 1 << blkshift)) >> blkshift;
} else {
nblks = (offset < dn->dn_datablksz);
}
if (nblks != 0) {
blkid = dbuf_whichblock(dn, offset);
- for (i = 0; i < nblks; i++)
- dbuf_prefetch(dn, blkid+i);
+ for (int i = 0; i < nblks; i++)
+ dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ);
}
rw_exit(&dn->dn_struct_rwlock);
@@ -1376,7 +1378,7 @@ dmu_sync_late_arrival(zio_t *pio, objset
zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
- dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
+ dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, dsa,
ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
return (0);
@@ -1516,8 +1518,9 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_s
zio_nowait(arc_write(pio, os->os_spa, txg,
bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
- DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, dmu_sync_done,
- dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
+ DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready,
+ NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE,
+ ZIO_FLAG_CANFAIL, &zb));
return (0);
}
Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c Thu Jan 16 15:57:39 2014 (r260763)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c Thu Jan 16 15:59:08 2014 (r260764)
@@ -1028,7 +1028,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio
zio = arc_write(pio, os->os_spa, tx->tx_txg,
os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready,
- dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE,
+ NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED, &zb);
/*
Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c Thu Jan 16 15:57:39 2014 (r260763)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c Thu Jan 16 15:59:08 2014 (r260764)
@@ -54,6 +54,7 @@ dmu_tx_create_dd(dsl_dir_t *dd)
offsetof(dmu_tx_hold_t, txh_node));
list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
offsetof(dmu_tx_callback_t, dcb_node));
+ tx->tx_start = gethrtime();
#ifdef ZFS_DEBUG
refcount_create(&tx->tx_space_written);
refcount_create(&tx->tx_space_freed);
@@ -597,13 +598,13 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t
if (txh == NULL)
return;
dn = txh->txh_dnode;
+ dmu_tx_count_dnode(txh);
if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
return;
if (len == DMU_OBJECT_END)
len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
- dmu_tx_count_dnode(txh);
/*
* For i/o error checking, we read the first and last level-0
@@ -911,6 +912,162 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_i
}
#endif
+/*
+ * If we can't do 10 iops, something is wrong. Let us go ahead
+ * and hit zfs_dirty_data_max.
+ */
+hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
+int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
+
+/*
+ * We delay transactions when we've determined that the backend storage
+ * isn't able to accommodate the rate of incoming writes.
+ *
+ * If there is already a transaction waiting, we delay relative to when
+ * that transaction finishes waiting. This way the calculated min_time
+ * is independent of the number of threads concurrently executing
+ * transactions.
+ *
+ * If we are the only waiter, wait relative to when the transaction
+ * started, rather than the current time. This credits the transaction for
+ * "time already served", e.g. reading indirect blocks.
+ *
+ * The minimum time for a transaction to take is calculated as:
+ * min_time = scale * (dirty - min) / (max - dirty)
+ * min_time is then capped at zfs_delay_max_ns.
+ *
+ * The delay has two degrees of freedom that can be adjusted via tunables.
+ * The percentage of dirty data at which we start to delay is defined by
+ * zfs_delay_min_dirty_percent. This should typically be at or above
+ * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
+ * delay after writing at full speed has failed to keep up with the incoming
+ * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
+ * speaking, this variable determines the amount of delay at the midpoint of
+ * the curve.
+ *
+ * delay
+ * 10ms +-------------------------------------------------------------*+
+ * | *|
+ * 9ms + *+
+ * | *|
+ * 8ms + *+
+ * | * |
+ * 7ms + * +
+ * | * |
+ * 6ms + * +
+ * | * |
+ * 5ms + * +
+ * | * |
+ * 4ms + * +
+ * | * |
+ * 3ms + * +
+ * | * |
+ * 2ms + (midpoint) * +
+ * | | ** |
+ * 1ms + v *** +
+ * | zfs_delay_scale ----------> ******** |
+ * 0 +-------------------------------------*********----------------+
+ * 0% <- zfs_dirty_data_max -> 100%
+ *
+ * Note that since the delay is added to the outstanding time remaining on the
+ * most recent transaction, the delay is effectively the inverse of IOPS.
+ * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
+ * was chosen such that small changes in the amount of accumulated dirty data
+ * in the first 3/4 of the curve yield relatively small differences in the
+ * amount of delay.
+ *
+ * The effects can be easier to understand when the amount of delay is
+ * represented on a log scale:
+ *
+ * delay
+ * 100ms +-------------------------------------------------------------++
+ * + +
+ * | |
+ * + *+
+ * 10ms + *+
+ * + ** +
+ * | (midpoint) ** |
+ * + | ** +
+ * 1ms + v **** +
+ * + zfs_delay_scale ----------> ***** +
+ * | **** |
+ * + **** +
+ * 100us + ** +
+ * + * +
+ * | * |
+ * + * +
+ * 10us + * +
+ * + +
+ * | |
+ * + +
+ * +--------------------------------------------------------------+
+ * 0% <- zfs_dirty_data_max -> 100%
+ *
+ * Note here that only as the amount of dirty data approaches its limit does
+ * the delay start to increase rapidly. The goal of a properly tuned system
+ * should be to keep the amount of dirty data out of that range by first
+ * ensuring that the appropriate limits are set for the I/O scheduler to reach
+ * optimal throughput on the backend storage, and then by changing the value
+ * of zfs_delay_scale to increase the steepness of the curve.
+ */
+static void
+dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
+{
+ dsl_pool_t *dp = tx->tx_pool;
+ uint64_t delay_min_bytes =
+ zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
+ hrtime_t wakeup, min_tx_time, now;
+
+ if (dirty <= delay_min_bytes)
+ return;
+
+ /*
+ * The caller has already waited until we are under the max.
+ * We make them pass us the amount of dirty data so we don't
+ * have to handle the case of it being >= the max, which could
+ * cause a divide-by-zero if it's == the max.
+ */
+ ASSERT3U(dirty, <, zfs_dirty_data_max);
+
+ now = gethrtime();
+ min_tx_time = zfs_delay_scale *
+ (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
+ if (now > tx->tx_start + min_tx_time)
+ return;
+
+ min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
+
+ DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
+ uint64_t, min_tx_time);
+
+ mutex_enter(&dp->dp_lock);
+ wakeup = MAX(tx->tx_start + min_tx_time,
+ dp->dp_last_wakeup + min_tx_time);
+ dp->dp_last_wakeup = wakeup;
+ mutex_exit(&dp->dp_lock);
+
+#ifdef _KERNEL
+#ifdef illumos
+ mutex_enter(&curthread->t_delay_lock);
+ while (cv_timedwait_hires(&curthread->t_delay_cv,
+ &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
+ CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
+ continue;
+ mutex_exit(&curthread->t_delay_lock);
+#else
+ /* XXX High resolution callouts are not available */
+ ASSERT(wakeup >= now);
+ pause("dmu_tx_delay", NSEC_TO_TICK(wakeup - now));
+#endif
+#else
+ hrtime_t delta = wakeup - gethrtime();
+ struct timespec ts;
+ ts.tv_sec = delta / NANOSEC;
+ ts.tv_nsec = delta % NANOSEC;
+ (void) nanosleep(&ts, NULL);
+#endif
+}
+
static int
dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
{
@@ -941,6 +1098,12 @@ dmu_tx_try_assign(dmu_tx_t *tx, txg_how_
return (SET_ERROR(ERESTART));
}
+ if (!tx->tx_waited &&
+ dsl_pool_need_dirty_delay(tx->tx_pool)) {
+ tx->tx_wait_dirty = B_TRUE;
+ return (SET_ERROR(ERESTART));
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-stable-9
mailing list