svn commit: r280296 - in user/delphij/zfs-arc-rebase: cddl/contrib/opensolaris/cmd/ztest sys/cddl/contrib/opensolaris/uts/common/fs/zfs sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys
Xin LI
delphij at FreeBSD.org
Fri Mar 20 18:39:09 UTC 2015
Author: delphij
Date: Fri Mar 20 18:39:07 2015
New Revision: 280296
URL: https://svnweb.freebsd.org/changeset/base/280296
Log:
MFV r277426:
Reduce L2ARC memory footprint by extracting L1 and L2 only fields and only
allocate the fields when needed.
Illumos issue:
5408 managing ZFS cache devices requires lots of RAM
Modified:
user/delphij/zfs-arc-rebase/cddl/contrib/opensolaris/cmd/ztest/ztest.c
user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
Directory Properties:
user/delphij/zfs-arc-rebase/cddl/contrib/opensolaris/ (props changed)
user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/ (props changed)
Modified: user/delphij/zfs-arc-rebase/cddl/contrib/opensolaris/cmd/ztest/ztest.c
==============================================================================
--- user/delphij/zfs-arc-rebase/cddl/contrib/opensolaris/cmd/ztest/ztest.c Fri Mar 20 18:02:56 2015 (r280295)
+++ user/delphij/zfs-arc-rebase/cddl/contrib/opensolaris/cmd/ztest/ztest.c Fri Mar 20 18:39:07 2015 (r280296)
@@ -3907,7 +3907,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *z
* assign an arcbuf to a dbuf.
*/
for (j = 0; j < s; j++) {
- if (i != 5) {
+ if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
bigbuf_arcbufs[j] =
dmu_request_arcbuf(bonus_db, chunksize);
} else {
@@ -3931,7 +3931,8 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *z
umem_free(packbuf, packsize);
umem_free(bigbuf, bigsize);
for (j = 0; j < s; j++) {
- if (i != 5) {
+ if (i != 5 ||
+ chunksize < (SPA_MINBLOCKSIZE * 2)) {
dmu_return_arcbuf(bigbuf_arcbufs[j]);
} else {
dmu_return_arcbuf(
@@ -3975,7 +3976,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *z
}
for (off = bigoff, j = 0; j < s; j++, off += chunksize) {
dmu_buf_t *dbt;
- if (i != 5) {
+ if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
bcopy((caddr_t)bigbuf + (off - bigoff),
bigbuf_arcbufs[j]->b_data, chunksize);
} else {
@@ -3992,7 +3993,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *z
VERIFY(dmu_buf_hold(os, bigobj, off,
FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0);
}
- if (i != 5) {
+ if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
dmu_assign_arcbuf(bonus_db, off,
bigbuf_arcbufs[j], tx);
} else {
Modified: user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
==============================================================================
--- user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Fri Mar 20 18:02:56 2015 (r280295)
+++ user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Fri Mar 20 18:39:07 2015 (r280296)
@@ -111,7 +111,7 @@
* Note that the majority of the performance stats are manipulated
* with atomic operations.
*
- * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
+ * The L2ARC uses the l2ad_mtx on each vdev for the following:
*
* - L2ARC buflist creation
* - L2ARC buflist eviction
@@ -390,6 +390,7 @@ typedef struct arc_stats {
kstat_named_t arcstat_l2_writes_hdr_miss;
kstat_named_t arcstat_l2_evict_lock_retry;
kstat_named_t arcstat_l2_evict_reading;
+ kstat_named_t arcstat_l2_evict_l1cached;
kstat_named_t arcstat_l2_free_on_write;
kstat_named_t arcstat_l2_abort_lowmem;
kstat_named_t arcstat_l2_cksum_bad;
@@ -471,6 +472,7 @@ static arc_stats_t arc_stats = {
{ "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
{ "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
{ "l2_evict_reading", KSTAT_DATA_UINT64 },
+ { "l2_evict_l1cached", KSTAT_DATA_UINT64 },
{ "l2_free_on_write", KSTAT_DATA_UINT64 },
{ "l2_abort_lowmem", KSTAT_DATA_UINT64 },
{ "l2_cksum_bad", KSTAT_DATA_UINT64 },
@@ -574,8 +576,6 @@ static int arc_no_grow; /* Don't try to
static uint64_t arc_tempreserve;
static uint64_t arc_loaned_bytes;
-typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
-
typedef struct arc_callback arc_callback_t;
struct arc_callback {
@@ -596,29 +596,53 @@ struct arc_write_callback {
arc_buf_t *awcb_buf;
};
-struct arc_buf_hdr {
- /* protected by hash lock */
- dva_t b_dva;
- uint64_t b_birth;
- uint64_t b_cksum0;
-
+/*
+ * ARC buffers are separated into multiple structs as a memory saving measure:
+ * - Common fields struct, always defined, and embedded within it:
+ * - L2-only fields, always allocated but undefined when not in L2ARC
+ * - L1-only fields, only allocated when in L1ARC
+ *
+ * Buffer in L1 Buffer only in L2
+ * +------------------------+ +------------------------+
+ * | arc_buf_hdr_t | | arc_buf_hdr_t |
+ * | | | |
+ * | | | |
+ * | | | |
+ * +------------------------+ +------------------------+
+ * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t |
+ * | (undefined if L1-only) | | |
+ * +------------------------+ +------------------------+
+ * | l1arc_buf_hdr_t |
+ * | |
+ * | |
+ * | |
+ * | |
+ * +------------------------+
+ *
+ * Because it's possible for the L2ARC to become extremely large, we can wind
+ * up eating a lot of memory in L2ARC buffer headers, so the size of a header
+ * is minimized by only allocating the fields necessary for an L1-cached buffer
+ * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
+ * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
+ * words in pointers. arc_hdr_realloc() is used to switch a header between
+ * these two allocation states.
+ */
+typedef struct l1arc_buf_hdr {
kmutex_t b_freeze_lock;
- zio_cksum_t *b_freeze_cksum;
+#ifdef ZFS_DEBUG
+ /*
+ * used for debugging wtih kmem_flags - by allocating and freeing
+ * b_thawed when the buffer is thawed, we get a record of the stack
+ * trace that thawed it.
+ */
void *b_thawed;
+#endif
- arc_buf_hdr_t *b_hash_next;
arc_buf_t *b_buf;
- arc_flags_t b_flags;
uint32_t b_datacnt;
-
- arc_callback_t *b_acb;
+ /* for waiting on writes to complete */
kcondvar_t b_cv;
- /* immutable */
- arc_buf_contents_t b_type;
- uint64_t b_size;
- uint64_t b_spa;
-
/* protected by arc state mutex */
arc_state_t *b_state;
list_node_t b_arc_node;
@@ -629,8 +653,46 @@ struct arc_buf_hdr {
/* self protecting */
refcount_t b_refcnt;
- l2arc_buf_hdr_t *b_l2hdr;
+ arc_callback_t *b_acb;
+ /* temporary buffer holder for in-flight compressed data */
+ void *b_tmp_cdata;
+} l1arc_buf_hdr_t;
+
+typedef struct l2arc_dev l2arc_dev_t;
+
+typedef struct l2arc_buf_hdr {
+ /* protected by arc_buf_hdr mutex */
+ l2arc_dev_t *b_dev; /* L2ARC device */
+ uint64_t b_daddr; /* disk address, offset byte */
+ /* real alloc'd buffer size depending on b_compress applied */
+ int32_t b_asize;
+
list_node_t b_l2node;
+} l2arc_buf_hdr_t;
+
+struct arc_buf_hdr {
+ /* protected by hash lock */
+ dva_t b_dva;
+ uint64_t b_birth;
+ /*
+ * Even though this checksum is only set/verified when a buffer is in
+ * the L1 cache, it needs to be in the set of common fields because it
+ * must be preserved from the time before a buffer is written out to
+ * L2ARC until after it is read back in.
+ */
+ zio_cksum_t *b_freeze_cksum;
+
+ arc_buf_hdr_t *b_hash_next;
+ arc_flags_t b_flags;
+
+ /* immutable */
+ int32_t b_size;
+ uint64_t b_spa;
+
+ /* L2ARC fields. Undefined when not in L2ARC. */
+ l2arc_buf_hdr_t b_l2hdr;
+ /* L1ARC fields. Undefined when in l2arc_only state */
+ l1arc_buf_hdr_t b_l1hdr;
};
#ifdef _KERNEL
@@ -667,22 +729,38 @@ static arc_buf_hdr_t arc_eviction_hdr;
#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
-#define HDR_FREE_IN_PROGRESS(hdr) \
- ((hdr)->b_flags & ARC_FLAG_FREE_IN_PROGRESS)
+
#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE)
+#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
#define HDR_L2_READING(hdr) \
- ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS && \
- (hdr)->b_l2hdr != NULL)
+ (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \
+ ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
+#define HDR_ISTYPE_METADATA(hdr) \
+ ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
+#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr))
+
+#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
+#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
+
+/* For storing compression mode in b_flags */
+#define HDR_COMPRESS_OFFSET 24
+#define HDR_COMPRESS_NBITS 7
+
+#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET(hdr->b_flags, \
+ HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS))
+#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \
+ HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp))
+
/*
* Other sizes
*/
-#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
-#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
+#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
+#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
/*
* Hash table routines
@@ -806,7 +884,7 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_onl
/*
* L2ARC Internals
*/
-typedef struct l2arc_dev {
+struct l2arc_dev {
vdev_t *l2ad_vdev; /* vdev */
spa_t *l2ad_spa; /* spa */
uint64_t l2ad_hand; /* next write location */
@@ -815,15 +893,15 @@ typedef struct l2arc_dev {
uint64_t l2ad_evict; /* last addr eviction reached */
boolean_t l2ad_first; /* first sweep through */
boolean_t l2ad_writing; /* currently writing */
- list_t *l2ad_buflist; /* buffer list */
+ kmutex_t l2ad_mtx; /* lock for buffer list */
+ list_t l2ad_buflist; /* buffer list */
list_node_t l2ad_node; /* device list node */
-} l2arc_dev_t;
+};
static list_t L2ARC_dev_list; /* device list */
static list_t *l2arc_dev_list; /* device list pointer */
static kmutex_t l2arc_dev_mtx; /* device list mutex */
static l2arc_dev_t *l2arc_dev_last; /* last device used */
-static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
static list_t L2ARC_free_on_write; /* free after write buf list */
static list_t *l2arc_free_on_write; /* free after write list ptr */
static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
@@ -843,18 +921,6 @@ typedef struct l2arc_write_callback {
arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
} l2arc_write_callback_t;
-struct l2arc_buf_hdr {
- /* protected by arc_buf_hdr mutex */
- l2arc_dev_t *b_dev; /* L2ARC device */
- uint64_t b_daddr; /* disk address, offset byte */
- /* compression applied to buffer data */
- enum zio_compress b_compress;
- /* real alloc'd buffer size depending on b_compress applied */
- int b_asize;
- /* temporary buffer holder for in-flight compressed data */
- void *b_tmp_cdata;
-};
-
typedef struct l2arc_data_free {
/* protected by l2arc_free_on_write_mtx */
void *l2df_data;
@@ -873,12 +939,13 @@ static int arc_evict_needed(arc_buf_cont
static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t);
static void arc_buf_watch(arc_buf_t *);
+static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
+static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
+
static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
static void l2arc_read_done(zio_t *);
-static void l2arc_hdr_stat_add(void);
-static void l2arc_hdr_stat_remove(void);
-static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *);
+static boolean_t l2arc_compress_buf(arc_buf_hdr_t *);
static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
@@ -901,8 +968,7 @@ buf_hash(uint64_t spa, const dva_t *dva,
#define BUF_EMPTY(buf) \
((buf)->b_dva.dva_word[0] == 0 && \
- (buf)->b_dva.dva_word[1] == 0 && \
- (buf)->b_cksum0 == 0)
+ (buf)->b_dva.dva_word[1] == 0)
#define BUF_EQUAL(spa, dva, birth, buf) \
((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
@@ -915,7 +981,6 @@ buf_discard_identity(arc_buf_hdr_t *hdr)
hdr->b_dva.dva_word[0] = 0;
hdr->b_dva.dva_word[1] = 0;
hdr->b_birth = 0;
- hdr->b_cksum0 = 0;
}
static arc_buf_hdr_t *
@@ -945,6 +1010,7 @@ buf_hash_find(uint64_t spa, const blkptr
* equal to elem in the hash table, then the already existing element
* will be returned and the new element will not be inserted.
* Otherwise returns NULL.
+ * If lockp == NULL, the caller is assumed to already hold the hash lock.
*/
static arc_buf_hdr_t *
buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
@@ -957,8 +1023,14 @@ buf_hash_insert(arc_buf_hdr_t *hdr, kmut
ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
ASSERT(hdr->b_birth != 0);
ASSERT(!HDR_IN_HASH_TABLE(hdr));
- *lockp = hash_lock;
- mutex_enter(hash_lock);
+
+ if (lockp != NULL) {
+ *lockp = hash_lock;
+ mutex_enter(hash_lock);
+ } else {
+ ASSERT(MUTEX_HELD(hash_lock));
+ }
+
for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
fhdr = fhdr->b_hash_next, i++) {
if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
@@ -1013,7 +1085,8 @@ buf_hash_remove(arc_buf_hdr_t *hdr)
/*
* Global data structures and functions for the buf kmem cache.
*/
-static kmem_cache_t *hdr_cache;
+static kmem_cache_t *hdr_full_cache;
+static kmem_cache_t *hdr_l2only_cache;
static kmem_cache_t *buf_cache;
static void
@@ -1025,7 +1098,8 @@ buf_fini(void)
(buf_hash_table.ht_mask + 1) * sizeof (void *));
for (i = 0; i < BUF_LOCKS; i++)
mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
- kmem_cache_destroy(hdr_cache);
+ kmem_cache_destroy(hdr_full_cache);
+ kmem_cache_destroy(hdr_l2only_cache);
kmem_cache_destroy(buf_cache);
}
@@ -1035,15 +1109,27 @@ buf_fini(void)
*/
/* ARGSUSED */
static int
-hdr_cons(void *vbuf, void *unused, int kmflag)
+hdr_full_cons(void *vbuf, void *unused, int kmflag)
{
arc_buf_hdr_t *hdr = vbuf;
- bzero(hdr, sizeof (arc_buf_hdr_t));
- refcount_create(&hdr->b_refcnt);
- cv_init(&hdr->b_cv, NULL, CV_DEFAULT, NULL);
- mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
- arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
+ bzero(hdr, HDR_FULL_SIZE);
+ cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
+ refcount_create(&hdr->b_l1hdr.b_refcnt);
+ mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+ arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
+{
+ arc_buf_hdr_t *hdr = vbuf;
+
+ bzero(hdr, HDR_L2ONLY_SIZE);
+ arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
return (0);
}
@@ -1067,15 +1153,25 @@ buf_cons(void *vbuf, void *unused, int k
*/
/* ARGSUSED */
static void
-hdr_dest(void *vbuf, void *unused)
+hdr_full_dest(void *vbuf, void *unused)
+{
+ arc_buf_hdr_t *hdr = vbuf;
+
+ ASSERT(BUF_EMPTY(hdr));
+ cv_destroy(&hdr->b_l1hdr.b_cv);
+ refcount_destroy(&hdr->b_l1hdr.b_refcnt);
+ mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
+ arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
+}
+
+/* ARGSUSED */
+static void
+hdr_l2only_dest(void *vbuf, void *unused)
{
arc_buf_hdr_t *hdr = vbuf;
ASSERT(BUF_EMPTY(hdr));
- refcount_destroy(&hdr->b_refcnt);
- cv_destroy(&hdr->b_cv);
- mutex_destroy(&hdr->b_freeze_lock);
- arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
+ arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
}
/* ARGSUSED */
@@ -1129,8 +1225,11 @@ retry:
goto retry;
}
- hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
- 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
+ hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
+ 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
+ hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
+ HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
+ NULL, NULL, 0);
buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
@@ -1144,6 +1243,81 @@ retry:
}
}
+/*
+ * Transition between the two allocation states for the arc_buf_hdr struct.
+ * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
+ * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
+ * version is used when a cache buffer is only in the L2ARC in order to reduce
+ * memory usage.
+ */
+static arc_buf_hdr_t *
+arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
+{
+ ASSERT(HDR_HAS_L2HDR(hdr));
+
+ arc_buf_hdr_t *nhdr;
+ l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
+
+ ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
+ (old == hdr_l2only_cache && new == hdr_full_cache));
+
+ nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
+
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
+ buf_hash_remove(hdr);
+
+ bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
+ if (new == hdr_full_cache) {
+ nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
+ /*
+ * arc_access and arc_change_state need to be aware that a
+ * header has just come out of L2ARC, so we set its state to
+ * l2c_only even though it's about to change.
+ */
+ nhdr->b_l1hdr.b_state = arc_l2c_only;
+ } else {
+ ASSERT(hdr->b_l1hdr.b_buf == NULL);
+ ASSERT0(hdr->b_l1hdr.b_datacnt);
+ ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+ /*
+ * We might be removing the L1hdr of a buffer which was just
+ * written out to L2ARC. If such a buffer is compressed then we
+ * need to free its b_tmp_cdata before destroying the header.
+ */
+ if (hdr->b_l1hdr.b_tmp_cdata != NULL &&
+ HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
+ l2arc_release_cdata_buf(hdr);
+ nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
+ }
+ /*
+ * The header has been reallocated so we need to re-insert it into any
+ * lists it was on.
+ */
+ (void) buf_hash_insert(nhdr, NULL);
+
+ ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
+
+ mutex_enter(&dev->l2ad_mtx);
+
+ /*
+ * We must place the realloc'ed header back into the list at
+ * the same spot. Otherwise, if it's placed earlier in the list,
+ * l2arc_write_buffers() could find it during the function's
+ * write phase, and try to write it out to the l2arc.
+ */
+ list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
+ list_remove(&dev->l2ad_buflist, hdr);
+
+ mutex_exit(&dev->l2ad_mtx);
+
+ buf_discard_identity(hdr);
+ hdr->b_freeze_cksum = NULL;
+ kmem_cache_free(old, hdr);
+
+ return (nhdr);
+}
+
+
#define ARC_MINTIME (hz>>4) /* 62 ms */
static void
@@ -1154,16 +1328,15 @@ arc_cksum_verify(arc_buf_t *buf)
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
- mutex_enter(&buf->b_hdr->b_freeze_lock);
- if (buf->b_hdr->b_freeze_cksum == NULL ||
- (buf->b_hdr->b_flags & ARC_FLAG_IO_ERROR)) {
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+ if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) {
+ mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
return;
}
fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
panic("buffer modified while frozen!");
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
}
static int
@@ -1172,10 +1345,10 @@ arc_cksum_equal(arc_buf_t *buf)
zio_cksum_t zc;
int equal;
- mutex_enter(&buf->b_hdr->b_freeze_lock);
+ mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
return (equal);
}
@@ -1186,15 +1359,15 @@ arc_cksum_compute(arc_buf_t *buf, boolea
if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
return;
- mutex_enter(&buf->b_hdr->b_freeze_lock);
+ mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
if (buf->b_hdr->b_freeze_cksum != NULL) {
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
return;
}
buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
buf->b_hdr->b_freeze_cksum);
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
#ifdef illumos
arc_buf_watch(buf);
#endif
@@ -1245,30 +1418,58 @@ arc_buf_watch(arc_buf_t *buf)
}
#endif /* illumos */
+static arc_buf_contents_t
+arc_buf_type(arc_buf_hdr_t *hdr)
+{
+ if (HDR_ISTYPE_METADATA(hdr)) {
+ return (ARC_BUFC_METADATA);
+ } else {
+ return (ARC_BUFC_DATA);
+ }
+}
+
+static uint32_t
+arc_bufc_to_flags(arc_buf_contents_t type)
+{
+ switch (type) {
+ case ARC_BUFC_DATA:
+ /* metadata field is 0 if buffer contains normal data */
+ return (0);
+ case ARC_BUFC_METADATA:
+ return (ARC_FLAG_BUFC_METADATA);
+ default:
+ break;
+ }
+ panic("undefined ARC buffer type!");
+ return ((uint32_t)-1);
+}
+
void
arc_buf_thaw(arc_buf_t *buf)
{
if (zfs_flags & ZFS_DEBUG_MODIFY) {
- if (buf->b_hdr->b_state != arc_anon)
+ if (buf->b_hdr->b_l1hdr.b_state != arc_anon)
panic("modifying non-anon buffer!");
- if (buf->b_hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS)
+ if (HDR_IO_IN_PROGRESS(buf->b_hdr))
panic("modifying buffer while i/o in progress!");
arc_cksum_verify(buf);
}
- mutex_enter(&buf->b_hdr->b_freeze_lock);
+ mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
if (buf->b_hdr->b_freeze_cksum != NULL) {
kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
buf->b_hdr->b_freeze_cksum = NULL;
}
+#ifdef ZFS_DEBUG
if (zfs_flags & ZFS_DEBUG_MODIFY) {
- if (buf->b_hdr->b_thawed)
- kmem_free(buf->b_hdr->b_thawed, 1);
- buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
+ if (buf->b_hdr->b_l1hdr.b_thawed != NULL)
+ kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1);
+ buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
}
+#endif
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
#ifdef illumos
arc_buf_unwatch(buf);
@@ -1287,7 +1488,7 @@ arc_buf_freeze(arc_buf_t *buf)
mutex_enter(hash_lock);
ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
- buf->b_hdr->b_state == arc_anon);
+ buf->b_hdr->b_l1hdr.b_state == arc_anon);
arc_cksum_compute(buf, B_FALSE);
mutex_exit(hash_lock);
@@ -1296,30 +1497,34 @@ arc_buf_freeze(arc_buf_t *buf)
static void
add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
{
+ ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(MUTEX_HELD(hash_lock));
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+
+ if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
+ (state != arc_anon)) {
+ /* We don't use the L2-only state list. */
+ if (state != arc_l2c_only) {
+ uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
+ list_t *list = &state->arcs_list[arc_buf_type(hdr)];
+ uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
- if ((refcount_add(&hdr->b_refcnt, tag) == 1) &&
- (hdr->b_state != arc_anon)) {
- uint64_t delta = hdr->b_size * hdr->b_datacnt;
- list_t *list = &hdr->b_state->arcs_list[hdr->b_type];
- uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
-
- ASSERT(!MUTEX_HELD(&hdr->b_state->arcs_mtx));
- mutex_enter(&hdr->b_state->arcs_mtx);
- ASSERT(list_link_active(&hdr->b_arc_node));
- list_remove(list, hdr);
- if (GHOST_STATE(hdr->b_state)) {
- ASSERT0(hdr->b_datacnt);
- ASSERT3P(hdr->b_buf, ==, NULL);
- delta = hdr->b_size;
- }
- ASSERT(delta > 0);
- ASSERT3U(*size, >=, delta);
- atomic_add_64(size, -delta);
- mutex_exit(&hdr->b_state->arcs_mtx);
+ ASSERT(!MUTEX_HELD(&state->arcs_mtx));
+ mutex_enter(&state->arcs_mtx);
+ ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
+ list_remove(list, hdr);
+ if (GHOST_STATE(state)) {
+ ASSERT0(hdr->b_l1hdr.b_datacnt);
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ delta = hdr->b_size;
+ }
+ ASSERT(delta > 0);
+ ASSERT3U(*size, >=, delta);
+ atomic_add_64(size, -delta);
+ mutex_exit(&state->arcs_mtx);
+ }
/* remove the prefetch flag if we get a reference */
- if (hdr->b_flags & ARC_FLAG_PREFETCH)
- hdr->b_flags &= ~ARC_FLAG_PREFETCH;
+ hdr->b_flags &= ~ARC_FLAG_PREFETCH;
}
}
@@ -1327,21 +1532,27 @@ static int
remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
{
int cnt;
- arc_state_t *state = hdr->b_state;
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+ ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
ASSERT(!GHOST_STATE(state));
- if (((cnt = refcount_remove(&hdr->b_refcnt, tag)) == 0) &&
+ /*
+ * arc_l2c_only counts as a ghost state so we don't need to explicitly
+ * check to prevent usage of the arc_l2c_only list.
+ */
+ if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
(state != arc_anon)) {
- uint64_t *size = &state->arcs_lsize[hdr->b_type];
+ uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
ASSERT(!MUTEX_HELD(&state->arcs_mtx));
mutex_enter(&state->arcs_mtx);
- ASSERT(!list_link_active(&hdr->b_arc_node));
- list_insert_head(&state->arcs_list[hdr->b_type], hdr);
- ASSERT(hdr->b_datacnt > 0);
- atomic_add_64(size, hdr->b_size * hdr->b_datacnt);
+ ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+ list_insert_head(&state->arcs_list[arc_buf_type(hdr)], hdr);
+ ASSERT(hdr->b_l1hdr.b_datacnt > 0);
+ atomic_add_64(size, hdr->b_size *
+ hdr->b_l1hdr.b_datacnt);
mutex_exit(&state->arcs_mtx);
}
return (cnt);
@@ -1355,40 +1566,60 @@ static void
arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
kmutex_t *hash_lock)
{
- arc_state_t *old_state = hdr->b_state;
- int64_t refcnt = refcount_count(&hdr->b_refcnt);
+ arc_state_t *old_state;
+ int64_t refcnt;
+ uint32_t datacnt;
uint64_t from_delta, to_delta;
+ arc_buf_contents_t buftype = arc_buf_type(hdr);
+
+ /*
+ * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
+ * in arc_read() when bringing a buffer out of the L2ARC. However, the
+ * L1 hdr doesn't always exist when we change state to arc_anon before
+ * destroying a header, in which case reallocating to add the L1 hdr is
+ * pointless.
+ */
+ if (HDR_HAS_L1HDR(hdr)) {
+ old_state = hdr->b_l1hdr.b_state;
+ refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
+ datacnt = hdr->b_l1hdr.b_datacnt;
+ } else {
+ old_state = arc_l2c_only;
+ refcnt = 0;
+ datacnt = 0;
+ }
ASSERT(MUTEX_HELD(hash_lock));
ASSERT3P(new_state, !=, old_state);
- ASSERT(refcnt == 0 || hdr->b_datacnt > 0);
- ASSERT(hdr->b_datacnt == 0 || !GHOST_STATE(new_state));
- ASSERT(hdr->b_datacnt <= 1 || old_state != arc_anon);
+ ASSERT(refcnt == 0 || datacnt > 0);
+ ASSERT(!GHOST_STATE(new_state) || datacnt == 0);
+ ASSERT(old_state != arc_anon || datacnt <= 1);
- from_delta = to_delta = hdr->b_datacnt * hdr->b_size;
+ from_delta = to_delta = datacnt * hdr->b_size;
/*
* If this buffer is evictable, transfer it from the
* old state list to the new state list.
*/
if (refcnt == 0) {
- if (old_state != arc_anon) {
+ if (old_state != arc_anon && old_state != arc_l2c_only) {
int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
- uint64_t *size = &old_state->arcs_lsize[hdr->b_type];
+ uint64_t *size = &old_state->arcs_lsize[buftype];
if (use_mutex)
mutex_enter(&old_state->arcs_mtx);
- ASSERT(list_link_active(&hdr->b_arc_node));
- list_remove(&old_state->arcs_list[hdr->b_type], hdr);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
+ list_remove(&old_state->arcs_list[buftype], hdr);
/*
* If prefetching out of the ghost cache,
* we will have a non-zero datacnt.
*/
- if (GHOST_STATE(old_state) && hdr->b_datacnt == 0) {
+ if (GHOST_STATE(old_state) && datacnt == 0) {
/* ghost elements have a ghost size */
- ASSERT(hdr->b_buf == NULL);
+ ASSERT(hdr->b_l1hdr.b_buf == NULL);
from_delta = hdr->b_size;
}
ASSERT3U(*size, >=, from_delta);
@@ -1397,20 +1628,26 @@ arc_change_state(arc_state_t *new_state,
if (use_mutex)
mutex_exit(&old_state->arcs_mtx);
}
- if (new_state != arc_anon) {
+ if (new_state != arc_anon && new_state != arc_l2c_only) {
int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
- uint64_t *size = &new_state->arcs_lsize[hdr->b_type];
+ uint64_t *size = &new_state->arcs_lsize[buftype];
+ /*
+ * An L1 header always exists here, since if we're
+ * moving to some L1-cached state (i.e. not l2c_only or
+ * anonymous), we realloc the header to add an L1hdr
+ * beforehand.
+ */
+ ASSERT(HDR_HAS_L1HDR(hdr));
if (use_mutex)
mutex_enter(&new_state->arcs_mtx);
- list_insert_head(&new_state->arcs_list[hdr->b_type],
- hdr);
+ list_insert_head(&new_state->arcs_list[buftype], hdr);
/* ghost elements have a ghost size */
if (GHOST_STATE(new_state)) {
- ASSERT(hdr->b_datacnt == 0);
- ASSERT(hdr->b_buf == NULL);
+ ASSERT0(datacnt);
+ ASSERT(hdr->b_l1hdr.b_buf == NULL);
to_delta = hdr->b_size;
}
atomic_add_64(size, to_delta);
@@ -1424,20 +1661,22 @@ arc_change_state(arc_state_t *new_state,
if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
buf_hash_remove(hdr);
- /* adjust state sizes */
- if (to_delta)
+ /* adjust state sizes (ignore arc_l2c_only) */
+ if (to_delta && new_state != arc_l2c_only)
atomic_add_64(&new_state->arcs_size, to_delta);
- if (from_delta) {
+ if (from_delta && old_state != arc_l2c_only) {
ASSERT3U(old_state->arcs_size, >=, from_delta);
atomic_add_64(&old_state->arcs_size, -from_delta);
}
- hdr->b_state = new_state;
+ if (HDR_HAS_L1HDR(hdr))
+ hdr->b_l1hdr.b_state = new_state;
- /* adjust l2arc hdr stats */
- if (new_state == arc_l2c_only)
- l2arc_hdr_stat_add();
- else if (old_state == arc_l2c_only)
- l2arc_hdr_stat_remove();
+ /*
+ * L2 headers should never be on the L2 state list since they don't
+ * have L1 headers allocated.
+ */
+ ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
+ list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
}
void
@@ -1493,31 +1732,36 @@ arc_space_return(uint64_t space, arc_spa
}
arc_buf_t *
-arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
+arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type)
{
arc_buf_hdr_t *hdr;
arc_buf_t *buf;
ASSERT3U(size, >, 0);
- hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
+ hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
ASSERT(BUF_EMPTY(hdr));
+ ASSERT3P(hdr->b_freeze_cksum, ==, NULL);
hdr->b_size = size;
- hdr->b_type = type;
hdr->b_spa = spa_load_guid(spa);
- hdr->b_state = arc_anon;
- hdr->b_arc_access = 0;
+
buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
buf->b_efunc = NULL;
buf->b_private = NULL;
buf->b_next = NULL;
- hdr->b_buf = buf;
+
+ hdr->b_flags = arc_bufc_to_flags(type);
+ hdr->b_flags |= ARC_FLAG_HAS_L1HDR;
+
+ hdr->b_l1hdr.b_buf = buf;
+ hdr->b_l1hdr.b_state = arc_anon;
+ hdr->b_l1hdr.b_arc_access = 0;
+ hdr->b_l1hdr.b_datacnt = 1;
+
arc_get_data_buf(buf);
- hdr->b_datacnt = 1;
- hdr->b_flags = 0;
- ASSERT(refcount_is_zero(&hdr->b_refcnt));
- (void) refcount_add(&hdr->b_refcnt, tag);
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
return (buf);
}
@@ -1550,8 +1794,9 @@ arc_return_buf(arc_buf_t *buf, void *tag
arc_buf_hdr_t *hdr = buf->b_hdr;
ASSERT(buf->b_data != NULL);
- (void) refcount_add(&hdr->b_refcnt, tag);
- (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
+ (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
}
@@ -1560,12 +1805,12 @@ arc_return_buf(arc_buf_t *buf, void *tag
void
arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
{
- arc_buf_hdr_t *hdr;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
ASSERT(buf->b_data != NULL);
- hdr = buf->b_hdr;
- (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
- (void) refcount_remove(&hdr->b_refcnt, tag);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
+ (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
buf->b_efunc = NULL;
buf->b_private = NULL;
@@ -1579,15 +1824,16 @@ arc_buf_clone(arc_buf_t *from)
arc_buf_hdr_t *hdr = from->b_hdr;
uint64_t size = hdr->b_size;
- ASSERT(hdr->b_state != arc_anon);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(hdr->b_l1hdr.b_state != arc_anon);
buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
buf->b_efunc = NULL;
buf->b_private = NULL;
- buf->b_next = hdr->b_buf;
- hdr->b_buf = buf;
+ buf->b_next = hdr->b_l1hdr.b_buf;
+ hdr->b_l1hdr.b_buf = buf;
arc_get_data_buf(buf);
bcopy(from->b_data, buf->b_data, size);
@@ -1597,11 +1843,11 @@ arc_buf_clone(arc_buf_t *from)
* then track the size and number of duplicates. These stats will be
* updated as duplicate buffers are created and destroyed.
*/
- if (hdr->b_type == ARC_BUFC_DATA) {
+ if (HDR_ISTYPE_DATA(hdr)) {
ARCSTAT_BUMP(arcstat_duplicate_buffers);
ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
}
- hdr->b_datacnt += 1;
+ hdr->b_l1hdr.b_datacnt += 1;
return (buf);
}
@@ -1624,17 +1870,20 @@ arc_buf_add_ref(arc_buf_t *buf, void* ta
hash_lock = HDR_LOCK(buf->b_hdr);
mutex_enter(hash_lock);
hdr = buf->b_hdr;
+ ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
mutex_exit(&buf->b_evict_lock);
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-user
mailing list