svn commit: r316894 - vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor/illumos/dist/cmd/zfs vendor/illumos/dist/cmd/zstreamdump vendor/illumos/dist/lib...
Andriy Gapon
avg at FreeBSD.org
Fri Apr 14 18:07:44 UTC 2017
Author: avg
Date: Fri Apr 14 18:07:43 2017
New Revision: 316894
URL: https://svnweb.freebsd.org/changeset/base/316894
Log:
7252 7628 compressed zfs send / receive
illumos/illumos-gate at 5602294fda888d923d57a78bafdaf48ae6223dea
https://github.com/illumos/illumos-gate/commit/5602294fda888d923d57a78bafdaf48ae6223dea
https://www.illumos.org/issues/7252
This feature includes code to allow a system with compressed ARC enabled to
send data in its compressed form straight out of the ARC, and receive data in
its compressed form directly into the ARC.
https://www.illumos.org/issues/7628
We should have longer, more readable versions of the ZFS send / recv options.
7628 create long versions of ZFS send / receive options
Reviewed by: George Wilson <george.wilson at delphix.com>
Reviewed by: John Kennedy <john.kennedy at delphix.com>
Reviewed by: Matthew Ahrens <mahrens at delphix.com>
Reviewed by: Paul Dagnelie <pcd at delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov at delphix.com>
Reviewed by: Sebastien Roy <sebastien.roy at delphix.com>
Reviewed by: David Quigley <dpquigl at davequigley.com>
Reviewed by: Thomas Caputi <tcaputi at datto.com>
Approved by: Dan McDonald <danmcd at omniti.com>
Author: Dan Kimmel <dan.kimmel at delphix.com>
Modified:
vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c
vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dataset.c
vendor-sys/illumos/dist/uts/common/fs/zfs/lz4.c
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/arc.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu_send.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_dataset.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/refcount.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_ioctl.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h
vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio_compress.h
vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ioctl.c
vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c
Changes in other areas also in this revision:
Modified:
vendor/illumos/dist/cmd/zfs/zfs_main.c
vendor/illumos/dist/cmd/zstreamdump/zstreamdump.c
vendor/illumos/dist/lib/libzfs/common/libzfs.h
vendor/illumos/dist/lib/libzfs/common/libzfs_sendrecv.c
vendor/illumos/dist/lib/libzfs_core/common/libzfs_core.c
vendor/illumos/dist/lib/libzfs_core/common/libzfs_core.h
vendor/illumos/dist/man/man1m/zfs.1m
Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c Fri Apr 14 18:05:20 2017 (r316893)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c Fri Apr 14 18:07:43 2017 (r316894)
@@ -77,10 +77,10 @@
* A new reference to a cache buffer can be obtained in two
* ways: 1) via a hash table lookup using the DVA as a key,
* or 2) via one of the ARC lists. The arc_read() interface
- * uses method 1, while the internal arc algorithms for
+ * uses method 1, while the internal ARC algorithms for
* adjusting the cache use method 2. We therefore provide two
* types of locks: 1) the hash table lock array, and 2) the
- * arc list locks.
+ * ARC list locks.
*
* Buffers do not have their own mutexes, rather they rely on the
* hash table mutexes for the bulk of their protection (i.e. most
@@ -93,21 +93,12 @@
* buf_hash_remove() expects the appropriate hash mutex to be
* already held before it is invoked.
*
- * Each arc state also has a mutex which is used to protect the
+ * Each ARC state also has a mutex which is used to protect the
* buffer list associated with the state. When attempting to
- * obtain a hash table lock while holding an arc list lock you
+ * obtain a hash table lock while holding an ARC list lock you
* must use: mutex_tryenter() to avoid deadlock. Also note that
* the active state mutex must be held before the ghost state mutex.
*
- * Arc buffers may have an associated eviction callback function.
- * This function will be invoked prior to removing the buffer (e.g.
- * in arc_do_user_evicts()). Note however that the data associated
- * with the buffer may be evicted prior to the callback. The callback
- * must be made with *no locks held* (to prevent deadlock). Additionally,
- * the users of callbacks must ensure that their private data is
- * protected from simultaneous callbacks from arc_clear_callback()
- * and arc_do_user_evicts().
- *
* Note that the majority of the performance stats are manipulated
* with atomic operations.
*
@@ -136,67 +127,81 @@
* are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
* the arc_buf_hdr_t that will point to the data block in memory. A block can
* only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
- * caches data in two ways -- in a list of arc buffers (arc_buf_t) and
+ * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
* also in the arc_buf_hdr_t's private physical data block pointer (b_pdata).
- * Each arc buffer (arc_buf_t) is being actively accessed by a specific ARC
- * consumer, and always contains uncompressed data. The ARC will provide
- * references to this data and will keep it cached until it is no longer in
- * use. Typically, the arc will try to cache only the L1ARC's physical data
- * block and will aggressively evict any arc_buf_t that is no longer referenced.
- * The amount of memory consumed by the arc_buf_t's can be seen via the
- * "overhead_size" kstat.
- *
*
- * arc_buf_hdr_t
- * +-----------+
- * | |
- * | |
- * | |
- * +-----------+
- * l2arc_buf_hdr_t| |
- * | |
- * +-----------+
- * l1arc_buf_hdr_t| |
- * | | arc_buf_t
- * | b_buf +------------>+---------+ arc_buf_t
- * | | |b_next +---->+---------+
- * | b_pdata +-+ |---------| |b_next +-->NULL
- * +-----------+ | | | +---------+
- * | |b_data +-+ | |
- * | +---------+ | |b_data +-+
- * +->+------+ | +---------+ |
- * (potentially) | | | |
- * compressed | | | |
- * data +------+ | v
- * +->+------+ +------+
- * uncompressed | | | |
- * data | | | |
- * +------+ +------+
- *
- * The L1ARC's data pointer, however, may or may not be uncompressed. The
- * ARC has the ability to store the physical data (b_pdata) associated with
- * the DVA of the arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk
- * physical block, it will match its on-disk compression characteristics.
- * If the block on-disk is compressed, then the physical data block
- * in the cache will also be compressed and vice-versa. This behavior
- * can be disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
+ * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
+ * ability to store the physical data (b_pdata) associated with the DVA of the
+ * arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk physical block,
+ * it will match its on-disk compression characteristics. This behavior can be
+ * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
* compressed ARC functionality is disabled, the b_pdata will point to an
* uncompressed version of the on-disk data.
*
+ * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
+ * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
+ * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
+ * consumer. The ARC will provide references to this data and will keep it
+ * cached until it is no longer in use. The ARC caches only the L1ARC's physical
+ * data block and will evict any arc_buf_t that is no longer referenced. The
+ * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
+ * "overhead_size" kstat.
+ *
+ * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
+ * compressed form. The typical case is that consumers will want uncompressed
+ * data, and when that happens a new data buffer is allocated where the data is
+ * decompressed for them to use. Currently the only consumer who wants
+ * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
+ * exists on disk. When this happens, the arc_buf_t's data buffer is shared
+ * with the arc_buf_hdr_t.
+ *
+ * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
+ * first one is owned by a compressed send consumer (and therefore references
+ * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
+ * used by any other consumer (and has its own uncompressed copy of the data
+ * buffer).
+ *
+ * arc_buf_hdr_t
+ * +-----------+
+ * | fields |
+ * | common to |
+ * | L1- and |
+ * | L2ARC |
+ * +-----------+
+ * | l2arc_buf_hdr_t
+ * | |
+ * +-----------+
+ * | l1arc_buf_hdr_t
+ * | | arc_buf_t
+ * | b_buf +------------>+-----------+ arc_buf_t
+ * | b_pdata +-+ |b_next +---->+-----------+
+ * +-----------+ | |-----------| |b_next +-->NULL
+ * | |b_comp = T | +-----------+
+ * | |b_data +-+ |b_comp = F |
+ * | +-----------+ | |b_data +-+
+ * +->+------+ | +-----------+ |
+ * compressed | | | |
+ * data | |<--------------+ | uncompressed
+ * +------+ compressed, | data
+ * shared +-->+------+
+ * data | |
+ * | |
+ * +------+
+ *
* When a consumer reads a block, the ARC must first look to see if the
- * arc_buf_hdr_t is cached. If the hdr is cached and already has an arc_buf_t,
- * then an additional arc_buf_t is allocated and the uncompressed data is
- * bcopied from the existing arc_buf_t. If the hdr is cached but does not
- * have an arc_buf_t, then the ARC allocates a new arc_buf_t and decompresses
- * the b_pdata contents into the arc_buf_t's b_data. If the arc_buf_hdr_t's
- * b_pdata is not compressed, then the block is shared with the newly
- * allocated arc_buf_t. This block sharing only occurs with one arc_buf_t
- * in the arc buffer chain. Sharing the block reduces the memory overhead
- * required when the hdr is caching uncompressed blocks or the compressed
- * arc functionality has been disabled via 'zfs_compressed_arc_enabled'.
+ * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
+ * arc_buf_t and either copies uncompressed data into a new data buffer from an
+ * existing uncompressed arc_buf_t, decompresses the hdr's b_pdata buffer into a
+ * new data buffer, or shares the hdr's b_pdata buffer, depending on whether the
+ * hdr is compressed and the desired compression characteristics of the
+ * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
+ * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
+ * the last buffer in the hdr's b_buf list, however a shared compressed buf can
+ * be anywhere in the hdr's list.
*
* The diagram below shows an example of an uncompressed ARC hdr that is
- * sharing its data with an arc_buf_t:
+ * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
+ * the last element in the buf list):
*
* arc_buf_hdr_t
* +-----------+
@@ -225,20 +230,24 @@
* | +------+ |
* +---------------------------------+
*
- * Writing to the arc requires that the ARC first discard the b_pdata
+ * Writing to the ARC requires that the ARC first discard the hdr's b_pdata
* since the physical block is about to be rewritten. The new data contents
- * will be contained in the arc_buf_t (uncompressed). As the I/O pipeline
- * performs the write, it may compress the data before writing it to disk.
- * The ARC will be called with the transformed data and will bcopy the
- * transformed on-disk block into a newly allocated b_pdata.
+ * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
+ * it may compress the data before writing it to disk. The ARC will be called
+ * with the transformed data and will bcopy the transformed on-disk block into
+ * a newly allocated b_pdata. Writes are always done into buffers which have
+ * either been loaned (and hence are new and don't have other readers) or
+ * buffers which have been released (and hence have their own hdr, if there
+ * were originally other readers of the buf's original hdr). This ensures that
+ * the ARC only needs to update a single buf and its hdr after a write occurs.
*
* When the L2ARC is in use, it will also take advantage of the b_pdata. The
* L2ARC will always write the contents of b_pdata to the L2ARC. This means
- * that when compressed arc is enabled that the L2ARC blocks are identical
+ * that when compressed ARC is enabled that the L2ARC blocks are identical
* to the on-disk block in the main data pool. This provides a significant
* advantage since the ARC can leverage the bp's checksum when reading from the
* L2ARC to determine if the contents are valid. However, if the compressed
- * arc is disabled, then the L2ARC's block must be transformed to look
+ * ARC is disabled, then the L2ARC's block must be transformed to look
* like the physical block in the main data pool before comparing the
* checksum and determining its validity.
*/
@@ -804,6 +813,7 @@ struct arc_callback {
void *acb_private;
arc_done_func_t *acb_done;
arc_buf_t *acb_buf;
+ boolean_t acb_compressed;
zio_t *acb_zio_dummy;
arc_callback_t *acb_next;
};
@@ -855,7 +865,7 @@ typedef struct l1arc_buf_hdr {
zio_cksum_t *b_freeze_cksum;
#ifdef ZFS_DEBUG
/*
- * used for debugging wtih kmem_flags - by allocating and freeing
+ * Used for debugging with kmem_flags - by allocating and freeing
* b_thawed when the buffer is thawed, we get a record of the stack
* trace that thawed it.
*/
@@ -970,6 +980,8 @@ struct arc_buf_hdr {
HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL)
+#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED)
+#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
/*
* Other sizes
@@ -1064,7 +1076,7 @@ static kmutex_t l2arc_free_on_write_mtx;
static uint64_t l2arc_ndev; /* number of devices */
typedef struct l2arc_read_callback {
- arc_buf_hdr_t *l2rcb_hdr; /* read buffer */
+ arc_buf_hdr_t *l2rcb_hdr; /* read header */
blkptr_t l2rcb_bp; /* original blkptr */
zbookmark_phys_t l2rcb_zb; /* original bookmark */
int l2rcb_flags; /* original flags */
@@ -1399,6 +1411,31 @@ retry:
}
}
+/*
+ * This is the size that the buf occupies in memory. If the buf is compressed,
+ * it will correspond to the compressed size. You should use this method of
+ * getting the buf size unless you explicitly need the logical size.
+ */
+int32_t
+arc_buf_size(arc_buf_t *buf)
+{
+ return (ARC_BUF_COMPRESSED(buf) ?
+ HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
+}
+
+int32_t
+arc_buf_lsize(arc_buf_t *buf)
+{
+ return (HDR_GET_LSIZE(buf->b_hdr));
+}
+
+enum zio_compress
+arc_get_compression(arc_buf_t *buf)
+{
+ return (ARC_BUF_COMPRESSED(buf) ?
+ HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
+}
+
#define ARC_MINTIME (hz>>4) /* 62 ms */
static inline boolean_t
@@ -1407,9 +1444,21 @@ arc_buf_is_shared(arc_buf_t *buf)
boolean_t shared = (buf->b_data != NULL &&
buf->b_data == buf->b_hdr->b_l1hdr.b_pdata);
IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
+ IMPLY(shared, ARC_BUF_SHARED(buf));
+ IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
+
+ /*
+ * It would be nice to assert arc_can_share() too, but the "hdr isn't
+ * already being shared" requirement prevents us from doing that.
+ */
+
return (shared);
}
+/*
+ * Free the checksum associated with this header. If there is no checksum, this
+ * is a no-op.
+ */
static inline void
arc_cksum_free(arc_buf_hdr_t *hdr)
{
@@ -1422,6 +1471,25 @@ arc_cksum_free(arc_buf_hdr_t *hdr)
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
}
+/*
+ * Return true iff at least one of the bufs on hdr is not compressed.
+ */
+static boolean_t
+arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
+{
+ for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
+ if (!ARC_BUF_COMPRESSED(b)) {
+ return (B_TRUE);
+ }
+ }
+ return (B_FALSE);
+}
+
+/*
+ * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
+ * matches the checksum that is stored in the hdr. If there is no checksum,
+ * or if the buf is compressed, this is a no-op.
+ */
static void
arc_cksum_verify(arc_buf_t *buf)
{
@@ -1431,6 +1499,12 @@ arc_cksum_verify(arc_buf_t *buf)
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
+ if (ARC_BUF_COMPRESSED(buf)) {
+ ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
+ arc_hdr_has_uncompressed_buf(hdr));
+ return;
+ }
+
ASSERT(HDR_HAS_L1HDR(hdr));
mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
@@ -1438,7 +1512,8 @@ arc_cksum_verify(arc_buf_t *buf)
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
return;
}
- fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, &zc);
+
+ fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
panic("buffer modified while frozen!");
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
@@ -1512,6 +1587,12 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, z
return (valid_cksum);
}
+/*
+ * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
+ * checksum and attaches it to the buf's hdr so that we can ensure that the buf
+ * isn't modified later on. If buf is compressed or there is already a checksum
+ * on the hdr, this is a no-op (we only checksum uncompressed bufs).
+ */
static void
arc_cksum_compute(arc_buf_t *buf)
{
@@ -1521,14 +1602,21 @@ arc_cksum_compute(arc_buf_t *buf)
return;
ASSERT(HDR_HAS_L1HDR(hdr));
+
mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
+ ASSERT(arc_hdr_has_uncompressed_buf(hdr));
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+ return;
+ } else if (ARC_BUF_COMPRESSED(buf)) {
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
return;
}
+
+ ASSERT(!ARC_BUF_COMPRESSED(buf));
hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
KM_SLEEP);
- fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL,
+ fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
hdr->b_l1hdr.b_freeze_cksum);
mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
arc_buf_watch(buf);
@@ -1569,7 +1657,7 @@ arc_buf_watch(arc_buf_t *buf)
procctl_t ctl;
ctl.cmd = PCWATCH;
ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
- ctl.prwatch.pr_size = HDR_GET_LSIZE(buf->b_hdr);
+ ctl.prwatch.pr_size = arc_buf_size(buf);
ctl.prwatch.pr_wflags = WA_WRITE;
result = write(arc_procfd, &ctl, sizeof (ctl));
ASSERT3U(result, ==, sizeof (ctl));
@@ -1590,6 +1678,12 @@ arc_buf_type(arc_buf_hdr_t *hdr)
return (type);
}
+boolean_t
+arc_is_metadata(arc_buf_t *buf)
+{
+ return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
+}
+
static uint32_t
arc_bufc_to_flags(arc_buf_contents_t type)
{
@@ -1611,12 +1705,19 @@ arc_buf_thaw(arc_buf_t *buf)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
- if (zfs_flags & ZFS_DEBUG_MODIFY) {
- if (hdr->b_l1hdr.b_state != arc_anon)
- panic("modifying non-anon buffer!");
- if (HDR_IO_IN_PROGRESS(hdr))
- panic("modifying buffer while i/o in progress!");
- arc_cksum_verify(buf);
+ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+
+ arc_cksum_verify(buf);
+
+ /*
+ * Compressed buffers do not manipulate the b_freeze_cksum or
+ * allocate b_thawed.
+ */
+ if (ARC_BUF_COMPRESSED(buf)) {
+ ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
+ arc_hdr_has_uncompressed_buf(hdr));
+ return;
}
ASSERT(HDR_HAS_L1HDR(hdr));
@@ -1645,6 +1746,12 @@ arc_buf_freeze(arc_buf_t *buf)
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
+ if (ARC_BUF_COMPRESSED(buf)) {
+ ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL ||
+ arc_hdr_has_uncompressed_buf(hdr));
+ return;
+ }
+
hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
@@ -1653,7 +1760,6 @@ arc_buf_freeze(arc_buf_t *buf)
hdr->b_l1hdr.b_state == arc_anon);
arc_cksum_compute(buf);
mutex_exit(hash_lock);
-
}
/*
@@ -1710,47 +1816,157 @@ arc_hdr_set_compress(arc_buf_hdr_t *hdr,
}
}
+/*
+ * Looks for another buf on the same hdr which has the data decompressed, copies
+ * from it, and returns true. If no such buf exists, returns false.
+ */
+static boolean_t
+arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ boolean_t copied = B_FALSE;
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT3P(buf->b_data, !=, NULL);
+ ASSERT(!ARC_BUF_COMPRESSED(buf));
+
+ for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
+ from = from->b_next) {
+ /* can't use our own data buffer */
+ if (from == buf) {
+ continue;
+ }
+
+ if (!ARC_BUF_COMPRESSED(from)) {
+ bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
+ copied = B_TRUE;
+ break;
+ }
+ }
+
+ /*
+ * There were no decompressed bufs, so there should not be a
+ * checksum on the hdr either.
+ */
+ EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
+
+ return (copied);
+}
+
+/*
+ * Given a buf that has a data buffer attached to it, this function will
+ * efficiently fill the buf with data of the specified compression setting from
+ * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
+ * are already sharing a data buf, no copy is performed.
+ *
+ * If the buf is marked as compressed but uncompressed data was requested, this
+ * will allocate a new data buffer for the buf, remove that flag, and fill the
+ * buf with uncompressed data. You can't request a compressed buf on a hdr with
+ * uncompressed data, and (since we haven't added support for it yet) if you
+ * want compressed data your buf must already be marked as compressed and have
+ * the correct-sized data buffer.
+ */
static int
-arc_decompress(arc_buf_t *buf)
+arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
+ boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
- int error;
- if (arc_buf_is_shared(buf)) {
- ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
- } else if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) {
- /*
- * The arc_buf_hdr_t is either not compressed or is
- * associated with an embedded block or a hole in which
- * case they remain anonymous.
- */
- IMPLY(HDR_COMPRESSION_ENABLED(hdr), HDR_GET_PSIZE(hdr) == 0 ||
- HDR_GET_PSIZE(hdr) == HDR_GET_LSIZE(hdr));
- ASSERT(!HDR_SHARED_DATA(hdr));
- bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_LSIZE(hdr));
+ ASSERT3P(buf->b_data, !=, NULL);
+ IMPLY(compressed, hdr_compressed);
+ IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
+
+ if (hdr_compressed == compressed) {
+ if (!arc_buf_is_shared(buf)) {
+ bcopy(hdr->b_l1hdr.b_pdata, buf->b_data,
+ arc_buf_size(buf));
+ }
} else {
- ASSERT(!HDR_SHARED_DATA(hdr));
+ ASSERT(hdr_compressed);
+ ASSERT(!compressed);
ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
- error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
- hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_PSIZE(hdr),
- HDR_GET_LSIZE(hdr));
- if (error != 0) {
- zfs_dbgmsg("hdr %p, compress %d, psize %d, lsize %d",
- hdr, HDR_GET_COMPRESS(hdr), HDR_GET_PSIZE(hdr),
- HDR_GET_LSIZE(hdr));
- return (SET_ERROR(EIO));
+
+ /*
+ * If the buf is sharing its data with the hdr, unlink it and
+ * allocate a new data buffer for the buf.
+ */
+ if (arc_buf_is_shared(buf)) {
+ ASSERT(ARC_BUF_COMPRESSED(buf));
+
+ /* We need to give the buf it's own b_data */
+ buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
+ buf->b_data =
+ arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
+ arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+
+ /* Previously overhead was 0; just add new overhead */
+ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
+ } else if (ARC_BUF_COMPRESSED(buf)) {
+ /* We need to reallocate the buf's b_data */
+ arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
+ buf);
+ buf->b_data =
+ arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
+
+ /* We increased the size of b_data; update overhead */
+ ARCSTAT_INCR(arcstat_overhead_size,
+ HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
+ }
+
+ /*
+ * Regardless of the buf's previous compression settings, it
+ * should not be compressed at the end of this function.
+ */
+ buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
+
+ /*
+ * Try copying the data from another buf which already has a
+ * decompressed version. If that's not possible, it's time to
+ * bite the bullet and decompress the data from the hdr.
+ */
+ if (arc_buf_try_copy_decompressed_data(buf)) {
+ /* Skip byteswapping and checksumming (already done) */
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL);
+ return (0);
+ } else {
+ int error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
+ hdr->b_l1hdr.b_pdata, buf->b_data,
+ HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
+
+ /*
+ * Absent hardware errors or software bugs, this should
+ * be impossible, but log it anyway so we can debug it.
+ */
+ if (error != 0) {
+ zfs_dbgmsg(
+ "hdr %p, compress %d, psize %d, lsize %d",
+ hdr, HDR_GET_COMPRESS(hdr),
+ HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
+ return (SET_ERROR(EIO));
+ }
}
}
+
+ /* Byteswap the buf's data if necessary */
if (bswap != DMU_BSWAP_NUMFUNCS) {
ASSERT(!HDR_SHARED_DATA(hdr));
ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
}
+
+ /* Compute the hdr's checksum if necessary */
arc_cksum_compute(buf);
+
return (0);
}
+int
+arc_decompress(arc_buf_t *buf)
+{
+ return (arc_buf_fill(buf, B_FALSE));
+}
+
/*
* Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t.
*/
@@ -1778,7 +1994,6 @@ static void
arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
{
arc_buf_contents_t type = arc_buf_type(hdr);
- uint64_t lsize = HDR_GET_LSIZE(hdr);
ASSERT(HDR_HAS_L1HDR(hdr));
@@ -1786,7 +2001,8 @@ arc_evictable_space_increment(arc_buf_hd
ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
- (void) refcount_add_many(&state->arcs_esize[type], lsize, hdr);
+ (void) refcount_add_many(&state->arcs_esize[type],
+ HDR_GET_LSIZE(hdr), hdr);
return;
}
@@ -1797,11 +2013,10 @@ arc_evictable_space_increment(arc_buf_hd
}
for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
buf = buf->b_next) {
- if (arc_buf_is_shared(buf)) {
- ASSERT(ARC_BUF_LAST(buf));
+ if (arc_buf_is_shared(buf))
continue;
- }
- (void) refcount_add_many(&state->arcs_esize[type], lsize, buf);
+ (void) refcount_add_many(&state->arcs_esize[type],
+ arc_buf_size(buf), buf);
}
}
@@ -1811,10 +2026,9 @@ arc_evictable_space_increment(arc_buf_hd
* so that we can add and remove them from the refcount individually.
*/
static void
-arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
+arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
{
arc_buf_contents_t type = arc_buf_type(hdr);
- uint64_t lsize = HDR_GET_LSIZE(hdr);
ASSERT(HDR_HAS_L1HDR(hdr));
@@ -1823,7 +2037,7 @@ arc_evitable_space_decrement(arc_buf_hdr
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
(void) refcount_remove_many(&state->arcs_esize[type],
- lsize, hdr);
+ HDR_GET_LSIZE(hdr), hdr);
return;
}
@@ -1834,12 +2048,10 @@ arc_evitable_space_decrement(arc_buf_hdr
}
for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
buf = buf->b_next) {
- if (arc_buf_is_shared(buf)) {
- ASSERT(ARC_BUF_LAST(buf));
+ if (arc_buf_is_shared(buf))
continue;
- }
(void) refcount_remove_many(&state->arcs_esize[type],
- lsize, buf);
+ arc_buf_size(buf), buf);
}
}
@@ -1867,7 +2079,7 @@ add_reference(arc_buf_hdr_t *hdr, void *
if (state != arc_l2c_only) {
multilist_remove(&state->arcs_list[arc_buf_type(hdr)],
hdr);
- arc_evitable_space_decrement(hdr, state);
+ arc_evictable_space_decrement(hdr, state);
}
/* remove the prefetch flag if we get a reference */
arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
@@ -1955,7 +2167,7 @@ arc_change_state(arc_state_t *new_state,
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
update_old = B_TRUE;
}
- arc_evitable_space_decrement(hdr, old_state);
+ arc_evictable_space_decrement(hdr, old_state);
}
if (new_state != arc_anon && new_state != arc_l2c_only) {
@@ -2018,13 +2230,11 @@ arc_change_state(arc_state_t *new_state,
* add to the refcount if the arc_buf_t is
* not shared.
*/
- if (arc_buf_is_shared(buf)) {
- ASSERT(ARC_BUF_LAST(buf));
+ if (arc_buf_is_shared(buf))
continue;
- }
(void) refcount_add_many(&new_state->arcs_size,
- HDR_GET_LSIZE(hdr), buf);
+ arc_buf_size(buf), buf);
}
ASSERT3U(bufcnt, ==, buffers);
@@ -2041,6 +2251,7 @@ arc_change_state(arc_state_t *new_state,
ASSERT(HDR_HAS_L1HDR(hdr));
if (GHOST_STATE(old_state)) {
ASSERT0(bufcnt);
+ ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
/*
* When moving a header off of a ghost state,
@@ -2052,7 +2263,6 @@ arc_change_state(arc_state_t *new_state,
(void) refcount_remove_many(&old_state->arcs_size,
HDR_GET_LSIZE(hdr), hdr);
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
} else {
uint32_t buffers = 0;
@@ -2063,7 +2273,7 @@ arc_change_state(arc_state_t *new_state,
*/
for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
buf = buf->b_next) {
- ASSERT3P(bufcnt, !=, 0);
+ ASSERT3U(bufcnt, !=, 0);
buffers++;
/*
@@ -2073,13 +2283,11 @@ arc_change_state(arc_state_t *new_state,
* add to the refcount if the arc_buf_t is
* not shared.
*/
- if (arc_buf_is_shared(buf)) {
- ASSERT(ARC_BUF_LAST(buf));
+ if (arc_buf_is_shared(buf))
continue;
- }
(void) refcount_remove_many(
- &old_state->arcs_size, HDR_GET_LSIZE(hdr),
+ &old_state->arcs_size, arc_buf_size(buf),
buf);
}
ASSERT3U(bufcnt, ==, buffers);
@@ -2164,11 +2372,50 @@ arc_space_return(uint64_t space, arc_spa
}
/*
- * Allocate an initial buffer for this hdr, subsequent buffers will
- * use arc_buf_clone().
+ * Given a hdr and a buf, returns whether that buf can share its b_data buffer
+ * with the hdr's b_pdata.
*/
-static arc_buf_t *
-arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag)
+static boolean_t
+arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
+{
+ /*
+ * The criteria for sharing a hdr's data are:
+ * 1. the hdr's compression matches the buf's compression
+ * 2. the hdr doesn't need to be byteswapped
+ * 3. the hdr isn't already being shared
+ * 4. the buf is either compressed or it is the last buf in the hdr list
+ *
+ * Criterion #4 maintains the invariant that shared uncompressed
+ * bufs must be the final buf in the hdr's b_buf list. Reading this, you
+ * might ask, "if a compressed buf is allocated first, won't that be the
+ * last thing in the list?", but in that case it's impossible to create
+ * a shared uncompressed buf anyway (because the hdr must be compressed
+ * to have the compressed buf). You might also think that #3 is
+ * sufficient to make this guarantee, however it's possible
+ * (specifically in the rare L2ARC write race mentioned in
+ * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
+ * is sharable, but wasn't at the time of its allocation. Rather than
+ * allow a new shared uncompressed buf to be created and then shuffle
+ * the list around to make it the last element, this simply disallows
+ * sharing if the new buf isn't the first to be added.
+ */
+ ASSERT3P(buf->b_hdr, ==, hdr);
+ boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF;
+ boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
+ return (buf_compressed == hdr_compressed &&
+ hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
+ !HDR_SHARED_DATA(hdr) &&
+ (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
+}
+
+/*
+ * Allocate a buf for this hdr. If you care about the data that's in the hdr,
+ * or if you want a compressed buffer, pass those flags in. Returns 0 if the
+ * copy was made successfully, or an error code otherwise.
+ */
+static int
+arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed,
+ boolean_t fill, arc_buf_t **ret)
{
arc_buf_t *buf;
@@ -2176,15 +2423,14 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, v
ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
VERIFY(hdr->b_type == ARC_BUFC_DATA ||
hdr->b_type == ARC_BUFC_METADATA);
+ ASSERT3P(ret, !=, NULL);
+ ASSERT3P(*ret, ==, NULL);
- ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
- ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
- ASSERT0(hdr->b_l1hdr.b_bufcnt);
-
- buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
+ buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
- buf->b_next = NULL;
+ buf->b_next = hdr->b_l1hdr.b_buf;
+ buf->b_flags = 0;
add_reference(hdr, tag);
@@ -2195,58 +2441,63 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, v
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
/*
- * If the hdr's data can be shared (no byteswapping, hdr is
- * uncompressed, hdr's data is not currently being written to the
- * L2ARC write) then we share the data buffer and set the appropriate
- * bit in the hdr's b_flags to indicate the hdr is sharing it's
- * b_pdata with the arc_buf_t. Otherwise, we allocate a new buffer to
- * store the buf's data.
+ * Only honor requests for compressed bufs if the hdr is actually
+ * compressed.
*/
- if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
- HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF && !HDR_L2_WRITING(hdr)) {
+ if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
+ buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
+
+ /*
+ * If the hdr's data can be shared then we share the data buffer and
+ * set the appropriate bit in the hdr's b_flags to indicate the hdr is
+ * sharing it's b_pdata with the arc_buf_t. Otherwise, we allocate a new
+ * buffer to store the buf's data.
+ *
+ * There is one additional restriction here because we're sharing
+ * hdr -> buf instead of the usual buf -> hdr: the hdr can't be actively
+ * involved in an L2ARC write, because if this buf is used by an
+ * arc_write() then the hdr's data buffer will be released when the
+ * write completes, even though the L2ARC write might still be using it.
+ */
+ boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr);
+
+ /* Set up b_data and sharing */
+ if (can_share) {
buf->b_data = hdr->b_l1hdr.b_pdata;
+ buf->b_flags |= ARC_BUF_FLAG_SHARED;
arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
} else {
- buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
- ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
- arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+ buf->b_data =
+ arc_get_data_buf(hdr, arc_buf_size(buf), buf);
+ ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
}
VERIFY3P(buf->b_data, !=, NULL);
hdr->b_l1hdr.b_buf = buf;
hdr->b_l1hdr.b_bufcnt += 1;
- return (buf);
-}
+ /*
+ * If the user wants the data from the hdr, we need to either copy or
+ * decompress the data.
+ */
+ if (fill) {
+ return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0));
+ }
-/*
- * Used when allocating additional buffers.
- */
-static arc_buf_t *
-arc_buf_clone(arc_buf_t *from)
-{
- arc_buf_t *buf;
- arc_buf_hdr_t *hdr = from->b_hdr;
- uint64_t size = HDR_GET_LSIZE(hdr);
+ return (0);
+}
- ASSERT(HDR_HAS_L1HDR(hdr));
- ASSERT(hdr->b_l1hdr.b_state != arc_anon);
+static char *arc_onloan_tag = "onloan";
- buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
- buf->b_hdr = hdr;
- buf->b_data = NULL;
- buf->b_next = hdr->b_l1hdr.b_buf;
- hdr->b_l1hdr.b_buf = buf;
- buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
- bcopy(from->b_data, buf->b_data, size);
- hdr->b_l1hdr.b_bufcnt += 1;
+static inline void
+arc_loaned_bytes_update(int64_t delta)
+{
+ atomic_add_64(&arc_loaned_bytes, delta);
- ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
- return (buf);
+ /* assert that it did not wrap around */
+ ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
}
-static char *arc_onloan_tag = "onloan";
-
/*
* Loan out an anonymous arc buffer. Loaned buffers are not counted as in
* flight data by arc_tempreserve_space() until they are "returned". Loaned
@@ -2254,16 +2505,29 @@ static char *arc_onloan_tag = "onloan";
* freed.
*/
arc_buf_t *
-arc_loan_buf(spa_t *spa, int size)
+arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
{
- arc_buf_t *buf;
+ arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
+ is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
- buf = arc_alloc_buf(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
+ arc_loaned_bytes_update(size);
- atomic_add_64(&arc_loaned_bytes, size);
return (buf);
}
+arc_buf_t *
+arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
+ enum zio_compress compression_type)
+{
+ arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
+ psize, lsize, compression_type);
+
+ arc_loaned_bytes_update(psize);
+
+ return (buf);
+}
+
+
/*
* Return a loaned arc buffer to the arc.
*/
@@ -2277,7 +2541,7 @@ arc_return_buf(arc_buf_t *buf, void *tag
(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
- atomic_add_64(&arc_loaned_bytes, -HDR_GET_LSIZE(hdr));
+ arc_loaned_bytes_update(-arc_buf_size(buf));
}
/* Detach an arc_buf from a dbuf (tag) */
@@ -2291,7 +2555,7 @@ arc_loan_inuse_buf(arc_buf_t *buf, void
(void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
- atomic_add_64(&arc_loaned_bytes, HDR_GET_LSIZE(hdr));
+ arc_loaned_bytes_update(arc_buf_size(buf));
}
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-vendor
mailing list