ZDB -Z?
Andrew Heybey
ath at niksun.com
Wed Dec 10 00:38:57 UTC 2014
On 11/24/14 1:49 PM, Zaphod Beeblebrox wrote:
> I'm reading about someone else's recovery of files from a damaged ZFS
> partition. He claims to have added (possibly to opensolaris or whatnot) an
> argument to zdb '-Z' ... which operates somewhat like -R, but which
> highlights what parts of the region are on what physical disks, and which
> are parity.
>
> Has anyone patched this into FreeBSD?
Sorry for the late reply, I am behind on my mailing list reading.
I assume you were looking at this post:
http://mbruning.blogspot.com/2009_12_01_archive.html
I was also recently trying to recover data in a ZFS pool. I made an ugly attempt at -Z for zdb. It will not work for anything but RAIDZ pools (I tried it on one containing two 6-disk raidz1 vdevs). The diff (against FreeBSD 10) is in this email.
I copy-pasted the static function vdev_raidz_map() out of libzfs since it is static and not callable externally. Not very tasteful but it worked for me.
andrew
commit 86ab9e2dab7e76dcdf527d2aa6b84a2fe429ee28
Author: Andrew Heybey <ath at niksun.com>
Date: Tue Nov 18 15:00:57 2014 -0500
zdb: Add -Z flag like http://mbruning.blogspot.com/2009/12/zfs-raidz-data-walk.html
diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
index c265c99..bf43ea1 100644
--- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c
+++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
@@ -59,6 +59,7 @@
#include <sys/ddt.h>
#include <sys/zfeature.h>
#include <zfs_comutil.h>
+#include <sys/vdev_raidz.h>
#undef ZFS_MAXNAMELEN
#undef verify
#include <libzfs.h>
@@ -2745,6 +2746,168 @@ zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
}
}
+
+typedef struct raidz_col {
+ uint64_t rc_devidx; /* child device index for I/O */
+ uint64_t rc_offset; /* device offset */
+ uint64_t rc_size; /* I/O size */
+ void *rc_data; /* I/O data */
+ void *rc_gdata; /* used to store the "good" version */
+ int rc_error; /* I/O error for this device */
+ uint8_t rc_tried; /* Did we attempt this I/O column? */
+ uint8_t rc_skipped; /* Did we skip this I/O column? */
+} raidz_col_t;
+
+typedef struct raidz_map {
+ uint64_t rm_cols; /* Regular column count */
+ uint64_t rm_scols; /* Count including skipped columns */
+ uint64_t rm_bigcols; /* Number of oversized columns */
+ uint64_t rm_asize; /* Actual total I/O size */
+ uint64_t rm_missingdata; /* Count of missing data devices */
+ uint64_t rm_missingparity; /* Count of missing parity devices */
+ uint64_t rm_firstdatacol; /* First data column/parity count */
+ uint64_t rm_nskip; /* Skipped sectors for padding */
+ uint64_t rm_skipstart; /* Column index of padding start */
+ void *rm_datacopy; /* rm_asize-buffer of copied data */
+ uintptr_t rm_reports; /* # of referencing checksum reports */
+ uint8_t rm_freed; /* map no longer has referencing ZIO */
+ uint8_t rm_ecksuminjected; /* checksum error was injected */
+ raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
+} raidz_map_t;
+
+/*
+ * Divides the IO evenly across all child vdevs; usually, dcols is
+ * the number of children in the target vdev.
+ *
+ * copy-pasted from vdev_raidz in the ZFS sources
+ */
+raidz_map_t*
+vdev_raidz_map(uint64_t size, uint64_t offset, uint64_t unit_shift,
+ uint64_t dcols, uint64_t nparity)
+{
+ raidz_map_t* rm;
+ /* The starting RAIDZ (parent) vdev sector of the block. */
+ uint64_t b = offset >> unit_shift;
+ /* The zio's size in units of the vdev's minimum sector size. */
+ uint64_t s = size >> unit_shift;
+ /* The first column for this stripe. */
+ uint64_t f = b % dcols;
+ /* The starting byte offset on each child vdev. */
+ uint64_t o = (b / dcols) << unit_shift;
+ uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
+
+ /*
+ * "Quotient": The number of data sectors for this stripe on all but
+ * the "big column" child vdevs that also contain "remainder" data.
+ */
+ q = s / (dcols - nparity);
+
+ /*
+ * "Remainder": The number of partial stripe data sectors in this I/O.
+ * This will add a sector to some, but not all, child vdevs.
+ */
+ r = s - q * (dcols - nparity);
+
+ /* The number of "big columns" - those which contain remainder data. */
+ bc = (r == 0 ? 0 : r + nparity);
+
+ /*
+ * The total number of data and parity sectors associated with
+ * this I/O.
+ */
+ tot = s + nparity * (q + (r == 0 ? 0 : 1));
+
+ /* acols: The columns that will be accessed. */
+ /* scols: The columns that will be accessed or skipped. */
+ if (q == 0) {
+ /* Our I/O request doesn't span all child vdevs. */
+ acols = bc;
+ scols = MIN(dcols, roundup(bc, nparity + 1));
+ } else {
+ acols = dcols;
+ scols = dcols;
+ }
+
+ rm = umem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
+
+ rm->rm_cols = acols;
+ rm->rm_scols = scols;
+ rm->rm_bigcols = bc;
+ rm->rm_skipstart = bc;
+ rm->rm_missingdata = 0;
+ rm->rm_missingparity = 0;
+ rm->rm_firstdatacol = nparity;
+ rm->rm_datacopy = NULL;
+ rm->rm_reports = 0;
+ rm->rm_freed = 0;
+ rm->rm_ecksuminjected = 0;
+
+ asize = 0;
+
+ for (c = 0; c < scols; c++) {
+ col = f + c;
+ coff = o;
+ if (col >= dcols) {
+ col -= dcols;
+ coff += 1ULL << unit_shift;
+ }
+ rm->rm_col[c].rc_devidx = col;
+ rm->rm_col[c].rc_offset = coff;
+ rm->rm_col[c].rc_data = NULL;
+ rm->rm_col[c].rc_gdata = NULL;
+ rm->rm_col[c].rc_error = 0;
+ rm->rm_col[c].rc_tried = 0;
+ rm->rm_col[c].rc_skipped = 0;
+
+ if (c >= acols)
+ rm->rm_col[c].rc_size = 0;
+ else if (c < bc)
+ rm->rm_col[c].rc_size = (q + 1) << unit_shift;
+ else
+ rm->rm_col[c].rc_size = q << unit_shift;
+
+ asize += rm->rm_col[c].rc_size;
+ }
+
+ rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
+ rm->rm_nskip = roundup(tot, nparity + 1) - tot;
+
+ /*
+ * If all data stored spans all columns, there's a danger that parity
+ * will always be on the same device and, since parity isn't read
+ * during normal operation, that that device's I/O bandwidth won't be
+ * used effectively. We therefore switch the parity every 1MB.
+ *
+ * ... at least that was, ostensibly, the theory. As a practical
+ * matter unless we juggle the parity between all devices evenly, we
+ * won't see any benefit. Further, occasional writes that aren't a
+ * multiple of the LCM of the number of children and the minimum
+ * stripe width are sufficient to avoid pessimal behavior.
+ * Unfortunately, this decision created an implicit on-disk format
+ * requirement that we need to support for all eternity, but only
+ * for single-parity RAID-Z.
+ *
+ * If we intend to skip a sector in the zeroth column for padding
+ * we must make sure to note this swap. We will never intend to
+ * skip the first column since at least one data and one parity
+ * column must appear in each row.
+ */
+ if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
+ devidx = rm->rm_col[0].rc_devidx;
+ o = rm->rm_col[0].rc_offset;
+ rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
+ rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
+ rm->rm_col[1].rc_devidx = devidx;
+ rm->rm_col[1].rc_offset = o;
+
+ if (rm->rm_skipstart == 0)
+ rm->rm_skipstart = 1;
+ }
+
+ return (rm);
+}
+
+
/*
* There are two acceptable formats:
* leaf_name - For example: c1t0d0 or /tmp/ztest.0a
@@ -2803,8 +2966,10 @@ name:
}
/*
- * Read a block from a pool and print it out. The syntax of the
- * block descriptor is:
+ * Read a block from a pool and print it out, or (if Zflag is true)
+ * print out where the block is found on the constituents of the vdev.
+ *
+ * The syntax of the block descriptor is:
*
* pool:vdev_specifier:offset:size[:flags]
*
@@ -2825,7 +2990,7 @@ name:
* * = not yet implemented
*/
static void
-zdb_read_block(char *thing, spa_t *spa)
+zdb_read_block(char *thing, spa_t *spa, boolean_t Zflag)
{
blkptr_t blk, *bp = &blk;
dva_t *dva = bp->blk_dva;
@@ -2904,6 +3069,22 @@ zdb_read_block(char *thing, spa_t *spa)
psize = size;
lsize = size;
+ if (Zflag) {
+ raidz_map_t* rm;
+ rm = vdev_raidz_map(psize, offset, vd->vdev_ashift,
+ vd->vdev_children, vd->vdev_nparity);
+ (void) printf("columns %lu bigcols %lu asize %lu firstdatacol %lu\n",
+ rm->rm_cols, rm->rm_bigcols, rm->rm_asize,
+ rm->rm_firstdatacol);
+ for (int c = 0; c < rm->rm_scols; ++c) {
+ raidz_col_t* rc = &rm->rm_col[c];
+ (void) printf("devidx %lu offset 0x%lx size 0x%lx\n",
+ rc->rc_devidx, rc->rc_offset, rc->rc_size);
+ }
+ umem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
+ return;
+ }
+
pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
@@ -3124,7 +3305,7 @@ main(int argc, char **argv)
dprintf_setup(&argc, argv);
- while ((c = getopt(argc, argv, "bcdhilmsuCDRSAFLXevp:t:U:P")) != -1) {
+ while ((c = getopt(argc, argv, "bcdhilmsuCDRSAFLXevp:t:U:PZ")) != -1) {
switch (c) {
case 'b':
case 'c':
@@ -3139,6 +3320,7 @@ main(int argc, char **argv)
case 'D':
case 'R':
case 'S':
+ case 'Z':
dump_opt[c]++;
dump_all = 0;
break;
@@ -3197,6 +3379,9 @@ main(int argc, char **argv)
if (dump_all)
verbose = MAX(verbose, 1);
+ if (dump_opt['Z'])
+ dump_opt['R'] = 1;
+
for (c = 0; c < 256; c++) {
if (dump_all && !strchr("elAFLRSXP", c))
dump_opt[c] = 1;
@@ -3325,7 +3510,7 @@ main(int argc, char **argv)
flagbits['r'] = ZDB_FLAG_RAW;
for (i = 0; i < argc; i++)
- zdb_read_block(argv[i], spa);
+ zdb_read_block(argv[i], spa, dump_opt['Z']);
}
(os != NULL) ? dmu_objset_disown(os, FTAG) : spa_close(spa, FTAG);
More information about the freebsd-fs
mailing list