ZDB -Z?

Wed Dec 10 00:38:57 UTC 2014

On 11/24/14 1:49 PM, Zaphod Beeblebrox wrote:
> I'm reading about someone else's recovery of files from a damaged ZFS
> partition.  He claims to have added (possibly to opensolaris or whatnot) an
> argument to zdb '-Z' ... which operates somewhat like -R, but which
> highlights what parts of the region are on what physical disks, and which
> are parity.
> 
> Has anyone patched this into FreeBSD?

Sorry for the late reply, I am behind on my mailing list reading.

I assume you were looking at this post:

http://mbruning.blogspot.com/2009_12_01_archive.html

I was also recently trying to recover data in a ZFS pool.  I made an ugly attempt at -Z for zdb.  It will not work for anything but RAIDZ pools (I tried it on one containing two 6-disk raidz1 vdevs).  The diff (against FreeBSD 10) is in this email.

I copy-pasted the static function vdev_raidz_map() out of libzfs since it is static and not callable externally.  Not very tasteful but it worked for me.

andrew

commit 86ab9e2dab7e76dcdf527d2aa6b84a2fe429ee28
Author: Andrew Heybey <ath at niksun.com>
Date:   Tue Nov 18 15:00:57 2014 -0500

    zdb: Add -Z flag like http://mbruning.blogspot.com/2009/12/zfs-raidz-data-walk.html

diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
index c265c99..bf43ea1 100644
--- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c
+++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
@@ -59,6 +59,7 @@
 #include <sys/ddt.h>
 #include <sys/zfeature.h>
 #include <zfs_comutil.h>
+#include <sys/vdev_raidz.h>
 #undef ZFS_MAXNAMELEN
 #undef verify
 #include <libzfs.h>
@@ -2745,6 +2746,168 @@ zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
 	}
 }
 
+
+typedef struct raidz_col {
+	uint64_t rc_devidx;		/* child device index for I/O */
+	uint64_t rc_offset;		/* device offset */
+	uint64_t rc_size;		/* I/O size */
+	void *rc_data;			/* I/O data */
+	void *rc_gdata;			/* used to store the "good" version */
+	int rc_error;			/* I/O error for this device */
+	uint8_t rc_tried;		/* Did we attempt this I/O column? */
+	uint8_t rc_skipped;		/* Did we skip this I/O column? */
+} raidz_col_t;
+
+typedef struct raidz_map {
+	uint64_t rm_cols;		/* Regular column count */
+	uint64_t rm_scols;		/* Count including skipped columns */
+	uint64_t rm_bigcols;		/* Number of oversized columns */
+	uint64_t rm_asize;		/* Actual total I/O size */
+	uint64_t rm_missingdata;	/* Count of missing data devices */
+	uint64_t rm_missingparity;	/* Count of missing parity devices */
+	uint64_t rm_firstdatacol;	/* First data column/parity count */
+	uint64_t rm_nskip;		/* Skipped sectors for padding */
+	uint64_t rm_skipstart;		/* Column index of padding start */
+	void *rm_datacopy;		/* rm_asize-buffer of copied data */
+	uintptr_t rm_reports;		/* # of referencing checksum reports */
+	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
+	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
+	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
+} raidz_map_t;
+
+/*
+ * Divides the IO evenly across all child vdevs; usually, dcols is
+ * the number of children in the target vdev.
+ *
+ * copy-pasted from vdev_raidz in the ZFS sources
+ */
+raidz_map_t*
+vdev_raidz_map(uint64_t size, uint64_t offset, uint64_t unit_shift,
+	       uint64_t dcols, uint64_t nparity)
+{
+	raidz_map_t* rm;
+	/* The starting RAIDZ (parent) vdev sector of the block. */
+	uint64_t b = offset >> unit_shift;
+	/* The zio's size in units of the vdev's minimum sector size. */
+	uint64_t s = size >> unit_shift;
+	/* The first column for this stripe. */
+	uint64_t f = b % dcols;
+	/* The starting byte offset on each child vdev. */
+	uint64_t o = (b / dcols) << unit_shift;
+	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
+
+	/*
+	 * "Quotient": The number of data sectors for this stripe on all but
+	 * the "big column" child vdevs that also contain "remainder" data.
+	 */
+	q = s / (dcols - nparity);
+
+	/*
+	 * "Remainder": The number of partial stripe data sectors in this I/O.
+	 * This will add a sector to some, but not all, child vdevs.
+	 */
+	r = s - q * (dcols - nparity);
+
+	/* The number of "big columns" - those which contain remainder data. */
+	bc = (r == 0 ? 0 : r + nparity);
+
+	/*
+	 * The total number of data and parity sectors associated with
+	 * this I/O.
+	 */
+	tot = s + nparity * (q + (r == 0 ? 0 : 1));
+
+	/* acols: The columns that will be accessed. */
+	/* scols: The columns that will be accessed or skipped. */
+	if (q == 0) {
+		/* Our I/O request doesn't span all child vdevs. */
+		acols = bc;
+		scols = MIN(dcols, roundup(bc, nparity + 1));
+	} else {
+		acols = dcols;
+		scols = dcols;
+	}
+
+	rm = umem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
+
+	rm->rm_cols = acols;
+	rm->rm_scols = scols;
+	rm->rm_bigcols = bc;
+	rm->rm_skipstart = bc;
+	rm->rm_missingdata = 0;
+	rm->rm_missingparity = 0;
+	rm->rm_firstdatacol = nparity;
+	rm->rm_datacopy = NULL;
+	rm->rm_reports = 0;
+	rm->rm_freed = 0;
+	rm->rm_ecksuminjected = 0;
+
+	asize = 0;
+
+	for (c = 0; c < scols; c++) {
+		col = f + c;
+		coff = o;
+		if (col >= dcols) {
+			col -= dcols;
+			coff += 1ULL << unit_shift;
+		}
+		rm->rm_col[c].rc_devidx = col;
+		rm->rm_col[c].rc_offset = coff;
+		rm->rm_col[c].rc_data = NULL;
+		rm->rm_col[c].rc_gdata = NULL;
+		rm->rm_col[c].rc_error = 0;
+		rm->rm_col[c].rc_tried = 0;
+		rm->rm_col[c].rc_skipped = 0;
+
+		if (c >= acols)
+			rm->rm_col[c].rc_size = 0;
+		else if (c < bc)
+			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
+		else
+			rm->rm_col[c].rc_size = q << unit_shift;
+
+		asize += rm->rm_col[c].rc_size;
+	}
+
+	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
+	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
+
+	/*
+	 * If all data stored spans all columns, there's a danger that parity
+	 * will always be on the same device and, since parity isn't read
+	 * during normal operation, that that device's I/O bandwidth won't be
+	 * used effectively. We therefore switch the parity every 1MB.
+	 *
+	 * ... at least that was, ostensibly, the theory. As a practical
+	 * matter unless we juggle the parity between all devices evenly, we
+	 * won't see any benefit. Further, occasional writes that aren't a
+	 * multiple of the LCM of the number of children and the minimum
+	 * stripe width are sufficient to avoid pessimal behavior.
+	 * Unfortunately, this decision created an implicit on-disk format
+	 * requirement that we need to support for all eternity, but only
+	 * for single-parity RAID-Z.
+	 *
+	 * If we intend to skip a sector in the zeroth column for padding
+	 * we must make sure to note this swap. We will never intend to
+	 * skip the first column since at least one data and one parity
+	 * column must appear in each row.
+	 */
+	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
+		devidx = rm->rm_col[0].rc_devidx;
+		o = rm->rm_col[0].rc_offset;
+		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
+		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
+		rm->rm_col[1].rc_devidx = devidx;
+		rm->rm_col[1].rc_offset = o;
+
+		if (rm->rm_skipstart == 0)
+			rm->rm_skipstart = 1;
+	}
+
+	return (rm);
+}
+
+
 /*
  * There are two acceptable formats:
  *	leaf_name	  - For example: c1t0d0 or /tmp/ztest.0a
@@ -2803,8 +2966,10 @@ name:
 }
 
 /*
- * Read a block from a pool and print it out.  The syntax of the
- * block descriptor is:
+ * Read a block from a pool and print it out, or (if Zflag is true)
+ * print out where the block is found on the constituents of the vdev.
+ *
+ * The syntax of the block descriptor is:
  *
  *	pool:vdev_specifier:offset:size[:flags]
  *
@@ -2825,7 +2990,7 @@ name:
  *              * = not yet implemented
  */
 static void
-zdb_read_block(char *thing, spa_t *spa)
+zdb_read_block(char *thing, spa_t *spa, boolean_t Zflag)
 {
 	blkptr_t blk, *bp = &blk;
 	dva_t *dva = bp->blk_dva;
@@ -2904,6 +3069,22 @@ zdb_read_block(char *thing, spa_t *spa)
 	psize = size;
 	lsize = size;
 
+	if (Zflag) {
+		raidz_map_t* rm;
+		rm = vdev_raidz_map(psize, offset, vd->vdev_ashift,
+				    vd->vdev_children, vd->vdev_nparity);
+		(void) printf("columns %lu bigcols %lu asize %lu firstdatacol %lu\n",
+			      rm->rm_cols, rm->rm_bigcols, rm->rm_asize,
+			      rm->rm_firstdatacol);
+		for (int c = 0; c < rm->rm_scols; ++c) {
+			raidz_col_t* rc = &rm->rm_col[c];
+			(void) printf("devidx %lu offset 0x%lx size 0x%lx\n",
+				      rc->rc_devidx, rc->rc_offset, rc->rc_size);
+		}
+		umem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
+		return;
+	}
+
 	pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 	lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 
@@ -3124,7 +3305,7 @@ main(int argc, char **argv)
 
 	dprintf_setup(&argc, argv);
 
-	while ((c = getopt(argc, argv, "bcdhilmsuCDRSAFLXevp:t:U:P")) != -1) {
+	while ((c = getopt(argc, argv, "bcdhilmsuCDRSAFLXevp:t:U:PZ")) != -1) {
 		switch (c) {
 		case 'b':
 		case 'c':
@@ -3139,6 +3320,7 @@ main(int argc, char **argv)
 		case 'D':
 		case 'R':
 		case 'S':
+		case 'Z':
 			dump_opt[c]++;
 			dump_all = 0;
 			break;
@@ -3197,6 +3379,9 @@ main(int argc, char **argv)
 	if (dump_all)
 		verbose = MAX(verbose, 1);
 
+	if (dump_opt['Z'])
+	    dump_opt['R'] = 1;
+
 	for (c = 0; c < 256; c++) {
 		if (dump_all && !strchr("elAFLRSXP", c))
 			dump_opt[c] = 1;
@@ -3325,7 +3510,7 @@ main(int argc, char **argv)
 		flagbits['r'] = ZDB_FLAG_RAW;
 
 		for (i = 0; i < argc; i++)
-			zdb_read_block(argv[i], spa);
+			zdb_read_block(argv[i], spa, dump_opt['Z']);
 	}
 
 	(os != NULL) ? dmu_objset_disown(os, FTAG) : spa_close(spa, FTAG);