ZFS patches

Chuck Silvers chs at highcloudsecurity.com
Thu May 31 00:32:36 UTC 2012


we only have a few changes to ZFS itself, and now that I look I see that
you've found one of them independently (r230256).

the other ones are:

 - improve performance of booting from a ZFS root under ESXi.
   previously this would sit there for about 5 minutes before even
   starting to load the kernel.  the problem is that the ZFS pool-discovery
   code opens every possible GPT partition looking for pools, and it rereads
   the GPT each time, one sector at a time.  we changed the GPT code to
   read the whole GPT in one shot, which reduced the delay to almost nothing.
   I remember seeing some discussion about a PR on this topic some time back
   but I don't know if any fix was ever applied and I don't see the PR now.
   as I recall, the proposal in that discussion was to improve the boot code
   caching so that it wouldn't reread the GPT at all, which I imagine would
   work just as well as what we did.
   (hmm, this isn't actually a change to ZFS either.)

 - make zfs_resilver_delay and zfs_resilver_min_time_ms tunable via sysctl.


patches for both of these are attached.

-Chuck
-------------- next part --------------
Index: sys/boot/i386/libi386/biosdisk.c
===================================================================
RCS file: /home/chs/freebsd/cvs/src/sys/boot/i386/libi386/biosdisk.c,v
retrieving revision 1.62.2.2.4.1
diff -u -p -r1.62.2.2.4.1 biosdisk.c
--- sys/boot/i386/libi386/biosdisk.c	21 Dec 2010 17:09:25 -0000	1.62.2.2.4.1
+++ sys/boot/i386/libi386/biosdisk.c	6 Jul 2011 20:44:31 -0000
@@ -853,9 +853,10 @@ bd_open_gpt(struct open_disk *od, struct
     struct gpt_hdr *hdr;
     struct gpt_ent *ent;
     struct gpt_part *gp;
-    int	entries_per_sec, error, i, part;
+    int	entries_per_sec, error, i, part, nsec;
     daddr_t lba, elba;
     char gpt[BIOSDISK_SECSIZE], tbl[BIOSDISK_SECSIZE];
+    char *buf, *tblp;
 
     /*
      * Following calculations attempt to determine the correct value
@@ -900,22 +901,31 @@ bd_open_gpt(struct open_disk *od, struct
 	return (EINVAL);
     }
 
+    entries_per_sec = BIOSDISK_SECSIZE / hdr->hdr_entsz;
+    nsec = hdr->hdr_entries / entries_per_sec;
+    if (nsec > 128) {
+	DEBUG("too many GPT table sectors %d", nsec);
+	return (EINVAL);
+    }
+    buf = alloca(nsec * BIOSDISK_SECSIZE);
+    if (bd_read(od, hdr->hdr_lba_table, nsec, buf)) {
+	DEBUG("error reading GPT table");
+	return (EIO);
+    }
+
     /* Now walk the partition table to count the number of valid partitions. */
     part = 0;
-    entries_per_sec = BIOSDISK_SECSIZE / hdr->hdr_entsz;
+    tblp = buf;
     elba = hdr->hdr_lba_table + hdr->hdr_entries / entries_per_sec;
     for (lba = hdr->hdr_lba_table; lba < elba; lba++) {
-	if (bd_read(od, lba, 1, tbl)) {
-	    DEBUG("error reading GPT table");
-	    return (EIO);
-	}
 	for (i = 0; i < entries_per_sec; i++) {
-	    ent = (struct gpt_ent *)(tbl + i * hdr->hdr_entsz);
+	    ent = (struct gpt_ent *)(tblp + i * hdr->hdr_entsz);
 	    if (uuid_is_nil(&ent->ent_type, NULL) || ent->ent_lba_start == 0 ||
 		ent->ent_lba_end < ent->ent_lba_start)
 		continue;
 	    part++;
 	}
+	tblp += BIOSDISK_SECSIZE;
     }
 
     /* Save the important information about all the valid partitions. */
@@ -923,14 +933,10 @@ bd_open_gpt(struct open_disk *od, struct
     if (part != 0) {
 	od->od_partitions = malloc(part * sizeof(struct gpt_part));
 	part = 0;	
+	tblp = buf;
 	for (lba = hdr->hdr_lba_table; lba < elba; lba++) {
-	    if (bd_read(od, lba, 1, tbl)) {
-		DEBUG("error reading GPT table");
-		error = EIO;
-		goto out;
-	    }
 	    for (i = 0; i < entries_per_sec; i++) {
-		ent = (struct gpt_ent *)(tbl + i * hdr->hdr_entsz);
+		ent = (struct gpt_ent *)(tblp + i * hdr->hdr_entsz);
 		if (uuid_is_nil(&ent->ent_type, NULL) ||
 		    ent->ent_lba_start == 0 ||
 		    ent->ent_lba_end < ent->ent_lba_start)
@@ -942,6 +948,7 @@ bd_open_gpt(struct open_disk *od, struct
 		od->od_partitions[part].gp_end = ent->ent_lba_end;
 		part++;
 	    }
+	    tblp += BIOSDISK_SECSIZE;
 	}
     }
     od->od_flags |= BD_GPTOK;
-------------- next part --------------
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c.ORG	2011-10-12 14:27:36.000000000 +0530
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c	2011-10-12 14:38:44.000000000 +0530
@@ -69,6 +69,14 @@
 enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
 int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
 
+SYSCTL_DECL(_vfs_zfs);
+TUNABLE_INT("vfs.zfs.zfs_resilver_delay", &zfs_resilver_delay);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, zfs_resilver_delay, CTLFLAG_RW,
+    &zfs_resilver_delay, 0, "Resilver delay");
+TUNABLE_INT("vfs.zfs.zfs_resilver_min_time_ms", &zfs_resilver_min_time_ms);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, zfs_resilver_min_time_ms, CTLFLAG_RW,
+    &zfs_resilver_min_time_ms, 0, "Resilver min time");
+
 #define	DSL_SCAN_IS_SCRUB_RESILVER(scn) \
 	((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
 	(scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)


More information about the zfs-devel mailing list