svn commit: r216984 - projects/graid/head/sys/geom/raid

Wed Jan 5 00:19:41 UTC 2011

Author: imp
Date: Wed Jan  5 00:19:40 2011
New Revision: 216984
URL: http://svn.freebsd.org/changeset/base/216984

Log:
  First pass at error recovery: if the first disk that we get errors on
  has a problem, try from the second one.  Note info about possible bad
  sector remap attempt through write, and some ideas on when to eject
  the subdisk from the disk.

Modified:
  projects/graid/head/sys/geom/raid/g_raid.h
  projects/graid/head/sys/geom/raid/tr_raid1.c

Modified: projects/graid/head/sys/geom/raid/g_raid.h
==============================================================================

--- projects/graid/head/sys/geom/raid/g_raid.h	Tue Jan  4 23:00:54 2011	(r216983)
+++ projects/graid/head/sys/geom/raid/g_raid.h	Wed Jan  5 00:19:40 2011	(r216984)
@@ -137,6 +137,7 @@ struct g_raid_subdisk {
 	off_t			 sd_size;	/* Size on the disk. */
 	u_int			 sd_pos;	/* Position in volume. */
 	u_int			 sd_state;	/* Subdisk state. */
+	int			 sd_read_errs;  /* Count of the read errors */
 	LIST_ENTRY(g_raid_subdisk)	 sd_next; /* Next subdisk on disk. */
 };
 

Modified: projects/graid/head/sys/geom/raid/tr_raid1.c
==============================================================================
--- projects/graid/head/sys/geom/raid/tr_raid1.c	Tue Jan  4 23:00:54 2011	(r216983)
+++ projects/graid/head/sys/geom/raid/tr_raid1.c	Wed Jan  5 00:19:40 2011	(r216984)
@@ -164,24 +164,34 @@ g_raid_tr_stop_raid1(struct g_raid_tr_ob
 	return (0);
 }
 
+/*
+ * Select the disk to do the reads to.  For now, we just pick the
+ * first one in the list that's active always.  This ensures we favor
+ * one disk on boot, and have more deterministic recovery from the
+ * weird edge cases of power failure.  In the future, we can imagine
+ * policies that go for the least loaded disk to improve performance,
+ * or we need to limit reads to a disk during some kind of error
+ * recovery with that disk.
+ */
+static struct g_raid_subdisk *
+g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol)
+{
+	int i;
+
+	for (i = 0; i < vol->v_disks_count; i++)
+		if (vol->v_subdisks[i].sd_state == G_RAID_SUBDISK_S_ACTIVE)
+			return (&vol->v_subdisks[i]);
+	return (NULL);
+}
+
 static void
 g_raid_tr_iostart_raid1_read(struct g_raid_tr_object *tr, struct bio *bp)
 {
-	struct g_raid_softc *sc;
-	struct g_raid_volume *vol;
 	struct g_raid_subdisk *sd;
 	struct bio *cbp;
-	int i;
 
-	vol = tr->tro_volume;
-	sc = vol->v_softc;
-	sd = NULL;
-	for (i = 0; i < vol->v_disks_count; i++) {
-		if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_ACTIVE)
-			continue;
-		sd = &vol->v_subdisks[i];
-	}
-	KASSERT(sd != NULL, ("No active disks in volume %s.", vol->v_name));
+	sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume);
+	KASSERT(sd != NULL, ("No active disks in volume %s.", tr->tro_volume->v_name));
 
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
@@ -283,6 +293,53 @@ g_raid_tr_iodone_raid1(struct g_raid_tr_
 
 	pbp = bp->bio_parent;
 	pbp->bio_inbed++;
+	if ((pbp->bio_flags & BIO_ERROR) && pbp->bio_cmd == BIO_READ &&
+	    pbp->bio_children == 1) {
+		struct bio *cbp;
+		struct g_raid_subdisk *nsd;
+		struct g_raid_volume *vol;
+		int i;
+
+		/*
+		 * Retry the error on the other disk drive, if available,
+		 * before erroring out the read.  Do we need to mark the
+		 * 'sd' disk as degraded somehow?
+		 */
+		vol = tr->tro_volume;
+		sd->sd_read_errs++;
+		/*
+		 * XXX Check threshold of sd_read_errs here to declare 
+		 * this subdisk bad?
+		 */
+		for (nsd = NULL, i = 0; i < vol->v_disks_count; i++) {
+			nsd = &vol->v_subdisks[i];
+			if (sd == nsd)
+				continue;
+			if (nsd->sd_state != G_RAID_SUBDISK_S_ACTIVE)
+				continue;
+			cbp = g_clone_bio(pbp);
+			if (cbp == NULL)
+				break;
+			g_raid_subdisk_iostart(nsd, cbp);
+			return;
+		}
+		/*
+		 * something happened, so we can't retry.  Return the
+		 * original error by falling through.
+		 */
+	}
+	/*
+	 * If it was a read, and bio_children is 2, then we just
+	 * recovered the data from the second drive.  We should try to
+	 * write that data to the first drive if sector remapping is
+	 * enabled.  A write should put the data in a new place on the
+	 * disk, remapping the bad sector.  Do we need to do that by
+	 * queueing a request to the main worker thread?  It doesn't
+	 * affect the return code of this current read, and can be
+	 * done at our liesure.
+	 *
+	 * XXX TODO
+	 */
 	if (pbp->bio_children == pbp->bio_inbed) {
 		pbp->bio_completed = pbp->bio_length;
 		g_raid_iodone(pbp, bp->bio_error);