svn commit: r218459 - projects/graid/head/sys/geom/raid

Tue Feb 8 22:09:00 UTC 2011

Author: mav
Date: Tue Feb  8 22:09:00 2011
New Revision: 218459
URL: http://svn.freebsd.org/changeset/base/218459

Log:
  Fix bad sector recovery:
   - Lock region on first recovery read, not ot write. Locking on write is
  too late, as some other write can slip in before it.
   - Add optional ignore BIO argument to g_raid_lock_range(). It is needed to
  not block on request that caused this recovery. This fixes dead lock on
  bad sector recovery.
   - Prepare some ground to do multiple recovery reads for 3+ disks RAID0s.

Modified:
  projects/graid/head/sys/geom/raid/g_raid.c
  projects/graid/head/sys/geom/raid/g_raid.h
  projects/graid/head/sys/geom/raid/tr_raid1.c

Modified: projects/graid/head/sys/geom/raid/g_raid.c
==============================================================================

--- projects/graid/head/sys/geom/raid/g_raid.c	Tue Feb  8 22:08:00 2011	(r218458)
+++ projects/graid/head/sys/geom/raid/g_raid.c	Tue Feb  8 22:09:00 2011	(r218459)
@@ -1001,7 +1001,8 @@ g_raid_iodone(struct bio *bp, int error)
 }
 
 int
-g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len, void *argp)
+g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len,
+    struct bio *ignore, void *argp)
 {
 	struct g_raid_softc *sc;
 	struct g_raid_lock *lp;
@@ -1016,7 +1017,7 @@ g_raid_lock_range(struct g_raid_volume *
 
 	lp->l_pending = 0;
 	TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) {
-		if (g_raid_bio_overlaps(bp, off, len))
+		if (bp != ignore && g_raid_bio_overlaps(bp, off, len))
 			lp->l_pending++;
 	}	
 

Modified: projects/graid/head/sys/geom/raid/g_raid.h
==============================================================================
--- projects/graid/head/sys/geom/raid/g_raid.h	Tue Feb  8 22:08:00 2011	(r218458)
+++ projects/graid/head/sys/geom/raid/g_raid.h	Tue Feb  8 22:09:00 2011	(r218459)
@@ -377,7 +377,8 @@ struct g_raid_subdisk * g_raid_get_subdi
 #define	G_RAID_DESTROY_HARD		2
 int g_raid_destroy(struct g_raid_softc *sc, int how);
 int g_raid_event_send(void *arg, int event, int flags);
-int g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len, void *argp);
+int g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len,
+    struct bio *ignore, void *argp);
 int g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len);
 
 g_ctl_req_t g_raid_ctl;

Modified: projects/graid/head/sys/geom/raid/tr_raid1.c
==============================================================================
--- projects/graid/head/sys/geom/raid/tr_raid1.c	Tue Feb  8 22:08:00 2011	(r218458)
+++ projects/graid/head/sys/geom/raid/tr_raid1.c	Tue Feb  8 22:09:00 2011	(r218459)
@@ -255,7 +255,7 @@ g_raid_tr_raid1_rebuild_some(struct g_ra
 	trs->trso_flags |= TR_RAID1_F_DOING_SOME;
 	trs->trso_flags |= TR_RAID1_F_LOCKED;
 	g_raid_lock_range(sd->sd_volume,	/* Lock callback starts I/O */
-	   bp->bio_offset, bp->bio_length, bp);
+	   bp->bio_offset, bp->bio_length, NULL, bp);
 }
 
 static void
@@ -737,7 +737,7 @@ g_raid_tr_iodone_raid1(struct g_raid_tr_
 				/* Lock callback starts I/O */
 				trs->trso_flags |= TR_RAID1_F_LOCKED;
 				g_raid_lock_range(sd->sd_volume,
-				    bp->bio_offset, bp->bio_length, bp);
+				    bp->bio_offset, bp->bio_length, NULL, bp);
 			}
 		} else if (trs->trso_type == TR_RAID1_RESYNC) {
 			/*
@@ -749,8 +749,8 @@ g_raid_tr_iodone_raid1(struct g_raid_tr_
 		}
 		return;
 	}
-	if (bp->bio_error != 0 && bp->bio_cmd == BIO_READ &&
-	    pbp->bio_children == 1 && bp->bio_cflags == 0) {
+	pbp->bio_inbed++;
+	if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
 		/*
 		 * Read failed on first drive.  Retry the read error on
 		 * another disk drive, if available, before erroring out the
@@ -758,9 +758,9 @@ g_raid_tr_iodone_raid1(struct g_raid_tr_
 		 */
 		vol = tr->tro_volume;
 		sd->sd_read_errs++;
-		G_RAID_LOGREQ(3, bp,
-		    "Read failure, attempting recovery. %d total read errs",
-		    sd->sd_read_errs);
+		G_RAID_LOGREQ(0, bp,
+		    "Read error (%d), %d read errors total",
+		    bp->bio_error, sd->sd_read_errs);
 
 		/*
 		 * If there are too many read errors, we move to degraded.
@@ -768,13 +768,18 @@ g_raid_tr_iodone_raid1(struct g_raid_tr_
 		 * everything to get it back in sync), or just degrade the
 		 * drive, which kicks off a resync?
 		 */
-		if (sd->sd_read_errs > g_raid1_read_err_thresh)
+		if (sd->sd_read_errs > g_raid1_read_err_thresh) {
 			g_raid_fail_disk(sd->sd_softc, sd, sd->sd_disk);
+			if (pbp->bio_children == 1)
+				goto remapdone;
+		}
 
 		/*
 		 * Find the other disk, and try to do the I/O to it.
 		 */
 		for (nsd = NULL, i = 0; i < vol->v_disks_count; i++) {
+			if (pbp->bio_children > 1)
+				break;
 			nsd = &vol->v_subdisks[i];
 			if (sd == nsd)
 				continue;
@@ -784,8 +789,12 @@ g_raid_tr_iodone_raid1(struct g_raid_tr_
 			if (cbp == NULL)
 				break;
 			G_RAID_LOGREQ(2, cbp, "Retrying read");
-			g_raid_subdisk_iostart(nsd, cbp);
-			pbp->bio_inbed++;
+			pbp->bio_driver1 = sd; /* Save original subdisk. */
+			cbp->bio_caller1 = nsd;
+			cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
+			/* Lock callback starts I/O */
+			g_raid_lock_range(sd->sd_volume,
+			    cbp->bio_offset, cbp->bio_length, pbp, cbp);
 			return;
 		}
 		/*
@@ -796,9 +805,8 @@ g_raid_tr_iodone_raid1(struct g_raid_tr_
 		 */
 		G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
 	}
-	pbp->bio_inbed++;
-	if (pbp->bio_cmd == BIO_READ && pbp->bio_children == 2 &&
-	    bp->bio_cflags == 0) {
+	if (bp->bio_cmd == BIO_READ && bp->bio_error == 0 &&
+	    pbp->bio_children > 1) {
 		/*
 		 * If it was a read, and bio_children is 2, then we just
 		 * recovered the data from the second drive.  We should try to
@@ -813,17 +821,15 @@ g_raid_tr_iodone_raid1(struct g_raid_tr_
 		G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
 		cbp = g_clone_bio(pbp);
 		if (cbp != NULL) {
-			nsd = bp->bio_caller1;
 			cbp->bio_cmd = BIO_WRITE;
 			cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
-			cbp->bio_caller1 = nsd;
-			G_RAID_LOGREQ(3, bp,
+			G_RAID_LOGREQ(3, cbp,
 			    "Attempting bad sector remap on failing drive.");
-			/* Lock callback starts I/O */
-			g_raid_lock_range(sd->sd_volume,
-			    cbp->bio_offset, cbp->bio_length, cbp);
+			g_raid_subdisk_iostart(pbp->bio_driver1, cbp);
+			return;
 		}
 	}
+remapdone:
 	if (bp->bio_cflags & G_RAID_BIO_FLAG_REMAP) {
 		/*
 		 * We're done with a remap write, mark the range as unlocked.
@@ -834,14 +840,15 @@ g_raid_tr_iodone_raid1(struct g_raid_tr_
 		 * it now.  However, we need to reset error to 0 in that case
 		 * because we're not failing the original I/O which succeeded.
 		 */
-		G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
-		g_raid_unlock_range(sd->sd_volume, bp->bio_offset,
-		    bp->bio_length);
-		if (bp->bio_error) {
-			G_RAID_LOGREQ(3, bp, "Error on remap: mark subdisk bad.");
+		if (pbp->bio_cmd == BIO_WRITE && bp->bio_error) {
+			G_RAID_LOGREQ(0, bp, "Remap write failed: "
+			    "failing subdisk.");
 			g_raid_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 			bp->bio_error = 0;
 		}
+		G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
+		g_raid_unlock_range(sd->sd_volume, bp->bio_offset,
+		    bp->bio_length);
 	}
 	error = bp->bio_error;
 	g_destroy_bio(bp);