svn commit: r218486 - projects/graid/head/sys/geom/raid

Wed Feb 9 15:40:13 UTC 2011

Author: mav
Date: Wed Feb  9 15:40:13 2011
New Revision: 218486
URL: http://svn.freebsd.org/changeset/base/218486

Log:
  Implement more advanced algorithm for choosing disk to read from RAID1.
  General idea is the same as in gmirror balance algorithm.  Take into
  account: subdisk state, running error recovery, average disk load, head
  position and possible cache hits.

Modified:
  projects/graid/head/sys/geom/raid/g_raid.c
  projects/graid/head/sys/geom/raid/g_raid.h
  projects/graid/head/sys/geom/raid/tr_raid1.c

Modified: projects/graid/head/sys/geom/raid/g_raid.c
==============================================================================

--- projects/graid/head/sys/geom/raid/g_raid.c	Wed Feb  9 15:33:13 2011	(r218485)
+++ projects/graid/head/sys/geom/raid/g_raid.c	Wed Feb  9 15:40:13 2011	(r218486)
@@ -1074,7 +1074,7 @@ void
 g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp)
 {
 	struct g_consumer *cp;
-	struct g_raid_disk *disk;
+	struct g_raid_disk *disk, *tdisk;
 
 	bp->bio_caller1 = sd;
 
@@ -1104,6 +1104,17 @@ nodisk:
 	bp->bio_from = cp;
 	bp->bio_to = cp->provider;
 	cp->index++;
+
+	/* Update average disks load. */
+	TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) {
+		if (tdisk->d_consumer == NULL)
+			tdisk->d_load = 0;
+		else
+			tdisk->d_load = (tdisk->d_consumer->index *
+			    G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8;
+	}
+
+	disk->d_last_offset = bp->bio_offset + bp->bio_length;
 	if (dumping) {
 		G_RAID_LOGREQ(3, bp, "Sending dumping request.");
 		if (bp->bio_cmd == BIO_WRITE) {

Modified: projects/graid/head/sys/geom/raid/g_raid.h
==============================================================================
--- projects/graid/head/sys/geom/raid/g_raid.h	Wed Feb  9 15:33:13 2011	(r218485)
+++ projects/graid/head/sys/geom/raid/g_raid.h	Wed Feb  9 15:40:13 2011	(r218486)
@@ -147,8 +147,8 @@ struct g_raid_disk {
 	struct g_consumer	*d_consumer;	/* GEOM disk consumer. */
 	void			*d_md_data;	/* Disk's metadata storage. */
 	struct g_kerneldump	 d_kd;		/* Kernel dumping method/args. */
-	u_int			 d_state;	/* Disk state. */
 	uint64_t		 d_flags;	/* Additional flags. */
+	u_int			 d_state;	/* Disk state. */
 	u_int			 d_load;	/* Disk average load. */
 	off_t			 d_last_offset;	/* Last head offset. */
 	TAILQ_HEAD(, g_raid_subdisk)	 d_subdisks; /* List of subdisks. */
@@ -169,6 +169,13 @@ struct g_raid_disk {
 #define G_RAID_SUBDISK_E_DISCONNECTED	0x03	/* A subdisk removed from volume. */
 #define G_RAID_SUBDISK_E_FIRST_TR_PRIVATE 0x80	/* translation private events */
 
+#define G_RAID_SUBDISK_POS(sd)						\
+    ((sd)->sd_disk ? ((sd)->sd_disk->d_last_offset - (sd)->sd_offset) : 0)
+#define G_RAID_SUBDISK_TRACK_SIZE	(1 * 1024 * 1024)
+#define G_RAID_SUBDISK_LOAD(sd)						\
+    ((sd)->sd_disk ? ((sd)->sd_disk->d_load) : 0)
+#define G_RAID_SUBDISK_LOAD_SCALE	256
+
 struct g_raid_subdisk {
 	struct g_raid_softc	*sd_softc;	/* Back-pointer to softc. */
 	struct g_raid_disk	*sd_disk;	/* Where this subdisk lives. */
@@ -179,6 +186,7 @@ struct g_raid_subdisk {
 	u_int			 sd_state;	/* Subdisk state. */
 	off_t			 sd_rebuild_pos; /* Rebuild position. */
 	int			 sd_read_errs;  /* Count of the read errors */
+	int			 sd_recovery;	/* Count of recovery reqs. */
 	TAILQ_ENTRY(g_raid_subdisk)	 sd_next; /* Next subdisk on disk. */
 };
 

Modified: projects/graid/head/sys/geom/raid/tr_raid1.c
==============================================================================
--- projects/graid/head/sys/geom/raid/tr_raid1.c	Wed Feb  9 15:33:13 2011	(r218485)
+++ projects/graid/head/sys/geom/raid/tr_raid1.c	Wed Feb  9 15:40:13 2011	(r218486)
@@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/endian.h>
 #include <sys/kernel.h>
 #include <sys/kobj.h>
+#include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
@@ -490,22 +491,43 @@ g_raid_tr_stop_raid1(struct g_raid_tr_ob
 }
 
 /*
- * Select the disk to do the reads to.  For now, we just pick the first one in
- * the list that's active always.  This ensures we favor one disk on boot, and
- * have more deterministic recovery from the weird edge cases of power
- * failure.  In the future, we can imagine policies that go for the least
- * loaded disk to improve performance, or we need to limit reads to a disk
- * during some kind of error recovery with that disk.
+ * Select the disk to read from.  Take into account: subdisk state, running
+ * error recovery, average disk load, head position and possible cache hits.
  */
+#define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
 static struct g_raid_subdisk *
-g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol)
+g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol, struct bio *bp)
 {
-	int i;
+	struct g_raid_subdisk *sd, *best;
+	int i, prio, bestprio;
 
-	for (i = 0; i < vol->v_disks_count; i++)
-		if (vol->v_subdisks[i].sd_state == G_RAID_SUBDISK_S_ACTIVE)
-			return (&vol->v_subdisks[i]);
-	return (NULL);
+	best = NULL;
+	bestprio = INT_MAX;
+	for (i = 0; i < vol->v_disks_count; i++) {
+		sd = &vol->v_subdisks[i];
+		if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE &&
+		    !((sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
+		       sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && 
+		      bp->bio_offset + bp->bio_length <
+		       sd->sd_rebuild_pos))
+			continue;
+		prio = G_RAID_SUBDISK_LOAD(sd);
+		prio += min(sd->sd_recovery, 255) << 22;
+		prio += (G_RAID_SUBDISK_S_ACTIVE - sd->sd_state) << 16;
+		/* If disk head is precisely in position - highly prefer it. */
+		if (G_RAID_SUBDISK_POS(sd) == bp->bio_offset)
+			prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
+		else
+		/* If disk head is close to position - prefer it. */
+		if (ABS(G_RAID_SUBDISK_POS(sd) - bp->bio_offset) <
+		    G_RAID_SUBDISK_TRACK_SIZE)
+			prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
+		if (prio < bestprio) {
+			best = sd;
+			bestprio = prio;
+		}
+	}
+	return (best);
 }
 
 static void
@@ -514,7 +536,7 @@ g_raid_tr_iostart_raid1_read(struct g_ra
 	struct g_raid_subdisk *sd;
 	struct bio *cbp;
 
-	sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume);
+	sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume, bp);
 	KASSERT(sd != NULL, ("No active disks in volume %s.",
 		tr->tro_volume->v_name));
 
@@ -832,6 +854,7 @@ rebuild_round_done:
 				break;
 			G_RAID_LOGREQ(2, cbp, "Retrying read");
 			if (pbp->bio_children == 2 && do_write) {
+				sd->sd_recovery++;
 				cbp->bio_caller1 = nsd;
 				pbp->bio_pflags = G_RAID_BIO_FLAG_LOCKED;
 				/* Lock callback starts I/O */
@@ -892,6 +915,10 @@ rebuild_round_done:
 			g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk);
 			bp->bio_error = 0;
 		}
+		if (pbp->bio_driver1 != NULL) {
+			((struct g_raid_subdisk *)pbp->bio_driver1)
+			    ->sd_recovery--;
+		}
 		G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
 		g_raid_unlock_range(sd->sd_volume, bp->bio_offset,
 		    bp->bio_length);