svn commit: r218486 - projects/graid/head/sys/geom/raid
Alexander Motin
mav at FreeBSD.org
Wed Feb 9 15:40:13 UTC 2011
Author: mav
Date: Wed Feb 9 15:40:13 2011
New Revision: 218486
URL: http://svn.freebsd.org/changeset/base/218486
Log:
Implement more advanced algorithm for choosing disk to read from RAID1.
General idea is the same as in gmirror balance algorithm. Take into
account: subdisk state, running error recovery, average disk load, head
position and possible cache hits.
Modified:
projects/graid/head/sys/geom/raid/g_raid.c
projects/graid/head/sys/geom/raid/g_raid.h
projects/graid/head/sys/geom/raid/tr_raid1.c
Modified: projects/graid/head/sys/geom/raid/g_raid.c
==============================================================================
--- projects/graid/head/sys/geom/raid/g_raid.c Wed Feb 9 15:33:13 2011 (r218485)
+++ projects/graid/head/sys/geom/raid/g_raid.c Wed Feb 9 15:40:13 2011 (r218486)
@@ -1074,7 +1074,7 @@ void
g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp)
{
struct g_consumer *cp;
- struct g_raid_disk *disk;
+ struct g_raid_disk *disk, *tdisk;
bp->bio_caller1 = sd;
@@ -1104,6 +1104,17 @@ nodisk:
bp->bio_from = cp;
bp->bio_to = cp->provider;
cp->index++;
+
+ /* Update average disks load. */
+ TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) {
+ if (tdisk->d_consumer == NULL)
+ tdisk->d_load = 0;
+ else
+ tdisk->d_load = (tdisk->d_consumer->index *
+ G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8;
+ }
+
+ disk->d_last_offset = bp->bio_offset + bp->bio_length;
if (dumping) {
G_RAID_LOGREQ(3, bp, "Sending dumping request.");
if (bp->bio_cmd == BIO_WRITE) {
Modified: projects/graid/head/sys/geom/raid/g_raid.h
==============================================================================
--- projects/graid/head/sys/geom/raid/g_raid.h Wed Feb 9 15:33:13 2011 (r218485)
+++ projects/graid/head/sys/geom/raid/g_raid.h Wed Feb 9 15:40:13 2011 (r218486)
@@ -147,8 +147,8 @@ struct g_raid_disk {
struct g_consumer *d_consumer; /* GEOM disk consumer. */
void *d_md_data; /* Disk's metadata storage. */
struct g_kerneldump d_kd; /* Kernel dumping method/args. */
- u_int d_state; /* Disk state. */
uint64_t d_flags; /* Additional flags. */
+ u_int d_state; /* Disk state. */
u_int d_load; /* Disk average load. */
off_t d_last_offset; /* Last head offset. */
TAILQ_HEAD(, g_raid_subdisk) d_subdisks; /* List of subdisks. */
@@ -169,6 +169,13 @@ struct g_raid_disk {
#define G_RAID_SUBDISK_E_DISCONNECTED 0x03 /* A subdisk removed from volume. */
#define G_RAID_SUBDISK_E_FIRST_TR_PRIVATE 0x80 /* translation private events */
+#define G_RAID_SUBDISK_POS(sd) \
+ ((sd)->sd_disk ? ((sd)->sd_disk->d_last_offset - (sd)->sd_offset) : 0)
+#define G_RAID_SUBDISK_TRACK_SIZE (1 * 1024 * 1024)
+#define G_RAID_SUBDISK_LOAD(sd) \
+ ((sd)->sd_disk ? ((sd)->sd_disk->d_load) : 0)
+#define G_RAID_SUBDISK_LOAD_SCALE 256
+
struct g_raid_subdisk {
struct g_raid_softc *sd_softc; /* Back-pointer to softc. */
struct g_raid_disk *sd_disk; /* Where this subdisk lives. */
@@ -179,6 +186,7 @@ struct g_raid_subdisk {
u_int sd_state; /* Subdisk state. */
off_t sd_rebuild_pos; /* Rebuild position. */
int sd_read_errs; /* Count of the read errors */
+ int sd_recovery; /* Count of recovery reqs. */
TAILQ_ENTRY(g_raid_subdisk) sd_next; /* Next subdisk on disk. */
};
Modified: projects/graid/head/sys/geom/raid/tr_raid1.c
==============================================================================
--- projects/graid/head/sys/geom/raid/tr_raid1.c Wed Feb 9 15:33:13 2011 (r218485)
+++ projects/graid/head/sys/geom/raid/tr_raid1.c Wed Feb 9 15:40:13 2011 (r218486)
@@ -32,6 +32,7 @@ __FBSDID("$FreeBSD$");
#include <sys/endian.h>
#include <sys/kernel.h>
#include <sys/kobj.h>
+#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
@@ -490,22 +491,43 @@ g_raid_tr_stop_raid1(struct g_raid_tr_ob
}
/*
- * Select the disk to do the reads to. For now, we just pick the first one in
- * the list that's active always. This ensures we favor one disk on boot, and
- * have more deterministic recovery from the weird edge cases of power
- * failure. In the future, we can imagine policies that go for the least
- * loaded disk to improve performance, or we need to limit reads to a disk
- * during some kind of error recovery with that disk.
+ * Select the disk to read from. Take into account: subdisk state, running
+ * error recovery, average disk load, head position and possible cache hits.
*/
+#define ABS(x) (((x) >= 0) ? (x) : (-(x)))
static struct g_raid_subdisk *
-g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol)
+g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol, struct bio *bp)
{
- int i;
+ struct g_raid_subdisk *sd, *best;
+ int i, prio, bestprio;
- for (i = 0; i < vol->v_disks_count; i++)
- if (vol->v_subdisks[i].sd_state == G_RAID_SUBDISK_S_ACTIVE)
- return (&vol->v_subdisks[i]);
- return (NULL);
+ best = NULL;
+ bestprio = INT_MAX;
+ for (i = 0; i < vol->v_disks_count; i++) {
+ sd = &vol->v_subdisks[i];
+ if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE &&
+ !((sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
+ sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
+ bp->bio_offset + bp->bio_length <
+ sd->sd_rebuild_pos))
+ continue;
+ prio = G_RAID_SUBDISK_LOAD(sd);
+ prio += min(sd->sd_recovery, 255) << 22;
+ prio += (G_RAID_SUBDISK_S_ACTIVE - sd->sd_state) << 16;
+ /* If disk head is precisely in position - highly prefer it. */
+ if (G_RAID_SUBDISK_POS(sd) == bp->bio_offset)
+ prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
+ else
+ /* If disk head is close to position - prefer it. */
+ if (ABS(G_RAID_SUBDISK_POS(sd) - bp->bio_offset) <
+ G_RAID_SUBDISK_TRACK_SIZE)
+ prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
+ if (prio < bestprio) {
+ best = sd;
+ bestprio = prio;
+ }
+ }
+ return (best);
}
static void
@@ -514,7 +536,7 @@ g_raid_tr_iostart_raid1_read(struct g_ra
struct g_raid_subdisk *sd;
struct bio *cbp;
- sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume);
+ sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume, bp);
KASSERT(sd != NULL, ("No active disks in volume %s.",
tr->tro_volume->v_name));
@@ -832,6 +854,7 @@ rebuild_round_done:
break;
G_RAID_LOGREQ(2, cbp, "Retrying read");
if (pbp->bio_children == 2 && do_write) {
+ sd->sd_recovery++;
cbp->bio_caller1 = nsd;
pbp->bio_pflags = G_RAID_BIO_FLAG_LOCKED;
/* Lock callback starts I/O */
@@ -892,6 +915,10 @@ rebuild_round_done:
g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk);
bp->bio_error = 0;
}
+ if (pbp->bio_driver1 != NULL) {
+ ((struct g_raid_subdisk *)pbp->bio_driver1)
+ ->sd_recovery--;
+ }
G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
g_raid_unlock_range(sd->sd_volume, bp->bio_offset,
bp->bio_length);
More information about the svn-src-projects
mailing list