git: 22054f88914b - main - Report I/O stats from the CAM_IOSCHED_DYNAMIC extension
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Mon, 28 Feb 2022 17:50:40 UTC
The branch main has been updated by imp: URL: https://cgit.FreeBSD.org/src/commit/?id=22054f88914b51113f77f6eccc11353a891f9f3e commit 22054f88914b51113f77f6eccc11353a891f9f3e Author: Warner Losh <imp@FreeBSD.org> AuthorDate: 2022-02-28 17:17:06 +0000 Commit: Warner Losh <imp@FreeBSD.org> CommitDate: 2022-02-28 17:44:47 +0000 Report I/O stats from the CAM_IOSCHED_DYNAMIC extension Report, on a periodic basis, the I/O latencies the CAM I/O scheduler computes. These times are only for the hardware portion of the I/O as measured from the time the operation is scheduled with the SIM using xpt_action() until the SIM reports it has completed with xpt_dine(). Any time the I/O operation spends in a software queue is no included. The P50 (median), P90, P99 and P99.9 statistics about the latency of each of the read, write and trim operations that completed during the polling interval are reported. If there are fewer than 2, 10, 100 or 1000 operations during the polling interval, no statistic is reported and a single dash '-' is displayed. The read, write and trim commands (either on the command line or at run time) toggle display of these operations. The color command toggles color (it defaults to on, like gstat). When color is enabled, unknown statistics are reported in blue, high latency for a statistics is reported in red, medium in magenta and low in green (as with gstat). The med= and hi= commands can set these latency thresholds. Limitations: The entire sysctl space for all the devices is walked for each polling period. This should be optimized to remember the OIDs and only do such polling with the xpt generation changes. There is also no way to filter devices displayed. This command only works on physical devies that are connected to SCSI, ATA or NVME sims as those are the only ones that are instrumented in the CAM I/O scheduler (the CAM_IOSCHED_DYNAMIC option must be in the kernel, and the dynamic scheduler can't be disabled). MFC After: 1 month Relnotes: yes Sponsored by: Netflix Reviewed by: pauamma_gundo.com, chs Differential Revision: https://reviews.freebsd.org/D34259 --- usr.bin/systat/Makefile | 2 +- usr.bin/systat/cmdtab.c | 3 + usr.bin/systat/extern.h | 2 + usr.bin/systat/iolat.c | 520 ++++++++++++++++++++++++++++++++++++++++++++++++ usr.bin/systat/main.c | 8 + usr.bin/systat/systat.1 | 60 ++++++ 6 files changed, 594 insertions(+), 1 deletion(-) diff --git a/usr.bin/systat/Makefile b/usr.bin/systat/Makefile index e0575ff0d15a..a0abf22177a8 100644 --- a/usr.bin/systat/Makefile +++ b/usr.bin/systat/Makefile @@ -4,7 +4,7 @@ .include <src.opts.mk> PROG= systat -SRCS= cmds.c cmdtab.c devs.c fetch.c iostat.c keyboard.c main.c sysput.c \ +SRCS= cmds.c cmdtab.c devs.c fetch.c iolat.c iostat.c keyboard.c main.c sysput.c \ netcmds.c netstat.c pigs.c proc.c swap.c icmp.c \ mode.c ip.c sctp.c tcp.c zarc.c \ vmstat.c convtbl.c ifcmds.c ifstat.c diff --git a/usr.bin/systat/cmdtab.c b/usr.bin/systat/cmdtab.c index 52329948748b..068823c73874 100644 --- a/usr.bin/systat/cmdtab.c +++ b/usr.bin/systat/cmdtab.c @@ -83,6 +83,9 @@ struct cmdtab cmdtab[] = { { "zarc", showzarc, fetchzarc, labelzarc, initzarc, openzarc, closezarc, 0, resetzarc, CF_ZFSARC }, + { "iolat", showiolat, fetchiolat, labeliolat, + initiolat, openiolat, closeiolat, cmdiolat, + 0, CF_LOADAV }, { NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0, 0, 0 } }; struct cmdtab *curcmd = &cmdtab[0]; diff --git a/usr.bin/systat/extern.h b/usr.bin/systat/extern.h index 3b5597598b8b..fc58b7a2af9c 100644 --- a/usr.bin/systat/extern.h +++ b/usr.bin/systat/extern.h @@ -88,6 +88,7 @@ void closepigs(WINDOW *); void closeswap(WINDOW *); void closetcp(WINDOW *); int cmdifstat(const char *, const char *); +int cmdiolat(const char *, const char *); int cmdiostat(const char *, const char *); int cmdkre(const char *, const char *); int cmdnetstat(const char *, const char *); @@ -188,3 +189,4 @@ void sysputwuint64(WINDOW* , int, int, int, uint64_t, int); SYSTAT_CMD( zarc ); SYSTAT_CMD( sctp ); +SYSTAT_CMD( iolat ); diff --git a/usr.bin/systat/iolat.c b/usr.bin/systat/iolat.c new file mode 100644 index 000000000000..9535297df3ee --- /dev/null +++ b/usr.bin/systat/iolat.c @@ -0,0 +1,520 @@ +/* + * Copyright (c) 2021 Netflix, Inc + * + * SPDX-License-Identifier: BSD-2-Clause + */ + + +#include <sys/param.h> +#include <sys/sysctl.h> +#include <sys/resource.h> + +#include <devstat.h> +#include <err.h> +#include <errno.h> +#include <math.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> + +#include <sys/queue.h> +#include <sys/sysctl.h> + +#include "systat.h" +#include "extern.h" +#include "devs.h" + +#define CAM_BASE "kern.cam" +#define LATENCY ".latencies" +#define CAM_IOSCHED_BASE "kern.cam.iosched.bucket_base_us" + +#define DEV_NAMSIZE 32 +#define OP_NAMSIZE 16 +#define MAX_LATS 32 + +static double high_thresh = 500; +static double med_thresh = 300; +static bool docolor = true; + +static int ndevs; +static SLIST_HEAD(, iosched_stat) curlist; + +struct iosched_op_stat { + int nlats; + uint64_t lats[MAX_LATS]; + uint64_t prev_lats[MAX_LATS]; +}; + +enum { OP_READ = 0, OP_WRITE, OP_TRIM, NUM_OPS }; +static const char *ops[NUM_OPS] = { "read", "write", "trim" }; +#define OP_READ_MASK (1 << OP_READ) +#define OP_WRITE_MASK (1 << OP_WRITE) +#define OP_TRIM_MASK (1 << OP_TRIM) + +static uint32_t flags = OP_READ_MASK | OP_WRITE_MASK | OP_TRIM_MASK; + +struct iosched_stat { + SLIST_ENTRY(iosched_stat) link; + char dev_name[DEV_NAMSIZE]; + int unit; + struct iosched_op_stat op_stats[NUM_OPS]; +}; + +static int name2oid(const char *, int *); +static int walk_sysctl(int *, size_t); + +static int +name2oid(const char *name, int *oidp) +{ + int oid[2]; + int i; + size_t j; + + oid[0] = CTL_SYSCTL; + oid[1] = CTL_SYSCTL_NAME2OID; + + j = CTL_MAXNAME * sizeof(int); + i = sysctl(oid, 2, oidp, &j, name, strlen(name)); + if (i < 0) + return (i); + j /= sizeof(int); + return (j); +} + +static size_t /* Includes the trailing NUL */ +oid2name(int *oid, size_t nlen, char *name, size_t namlen) +{ + int qoid[CTL_MAXNAME + 2]; + int i; + size_t j; + + bzero(name, namlen); + qoid[0] = CTL_SYSCTL; + qoid[1] = CTL_SYSCTL_NAME; + memcpy(qoid + 2, oid, nlen * sizeof(int)); + j = namlen; + i = sysctl(qoid, nlen + 2, name, &j, 0, 0); + if (i || !j) + err(1, "sysctl name %d %zu %d", i, j, errno); + return (j); +} + +static int +oidfmt(int *oid, int len, u_int *kind) +{ + int qoid[CTL_MAXNAME+2]; + u_char buf[BUFSIZ]; + int i; + size_t j; + + qoid[0] = CTL_SYSCTL; + qoid[1] = CTL_SYSCTL_OIDFMT; + memcpy(qoid + 2, oid, len * sizeof(int)); + + j = sizeof(buf); + i = sysctl(qoid, len + 2, buf, &j, 0, 0); + if (i) + err(1, "sysctl fmt %d %zu %d", i, j, errno); + *kind = *(u_int *)buf; + return (0); +} + +static int +split_u64(char *str, const char *delim, uint64_t *buckets, int *nbuckets) +{ + int n = *nbuckets, i; + char *v; + + memset(buckets, 0, n * sizeof(buckets[0])); + for (i = 0; (v = strsep(&str, delim)) != NULL && i < n; i++) { + buckets[i] = strtoull(v, NULL, 10); + } + if (i < n) + *nbuckets = i; + return (i < n); +} + +static double baselat = 0.000020; + +static float +pest(int permill, uint64_t *lats, int nlat) +{ + uint64_t tot, samp; + int i; + float b1, b2; + + for (tot = 0, i = 0; i < nlat; i++) + tot += lats[i]; + if (tot == 0) + return -nanf(""); + if (tot < (uint64_t)2000 / (1000 - permill)) + return nanf(""); + samp = tot * permill / 1000; + if (samp < lats[0]) + return baselat * (float)samp / lats[0]; /* linear interpolation 0 and baselat */ + for (tot = 0, i = 0; samp >= tot && i < nlat; i++) + tot += lats[i]; + i--; + b1 = baselat * (1 << (i - 1)); + b2 = baselat * (1 << i); + /* Should expoentially interpolate between buckets -- doing linear instead */ + return b1 + (b2 - b1) * (float)(lats[i] - (tot - samp)) / lats[i]; +} + +static int +op2num(const char *op) +{ + for (int i = 0; i < NUM_OPS; i++) + if (strcmp(op, ops[i]) == 0) + return i; + return -1; +} + +static struct iosched_op_stat * +find_dev(const char *dev, int unit, int op) +{ + struct iosched_stat *isp; + struct iosched_op_stat *iosp; + + SLIST_FOREACH(isp, &curlist, link) { + if (strcmp(isp->dev_name, dev) != 0 || isp->unit != unit) + continue; + iosp = &isp->op_stats[op]; + return iosp; + } + return NULL; +} + +static struct iosched_op_stat * +alloc_dev(const char *dev, int unit, int op) +{ + struct iosched_stat *isp; + struct iosched_op_stat *iosp; + + isp = malloc(sizeof(*isp)); + if (isp == NULL) + return NULL; + strlcpy(isp->dev_name, dev, sizeof(isp->dev_name)); + isp->unit = unit; + SLIST_INSERT_HEAD(&curlist, isp, link); + ndevs++; + iosp = &isp->op_stats[op]; + return iosp; +} + +#define E3 1000.0 +static void +update_dev(const char *dev, int unit, int op, uint64_t *lats, int nlat) +{ + struct iosched_op_stat *iosp; + + iosp = find_dev(dev, unit, op); + if (iosp == NULL) + iosp = alloc_dev(dev, unit, op); + if (iosp == NULL) + return; + iosp->nlats = nlat; + memcpy(iosp->prev_lats, iosp->lats, iosp->nlats * sizeof(uint64_t)); + memcpy(iosp->lats, lats, iosp->nlats * sizeof(uint64_t)); +// printf("%s%d: %-6s %.3f %.3f %.3f %.3f\r\n", +// dev, unit, operation, E3 * pest(500, lats, nlat), E3 * pest(900, lats, nlat), +// E3 * pest(990, lats, nlat), E3 * pest(999, lats, nlat)); +} + +static int +walk_sysctl(int *base_oid, size_t len) +{ + int qoid[CTL_MAXNAME + 2], oid[CTL_MAXNAME]; + size_t l1, l2; + char name[BUFSIZ]; + + if (len > CTL_MAXNAME) + err(1, "Length %zd too long", len); + + qoid[0] = CTL_SYSCTL; + qoid[1] = CTL_SYSCTL_NEXT; + l1 = 2; + memcpy(qoid + 2, base_oid, len * sizeof(int)); + l1 += len; + for (;;) { + /* + * Get the next one or return when we get to the end of the + * sysctls in the kernel. + */ + l2 = sizeof(oid); + if (sysctl(qoid, l1, oid, &l2, 0, 0) != 0) { + if (errno == ENOENT) + return (0); + err(1, "sysctl(getnext) %zu", l2); + } + + l2 /= sizeof(int); + + /* + * Bail if we're seeing OIDs that don't have the + * same prefix or can't have the same prefix. + */ + if (l2 < len || + memcmp(oid, base_oid, len * sizeof(int)) != 0) + return (0); + + /* + * Get the name, validate it's one we're looking for, + * parse the latency and add to list. + */ + do { + int nlat; + size_t l3; + char val[BUFSIZ]; + char *walker, *dev, *opstr; + uint64_t latvals[MAX_LATS]; + u_int kind; + int unit, op; + + l1 = oid2name(oid, l2, name, sizeof(name)); + if (strcmp(name + l1 - strlen(LATENCY) - 1, LATENCY) != 0) + break; + if (oidfmt(oid, l2, &kind) != 0) + err(1, "oidfmt"); + if ((kind & CTLTYPE) != CTLTYPE_STRING) + errx(1, "string"); + l3 = sizeof(val); + if (sysctl(oid, l2, val, &l3, 0, 0) != 0) + err(1, "sysctl"); + val[l3] = '\0'; + nlat = nitems(latvals); + if (split_u64(val, ",", latvals, &nlat) == 0) + break; + walker = name + strlen(CAM_BASE) + 1; + dev = strsep(&walker, "."); + unit = (int)strtol(strsep(&walker, "."), NULL, 10); + strsep(&walker, "."); + opstr = strsep(&walker, "."); + op = op2num(opstr); + if (op < 0) + break; + update_dev(dev, unit, op, latvals, nlat); + } while (false); + + memcpy(qoid + 2, oid, l2 * sizeof(int)); + l1 = 2 + l2; + } +} + +void +closeiolat(WINDOW *w) +{ + if (w == NULL) + return; + wclear(w); + wrefresh(w); + delwin(w); +} + +static void +doublecmd(const char *cmd, double *v) +{ + const char *p; + double tv; + + p = strchr(cmd, '='); + if (p == NULL) + return; /* XXX Tell the user something? */ + if (sscanf(p + 1, "%lf", &tv) != 1) + return; /* XXX Tell the user something? */ + *v = tv; +} + +int +cmdiolat(const char *cmd __unused, const char *args __unused) +{ + fprintf(stderr, "CMD IS '%s'\n\n", cmd); + if (prefix(cmd, "trim")) + flags ^= OP_TRIM_MASK; + else if (prefix(cmd, "read")) + flags ^= OP_READ_MASK; + else if (prefix(cmd, "write")) + flags ^= OP_WRITE_MASK; + else if (prefix(cmd, "color")) + docolor = !docolor; + else if (prefix("high", cmd)) + doublecmd(cmd, &high_thresh); + else if (prefix("med", cmd)) + doublecmd(cmd, &med_thresh); + else + return (0); + wclear(wnd); + labeliolat(); + refresh(); + return (1); +} + +int +initiolat(void) +{ + int cam[CTL_MAXNAME]; + uint64_t sbt_base; + size_t len = sizeof(sbt_base); + + SLIST_INIT(&curlist); + + baselat = 1e-3; /* old default */ + if (sysctlbyname(CAM_IOSCHED_BASE, &sbt_base, &len, NULL, 0) == 0) + baselat = sbt_base * 1e-6; /* Convert to microseconds */ + + name2oid(CAM_BASE, cam); + walk_sysctl(cam, 2); + return (1); +} + +void +fetchiolat(void) +{ + int cam[CTL_MAXNAME]; + + name2oid(CAM_BASE, cam); + walk_sysctl(cam, 2); +} + +#define INSET 10 + +void +labeliolat(void) +{ + int _col, ndrives, lpr, row, j; + int regions __unused; + struct iosched_stat *isp; + char tmpstr[32]; +#define COLWIDTH 29 +#define DRIVESPERLINE ((getmaxx(wnd) - 1 - INSET) / COLWIDTH) + ndrives = ndevs; // XXX FILTER XXX + regions = howmany(ndrives, DRIVESPERLINE); + lpr = 2; /* for headers */ + for (int i = 0; i < NUM_OPS; i++) { + if (flags & (1 << i)) + lpr++; + } + row = 0; + _col = INSET; + j = 2; + if (flags & OP_READ_MASK) + mvwaddstr(wnd, row + j++, 1, "read"); + if (flags & OP_WRITE_MASK) + mvwaddstr(wnd, row + j++, 1, "write"); + if (flags & OP_TRIM_MASK) + mvwaddstr(wnd, row + j++, 1, "trim"); + SLIST_FOREACH(isp, &curlist, link) { + if (_col + COLWIDTH >= getmaxx(wnd) - 1 - INSET) { + _col = INSET; + row += lpr + 1; + if (row > getmaxy(wnd) - 1 - (lpr + 1)) + break; + j = 2; + if (flags & OP_READ_MASK) + mvwaddstr(wnd, row + j++, 1, "read"); + if (flags & OP_WRITE_MASK) + mvwaddstr(wnd, row + j++, 1, "write"); + if (flags & OP_TRIM_MASK) + mvwaddstr(wnd, row + j++, 1, "trim"); + } + snprintf(tmpstr, sizeof(tmpstr), "%s%d", isp->dev_name, isp->unit); + mvwaddstr(wnd, row, _col + (COLWIDTH - strlen(tmpstr)) / 2, tmpstr); + mvwaddstr(wnd, row + 1, _col, " p50 p90 p99 p99.9"); + _col += COLWIDTH; + } +} + +WINDOW * +openiolat(void) +{ + return (subwin(stdscr, LINES-3-1, 0, MAINWIN_ROW, 0)); +} + +static void +fmt(float f, char *buf, size_t len) +{ + if (isnan(f)) + strlcpy(buf, " - ", len); + else if (f >= 1000.0) + snprintf(buf, len, "%6d", (int)f); + else if (f >= 100.0) + snprintf(buf, len, "%6.1f", f); + else if (f >= 10.0) + snprintf(buf, len, "%6.2f", f); + else + snprintf(buf, len, "%6.3f", f); +} + +static void +latout(double lat, int y, int x) +{ + int i; + char tmpstr[32]; + + fmt(lat, tmpstr, sizeof(tmpstr)); + if (isnan(lat)) + i = 4; + else if (lat > high_thresh) + i = 3; + else if (lat > med_thresh) + i = 2; + else + i = 1; + if (docolor) + wattron(wnd, COLOR_PAIR(i)); + mvwaddstr(wnd, y, x, tmpstr); + if (docolor) + wattroff(wnd, COLOR_PAIR(i)); +} + +void +showiolat(void) +{ + int _col, ndrives, lpr, row, k; + int regions __unused; + struct iosched_stat *isp; + struct iosched_op_stat *iosp; +#define COLWIDTH 29 +#define DRIVESPERLINE ((getmaxx(wnd) - 1 - INSET) / COLWIDTH) + ndrives = ndevs; // XXX FILTER XXX + regions = howmany(ndrives, DRIVESPERLINE); + lpr = 2; /* XXX */ + for (int i = 0; i < NUM_OPS; i++) { + if (flags & (1 << i)) + lpr++; + } + row = 0; + _col = INSET; + SLIST_FOREACH(isp, &curlist, link) { + if (_col + COLWIDTH >= getmaxx(wnd) - 1 - INSET) { + _col = INSET; + row += lpr + 1; + if (row > getmaxy(wnd) - 1 - (lpr + 1)) + break; + } + k = 2; + for (int i = 0; i < NUM_OPS; i++) { + uint64_t lats[MAX_LATS]; + int nlats; + float p50, p90, p99, p999; + + if ((flags & (1 << i)) == 0) + continue; + iosp = &isp->op_stats[i]; + nlats = iosp->nlats; + memset(lats, 0, sizeof(lats)); + for (int j = 0; j < iosp->nlats; j++) + lats[j] = iosp->lats[j] - iosp->prev_lats[j]; + p50 = pest(500, lats, nlats) * E3; + p90 = pest(900, lats, nlats) * E3; + p99 = pest(990, lats, nlats) * E3; + p999 = pest(999, lats, nlats) * E3; + latout(p50, row + k, _col); + latout(p90, row + k, _col + 7); + latout(p99, row + k, _col + 14); + latout(p999, row + k, _col + 21); + k++; + } + _col += COLWIDTH; + } +} diff --git a/usr.bin/systat/main.c b/usr.bin/systat/main.c index 3266fa187714..a52fbd7ebc4d 100644 --- a/usr.bin/systat/main.c +++ b/usr.bin/systat/main.c @@ -155,6 +155,7 @@ main(int argc, char **argv) char errbuf[_POSIX2_LINE_MAX], dummy; size_t size; struct cmdentry *cmd = NULL; + short cf, cb; (void) setlocale(LC_ALL, ""); @@ -214,6 +215,13 @@ main(int argc, char **argv) * routines to minimize update work by curses. */ initscr(); + start_color(); + use_default_colors(); + pair_content(0, &cf, &cb); + init_pair(1, COLOR_GREEN, cb); + init_pair(2, COLOR_MAGENTA, cb); + init_pair(3, COLOR_RED, cb); + init_pair(4, COLOR_BLUE, cb); CMDLINE = LINES - 1; wnd = (*curcmd->c_open)(); if (wnd == NULL) { diff --git a/usr.bin/systat/systat.1 b/usr.bin/systat/systat.1 index 1f6ddaeafb1a..9f866727bd89 100644 --- a/usr.bin/systat/systat.1 +++ b/usr.bin/systat/systat.1 @@ -88,6 +88,7 @@ to be one of: .Ic icmp , .Ic icmp6 , .Ic ifstat , +.Ic iolat , .Ic iostat , .Ic ip , .Ic ip6 , @@ -244,6 +245,59 @@ but with SCTP statistics. Like .Ic icmp , but with TCP statistics. +.It Ic iolat +Display statistics describing the hardware latencies of I/O operations as +computed by the +.Va CAM_IOSCHED_DYNAMIC +option. +This option must be in the kernel config file of the running kernel for this +display to work. +All devices are displayed as there is currently no way to filter them. +The statistics displayed for the I/O latencies are the percentiles with +sufficient data during the polling interval to compute. +If a value cannot be estimated ``-'' is displayed. +The P50 (also known as the median), P90, P99 and P99.9 values are computed if +more than 2, 10, 100 or 1000 operations occurred during the polling interval. +The latency is the hardware latency values, and does not include any software +queueing time. +The latencies are estimated based on histogram data computed by the CAM I/O +scheduler and represent estimates of the actual value that are only good to +two or three significant digits. +The display of latency changes based on the scale of the latency to reflect +the precision of the estimates and to fit on the available screen space. +All latencies are reported in milliseconds. +When color is enabled +.Bl -bullet +.It Values below the medium latency threshold are displayed in green. +.It Values between the minimum latency and high latency thresholds are displayed +in magenta. +.It Values above the high latency thresholds are displayed in red. +.Pp +When color is disabled, the default foreground and background colors are always +used. +.Pp +The following commands are specific to the +.Ic iolat +display; the minimum unambiguous prefix may be supplied. +.Pp +.Bl -tag -width Fl -compact +.It Cm color +Toggle the use of color in the display. +The default is on. +.It Cm hi=XXX +Set the high latency threshold to XXX milliseconds. +.It Cm med=XXX +Set the medium latency threshold to XXX milliseconds. +.It Cm read +Toggle the display of statistics about read operations. +The default is on. +.It Cm write +Toggle the display of statistics about write operations. +The default is on. +.It Cm trim +Toggle the display of statistics about trim operations. +The default is on. +.El .It Ic iostat Display, in the lower window, statistics about processor use and disk throughput. @@ -682,3 +736,9 @@ The .Ic vmstat display looks out of place because it is (it was added in as a separate display rather than created as a new program). +The +.Ic iolat +command doesn't implement the common device commands including +filtering, as it doesn't use the +.Xr devstat 3 +mechanism to obtain its statistics.