git: e82644e59ece - main - cam/iosched: Add a counter of I/Os that take too long

From: Warner Losh <imp_at_FreeBSD.org>
Date: Sat, 20 Jul 2024 02:59:06 UTC
The branch main has been updated by imp:

URL: https://cgit.FreeBSD.org/src/commit/?id=e82644e59ece5cdc67250262508e81fa22deea90

commit e82644e59ece5cdc67250262508e81fa22deea90
Author:     Warner Losh <imp@FreeBSD.org>
AuthorDate: 2024-07-20 02:52:40 +0000
Commit:     Warner Losh <imp@FreeBSD.org>
CommitDate: 2024-07-20 02:53:37 +0000

    cam/iosched: Add a counter of I/Os that take too long
    
    Add kern.cam.DEV.UNIT.iosched.too_long (to count I/Os taking too long)
    and kern.cam.DEV.UNIT.bad_latency (to set this threshold, defaults to
    500ms). Each class of I/O (read, write, trim) has its own counters and
    thresholds.
    
    Sponsored by:           Netflix
    Reviewed by:            jhb
    Differential Revision:  https://reviews.freebsd.org/D46033
---
 sys/cam/cam_iosched.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/sys/cam/cam_iosched.c b/sys/cam/cam_iosched.c
index 471e6c355d69..022eb23cb621 100644
--- a/sys/cam/cam_iosched.c
+++ b/sys/cam/cam_iosched.c
@@ -271,6 +271,9 @@ struct iop_stats {
 	sbintime_t      emvar;
 	sbintime_t      sd;		/* Last computed sd */
 
+	uint64_t	too_long;	/* Number of I/Os greater than bad lat threshold */
+	sbintime_t	bad_latency;	/* Latency threshold */
+
 	uint32_t	state_flags;
 #define IOP_RATE_LIMITED		1u
 
@@ -856,6 +859,7 @@ cam_iosched_iop_stats_init(struct cam_iosched_softc *isc, struct iop_stats *ios)
 	ios->total = 0;
 	ios->ema = 0;
 	ios->emvar = 0;
+	ios->bad_latency = SBT_1S / 2;	/* Default to 500ms */
 	ios->softc = isc;
 	cam_iosched_limiter_init(ios);
 }
@@ -1046,6 +1050,15 @@ cam_iosched_iop_stats_sysctl_init(struct cam_iosched_softc *isc, struct iop_stat
 	    OID_AUTO, "errs", CTLFLAG_RD,
 	    &ios->errs, 0,
 	    "# of transactions completed with an error");
+	SYSCTL_ADD_U64(ctx, n,
+	    OID_AUTO, "too_long", CTLFLAG_RD,
+	    &ios->too_long, 0,
+	    "# of transactions completed took too long");
+	SYSCTL_ADD_PROC(ctx, n,
+	    OID_AUTO, "bad_latency",
+	    CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
+	    &ios->bad_latency, 0, cam_iosched_sbintime_sysctl, "A",
+	    "Threshold for counting transactions that took too long (in us)");
 
 	SYSCTL_ADD_PROC(ctx, n,
 	    OID_AUTO, "limiter",
@@ -1916,6 +1929,14 @@ cam_iosched_update(struct iop_stats *iop, sbintime_t sim_latency)
 	sbintime_t y, deltasq, delta;
 	int i;
 
+	/*
+	 * Simple threshold: count the number of events that excede the
+	 * configured threshold.
+	 */
+	if (sim_latency > iop->bad_latency) {
+		iop->too_long++;
+	}
+
 	/*
 	 * Keep counts for latency. We do it by power of two buckets.
 	 * This helps us spot outlier behavior obscured by averages.