git: 2760658b211c - main - Improve UMA cache reclamation.

Sun May 2 23:45:29 UTC 2021

The branch main has been updated by mav:

URL: https://cgit.FreeBSD.org/src/commit/?id=2760658b211c654bce1dbde597bc52b49fde5d7e

commit 2760658b211c654bce1dbde597bc52b49fde5d7e
Author:     Alexander Motin <mav at FreeBSD.org>
AuthorDate: 2021-05-02 23:35:28 +0000
Commit:     Alexander Motin <mav at FreeBSD.org>
CommitDate: 2021-05-02 23:45:23 +0000

    Improve UMA cache reclamation.
    
    When estimating working set size, measure only allocation batches, not free
    batches.  Allocation and free patterns can be very different.  For example,
    ZFS on vm_lowmem event can free to UMA few gigabytes of memory in one call,
    but it does not mean it will request the same amount back that fast too, in
    fact it won't.
    
    Update working set size on every reclamation call, shrinking caches faster
    under pressure.  Lack of this caused repeating vm_lowmem events squeezing
    more and more memory out of real consumers only to make it stuck in UMA
    caches.  I saw ZFS drop ARC size in half before previous algorithm after
    periodic WSS update decided to reclaim UMA caches.
    
    Introduce voluntary reclamation of UMA caches not used for a long time. For
    each zdom track longterm minimal cache size watermark, freeing some unused
    items every UMA_TIMEOUT after first 15 minutes without cache misses. Freed
    memory can get better use by other consumers.  For example, ZFS won't grow
    its ARC unless it see free memory, since it does not know it is not really
    used.  And even if memory is not really needed, periodic free during
    inactivity periods should reduce its fragmentation.
    
    Reviewed by:    markj, jeff (previous version)
    MFC after:      2 weeks
    Sponsored by:   iXsystems, Inc.
    Differential Revision:  https://reviews.freebsd.org/D29790
---
 sys/vm/uma_core.c | 179 +++++++++++++++++++++++++++++++++++++-----------------
 sys/vm/uma_int.h  |   3 +
 2 files changed, 126 insertions(+), 56 deletions(-)

diff --git a/sys/vm/uma_core.c b/sys/vm/uma_core.c
index 6b0add6b6b07..a85b88b24110 100644
--- a/sys/vm/uma_core.c
+++ b/sys/vm/uma_core.c
@@ -293,8 +293,10 @@ static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int);
 static void cache_drain(uma_zone_t);
 static void bucket_drain(uma_zone_t, uma_bucket_t);
 static void bucket_cache_reclaim(uma_zone_t zone, bool, int);
+static bool bucket_cache_reclaim_domain(uma_zone_t, bool, bool, int);
 static int keg_ctor(void *, int, void *, int);
 static void keg_dtor(void *, int, void *);
+static void keg_drain(uma_keg_t keg, int domain);
 static int zone_ctor(void *, int, void *, int);
 static void zone_dtor(void *, int, void *);
 static inline void item_dtor(uma_zone_t zone, void *item, int size,
@@ -700,24 +702,6 @@ zone_domain_highest(uma_zone_t zone, int pref)
 	return (domain);
 }
 
-/*
- * Safely subtract cnt from imax.
- */
-static void
-zone_domain_imax_sub(uma_zone_domain_t zdom, int cnt)
-{
-	long new;
-	long old;
-
-	old = zdom->uzd_imax;
-	do {
-		if (old <= cnt)
-			new = 0;
-		else
-			new = old - cnt;
-	} while (atomic_fcmpset_long(&zdom->uzd_imax, &old, new) == 0);
-}
-
 /*
  * Set the maximum imax value.
  */
@@ -729,8 +713,16 @@ zone_domain_imax_set(uma_zone_domain_t zdom, int nitems)
 	old = zdom->uzd_imax;
 	do {
 		if (old >= nitems)
-			break;
+			return;
 	} while (atomic_fcmpset_long(&zdom->uzd_imax, &old, nitems) == 0);
+
+	/*
+	 * We are at new maximum, so do the last WSS update for the old
+	 * bimin and prepare to measure next allocation batch.
+	 */
+	if (zdom->uzd_wss < old - zdom->uzd_bimin)
+		zdom->uzd_wss = old - zdom->uzd_bimin;
+	zdom->uzd_bimin = nitems;
 }
 
 /*
@@ -741,6 +733,7 @@ static uma_bucket_t
 zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom, bool reclaim)
 {
 	uma_bucket_t bucket;
+	long cnt;
 	int i;
 	bool dtor = false;
 
@@ -768,15 +761,26 @@ zone_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom, bool reclaim)
 	    ("%s: empty bucket in bucket cache", __func__));
 	zdom->uzd_nitems -= bucket->ub_cnt;
 
-	/*
-	 * Shift the bounds of the current WSS interval to avoid
-	 * perturbing the estimate.
-	 */
 	if (reclaim) {
+		/*
+		 * Shift the bounds of the current WSS interval to avoid
+		 * perturbing the estimates.
+		 */
+		cnt = lmin(zdom->uzd_bimin, bucket->ub_cnt);
+		atomic_subtract_long(&zdom->uzd_imax, cnt);
+		zdom->uzd_bimin -= cnt;
 		zdom->uzd_imin -= lmin(zdom->uzd_imin, bucket->ub_cnt);
-		zone_domain_imax_sub(zdom, bucket->ub_cnt);
-	} else if (zdom->uzd_imin > zdom->uzd_nitems)
-		zdom->uzd_imin = zdom->uzd_nitems;
+		if (zdom->uzd_limin >= bucket->ub_cnt) {
+			zdom->uzd_limin -= bucket->ub_cnt;
+		} else {
+			zdom->uzd_limin = 0;
+			zdom->uzd_timin = 0;
+		}
+	} else if (zdom->uzd_bimin > zdom->uzd_nitems) {
+		zdom->uzd_bimin = zdom->uzd_nitems;
+		if (zdom->uzd_imin > zdom->uzd_nitems)
+			zdom->uzd_imin = zdom->uzd_nitems;
+	}
 
 	ZDOM_UNLOCK(zdom);
 	if (dtor)
@@ -808,8 +812,18 @@ zone_put_bucket(uma_zone_t zone, int domain, uma_bucket_t bucket, void *udata,
 	 */
 	zdom->uzd_nitems += bucket->ub_cnt;
 	if (__predict_true(zdom->uzd_nitems < zone->uz_bucket_max)) {
-		if (ws)
+		if (ws) {
 			zone_domain_imax_set(zdom, zdom->uzd_nitems);
+		} else {
+			/*
+			 * Shift the bounds of the current WSS interval to
+			 * avoid perturbing the estimates.
+			 */
+			atomic_add_long(&zdom->uzd_imax, bucket->ub_cnt);
+			zdom->uzd_imin += bucket->ub_cnt;
+			zdom->uzd_bimin += bucket->ub_cnt;
+			zdom->uzd_limin += bucket->ub_cnt;
+		}
 		if (STAILQ_EMPTY(&zdom->uzd_buckets))
 			zdom->uzd_seq = bucket->ub_seq;
 
@@ -1041,22 +1055,49 @@ uma_timeout(void *unused)
 }
 
 /*
- * Update the working set size estimate for the zone's bucket cache.
- * The constants chosen here are somewhat arbitrary.  With an update period of
- * 20s (UMA_TIMEOUT), this estimate is dominated by zone activity over the
- * last 100s.
+ * Update the working set size estimates for the zone's bucket cache.
+ * The constants chosen here are somewhat arbitrary.
  */
 static void
 zone_domain_update_wss(uma_zone_domain_t zdom)
 {
-	long wss;
+	long m;
 
-	ZDOM_LOCK(zdom);
-	MPASS(zdom->uzd_imax >= zdom->uzd_imin);
-	wss = zdom->uzd_imax - zdom->uzd_imin;
-	zdom->uzd_imax = zdom->uzd_imin = zdom->uzd_nitems;
-	zdom->uzd_wss = (4 * wss + zdom->uzd_wss) / 5;
-	ZDOM_UNLOCK(zdom);
+	ZDOM_LOCK_ASSERT(zdom);
+	MPASS(zdom->uzd_imax >= zdom->uzd_nitems);
+	MPASS(zdom->uzd_nitems >= zdom->uzd_bimin);
+	MPASS(zdom->uzd_bimin >= zdom->uzd_imin);
+
+	/*
+	 * Estimate WSS as modified moving average of biggest allocation
+	 * batches for each period over few minutes (UMA_TIMEOUT of 20s).
+	 */
+	zdom->uzd_wss = lmax(zdom->uzd_wss * 3 / 4,
+	    zdom->uzd_imax - zdom->uzd_bimin);
+
+	/*
+	 * Estimate longtime minimum item count as a combination of recent
+	 * minimum item count, adjusted by WSS for safety, and the modified
+	 * moving average over the last several hours (UMA_TIMEOUT of 20s).
+	 * timin measures time since limin tried to go negative, that means
+	 * we were dangerously close to or got out of cache.
+	 */
+	m = zdom->uzd_imin - zdom->uzd_wss;
+	if (m >= 0) {
+		if (zdom->uzd_limin >= m)
+			zdom->uzd_limin = m;
+		else
+			zdom->uzd_limin = (m + zdom->uzd_limin * 255) / 256;
+		zdom->uzd_timin++;
+	} else {
+		zdom->uzd_limin = 0;
+		zdom->uzd_timin = 0;
+	}
+
+	/* To reduce period edge effects on WSS keep half of the imax. */
+	atomic_subtract_long(&zdom->uzd_imax,
+	    (zdom->uzd_imax - zdom->uzd_nitems + 1) / 2);
+	zdom->uzd_imin = zdom->uzd_bimin = zdom->uzd_nitems;
 }
 
 /*
@@ -1072,7 +1113,7 @@ zone_timeout(uma_zone_t zone, void *unused)
 	u_int slabs, pages;
 
 	if ((zone->uz_flags & UMA_ZFLAG_HASH) == 0)
-		goto update_wss;
+		goto trim;
 
 	keg = zone->uz_keg;
 
@@ -1113,14 +1154,18 @@ zone_timeout(uma_zone_t zone, void *unused)
 
 			KEG_UNLOCK(keg, 0);
 			hash_free(&oldhash);
-			goto update_wss;
+			goto trim;
 		}
 	}
 	KEG_UNLOCK(keg, 0);
 
-update_wss:
-	for (int i = 0; i < vm_ndomains; i++)
-		zone_domain_update_wss(ZDOM_GET(zone, i));
+trim:
+	/* Trim caches not used for a long time. */
+	for (int i = 0; i < vm_ndomains; i++) {
+		if (bucket_cache_reclaim_domain(zone, false, false, i) &&
+		    (zone->uz_flags & UMA_ZFLAG_CACHE) == 0)
+			keg_drain(zone->uz_keg, i);
+	}
 }
 
 /*
@@ -1405,12 +1450,13 @@ pcpu_cache_drain_safe(uma_zone_t zone)
  * requested a drain, otherwise the per-domain caches are trimmed to either
  * estimated working set size.
  */
-static void
-bucket_cache_reclaim_domain(uma_zone_t zone, bool drain, int domain)
+static bool
+bucket_cache_reclaim_domain(uma_zone_t zone, bool drain, bool trim, int domain)
 {
 	uma_zone_domain_t zdom;
 	uma_bucket_t bucket;
 	long target;
+	bool done = false;
 
 	/*
 	 * The cross bucket is partially filled and not part of
@@ -1428,23 +1474,35 @@ bucket_cache_reclaim_domain(uma_zone_t zone, bool drain, int domain)
 
 	/*
 	 * If we were asked to drain the zone, we are done only once
-	 * this bucket cache is empty.  Otherwise, we reclaim items in
-	 * excess of the zone's estimated working set size.  If the
-	 * difference nitems - imin is larger than the WSS estimate,
-	 * then the estimate will grow at the end of this interval and
-	 * we ignore the historical average.
+	 * this bucket cache is empty.  If trim, we reclaim items in
+	 * excess of the zone's estimated working set size.  Multiple
+	 * consecutive calls will shrink the WSS and so reclaim more.
+	 * If neither drain nor trim, then voluntarily reclaim 1/4
+	 * (to reduce first spike) of items not used for a long time.
 	 */
 	ZDOM_LOCK(zdom);
-	target = drain ? 0 : lmax(zdom->uzd_wss, zdom->uzd_nitems -
-	    zdom->uzd_imin);
-	while (zdom->uzd_nitems > target) {
+	zone_domain_update_wss(zdom);
+	if (drain)
+		target = 0;
+	else if (trim)
+		target = zdom->uzd_wss;
+	else if (zdom->uzd_timin > 900 / UMA_TIMEOUT)
+		target = zdom->uzd_nitems - zdom->uzd_limin / 4;
+	else {
+		ZDOM_UNLOCK(zdom);
+		return (done);
+	}
+	while ((bucket = STAILQ_FIRST(&zdom->uzd_buckets)) != NULL &&
+	    zdom->uzd_nitems >= target + bucket->ub_cnt) {
 		bucket = zone_fetch_bucket(zone, zdom, true);
 		if (bucket == NULL)
 			break;
 		bucket_free(zone, bucket, NULL);
+		done = true;
 		ZDOM_LOCK(zdom);
 	}
 	ZDOM_UNLOCK(zdom);
+	return (done);
 }
 
 static void
@@ -1461,10 +1519,10 @@ bucket_cache_reclaim(uma_zone_t zone, bool drain, int domain)
 
 	if (domain != UMA_ANYDOMAIN &&
 	    (zone->uz_flags & UMA_ZONE_ROUNDROBIN) == 0) {
-		bucket_cache_reclaim_domain(zone, drain, domain);
+		bucket_cache_reclaim_domain(zone, drain, true, domain);
 	} else {
 		for (i = 0; i < vm_ndomains; i++)
-			bucket_cache_reclaim_domain(zone, drain, i);
+			bucket_cache_reclaim_domain(zone, drain, true, i);
 	}
 }
 
@@ -2611,9 +2669,18 @@ zone_alloc_sysctl(uma_zone_t zone, void *unused)
 		SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    "imin", CTLFLAG_RD, &zdom->uzd_imin,
 		    "minimum item count in this period");
+		SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+		    "bimin", CTLFLAG_RD, &zdom->uzd_bimin,
+		    "Minimum item count in this batch");
 		SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    "wss", CTLFLAG_RD, &zdom->uzd_wss,
 		    "Working set size");
+		SYSCTL_ADD_LONG(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+		    "limin", CTLFLAG_RD, &zdom->uzd_limin,
+		    "Long time minimum item count");
+		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO,
+		    "timin", CTLFLAG_RD, &zdom->uzd_timin, 0,
+		    "Time since zero long time minimum item count");
 	}
 
 	/*
@@ -3642,7 +3709,7 @@ cache_alloc(uma_zone_t zone, uma_cache_t cache, void *udata, int flags)
 	 * We lost the race, release this bucket and start over.
 	 */
 	critical_exit();
-	zone_put_bucket(zone, domain, bucket, udata, false);
+	zone_put_bucket(zone, domain, bucket, udata, !new);
 	critical_enter();
 
 	return (true);
diff --git a/sys/vm/uma_int.h b/sys/vm/uma_int.h
index 93910e78165b..d4b43a61f29e 100644
--- a/sys/vm/uma_int.h
+++ b/sys/vm/uma_int.h
@@ -445,7 +445,10 @@ struct uma_zone_domain {
 	long		uzd_nitems;	/* total item count */
 	long		uzd_imax;	/* maximum item count this period */
 	long		uzd_imin;	/* minimum item count this period */
+	long		uzd_bimin;	/* Minimum item count this batch. */
 	long		uzd_wss;	/* working set size estimate */
+	long		uzd_limin;	/* Longtime minimum item count. */
+	u_int		uzd_timin;	/* Time since uzd_limin == 0. */
 	smr_seq_t	uzd_seq;	/* Lowest queued seq. */
 	struct mtx	uzd_lock;	/* Lock for the domain */
 } __aligned(CACHE_LINE_SIZE);