svn commit: r297633 - in head: sys/cddl/contrib/opensolaris/uts/common/fs/zfs sys/fs/ext2fs sys/kern sys/sys sys/ufs/ffs sys/ufs/ufs sys/vm usr.bin/rctl

Edward Tomasz Napierala trasz at FreeBSD.org
Thu Apr 7 04:23:28 UTC 2016


Author: trasz
Date: Thu Apr  7 04:23:25 2016
New Revision: 297633
URL: https://svnweb.freebsd.org/changeset/base/297633

Log:
  Add four new RCTL resources - readbps, readiops, writebps and writeiops,
  for limiting disk (actually filesystem) IO.
  
  Note that in some cases these limits are not quite precise. It's ok,
  as long as it's within some reasonable bounds.
  
  Testing - and review of the code, in particular the VFS and VM parts - is
  very welcome.
  
  MFC after:	1 month
  Relnotes:	yes
  Sponsored by:	The FreeBSD Foundation
  Differential Revision:	https://reviews.freebsd.org/D5080

Modified:
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
  head/sys/fs/ext2fs/ext2_bmap.c
  head/sys/kern/kern_physio.c
  head/sys/kern/kern_racct.c
  head/sys/kern/kern_rctl.c
  head/sys/kern/subr_trap.c
  head/sys/kern/vfs_bio.c
  head/sys/kern/vfs_cluster.c
  head/sys/sys/proc.h
  head/sys/sys/racct.h
  head/sys/sys/rctl.h
  head/sys/ufs/ffs/ffs_inode.c
  head/sys/ufs/ffs/ffs_softdep.c
  head/sys/ufs/ufs/ufs_bmap.c
  head/sys/vm/vm_fault.c
  head/usr.bin/rctl/rctl.8

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	Thu Apr  7 01:42:09 2016	(r297632)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	Thu Apr  7 04:23:25 2016	(r297633)
@@ -132,6 +132,7 @@
 #include <sys/multilist.h>
 #ifdef _KERNEL
 #include <sys/dnlc.h>
+#include <sys/racct.h>
 #endif
 #include <sys/callb.h>
 #include <sys/kstat.h>
@@ -4503,6 +4504,14 @@ top:
 		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
 		    data, metadata, misses);
 #ifdef _KERNEL
+#ifdef RACCT
+		if (racct_enable) {
+			PROC_LOCK(curproc);
+			racct_add_force(curproc, RACCT_READBPS, size);
+			racct_add_force(curproc, RACCT_READIOPS, 1);
+			PROC_UNLOCK(curproc);
+		}
+#endif /* RACCT */
 		curthread->td_ru.ru_inblock++;
 #endif
 

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	Thu Apr  7 01:42:09 2016	(r297632)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	Thu Apr  7 04:23:25 2016	(r297633)
@@ -47,6 +47,7 @@
 #include <sys/sa.h>
 #include <sys/zfeature.h>
 #ifdef _KERNEL
+#include <sys/racct.h>
 #include <sys/vm.h>
 #include <sys/zfs_znode.h>
 #endif
@@ -427,6 +428,15 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn,
 	}
 	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 
+#if defined(_KERNEL) && defined(RACCT)
+	if (racct_enable && !read) {
+		PROC_LOCK(curproc);
+		racct_add_force(curproc, RACCT_WRITEBPS, length);
+		racct_add_force(curproc, RACCT_WRITEIOPS, nblks);
+		PROC_UNLOCK(curproc);
+	}
+#endif
+
 	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	for (i = 0; i < nblks; i++) {
@@ -1422,7 +1432,15 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uin
 	    DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA) {
 #ifdef _KERNEL
 		curthread->td_ru.ru_oublock++;
-#endif
+#ifdef RACCT
+		if (racct_enable) {
+			PROC_LOCK(curproc);
+			racct_add_force(curproc, RACCT_WRITEBPS, blksz);
+			racct_add_force(curproc, RACCT_WRITEIOPS, 1);
+			PROC_UNLOCK(curproc);
+		}
+#endif /* RACCT */
+#endif /* _KERNEL */
 		dbuf_assign_arcbuf(db, buf, tx);
 		dbuf_rele(db, FTAG);
 	} else {

Modified: head/sys/fs/ext2fs/ext2_bmap.c
==============================================================================
--- head/sys/fs/ext2fs/ext2_bmap.c	Thu Apr  7 01:42:09 2016	(r297632)
+++ head/sys/fs/ext2fs/ext2_bmap.c	Thu Apr  7 04:23:25 2016	(r297633)
@@ -42,6 +42,7 @@
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
+#include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/stat.h>
 
@@ -247,6 +248,13 @@ ext2_bmaparray(struct vnode *vp, daddr_t
 			vfs_busy_pages(bp, 0);
 			bp->b_iooffset = dbtob(bp->b_blkno);
 			bstrategy(bp);
+#ifdef RACCT
+			if (racct_enable) {
+				PROC_LOCK(curproc);
+				racct_add_buf(curproc, bp, 0);
+				PROC_UNLOCK(curproc);
+			}
+#endif
 			curthread->td_ru.ru_inblock++;
 			error = bufwait(bp);
 			if (error) {

Modified: head/sys/kern/kern_physio.c
==============================================================================
--- head/sys/kern/kern_physio.c	Thu Apr  7 01:42:09 2016	(r297632)
+++ head/sys/kern/kern_physio.c	Thu Apr  7 04:23:25 2016	(r297633)
@@ -27,6 +27,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/conf.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
+#include <sys/racct.h>
 #include <sys/uio.h>
 #include <geom/geom.h>
 
@@ -109,6 +110,22 @@ physio(struct cdev *dev, struct uio *uio
 		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
 	error = 0;
 	for (i = 0; i < uio->uio_iovcnt; i++) {
+#ifdef RACCT
+		if (racct_enable) {
+			PROC_LOCK(curproc);
+			if (uio->uio_rw == UIO_READ) {
+				racct_add_force(curproc, RACCT_READBPS,
+				    uio->uio_iov[i].iov_len);
+				racct_add_force(curproc, RACCT_READIOPS, 1);
+			} else {
+				racct_add_force(curproc, RACCT_WRITEBPS,
+				    uio->uio_iov[i].iov_len);
+				racct_add_force(curproc, RACCT_WRITEIOPS, 1);
+			}
+			PROC_UNLOCK(curproc);
+		}
+#endif /* RACCT */
+
 		while (uio->uio_iov[i].iov_len) {
 			g_reset_bio(bp);
 			if (uio->uio_rw == UIO_READ) {

Modified: head/sys/kern/kern_racct.c
==============================================================================
--- head/sys/kern/kern_racct.c	Thu Apr  7 01:42:09 2016	(r297632)
+++ head/sys/kern/kern_racct.c	Thu Apr  7 04:23:25 2016	(r297633)
@@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$");
 #include "opt_sched.h"
 
 #include <sys/param.h>
+#include <sys/buf.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/jail.h>
@@ -177,7 +178,15 @@ int racct_types[] = {
 	[RACCT_WALLCLOCK] =
 		RACCT_IN_MILLIONS,
 	[RACCT_PCTCPU] =
-		RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS };
+		RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS,
+	[RACCT_READBPS] =
+		RACCT_DECAYING,
+	[RACCT_WRITEBPS] =
+		RACCT_DECAYING,
+	[RACCT_READIOPS] =
+		RACCT_DECAYING,
+	[RACCT_WRITEIOPS] =
+		RACCT_DECAYING };
 
 static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE;
 
@@ -634,6 +643,28 @@ racct_add_cred(struct ucred *cred, int r
 	RACCT_UNLOCK();
 }
 
+/*
+ * Account for disk IO resource consumption.  Checks for limits,
+ * but never fails, due to disk limits being undeniable.
+ */
+void
+racct_add_buf(struct proc *p, const struct buf *bp, int is_write)
+{
+
+	ASSERT_RACCT_ENABLED();
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	RACCT_LOCK();
+	if (is_write) {
+		racct_add_locked(curproc, RACCT_WRITEBPS, bp->b_bcount, 1);
+		racct_add_locked(curproc, RACCT_WRITEIOPS, 1, 1);
+	} else {
+		racct_add_locked(curproc, RACCT_READBPS, bp->b_bcount, 1);
+		racct_add_locked(curproc, RACCT_READIOPS, 1, 1);
+	}
+	RACCT_UNLOCK();
+}
+
 static int
 racct_set_locked(struct proc *p, int resource, uint64_t amount, int force)
 {
@@ -655,7 +686,7 @@ racct_set_locked(struct proc *p, int res
 	 * The diffs may be negative.
 	 */
 	diff_proc = amount - old_amount;
-	if (RACCT_IS_DECAYING(resource)) {
+	if (resource == RACCT_PCTCPU) {
 		/*
 		 * Resources in per-credential racct containers may decay.
 		 * If this is the case, we need to calculate the difference
@@ -1043,14 +1074,19 @@ racct_move(struct racct *dest, struct ra
 	RACCT_UNLOCK();
 }
 
-static void
-racct_proc_throttle(struct proc *p)
+/*
+ * Make the process sleep in userret() for 'timeout' ticks.  Setting
+ * timeout to -1 makes it sleep until woken up by racct_proc_wakeup().
+ */
+void
+racct_proc_throttle(struct proc *p, int timeout)
 {
 	struct thread *td;
 #ifdef SMP
 	int cpuid;
 #endif
 
+	KASSERT(timeout != 0, ("timeout %d", timeout));
 	ASSERT_RACCT_ENABLED();
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
@@ -1058,10 +1094,13 @@ racct_proc_throttle(struct proc *p)
 	 * Do not block kernel processes.  Also do not block processes with
 	 * low %cpu utilization to improve interactivity.
 	 */
-	if (((p->p_flag & (P_SYSTEM | P_KPROC)) != 0) ||
-	    (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold))
+	if ((p->p_flag & (P_SYSTEM | P_KPROC)) != 0)
 		return;
-	p->p_throttled = 1;
+
+	if (p->p_throttled < 0 || (timeout > 0 && p->p_throttled > timeout))
+		return;
+
+	p->p_throttled = timeout;
 
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
@@ -1102,7 +1141,7 @@ racct_proc_wakeup(struct proc *p)
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
-	if (p->p_throttled) {
+	if (p->p_throttled != 0) {
 		p->p_throttled = 0;
 		wakeup(p->p_racct);
 	}
@@ -1116,6 +1155,13 @@ racct_decay_callback(struct racct *racct
 	ASSERT_RACCT_ENABLED();
 	RACCT_LOCK_ASSERT();
 
+#ifdef RCTL
+	rctl_throttle_decay(racct, RACCT_READBPS);
+	rctl_throttle_decay(racct, RACCT_WRITEBPS);
+	rctl_throttle_decay(racct, RACCT_READIOPS);
+	rctl_throttle_decay(racct, RACCT_WRITEIOPS);
+#endif
+
 	r_old = racct->r_resources[RACCT_PCTCPU];
 
 	/* If there is nothing to decay, just exit. */
@@ -1206,6 +1252,12 @@ racctd(void)
 				pct_estimate = 0;
 			pct = racct_getpcpu(p, pct_estimate);
 			RACCT_LOCK();
+#ifdef RCTL
+			rctl_throttle_decay(p->p_racct, RACCT_READBPS);
+			rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS);
+			rctl_throttle_decay(p->p_racct, RACCT_READIOPS);
+			rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS);
+#endif
 			racct_set_locked(p, RACCT_PCTCPU, pct, 1);
 			racct_set_locked(p, RACCT_CPU, runtime, 0);
 			racct_set_locked(p, RACCT_WALLCLOCK,
@@ -1228,10 +1280,13 @@ racctd(void)
 				continue;
 			}
 
-			if (racct_pcpu_available(p) <= 0)
-				racct_proc_throttle(p);
-			else if (p->p_throttled)
+			if (racct_pcpu_available(p) <= 0) {
+				if (p->p_racct->r_resources[RACCT_PCTCPU] >
+				    pcpu_threshold)
+					racct_proc_throttle(p, -1);
+			} else if (p->p_throttled == -1) {
 				racct_proc_wakeup(p);
+			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);

Modified: head/sys/kern/kern_rctl.c
==============================================================================
--- head/sys/kern/kern_rctl.c	Thu Apr  7 01:42:09 2016	(r297632)
+++ head/sys/kern/kern_rctl.c	Thu Apr  7 04:23:25 2016	(r297633)
@@ -77,9 +77,13 @@ FEATURE(rctl, "Resource Limits");
 
 #define	RCTL_PCPU_SHIFT		(10 * 1000000)
 
-unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
+static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
 static int rctl_log_rate_limit = 10;
 static int rctl_devctl_rate_limit = 10;
+static unsigned int rctl_throttle_min = 0;
+static unsigned int rctl_throttle_max = 0;
+static unsigned int rctl_throttle_pct = 0;
+static unsigned int rctl_throttle_pct2 = 0;
 
 SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits");
 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
@@ -88,6 +92,16 @@ SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, 
     &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RW,
     &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_min, CTLFLAG_RDTUN,
+    &rctl_throttle_min, 0, "Shortest throttling duration, in hz");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_max, CTLFLAG_RDTUN,
+    &rctl_throttle_max, 0, "Longest throttling duration, in hz");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_pct, CTLFLAG_RDTUN,
+    &rctl_throttle_pct, 0,
+    "Throttling penalty for process consumption, in percent");
+SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, throttle_pct2, CTLFLAG_RDTUN,
+    &rctl_throttle_pct2, 0,
+    "Throttling penalty for container consumption, in percent");
 
 /*
  * 'rctl_rule_link' connects a rule with every racct it's related to.
@@ -134,6 +148,10 @@ static struct dict resourcenames[] = {
 	{ "shmsize", RACCT_SHMSIZE },
 	{ "wallclock", RACCT_WALLCLOCK },
 	{ "pcpu", RACCT_PCTCPU },
+	{ "readbps", RACCT_READBPS },
+	{ "writebps", RACCT_WRITEBPS },
+	{ "readiops", RACCT_READIOPS },
+	{ "writeiops", RACCT_WRITEIOPS },
 	{ NULL, -1 }};
 
 static struct dict actionnames[] = {
@@ -171,6 +189,7 @@ static struct dict actionnames[] = {
 	{ "deny", RCTL_ACTION_DENY },
 	{ "log", RCTL_ACTION_LOG },
 	{ "devctl", RCTL_ACTION_DEVCTL },
+	{ "throttle", RCTL_ACTION_THROTTLE },
 	{ NULL, -1 }};
 
 static void rctl_init(void);
@@ -274,23 +293,53 @@ rctl_available_resource(const struct pro
 }
 
 /*
- * Return non-zero if allocating 'amount' by proc 'p' would exceed
- * resource limit specified by 'rule'.
+ * Called every second for proc, uidinfo, loginclass, and jail containers.
+ * If the limit isn't exceeded, it decreases the usage amount to zero.
+ * Otherwise, it decreases it by the value of the limit.  This way
+ * resource consumption exceeding the limit "carries over" to the next
+ * period.
  */
-static int
-rctl_would_exceed(const struct proc *p, const struct rctl_rule *rule,
-    int64_t amount)
+void
+rctl_throttle_decay(struct racct *racct, int resource)
 {
-	int64_t available;
+	struct rctl_rule *rule;
+	struct rctl_rule_link *link;
+	int64_t minavailable;
 
 	ASSERT_RACCT_ENABLED();
-	RCTL_LOCK_ASSERT();
 
-	available = rctl_available_resource(p, rule);
-	if (available >= amount)
-		return (0);
+	minavailable = INT64_MAX;
 
-	return (1);
+	RCTL_RLOCK();
+
+	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
+		rule = link->rrl_rule;
+
+		if (rule->rr_resource != resource)
+			continue;
+		if (rule->rr_action != RCTL_ACTION_THROTTLE)
+			continue;
+
+		if (rule->rr_amount < minavailable)
+			minavailable = rule->rr_amount;
+	}
+
+	RCTL_RUNLOCK();
+
+	if (racct->r_resources[resource] < minavailable) {
+		racct->r_resources[resource] = 0;
+	} else {
+		/*
+		 * Cap utilization counter at ten times the limit.  Otherwise,
+		 * if we changed the rule lowering the allowed amount, it could
+		 * take unreasonably long time for the accumulated resource
+		 * usage to drop.
+		 */
+		if (racct->r_resources[resource] > minavailable * 10)
+			racct->r_resources[resource] = minavailable * 10;
+
+		racct->r_resources[resource] -= minavailable;
+	}
 }
 
 /*
@@ -340,6 +389,38 @@ rctl_pcpu_available(const struct proc *p
 	return (minavailable);
 }
 
+static uint64_t
+xadd(uint64_t a, uint64_t b)
+{
+	uint64_t c;
+
+	c = a + b;
+
+	/*
+	 * Detect overflow.
+	 */
+	if (c < a || c < b)
+		return (UINT64_MAX);
+
+	return (c);
+}
+
+static uint64_t
+xmul(uint64_t a, uint64_t b)
+{
+	uint64_t c;
+
+	if (a == 0 || b == 0)
+		return (0);
+
+	c = a * b;
+
+	if (c < a || c < b)
+		return (UINT64_MAX);
+
+	return (c);
+}
+
 /*
  * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
  * to what it keeps allocated now.  Returns non-zero if the allocation should
@@ -353,9 +434,12 @@ rctl_enforce(struct proc *p, int resourc
 	struct rctl_rule *rule;
 	struct rctl_rule_link *link;
 	struct sbuf sb;
+	int64_t available;
+	uint64_t sleep_ms, sleep_ratio;
 	int should_deny = 0;
 	char *buf;
 
+
 	ASSERT_RACCT_ENABLED();
 
 	RCTL_RLOCK();
@@ -368,7 +452,9 @@ rctl_enforce(struct proc *p, int resourc
 		rule = link->rrl_rule;
 		if (rule->rr_resource != resource)
 			continue;
-		if (!rctl_would_exceed(p, rule, amount)) {
+
+		available = rctl_available_resource(p, rule);
+		if (available >= (int64_t)amount) {
 			link->rrl_exceeded = 0;
 			continue;
 		}
@@ -421,7 +507,7 @@ rctl_enforce(struct proc *p, int resourc
 
 			if (p->p_state != PRS_NORMAL)
 				continue;
-	
+
 			if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
 			    rctl_devctl_rate_limit))
 				continue;
@@ -444,6 +530,69 @@ rctl_enforce(struct proc *p, int resourc
 			free(buf, M_RCTL);
 			link->rrl_exceeded = 1;
 			continue;
+		case RCTL_ACTION_THROTTLE:
+			if (p->p_state != PRS_NORMAL)
+				continue;
+
+			/*
+			 * Make the process sleep for a fraction of second
+			 * proportional to the ratio of process' resource
+			 * utilization compared to the limit.  The point is
+			 * to penalize resource hogs: processes that consume
+			 * more of the available resources sleep for longer.
+			 *
+			 * We're trying to defer division until the very end,
+			 * to minimize the rounding effects.  The following
+			 * calculation could have been written in a clearer
+			 * way like this:
+			 *
+			 * sleep_ms = hz * p->p_racct->r_resources[resource] /
+			 *     rule->rr_amount;
+			 * sleep_ms *= rctl_throttle_pct / 100;
+			 * if (sleep_ms < rctl_throttle_min)
+			 *         sleep_ms = rctl_throttle_min;
+			 *
+			 */
+			sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
+			sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
+			if (sleep_ms < rctl_throttle_min * rule->rr_amount)
+				sleep_ms = rctl_throttle_min * rule->rr_amount;
+
+			/*
+			 * Multiply that by the ratio of the resource
+			 * consumption for the container compared to the limit,
+			 * squared.  In other words, a process in a container
+			 * that is two times over the limit will be throttled
+			 * four times as much for hitting the same rule.  The
+			 * point is to penalize processes more if the container
+			 * itself (eg certain UID or jail) is above the limit.
+			 */
+			if (available < 0)
+				sleep_ratio = -available / rule->rr_amount;
+			else
+				sleep_ratio = 0;
+			sleep_ratio = xmul(sleep_ratio, sleep_ratio);
+			sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
+			sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
+
+			/*
+			 * Finally the division.
+			 */
+			sleep_ms /= rule->rr_amount;
+
+			if (sleep_ms > rctl_throttle_max)
+				sleep_ms = rctl_throttle_max;
+#if 0
+			printf("%s: pid %d (%s), %jd of %jd, will sleep for %ld ms (ratio %ld, available %ld)\n",
+			   __func__, p->p_pid, p->p_comm,
+			   p->p_racct->r_resources[resource],
+			   rule->rr_amount, sleep_ms, sleep_ratio, available);
+#endif
+
+			KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
+			    __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
+			racct_proc_throttle(p, sleep_ms);
+			continue;
 		default:
 			if (link->rrl_exceeded != 0)
 				continue;
@@ -1073,20 +1222,32 @@ rctl_rule_add(struct rctl_rule *rule)
 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
 
 	/*
-	 * Some rules just don't make sense.  Note that the one below
-	 * cannot be rewritten using RACCT_IS_DENIABLE(); the RACCT_PCTCPU,
-	 * for example, is not deniable in the racct sense, but the
-	 * limit is enforced in a different way, so "deny" rules for %CPU
-	 * do make sense.
+	 * Some rules just don't make sense, like "deny" rule for an undeniable
+	 * resource.  The exception are the RSS and %CPU resources - they are
+	 * not deniable in the racct sense, but the limit is enforced in
+	 * a different way.
 	 */
 	if (rule->rr_action == RCTL_ACTION_DENY &&
-	    (rule->rr_resource == RACCT_CPU ||
-	    rule->rr_resource == RACCT_WALLCLOCK))
+	    !RACCT_IS_DENIABLE(rule->rr_resource) &&
+	    rule->rr_resource != RACCT_RSS &&
+	    rule->rr_resource != RACCT_PCTCPU) {
 		return (EOPNOTSUPP);
+	}
+
+	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
+	    !RACCT_IS_DECAYING(rule->rr_resource)) {
+		return (EOPNOTSUPP);
+	}
+
+	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
+	    rule->rr_resource == RACCT_PCTCPU) {
+		return (EOPNOTSUPP);
+	}
 
 	if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
-	    RACCT_IS_SLOPPY(rule->rr_resource))
+	    RACCT_IS_SLOPPY(rule->rr_resource)) {
 		return (EOPNOTSUPP);
+	}
 
 	/*
 	 * Make sure there are no duplicated rules.  Also, for the "deny"
@@ -1960,6 +2121,15 @@ rctl_init(void)
 	    UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+
+	if (rctl_throttle_min <= 0)
+		rctl_throttle_min = 1;
+	if (rctl_throttle_max <= 0)
+		rctl_throttle_max = 2 * hz;
+	if (rctl_throttle_pct <= 0)
+		rctl_throttle_pct = 100;
+	if (rctl_throttle_pct2 <= 0)
+		rctl_throttle_pct2 = 100;
 }
 
 #else /* !RCTL */

Modified: head/sys/kern/subr_trap.c
==============================================================================
--- head/sys/kern/subr_trap.c	Thu Apr  7 01:42:09 2016	(r297632)
+++ head/sys/kern/subr_trap.c	Thu Apr  7 04:23:25 2016	(r297633)
@@ -172,10 +172,14 @@ userret(struct thread *td, struct trapfr
 	    (td->td_vnet_lpush != NULL) ? td->td_vnet_lpush : "N/A"));
 #endif
 #ifdef RACCT
-	if (racct_enable && p->p_throttled == 1) {
+	if (racct_enable && p->p_throttled != 0) {
 		PROC_LOCK(p);
-		while (p->p_throttled == 1)
-			msleep(p->p_racct, &p->p_mtx, 0, "racct", 0);
+		while (p->p_throttled != 0) {
+			msleep(p->p_racct, &p->p_mtx, 0, "racct",
+			    p->p_throttled < 0 ? 0 : p->p_throttled);
+			if (p->p_throttled > 0)
+				p->p_throttled = 0;
+		}
 		PROC_UNLOCK(p);
 	}
 #endif

Modified: head/sys/kern/vfs_bio.c
==============================================================================
--- head/sys/kern/vfs_bio.c	Thu Apr  7 01:42:09 2016	(r297632)
+++ head/sys/kern/vfs_bio.c	Thu Apr  7 04:23:25 2016	(r297633)
@@ -61,6 +61,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/proc.h>
+#include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/smp.h>
@@ -1784,8 +1785,16 @@ breada(struct vnode * vp, daddr_t * rabl
 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
 
 		if ((rabp->b_flags & B_CACHE) == 0) {
-			if (!TD_IS_IDLETHREAD(curthread))
+			if (!TD_IS_IDLETHREAD(curthread)) {
+#ifdef RACCT
+				if (racct_enable) {
+					PROC_LOCK(curproc);
+					racct_add_buf(curproc, rabp, 0);
+					PROC_UNLOCK(curproc);
+				}
+#endif /* RACCT */
 				curthread->td_ru.ru_inblock++;
+			}
 			rabp->b_flags |= B_ASYNC;
 			rabp->b_flags &= ~B_INVAL;
 			rabp->b_ioflags &= ~BIO_ERROR;
@@ -1829,8 +1838,16 @@ breadn_flags(struct vnode *vp, daddr_t b
 
 	/* if not found in cache, do some I/O */
 	if ((bp->b_flags & B_CACHE) == 0) {
-		if (!TD_IS_IDLETHREAD(curthread))
+		if (!TD_IS_IDLETHREAD(curthread)) {
+#ifdef RACCT
+			if (racct_enable) {
+				PROC_LOCK(curproc);
+				racct_add_buf(curproc, bp, 0);
+				PROC_UNLOCK(curproc);
+			}
+#endif /* RACCT */
 			curthread->td_ru.ru_inblock++;
+		}
 		bp->b_iocmd = BIO_READ;
 		bp->b_flags &= ~B_INVAL;
 		bp->b_ioflags &= ~BIO_ERROR;
@@ -1926,8 +1943,16 @@ bufwrite(struct buf *bp)
 	bp->b_runningbufspace = bp->b_bufsize;
 	space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
 
-	if (!TD_IS_IDLETHREAD(curthread))
+	if (!TD_IS_IDLETHREAD(curthread)) {
+#ifdef RACCT
+		if (racct_enable) {
+			PROC_LOCK(curproc);
+			racct_add_buf(curproc, bp, 1);
+			PROC_UNLOCK(curproc);
+		}
+#endif /* RACCT */
 		curthread->td_ru.ru_oublock++;
+	}
 	if (oldflags & B_ASYNC)
 		BUF_KERNPROC(bp);
 	bp->b_iooffset = dbtob(bp->b_blkno);

Modified: head/sys/kern/vfs_cluster.c
==============================================================================
--- head/sys/kern/vfs_cluster.c	Thu Apr  7 01:42:09 2016	(r297632)
+++ head/sys/kern/vfs_cluster.c	Thu Apr  7 04:23:25 2016	(r297633)
@@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/vnode.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
+#include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/vmmeter.h>
@@ -241,6 +242,13 @@ cluster_read(struct vnode *vp, u_quad_t 
 			BUF_KERNPROC(bp);
 		bp->b_iooffset = dbtob(bp->b_blkno);
 		bstrategy(bp);
+#ifdef RACCT
+		if (racct_enable) {
+			PROC_LOCK(curproc);
+			racct_add_buf(curproc, bp, 0);
+			PROC_UNLOCK(curproc);
+		}
+#endif /* RACCT */
 		curthread->td_ru.ru_inblock++;
 	}
 
@@ -294,6 +302,13 @@ cluster_read(struct vnode *vp, u_quad_t 
 			BUF_KERNPROC(rbp);
 		rbp->b_iooffset = dbtob(rbp->b_blkno);
 		bstrategy(rbp);
+#ifdef RACCT
+		if (racct_enable) {
+			PROC_LOCK(curproc);
+			racct_add_buf(curproc, rbp, 0);
+			PROC_UNLOCK(curproc);
+		}
+#endif /* RACCT */
 		curthread->td_ru.ru_inblock++;
 	}
 

Modified: head/sys/sys/proc.h
==============================================================================
--- head/sys/sys/proc.h	Thu Apr  7 01:42:09 2016	(r297632)
+++ head/sys/sys/proc.h	Thu Apr  7 04:23:25 2016	(r297633)
@@ -623,7 +623,7 @@ struct proc {
 					   after fork. */
 	uint64_t	p_prev_runtime;	/* (c) Resource usage accounting. */
 	struct racct	*p_racct;	/* (b) Resource accounting. */
-	u_char		p_throttled;	/* (c) Flag for racct pcpu throttling */
+	int		p_throttled;	/* (c) Flag for racct pcpu throttling */
 	struct vm_domain_policy p_vm_dom_policy;	/* (c) process default VM domain, or -1 */
 	/*
 	 * An orphan is the child that has beed re-parented to the

Modified: head/sys/sys/racct.h
==============================================================================
--- head/sys/sys/racct.h	Thu Apr  7 01:42:09 2016	(r297632)
+++ head/sys/sys/racct.h	Thu Apr  7 04:23:25 2016	(r297633)
@@ -42,6 +42,7 @@
 #include <sys/stdint.h>
 #include <sys/sysctl.h>
 
+struct buf;
 struct proc;
 struct rctl_rule_link;
 struct ucred;
@@ -71,7 +72,11 @@ struct ucred;
 #define	RACCT_SHMSIZE		18
 #define	RACCT_WALLCLOCK		19
 #define	RACCT_PCTCPU		20
-#define	RACCT_MAX		RACCT_PCTCPU
+#define	RACCT_READBPS		21
+#define	RACCT_WRITEBPS		22
+#define	RACCT_READIOPS		23
+#define	RACCT_WRITEIOPS		24
+#define	RACCT_MAX		RACCT_WRITEIOPS
 
 /*
  * Resource properties.
@@ -153,6 +158,7 @@ SYSCTL_DECL(_kern_racct);
 int	racct_add(struct proc *p, int resource, uint64_t amount);
 void	racct_add_cred(struct ucred *cred, int resource, uint64_t amount);
 void	racct_add_force(struct proc *p, int resource, uint64_t amount);
+void	racct_add_buf(struct proc *p, const struct buf *bufp, int is_write);
 int	racct_set(struct proc *p, int resource, uint64_t amount);
 void	racct_set_force(struct proc *p, int resource, uint64_t amount);
 void	racct_sub(struct proc *p, int resource, uint64_t amount);
@@ -170,6 +176,7 @@ void	racct_proc_exit(struct proc *p);
 void	racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred,
 	    struct ucred *newcred);
 void	racct_move(struct racct *dest, struct racct *src);
+void	racct_proc_throttle(struct proc *p, int timeout);
 
 #else
 

Modified: head/sys/sys/rctl.h
==============================================================================
--- head/sys/sys/rctl.h	Thu Apr  7 01:42:09 2016	(r297632)
+++ head/sys/sys/rctl.h	Thu Apr  7 04:23:25 2016	(r297633)
@@ -129,7 +129,8 @@ struct rctl_rule {
 #define	RCTL_ACTION_DENY		(RCTL_ACTION_SIGNAL_MAX + 1)
 #define	RCTL_ACTION_LOG			(RCTL_ACTION_SIGNAL_MAX + 2)
 #define	RCTL_ACTION_DEVCTL		(RCTL_ACTION_SIGNAL_MAX + 3)
-#define	RCTL_ACTION_MAX			RCTL_ACTION_DEVCTL
+#define	RCTL_ACTION_THROTTLE		(RCTL_ACTION_SIGNAL_MAX + 4)
+#define	RCTL_ACTION_MAX			RCTL_ACTION_THROTTLE
 
 #define	RCTL_AMOUNT_UNDEFINED		-1
 
@@ -140,6 +141,7 @@ void	rctl_rule_release(struct rctl_rule 
 int	rctl_rule_add(struct rctl_rule *rule);
 int	rctl_rule_remove(struct rctl_rule *filter);
 int	rctl_enforce(struct proc *p, int resource, uint64_t amount);
+void	rctl_throttle_decay(struct racct *racct, int resource);
 int64_t	rctl_pcpu_available(const struct proc *p);
 uint64_t rctl_get_limit(struct proc *p, int resource);
 uint64_t rctl_get_available(struct proc *p, int resource);

Modified: head/sys/ufs/ffs/ffs_inode.c
==============================================================================
--- head/sys/ufs/ffs/ffs_inode.c	Thu Apr  7 01:42:09 2016	(r297632)
+++ head/sys/ufs/ffs/ffs_inode.c	Thu Apr  7 04:23:25 2016	(r297633)
@@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
+#include <sys/racct.h>
 #include <sys/random.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
@@ -659,6 +660,13 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, lev
 	vp = ITOV(ip);
 	bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, 0, 0);
 	if ((bp->b_flags & B_CACHE) == 0) {
+#ifdef RACCT
+		if (racct_enable) {
+			PROC_LOCK(curproc);
+			racct_add_buf(curproc, bp, 0);
+			PROC_UNLOCK(curproc);
+		}
+#endif /* RACCT */
 		curthread->td_ru.ru_inblock++;	/* pay for read */
 		bp->b_iocmd = BIO_READ;
 		bp->b_flags &= ~B_INVAL;

Modified: head/sys/ufs/ffs/ffs_softdep.c
==============================================================================
--- head/sys/ufs/ffs/ffs_softdep.c	Thu Apr  7 01:42:09 2016	(r297632)
+++ head/sys/ufs/ffs/ffs_softdep.c	Thu Apr  7 04:23:25 2016	(r297633)
@@ -69,6 +69,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
+#include <sys/racct.h>
 #include <sys/rwlock.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
@@ -6229,6 +6230,13 @@ setup_trunc_indir(freeblks, ip, lbn, las
 		vfs_busy_pages(bp, 0);
 		bp->b_iooffset = dbtob(bp->b_blkno);
 		bstrategy(bp);
+#ifdef RACCT
+		if (racct_enable) {
+			PROC_LOCK(curproc);
+			racct_add_buf(curproc, bp, 0);
+			PROC_UNLOCK(curproc);
+		}
+#endif /* RACCT */
 		curthread->td_ru.ru_inblock++;
 		error = bufwait(bp);
 		if (error) {

Modified: head/sys/ufs/ufs/ufs_bmap.c
==============================================================================
--- head/sys/ufs/ufs/ufs_bmap.c	Thu Apr  7 01:42:09 2016	(r297632)
+++ head/sys/ufs/ufs/ufs_bmap.c	Thu Apr  7 04:23:25 2016	(r297633)
@@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
+#include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/stat.h>
 
@@ -223,6 +224,13 @@ ufs_bmaparray(vp, bn, bnp, nbp, runp, ru
 			vfs_busy_pages(bp, 0);
 			bp->b_iooffset = dbtob(bp->b_blkno);
 			bstrategy(bp);
+#ifdef RACCT
+			if (racct_enable) {
+				PROC_LOCK(curproc);
+				racct_add_buf(curproc, bp, 0);
+				PROC_UNLOCK(curproc);
+			}
+#endif /* RACCT */
 			curthread->td_ru.ru_inblock++;
 			error = bufwait(bp);
 			if (error) {

Modified: head/sys/vm/vm_fault.c
==============================================================================
--- head/sys/vm/vm_fault.c	Thu Apr  7 01:42:09 2016	(r297632)
+++ head/sys/vm/vm_fault.c	Thu Apr  7 04:23:25 2016	(r297633)
@@ -83,6 +83,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/proc.h>
+#include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sysctl.h>
@@ -994,6 +995,21 @@ vnode_locked:
 	if (hardfault) {
 		PCPU_INC(cnt.v_io_faults);
 		curthread->td_ru.ru_majflt++;
+#ifdef RACCT
+		if (racct_enable && fs.object->type == OBJT_VNODE) {
+			PROC_LOCK(curproc);
+			if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) {
+				racct_add_force(curproc, RACCT_WRITEBPS,
+				    PAGE_SIZE + behind * PAGE_SIZE);
+				racct_add_force(curproc, RACCT_WRITEIOPS, 1);
+			} else {
+				racct_add_force(curproc, RACCT_READBPS,
+				    PAGE_SIZE + ahead * PAGE_SIZE);
+				racct_add_force(curproc, RACCT_READIOPS, 1);
+			}
+			PROC_UNLOCK(curproc);
+		}
+#endif
 	} else 
 		curthread->td_ru.ru_minflt++;
 

Modified: head/usr.bin/rctl/rctl.8
==============================================================================
--- head/usr.bin/rctl/rctl.8	Thu Apr  7 01:42:09 2016	(r297632)
+++ head/usr.bin/rctl/rctl.8	Thu Apr  7 04:23:25 2016	(r297633)
@@ -25,7 +25,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd November 29, 2015
+.Dd January 30, 2016
 .Dt RCTL 8
 .Os
 .Sh NAME
@@ -204,14 +204,22 @@ resource would be
 .It Sy shmsize Ta "SysV shared memory size, in bytes"
 .It Sy wallclock Ta "wallclock time, in seconds"
 .It Sy pcpu Ta "%CPU, in percents of a single CPU core"
+.It Sy readbps Ta "filesystem reads, in bytes per second"
+.It Sy writebps Ta "filesystem writes, in bytes per second"
+.It Sy readiops Ta "filesystem reads, in operations per second"
+.It Sy writeiops Ta "filesystem writes, in operations per second"
 .El
 .Sh ACTIONS
 .Bl -column -offset 3n "pseudoterminals"
 .It Em action
 .It Sy deny Ta deny the allocation; not supported for
-.Sy cputime
+.Sy cputime ,
+.Sy wallclock ,
+.Sy readbps ,
+.Sy writebps ,
+.Sy readiops ,
 and
-.Sy wallclock
+.Sy writeiops
 .It Sy log Ta "log a warning to the console"
 .It Sy devctl Ta "send notification to"
 .Xr devd 8
@@ -228,6 +236,12 @@ send a signal to the offending process.
 See
 .Xr signal 3
 for a list of supported signals
+.It Sy throttle Ta "slow down process execution"; only supported for
+.Sy readbps ,
+.Sy writebps ,
+.Sy readiops ,
+and
+.Sy writeiops .
 .El
 .Pp
 Not all actions are supported for all resources.
@@ -287,3 +301,22 @@ under sponsorship from the FreeBSD Found
 Limiting
 .Sy memoryuse
 may kill the machine due to thrashing.
+.Pp
+The
+.Sy readiops
+and
+.Sy writeiops
+counters are only approximations.
+Like
+.Sy readbps
+and
+.Sy writebps ,
+they are calculated in the filesystem layer, where it is difficult
+or even impossible to observe actual disk device operations.
+.Pp
+The
+.Sy writebps
+and
+.Sy writeiops
+resources generally account for writes to the filesystem cache,
+not to actual devices.


More information about the svn-src-head mailing list