svn commit: r349029 - head/sys/kern
Bruce Evans
brde at optusnet.com.au
Fri Jun 14 12:58:46 UTC 2019
On Fri, 14 Jun 2019, Alexander Motin wrote:
> Log:
> Update td_runtime of running thread on each statclock().
>
> Normally td_runtime is updated on context switch, but there are some kernel
> threads that due to high absolute priority may run for many seconds without
> context switches (yes, that is bad, but that is true), which means their
> td_runtime was not updated all that time, that made them invisible for top
> other then as some general CPU usage.
>
> MFC after: 1 week
> Sponsored by: iXsystems, Inc.
This and more is supposed to be done in calcru(). It is also necessary to
adjust for the current timeslice.
I thought that calcru() was fixed, but the fix seems to be only in my
version of FreeBSD-5.2.
The bug seems to be in fill_kinfo_proc_only(). calcru() should update
td_runtime for all threads in the proc (and sum into rux_rutime), but
fill_kinfo_proc_only() uses rux_runtime to initialize ki_runtime before
calling calcru(), so sees a stale value. This was broken in r132855
(the initialization of ki_runtime was moved too early as part of a fix
for zombies).
Using calcru() for its side effect is a bit magic. get_thread_cputime()
does this less hackishly for curthread (by adding the timeslice without
updating td_runtime), but is broken for other threads (by using the
stale td_runtime).
td_runtime is unavailable in userland. top and ps can only see ki_runtime
which is per-process.
Actually, calcru() seems to have the same bug as get_thread_cputime().
This is fixed in my version of FreeBSD-5-2, but is clearly broken in
plain FreeBSD-FreeBSD-5.2, and -current seems to be even more broken
than plain 5.2 -- it is missing the large comment about this bug that
I added in r130858. -current first updates td_runtime for curthread.
Then it loops over all threads in the process, but does nothing unless
td_incruntime != 0, and when td_incruntime != 0 it updates rux_runtime
using the value in td_incruntime, and this value is stale except for
curthread.
td_runtime is updated in one other place: in rufetchtd(), but this function
has the same bug eas everywhere else -- it only updates the runtimes for
curthread.
Here is my fix for this in FreeBSD-5.2:
XX Index: kern_resource.c
XX ===================================================================
XX RCS file: /home/ncvs/src/sys/kern/kern_resource.c,v
XX retrieving revision 1.140
XX diff -u -2 -r1.140 kern_resource.c
XX --- kern_resource.c 21 Jun 2004 17:46:27 -0000 1.140
XX @@ -702,18 +720,80 @@
XX struct timeval *ip;
XX {
XX - struct bintime bt, rt;
XX - struct timeval tv;
XX + struct bintime bt;
XX + struct rusage_ext rux;
-current already has most of the infrastructure for this -- I invented
rusage_ext partly to fix this problem, and -current has a more sophisticated
version of rusage_ext.
-current also uses cputicks instead of bintimes.
XX struct thread *td;
XX - /* {user, system, interrupt, total} {ticks, usec}; previous tu: */
XX - u_int64_t ut, uu, st, su, it, iu, tt, tu, ptu;
XX - int problemcase;
XX + int bt_valid, proc_locked, sched_locked;
XX
XX - mtx_assert(&sched_lock, MA_OWNED);
XX - /* XXX: why spl-protect ? worst case is an off-by-one report */
XX + proc_locked = mtx_owned(&p->p_mtx);
XX + sched_locked = mtx_owned(&sched_lock);
XX + if (!proc_locked && !sched_locked)
XX + PROC_LOCK(p);
XX + if (!sched_locked)
XX + mtx_lock_spin(&sched_lock);
This also has some restructuring that -current already has in a more
sophisticated way.
XX + rux = p->p_rux;
XX + bt_valid = 0;
XX + FOREACH_THREAD_IN_PROC(p, td) {
XX + if (TD_IS_RUNNING(td)) {
XX + /*
XX + * Adjust for the current time slice. This is
XX + * important since the adjustment is on the order
XX + * of a time quantum, which is much greater than
XX + * the precision of binuptime().
XX + */
XX + KASSERT(td->td_oncpu != NOCPU,
XX + ("calcru: running thread has no CPU"));
XX + if (!bt_valid) {
XX + binuptime(&bt);
XX + bt_valid = 1;
XX + }
XX + bintime_sub(&bt,
XX + &pcpu_find(td->td_oncpu)->pc_switchtime);
XX + if (bt.sec < 0) {
XX + printf(
XX + "calcru: negative delta-runtime of %jd sec + %jd wsec for pid %d (%s)\n",
XX + (intmax_t)bt.sec, bt.frac,
XX + p->p_pid, p->p_comm);
XX + bt.sec = 0;
XX + bt.frac = 0;
XX + }
XX + bintime_add(&rux.rux_runtime, &bt);
XX + }
XX + }
XX + if (!sched_locked)
XX + mtx_unlock_spin(&sched_lock);
That is the main part of the fix. It is necessary to loop over all threads
in the process.
XX + calcru1(p, &rux, up, sp, ip);
XX + p->p_rux.rux_uu = rux.rux_uu;
XX + p->p_rux.rux_su = rux.rux_su;
XX + p->p_rux.rux_iu = rux.rux_iu;
XX + if (!proc_locked && !sched_locked)
XX + PROC_UNLOCK(p);
XX +}
XX +
XX +void
XX +calccru(p, up, sp)
XX + struct proc *p;
XX + struct timeval *up;
XX + struct timeval *sp;
XX +{
XX
XX - ut = p->p_uticks;
XX - st = p->p_sticks;
XX - it = p->p_iticks;
XX + PROC_LOCK_ASSERT(p, MA_OWNED);
XX + calcru1(p, &p->p_crux, up, sp, NULL);
XX +}
XX
XX +static void
XX +calcru1(p, ruxp, up, sp, ip)
XX + struct proc *p;
XX + struct rusage_ext *ruxp;
XX + struct timeval *up;
XX + struct timeval *sp;
XX + struct timeval *ip;
XX +{
XX + struct timeval tv;
XX + /* {user, system, interrupt, total} {ticks, usec}; previous tu: */
XX + u_int64_t ut, uu, st, su, it, iu, tt, tu, ptu;
XX +
XX + ut = ruxp->rux_uticks;
XX + st = ruxp->rux_sticks;
XX + it = ruxp->rux_iticks;
XX tt = ut + st + it;
XX if (tt == 0) {
XX @@ -721,42 +801,20 @@
XX tt = 1;
XX }
XX - rt = p->p_runtime;
XX - problemcase = 0;
XX - FOREACH_THREAD_IN_PROC(p, td) {
XX - /*
XX - * Adjust for the current time slice. This is actually fairly
XX - * important since the error here is on the order of a time
XX - * quantum, which is much greater than the sampling error.
XX - */
XX - if (td == curthread) {
XX - binuptime(&bt);
XX - bintime_sub(&bt, PCPU_PTR(switchtime));
XX - bintime_add(&rt, &bt);
XX - } else if (TD_IS_RUNNING(td)) {
XX - /*
XX - * XXX: this case should add the difference between
XX - * the current time and the switch time as above,
XX - * but the switch time is inaccessible, so we can't
XX - * do the adjustment and will end up with a wrong
XX - * runtime. A previous call with a different
XX - * curthread may have obtained a (right or wrong)
XX - * runtime that is in advance of ours. Just set a
XX - * flag to avoid warning about this known problem.
XX - */
XX - problemcase = 1;
This is the large comment about the bug. It was broken by removing it in
r136152. r136152 was mostly by me to introduce rusage_ext, but it seems
to have been incomplete -- it even removed the adjustment of td_runtime
for curthread. The latter wasn't restored until 7 years later in r244188.
XX - }
XX - }
XX - bintime2timeval(&rt, &tv);
XX + bintime2timeval(&ruxp->rux_runtime, &tv);
XX tu = (u_int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
XX - ptu = p->p_uu + p->p_su + p->p_iu;
XX + ptu = ruxp->rux_uu + ruxp->rux_su + ruxp->rux_iu;
XX if (tu < ptu) {
XX - if (!problemcase)
XX - printf(
XX -"calcru: runtime went backwards from %ju usec to %ju usec for pid %d (%s)\n",
XX - (uintmax_t)ptu, (uintmax_t)tu, p->p_pid, p->p_comm);
XX + printf(
XX +"calcru1: runtime went backwards from %ju usec (uu = %ju, su = %ju, iu = %ju) to %ju usec for pid %d (%s)\n",
XX + (uintmax_t)ptu,
XX + (uintmax_t)ruxp->rux_uu, (uintmax_t)ruxp->rux_su,
XX + (uintmax_t)ruxp->rux_iu,
XX + (uintmax_t)tu, p->p_pid, p->p_comm);
XX tu = ptu;
XX }
XX if ((int64_t)tu < 0) {
XX - printf("calcru: negative runtime of %jd usec for pid %d (%s)\n",
XX + /* XXX p is only needed to print these diagnostics. */
XX + printf(
XX + "calcru1: negative runtime of %jd usec for pid %d (%s)\n",
XX (intmax_t)tu, p->p_pid, p->p_comm);
XX tu = ptu;
-current still has many bugs that let the times go backwards. I don't
remember if 'problemcase' gave negative times, but it still occurs and
gives garbage times. Userland could do a related negative times check:
assert(ki_runtime >= sum of user, sys and interrupt times) often fails
because ki_runtime uses a stale rux_runtime time while the sum equals
the non-stale rux_runtime (after scaling).
Bruce
More information about the svn-src-all
mailing list