[RFC/RFT] calloutng
Luigi Rizzo
rizzo at iet.unipi.it
Sun Feb 17 02:42:08 UTC 2013
On Thu, Feb 14, 2013 at 11:56:54PM -0800, Alfred Perlstein wrote:
> [ added Luigi Rizzo to thread ]
>
>
> On 2/11/13 12:26 PM, Davide Italiano wrote:
> > [trimmed old mails]
> >
> > Here's a new version of the patch:
> > http://people.freebsd.org/~davide/patches/calloutng-11022012.diff
> >
> > Significant bits changed (after wider discussion and suggestion by phk@):
> > - Introduction of the new sbintime_t type (32.32 fixed point) with the
> > respective conversion (sbintime2bintime, sbintime2timeval etc...)
> > - switch from 64.64 (struct bintime) format to measure time to sbintime_t
> > - Use sbintime_t to represent expected sleep time instead of measuring
> > it in microseconds (cpu_idle_acpi(), cpu_idle_spin() ...).
>
>
> Luigi and I discussed this at BAFUG tonight and he expressed an interest
> in shepherding the code in at this point.
>
> Luigi, can you reiterate any points that still remain before we
> integrate this code?
I am mostly ok with the patch in the current state.
I have few comments/suggestions below but they are mostly
on documenting/style/trivial bugfixes.
So now I would really like to see this code go in HEAD,
to give people a chance to see how it works and possibly
figure out whether the new API needs modifications.
To recall, my major concerns were the following:
- API explosion, with multiple versions of the callout routines.
This seems to be solved now, with only one version (the *_sbt()
functions) and macros remapping the old functions to the new ones.
- the use of struct bintime for timeouts and precision.
This is also solved thanks to the introduction of sbintime_t values
(fixed-point 32.32 times)
- Some measurements also indicated that the default "precision"
could give large (absolute) errors on the timeouts, especially
with large timeouts.
I am not sure what is the situation with this new version, but i believe
this a relatively minor issue because it surely simple to customize, e.g.
having a couple of sysctl setting the default precision (percentage)
and maximum error (absolute time). So users could e.g. set
a 5% precision and a maximum error of 100us on a desktop,
and 10% and 10ms on a laptop.
- Another thing that we should do (but after the patch is in) is to
see if any existing code is converting times to ticks just to
call the timeout routines -- right now the macros convert back
to times, clearly it would be best to avoid the double conversion.
comments inline below, search for XXX-lr
thanks again for your work on this, and for following up with changes
after the previous discussion
cheers
luigi
Index: sys/dev/acpica/acpi_cpu.c
===================================================================
--- sys/dev/acpica/acpi_cpu.c (.../head) (revision 246685)
+++ sys/dev/acpica/acpi_cpu.c (.../projects/calloutng) (revision 246685)
@@ -980,13 +980,16 @@
}
/* Find the lowest state that has small enough latency. */
+ us = sc->cpu_prev_sleep;
+ if (sbt >= 0 && us > sbt / SBT_1US) XXX-lr can we have us2sbt() , ms2sbt() and the like ?
+ us = sbt / SBT_1US;
cx_next_idx = 0;
if (cpu_disable_deep_sleep)
i = min(sc->cpu_cx_lowest, sc->cpu_non_c3);
else
i = sc->cpu_cx_lowest;
for (; i >= 0; i--) {
- if (sc->cpu_cx_states[i].trans_lat * 3 <= sc->cpu_prev_sleep) {
+ if (sc->cpu_cx_states[i].trans_lat * 3 <= us) {
cx_next_idx = i;
break;
}
Index: sys/kern/kern_synch.c
===================================================================
--- sys/kern/kern_synch.c (.../head) (revision 246685)
+++ sys/kern/kern_synch.c (.../projects/calloutng) (revision 246685)
@@ -146,12 +146,12 @@
*/
int
_sleep(void *ident, struct lock_object *lock, int priority,
- const char *wmesg, int timo)
+ const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags)
{
struct thread *td;
struct proc *p;
struct lock_class *class;
- int catch, flags, lock_state, pri, rval;
+ int catch, sleepq_flags, lock_state, pri, rval; XXX-lr keep flags, use _sleep_flags as function argument ?
WITNESS_SAVE_DECL(lock_witness);
td = curthread;
@@ -348,28 +349,30 @@
* to a "timo" value of one.
*/
int
-pause(const char *wmesg, int timo)
+pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags)
{
- KASSERT(timo >= 0, ("pause: timo must be >= 0"));
+ int sbt_sec; XXX-lr localize to the cold block
+ sbt_sec = sbintime_getsec(sbt);
+ KASSERT(sbt_sec >= 0, ("pause: timo must be >= 0"));
+
/* silently convert invalid timeouts */
- if (timo < 1)
- timo = 1;
+ if (sbt == 0)
+ sbt = tick_sbt;
if (cold) {
/*
- * We delay one HZ at a time to avoid overflowing the
+ * We delay one second at a time to avoid overflowing the
* system specific DELAY() function(s):
*/
- while (timo >= hz) {
+ while (sbt_sec > 0) {
DELAY(1000000);
- timo -= hz;
+ sbt_sec--;
}
- if (timo > 0)
- DELAY(timo * tick);
+ DELAY((sbt & 0xffffffff) / SBT_1US); XXX-lr sbt2us() ?
return (0);
}
- return (tsleep(&pause_wchan, 0, wmesg, timo));
+ return (_sleep(&pause_wchan, NULL, 0, wmesg, sbt, pr, flags));
}
/*
@@ -560,8 +563,9 @@
* random variation to avoid synchronisation with processes that
* run at regular intervals.
*/
- callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)),
- loadav, NULL);
+ callout_reset_sbt(&loadav_callout, XXX-lr we have better resolution than HZ here ?
+ tick_sbt * (hz * 4 + (int)(random() % (hz * 2 + 1))), 0,
+ loadav, NULL, C_DIRECT_EXEC | C_HARDCLOCK);
}
/* ARGSUSED */
Index: sys/kern/sys_generic.c
===================================================================
--- sys/kern/sys_generic.c (.../head) (revision 246685)
+++ sys/kern/sys_generic.c (.../projects/calloutng) (revision 246685)
@@ -995,35 +996,29 @@
if (nbufbytes != 0)
bzero(selbits, nbufbytes / 2);
+ precision = 0;
if (tvp != NULL) {
- atv = *tvp;
- if (itimerfix(&atv)) {
+ rtv = *tvp;
+ if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || XXX-lr itimerfix or some check routine ? several places
+ rtv.tv_usec >= 1000000) {
error = EINVAL;
goto done;
}
- getmicrouptime(&rtv);
- timevaladd(&atv, &rtv);
- } else {
- atv.tv_sec = 0;
- atv.tv_usec = 0;
- }
- timo = 0;
+ rsbt = timeval2sbintime(rtv);
+ precision = rsbt;
+ precision >>= tc_precexp;
+ if (TIMESEL(&asbt, rsbt))
+ asbt += tc_tick_sbt;
+ asbt += rsbt;
+ } else
+ asbt = -1;
seltdinit(td);
/* Iterate until the timeout expires or descriptors become ready. */
for (;;) {
error = selscan(td, ibits, obits, nd);
if (error || td->td_retval[0] != 0)
break;
- if (atv.tv_sec || atv.tv_usec) {
- getmicrouptime(&rtv);
- if (timevalcmp(&rtv, &atv, >=))
- break;
- ttv = atv;
- timevalsub(&ttv, &rtv);
- timo = ttv.tv_sec > 24 * 60 * 60 ?
- 24 * 60 * 60 * hz : tvtohz(&ttv);
- }
- error = seltdwait(td, timo);
+ error = seltdwait(td, asbt, precision);
if (error)
break;
error = selrescan(td, ibits, obits);
Index: sys/kern/kern_tc.c
===================================================================
--- sys/kern/kern_tc.c (.../head) (revision 246685)
+++ sys/kern/kern_tc.c (.../projects/calloutng) (revision 246685)
@@ -119,6 +120,21 @@
SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW,
×tepwarnings, 0, "Log time steps");
+struct bintime bt_timethreshold; XXX-lr document these variables
+struct bintime bt_tickthreshold;
+sbintime_t sbt_timethreshold;
+sbintime_t sbt_tickthreshold;
+struct bintime tc_tick_bt;
+sbintime_t tc_tick_sbt;
+int tc_precexp;
+int tc_timepercentage = TC_DEFAULTPERC;
+TUNABLE_INT("kern.timecounter.alloweddeviation", &tc_timepercentage);
+static int sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, alloweddeviation,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
+ sysctl_kern_timecounter_adjprecision, "I",
+ "Allowed time interval deviation in percents");
+
static void tc_windup(void);
static void cpu_tick_calibrate(int);
@@ -883,6 +919,16 @@
}
void
+sbinuptime(sbintime_t sbt) XXX-lr wrong argument ? not compile-tested ?
Index: sys/kern/kern_timeout.c
===================================================================
--- sys/kern/kern_timeout.c (.../head) (revision 246685)
+++ sys/kern/kern_timeout.c (.../projects/calloutng) (revision 246685)
@@ -87,58 +105,62 @@
int callwheelsize, callwheelmask;
/*
- * The callout cpu migration entity represents informations necessary for
- * describing the migrating callout to the new callout cpu.
+ * The callout cpu exec entities represent informations necessary for
+ * describing the state of callouts currently running on the CPU and the ones
+ * necessary for migrating callouts to the new callout cpu. In particular,
+ * the first entry of the array cc_exec_entity holds informations for callout
+ * running in SWI thread context, while the second one holds informations
+ * for callout running directly from hardware interrupt context.
* The cached informations are very important for deferring migration when
* the migrating callout is already running.
*/
-struct cc_mig_ent {
+struct cc_exec {
+ struct callout *cc_next;
+ struct callout *cc_curr;
#ifdef SMP
- void (*ce_migration_func)(void *);
- void *ce_migration_arg;
- int ce_migration_cpu;
- int ce_migration_ticks;
+ void (*ce_migration_func)(void *);
+ void *ce_migration_arg;
+ int ce_migration_cpu;
+ sbintime_t ce_migration_time;
#endif
+ int cc_cancel;
+ int cc_waiting;
};
-
+
/*
- * There is one struct callout_cpu per cpu, holding all relevant
+ * There is one struct callou_cpu per cpu, holding all relevant
* state for the callout processing thread on the individual CPU.
- * In particular:
- * cc_ticks is incremented once per tick in callout_cpu().
- * It tracks the global 'ticks' but in a way that the individual
- * threads should not worry about races in the order in which
- * hardclock() and hardclock_cpu() run on the various CPUs.
- * cc_softclock is advanced in callout_cpu() to point to the
- * first entry in cc_callwheel that may need handling. In turn,
- * a softclock() is scheduled so it can serve the various entries i
- * such that cc_softclock <= i <= cc_ticks .
- * XXX maybe cc_softclock and cc_ticks should be volatile ?
- *
- * cc_ticks is also used in callout_reset_cpu() to determine
- * when the callout should be served.
*/
struct callout_cpu {
struct mtx_padalign cc_lock;
- struct cc_mig_ent cc_migrating_entity;
+ struct cc_exec cc_exec_entity[2]; XXX-lr repeat (short) explanation on why 2 entities
struct callout *cc_callout;
struct callout_tailq *cc_callwheel;
+ struct callout_tailq cc_expireq;
struct callout_list cc_callfree;
- struct callout *cc_next;
- struct callout *cc_curr;
+ sbintime_t cc_firstevent;
+ sbintime_t cc_lastscan;
void *cc_cookie;
- int cc_ticks;
- int cc_softticks;
- int cc_cancel;
- int cc_waiting;
- int cc_firsttick;
};
+#define cc_exec_curr cc_exec_entity[0].cc_curr
+#define cc_exec_next cc_exec_entity[0].cc_next
+#define cc_exec_cancel cc_exec_entity[0].cc_cancel
+#define cc_exec_waiting cc_exec_entity[0].cc_waiting
+#define cc_exec_curr_dir cc_exec_entity[1].cc_curr
+#define cc_exec_next_dir cc_exec_entity[1].cc_next
+#define cc_exec_cancel_dir cc_exec_entity[1].cc_cancel
+#define cc_exec_waiting_dir cc_exec_entity[1].cc_waiting
+
#ifdef SMP
-#define cc_migration_func cc_migrating_entity.ce_migration_func
-#define cc_migration_arg cc_migrating_entity.ce_migration_arg
-#define cc_migration_cpu cc_migrating_entity.ce_migration_cpu
-#define cc_migration_ticks cc_migrating_entity.ce_migration_ticks
+#define cc_migration_func cc_exec_entity[0].ce_migration_func
+#define cc_migration_arg cc_exec_entity[0].ce_migration_arg
+#define cc_migration_cpu cc_exec_entity[0].ce_migration_cpu
+#define cc_migration_time cc_exec_entity[0].ce_migration_time
+#define cc_migration_func_dir cc_exec_entity[1].ce_migration_func
+#define cc_migration_arg_dir cc_exec_entity[1].ce_migration_arg
+#define cc_migration_cpu_dir cc_exec_entity[1].ce_migration_cpu
+#define cc_migration_time_dir cc_exec_entity[1].ce_migration_time
struct callout_cpu cc_cpu[MAXCPU];
#define CPUBLOCK MAXCPU
Index: sys/ia64/ia64/machdep.c
===================================================================
--- sys/ia64/ia64/machdep.c (.../head) (revision 246685)
+++ sys/ia64/ia64/machdep.c (.../projects/calloutng) (revision 246685)
@@ -404,7 +405,7 @@
if (sched_runnable())
ia64_enable_intr();
else if (cpu_idle_hook != NULL) {
- (*cpu_idle_hook)(); XXX-lr style ? just use cpu_idle_hook(sbt);
+ (*cpu_idle_hook)(sbt);
/* The hook must enable interrupts! */
} else {
ia64_call_pal_static(PAL_HALT_LIGHT, 0, 0, 0);
Index: sys/ofed/include/linux/timer.h
===================================================================
--- sys/ofed/include/linux/timer.h (.../head) (revision 246685)
+++ sys/ofed/include/linux/timer.h (.../projects/calloutng) (revision 246685)
@@ -65,13 +64,16 @@
callout_init(&(timer)->timer_callout, CALLOUT_MPSAFE); \
} while (0)
-#define mod_timer(timer, expire) \ XXX-lr gratuitous rename
- callout_reset(&(timer)->timer_callout, (expire) - jiffies, \
- _timer_fn, (timer))
+#define mod_timer(timer, exp) \
+do { \
+ (timer)->expires = exp; \
+ callout_reset(&(timer)->timer_callout, (exp) - jiffies, \
+ _timer_fn, (timer)); \
+} while (0)
#define add_timer(timer) \
callout_reset(&(timer)->timer_callout, \
- (timer)->timer_callout.c_time - jiffies, _timer_fn, (timer))
+ (timer)->expires - jiffies, _timer_fn, (timer))
#define del_timer(timer) callout_stop(&(timer)->timer_callout)
#define del_timer_sync(timer) callout_drain(&(timer)->timer_callout)
Index: sys/sys/callout.h
===================================================================
--- sys/sys/callout.h (.../head) (revision 246685)
+++ sys/sys/callout.h (.../projects/calloutng) (revision 246685)
@@ -47,7 +47,17 @@
#define CALLOUT_RETURNUNLOCKED 0x0010 /* handler returns with mtx unlocked */
#define CALLOUT_SHAREDLOCK 0x0020 /* callout lock held in shared mode */
#define CALLOUT_DFRMIGRATION 0x0040 /* callout in deferred migration mode */
+#define CALLOUT_PROCESSED 0x0080 /* callout in wheel or processing list? */
+#define CALLOUT_DIRECT 0x0100 /* allow exec from hw int context */
+#define C_DIRECT_EXEC 0x0001 /* direct execution of callout */
+#define C_PRELBITS 7 XXX-lr document all
+#define C_PRELRANGE ((1 << C_PRELBITS) - 1)
+#define C_PREL(x) (((x) + 1) << 1)
+#define C_PRELGET(x) (int)((((x) >> 1) & C_PRELRANGE) - 1)
+#define C_HARDCLOCK 0x0100 /* align to hardclock() calls */
+#define C_ABSOLUTE 0x0200 /* event time is absolute. */
+
struct callout_handle {
struct callout *callout;
};
Index: sys/sys/time.h
===================================================================
--- sys/sys/time.h (.../head) (revision 246685)
+++ sys/sys/time.h (.../projects/calloutng) (revision 246685)
@@ -109,6 +124,45 @@
((a)->frac cmp (b)->frac) : \
((a)->sec cmp (b)->sec))
+typedef int64_t sbintime_t; XXX-lr add function ns2sbt(), us2sbt(), ms2sbt()
+#define SBT_1S ((sbintime_t)1 << 32)
+#define SBT_1M (SBT_1S * 60)
+#define SBT_1MS (SBT_1S / 1000)
+#define SBT_1US (SBT_1S / 1000000)
+#define SBT_1NS (SBT_1S / 1000000000)
+
+static __inline int
+sbintime_getsec(sbintime_t sbt)
+{
+
+ return (int)(sbt >> 32);
+}
+
+static __inline sbintime_t
+bintime2sbintime(const struct bintime bt)
+{
+
+ return (((sbintime_t)bt.sec << 32) + (bt.frac >> 32));
+}
+
+static __inline struct bintime
+sbintime2bintime(sbintime_t sbt)
+{
+ struct bintime bt;
+
+ bt.sec = sbt >> 32;
+ bt.frac = sbt << 32;
+ return (bt);
+
+}
+
+#ifdef _KERNEL
+
+extern struct bintime tick_bt;
+extern sbintime_t tick_sbt;
+
+#endif /* KERNEL */
+
/*-
* Background information:
*
@@ -290,7 +381,15 @@
extern volatile time_t time_second;
extern volatile time_t time_uptime;
extern struct bintime boottimebin;
+extern struct bintime tc_tick_bt;
+extern sbintime_t tc_tick_sbt; XXX-lr comment the new vars
extern struct timeval boottime;
+extern int tc_precexp; XXX-lr comment
+extern int tc_timepercentage;
+extern struct bintime bt_timethreshold;
+extern struct bintime bt_tickthreshold;
+extern sbintime_t sbt_timethreshold;
+extern sbintime_t sbt_tickthreshold;
/*
* Functions for looking at our clock: [get]{bin,nano,micro}[up]time()
@@ -337,6 +438,23 @@
void timevaladd(struct timeval *t1, const struct timeval *t2);
void timevalsub(struct timeval *t1, const struct timeval *t2);
int tvtohz(struct timeval *tv);
+
+#define TC_DEFAULTPERC 5 XXX-lr comment
+
+#define BT2FREQ(bt) \
+ (((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) / \
+ ((bt)->frac >> 1))
+
+#define FREQ2BT(freq, bt) \
+{ \
+ (bt)->sec = 0; \
+ (bt)->frac = ((uint64_t)0x8000000000000000 / (freq)) << 1; \
+}
+
+#define TIMESEL(sbt, sbt2) \
+ (((sbt2) >= sbt_timethreshold) ? \
+ (getsbinuptime(sbt), 1) : (sbinuptime(sbt), 0))
+
#else /* !_KERNEL */
#include <time.h>
Index: sys/netinet/tcp_timer.c
===================================================================
--- sys/netinet/tcp_timer.c (.../head) (revision 246685)
+++ sys/netinet/tcp_timer.c (.../projects/calloutng) (revision 246685)
@@ -719,20 +719,24 @@
#define ticks_to_msecs(t) (1000*(t) / hz)
void
-tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, struct xtcp_timer *xtimer)
+tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
+ struct xtcp_timer *xtimer)
{
- bzero(xtimer, sizeof(struct xtcp_timer));
+ sbintime_t now;
+
+ bzero(xtimer, sizeof(*xtimer)); XXX-lr use sbd2ms()
if (timer == NULL)
return;
+ getsbinuptime(&now);
if (callout_active(&timer->tt_delack))
- xtimer->tt_delack = ticks_to_msecs(timer->tt_delack.c_time - ticks);
+ xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS;
if (callout_active(&timer->tt_rexmt))
- xtimer->tt_rexmt = ticks_to_msecs(timer->tt_rexmt.c_time - ticks);
+ xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS;
if (callout_active(&timer->tt_persist))
- xtimer->tt_persist = ticks_to_msecs(timer->tt_persist.c_time - ticks);
+ xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS;
if (callout_active(&timer->tt_keep))
- xtimer->tt_keep = ticks_to_msecs(timer->tt_keep.c_time - ticks);
+ xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS;
if (callout_active(&timer->tt_2msl))
- xtimer->tt_2msl = ticks_to_msecs(timer->tt_2msl.c_time - ticks);
+ xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS;
xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime);
}
More information about the freebsd-arch
mailing list