diff options
author | cheloha <cheloha@cvs.openbsd.org> | 2020-10-15 20:03:45 +0000 |
---|---|---|
committer | cheloha <cheloha@cvs.openbsd.org> | 2020-10-15 20:03:45 +0000 |
commit | ed4942372fd0d307a78a972ef6d74ebbb6554d77 (patch) | |
tree | b9f56c5cec9eb762c4cf19dddc58d8df89eb5a13 /sys/kern/kern_timeout.c | |
parent | b09a19ebcaca9db3055b2d4097fbfc0636121c91 (diff) |
timeout(9): basic support for kclock timeouts
A kclock timeout is a timeout that expires at an absolute time on one
of the kernel's clocks. A timeout's absolute expiration time is kept
in a new member of the timeout struct, to_abstime. The timeout's
kclock is set at initialization and is kept in another new member of
the timeout struct, to_kclock.
Kclock timeouts are desireable because they have nanosecond
resolution, regardless of the value of hz(9). The timecounter
subsystem is also inherently NTP-sensitive, so timeouts scheduled
against the subsystem are NTP-sensitive. These two qualities
guarantee that a kclock timeout will never expire early.
Currently there is support for one kclock, KCLOCK_UPTIME (the uptime
clock). Support for KCLOCK_RUNTIME (the runtime clock) and KCLOCK_UTC
(the UTC clock) is planned for the future.
Support for these additional kclocks will allow us to implement some
of the POSIX interfaces OpenBSD is missing, e.g. clock_nanosleep() and
timer_create(). We could also use it to provide proper absolute
timeouts for e.g. pthread_mutex_timedlock(3).
Kclock timeouts are initialized with timeout_set_kclock(). They can
be scheduled with either timeout_in_nsec() (relative timeout) or
timeout_at_ts() (absolute timeout). They are incompatible with
timeout_add(9), timeout_add_sec(9), timeout_add_msec(9),
timeout_add_usec(9), timeout_add_nsec(9), and timeout_add_tv(9).
They can be cancelled with timeout_del(9) or timeout_del_barrier(9).
Documentation for the new interfaces is a work in progress.
For now, tick-based timeouts remain supported alongside kclock
timeouts. They will remain supported until we are certain we don't
need them anymore. It is possible we will never remove them. I would
rather not keep them around forever, but I cannot predict what
difficulties we will encounter while converting tick-based timeouts to
kclock timeouts. There are a *lot* of timeouts in the kernel.
Kclock timeouts are more costly than tick-based timeouts:
- Calling timeout_in_nsec() incurs a call to nanouptime(9). Reading
the hardware timecounter is too expensive in some contexts, so care
must be taken when converting existing timeouts.
We may add a flag in the future to cause timeout_in_nsec() to use
getnanouptime(9) instead of nanouptime(9), which is much cheaper.
This may be appropriate for certain classes of timeouts. tcp/ip
session timeouts come to mind.
- Kclock timeout expirations are kept in a timespec. Timespec
arithmetic has more overhead than 32-bit tick arithmetic, so
processing kclock timeouts during softclock() is more expensive.
On my machine the overhead for processing a tick-based timeout is
~125 cycles. The overhead for a kclock timeout is ~500 cycles.
The overhead difference on 32-bit platforms is unknown. If it
proves too large we may need to use a 64-bit value to store the
expiration time. More measurement is needed.
Priority targets for conversion are setitimer(2), *sleep_nsec(9), and
the kevent(2) EVFILT_TIMER timers. Others will follow.
With input from mpi@, visa@, kettenis@, dlg@, guenther@, claudio@,
deraadt@, probably many others. Older version tested by visa@.
Problems found in older version by bluhm@. Current version tested by
Yuichiro Naito.
"wait until after unlock" deraadt@, ok kettenis@
Diffstat (limited to 'sys/kern/kern_timeout.c')
-rw-r--r-- | sys/kern/kern_timeout.c | 398 |
1 files changed, 338 insertions, 60 deletions
diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c index 530ecb7f670..3c15605f287 100644 --- a/sys/kern/kern_timeout.c +++ b/sys/kern/kern_timeout.c @@ -1,4 +1,4 @@ -/* $OpenBSD: kern_timeout.c,v 1.80 2020/09/22 13:43:28 cheloha Exp $ */ +/* $OpenBSD: kern_timeout.c,v 1.81 2020/10/15 20:03:43 cheloha Exp $ */ /* * Copyright (c) 2001 Thomas Nordin <nordin@openbsd.org> * Copyright (c) 2000-2001 Artur Grabowski <art@openbsd.org> @@ -64,16 +64,27 @@ struct timeoutstat tostat; /* [T] statistics and totals */ * of the global variable "ticks" when the timeout should be called. There are * four levels with 256 buckets each. */ -#define BUCKETS 1024 +#define WHEELCOUNT 4 #define WHEELSIZE 256 #define WHEELMASK 255 #define WHEELBITS 8 +#define BUCKETS (WHEELCOUNT * WHEELSIZE) -struct circq timeout_wheel[BUCKETS]; /* [T] Queues of timeouts */ +struct circq timeout_wheel[BUCKETS]; /* [T] Tick-based timeouts */ +struct circq timeout_wheel_kc[BUCKETS]; /* [T] Clock-based timeouts */ struct circq timeout_new; /* [T] New, unscheduled timeouts */ struct circq timeout_todo; /* [T] Due or needs rescheduling */ struct circq timeout_proc; /* [T] Due + needs process context */ +time_t timeout_level_width[WHEELCOUNT]; /* [I] Wheel level width (seconds) */ +struct timespec tick_ts; /* [I] Length of a tick (1/hz secs) */ + +struct kclock { + struct timespec kc_lastscan; /* [T] Clock time at last wheel scan */ + struct timespec kc_late; /* [T] Late if due prior */ + struct timespec kc_offset; /* [T] Offset from primary kclock */ +} timeout_kclock[KCLOCK_MAX]; + #define MASKWHEEL(wheel, time) (((time) >> ((wheel)*WHEELBITS)) & WHEELMASK) #define BUCKET(rel, abs) \ @@ -155,9 +166,15 @@ struct lock_type timeout_spinlock_type = { ((needsproc) ? &timeout_sleeplock_obj : &timeout_spinlock_obj) #endif +void kclock_nanotime(int, struct timespec *); void softclock(void *); void softclock_create_thread(void *); +void softclock_process_kclock_timeout(struct timeout *, int); +void softclock_process_tick_timeout(struct timeout *, int); void softclock_thread(void *); +uint32_t timeout_bucket(const struct timeout *); +uint32_t timeout_maskwheel(uint32_t, const struct timespec *); +void timeout_run(struct timeout *); void timeout_proc_barrier(void *); /* @@ -207,13 +224,19 @@ timeout_sync_leave(int needsproc) void timeout_startup(void) { - int b; + int b, level; CIRCQ_INIT(&timeout_new); CIRCQ_INIT(&timeout_todo); CIRCQ_INIT(&timeout_proc); for (b = 0; b < nitems(timeout_wheel); b++) CIRCQ_INIT(&timeout_wheel[b]); + for (b = 0; b < nitems(timeout_wheel_kc); b++) + CIRCQ_INIT(&timeout_wheel_kc[b]); + + for (level = 0; level < nitems(timeout_level_width); level++) + timeout_level_width[level] = 2 << (level * WHEELBITS); + NSEC_TO_TIMESPEC(tick_nsec, &tick_ts); } void @@ -229,25 +252,39 @@ timeout_proc_init(void) kthread_create_deferred(softclock_create_thread, NULL); } +static inline void +_timeout_set(struct timeout *to, void (*fn)(void *), void *arg, int flags, + int kclock) +{ + to->to_func = fn; + to->to_arg = arg; + to->to_flags = flags | TIMEOUT_INITIALIZED; + to->to_kclock = kclock; +} + void timeout_set(struct timeout *new, void (*fn)(void *), void *arg) { - timeout_set_flags(new, fn, arg, 0); + _timeout_set(new, fn, arg, 0, KCLOCK_NONE); } void timeout_set_flags(struct timeout *to, void (*fn)(void *), void *arg, int flags) { - to->to_func = fn; - to->to_arg = arg; - to->to_process = NULL; - to->to_flags = flags | TIMEOUT_INITIALIZED; + _timeout_set(to, fn, arg, flags, KCLOCK_NONE); } void timeout_set_proc(struct timeout *new, void (*fn)(void *), void *arg) { - timeout_set_flags(new, fn, arg, TIMEOUT_PROC); + _timeout_set(new, fn, arg, TIMEOUT_PROC, KCLOCK_NONE); +} + +void +timeout_set_kclock(struct timeout *to, void (*fn)(void *), void *arg, + int flags, int kclock) +{ + _timeout_set(to, fn, arg, flags | TIMEOUT_KCLOCK, kclock); } int @@ -257,6 +294,8 @@ timeout_add(struct timeout *new, int to_ticks) int ret = 1; KASSERT(ISSET(new->to_flags, TIMEOUT_INITIALIZED)); + KASSERT(!ISSET(new->to_flags, TIMEOUT_KCLOCK)); + KASSERT(new->to_kclock == KCLOCK_NONE); KASSERT(to_ticks >= 0); mtx_enter(&timeout_mutex); @@ -356,6 +395,66 @@ timeout_add_nsec(struct timeout *to, int nsecs) } int +timeout_at_ts(struct timeout *to, const struct timespec *abstime) +{ + struct timespec old_abstime; + int ret = 1; + + mtx_enter(&timeout_mutex); + + KASSERT(ISSET(to->to_flags, TIMEOUT_INITIALIZED | TIMEOUT_KCLOCK)); + KASSERT(to->to_kclock != KCLOCK_NONE); + + old_abstime = to->to_abstime; + to->to_abstime = *abstime; + CLR(to->to_flags, TIMEOUT_TRIGGERED); + + if (ISSET(to->to_flags, TIMEOUT_ONQUEUE)) { + if (timespeccmp(abstime, &old_abstime, <)) { + CIRCQ_REMOVE(&to->to_list); + CIRCQ_INSERT_TAIL(&timeout_new, &to->to_list); + } + tostat.tos_readded++; + ret = 0; + } else { + SET(to->to_flags, TIMEOUT_ONQUEUE); + CIRCQ_INSERT_TAIL(&timeout_new, &to->to_list); + } +#if NKCOV > 0 + new->to_process = curproc->p_p; +#endif + tostat.tos_added++; + + mtx_leave(&timeout_mutex); + + return ret; +} + +int +timeout_in_nsec(struct timeout *to, uint64_t nsecs) +{ + struct timespec abstime, interval, now; + + kclock_nanotime(to->to_kclock, &now); + NSEC_TO_TIMESPEC(nsecs, &interval); + timespecadd(&now, &interval, &abstime); + + return timeout_at_ts(to, &abstime); +} + +void +kclock_nanotime(int kclock, struct timespec *now) +{ + switch (kclock) { + case KCLOCK_UPTIME: + nanouptime(now); + break; + default: + panic("invalid kclock: 0x%x", kclock); + } +} + +int timeout_del(struct timeout *to) { int ret = 0; @@ -425,6 +524,48 @@ timeout_proc_barrier(void *arg) cond_signal(c); } +uint32_t +timeout_bucket(const struct timeout *to) +{ + struct kclock *kc = &timeout_kclock[to->to_kclock]; + struct timespec diff, shifted_abstime; + uint32_t level; + + KASSERT(ISSET(to->to_flags, TIMEOUT_KCLOCK)); + KASSERT(timespeccmp(&kc->kc_lastscan, &to->to_abstime, <)); + + timespecsub(&to->to_abstime, &kc->kc_lastscan, &diff); + for (level = 0; level < nitems(timeout_level_width) - 1; level++) { + if (diff.tv_sec < timeout_level_width[level]) + break; + } + timespecadd(&to->to_abstime, &kc->kc_offset, &shifted_abstime); + return level * WHEELSIZE + timeout_maskwheel(level, &shifted_abstime); +} + +/* + * Hash the absolute time into a bucket on a given level of the wheel. + * + * The complete hash is 32 bits. The upper 25 bits are seconds, the + * lower 7 bits are nanoseconds. tv_nsec is a positive value less + * than one billion so we need to divide it to isolate the desired + * bits. We can't just shift it. + * + * The level is used to isolate an 8-bit portion of the hash. The + * resulting number indicates which bucket the absolute time belongs + * in on the given level of the wheel. + */ +uint32_t +timeout_maskwheel(uint32_t level, const struct timespec *abstime) +{ + uint32_t hi, lo; + + hi = abstime->tv_sec << 7; + lo = abstime->tv_nsec / 7812500; + + return ((hi | lo) >> (level * WHEELBITS)) & WHEELMASK; +} + /* * This is called from hardclock() on the primary CPU at the start of * every tick. @@ -432,7 +573,15 @@ timeout_proc_barrier(void *arg) void timeout_hardclock_update(void) { - int need_softclock = 1; + struct timespec elapsed, now; + struct kclock *kc; + struct timespec *lastscan; + int b, done, first, i, last, level, need_softclock, off; + + nanouptime(&now); + lastscan = &timeout_kclock[KCLOCK_UPTIME].kc_lastscan; + timespecsub(&now, lastscan, &elapsed); + need_softclock = 1; mtx_enter(&timeout_mutex); @@ -446,6 +595,47 @@ timeout_hardclock_update(void) } } + /* + * Dump the buckets that expired while we were away. + * + * If the elapsed time has exceeded a level's limit then we need + * to dump every bucket in the level. We have necessarily completed + * a lap of that level, too, so we need to process buckets in the + * next level. + * + * Otherwise we need to compare indices: if the index of the first + * expired bucket is greater than that of the last then we have + * completed a lap of the level and need to process buckets in the + * next level. + */ + for (level = 0; level < nitems(timeout_level_width); level++) { + first = timeout_maskwheel(level, lastscan); + if (elapsed.tv_sec >= timeout_level_width[level]) { + last = (first == 0) ? WHEELSIZE - 1 : first - 1; + done = 0; + } else { + last = timeout_maskwheel(level, &now); + done = first <= last; + } + off = level * WHEELSIZE; + for (b = first;; b = (b + 1) % WHEELSIZE) { + CIRCQ_CONCAT(&timeout_todo, &timeout_wheel_kc[off + b]); + if (b == last) + break; + } + if (done) + break; + } + + /* + * Update the cached state for each kclock. + */ + for (i = 0; i < nitems(timeout_kclock); i++) { + kc = &timeout_kclock[i]; + timespecadd(&now, &kc->kc_offset, &kc->kc_lastscan); + timespecsub(&kc->kc_lastscan, &tick_ts, &kc->kc_late); + } + if (CIRCQ_EMPTY(&timeout_new) && CIRCQ_EMPTY(&timeout_todo)) need_softclock = 0; @@ -487,6 +677,51 @@ timeout_run(struct timeout *to) mtx_enter(&timeout_mutex); } +void +softclock_process_kclock_timeout(struct timeout *to, int new) +{ + struct kclock *kc = &timeout_kclock[to->to_kclock]; + + if (timespeccmp(&to->to_abstime, &kc->kc_lastscan, >)) { + tostat.tos_scheduled++; + if (!new) + tostat.tos_rescheduled++; + CIRCQ_INSERT_TAIL(&timeout_wheel_kc[timeout_bucket(to)], + &to->to_list); + return; + } + if (!new && timespeccmp(&to->to_abstime, &kc->kc_late, <=)) + tostat.tos_late++; + if (ISSET(to->to_flags, TIMEOUT_PROC)) { + CIRCQ_INSERT_TAIL(&timeout_proc, &to->to_list); + return; + } + timeout_run(to); + tostat.tos_run_softclock++; +} + +void +softclock_process_tick_timeout(struct timeout *to, int new) +{ + int delta = to->to_time - ticks; + + if (delta > 0) { + tostat.tos_scheduled++; + if (!new) + tostat.tos_rescheduled++; + CIRCQ_INSERT_TAIL(&BUCKET(delta, to->to_time), &to->to_list); + return; + } + if (!new && delta < 0) + tostat.tos_late++; + if (ISSET(to->to_flags, TIMEOUT_PROC)) { + CIRCQ_INSERT_TAIL(&timeout_proc, &to->to_list); + return; + } + timeout_run(to); + tostat.tos_run_softclock++; +} + /* * Timeouts are processed here instead of timeout_hardclock_update() * to avoid doing any more work at IPL_CLOCK than absolutely necessary. @@ -496,9 +731,8 @@ timeout_run(struct timeout *to) void softclock(void *arg) { - struct circq *bucket; struct timeout *first_new, *to; - int delta, needsproc, new; + int needsproc, new; first_new = NULL; new = 0; @@ -512,28 +746,10 @@ softclock(void *arg) CIRCQ_REMOVE(&to->to_list); if (to == first_new) new = 1; - - /* - * If due run it or defer execution to the thread, - * otherwise insert it into the right bucket. - */ - delta = to->to_time - ticks; - if (delta > 0) { - bucket = &BUCKET(delta, to->to_time); - CIRCQ_INSERT_TAIL(bucket, &to->to_list); - tostat.tos_scheduled++; - if (!new) - tostat.tos_rescheduled++; - continue; - } - if (!new && delta < 0) - tostat.tos_late++; - if (ISSET(to->to_flags, TIMEOUT_PROC)) { - CIRCQ_INSERT_TAIL(&timeout_proc, &to->to_list); - continue; - } - timeout_run(to); - tostat.tos_run_softclock++; + if (ISSET(to->to_flags, TIMEOUT_KCLOCK)) + softclock_process_kclock_timeout(to, new); + else + softclock_process_tick_timeout(to, new); } tostat.tos_softclocks++; needsproc = !CIRCQ_EMPTY(&timeout_proc); @@ -632,52 +848,114 @@ timeout_sysctl(void *oldp, size_t *oldlenp, void *newp, size_t newlen) } #ifdef DDB +const char *db_kclock(int); void db_show_callout_bucket(struct circq *); +void db_show_timeout(struct timeout *, struct circq *); +const char *db_timespec(const struct timespec *); + +const char * +db_kclock(int kclock) +{ + switch (kclock) { + case KCLOCK_UPTIME: + return "uptime"; + default: + return "invalid"; + } +} + +const char * +db_timespec(const struct timespec *ts) +{ + static char buf[32]; + struct timespec tmp, zero; + + if (ts->tv_sec >= 0) { + snprintf(buf, sizeof(buf), "%lld.%09ld", + ts->tv_sec, ts->tv_nsec); + return buf; + } + + timespecclear(&zero); + timespecsub(&zero, ts, &tmp); + snprintf(buf, sizeof(buf), "-%lld.%09ld", tmp.tv_sec, tmp.tv_nsec); + return buf; +} void db_show_callout_bucket(struct circq *bucket) { - char buf[8]; - struct timeout *to; struct circq *p; + + CIRCQ_FOREACH(p, bucket) + db_show_timeout(timeout_from_circq(p), bucket); +} + +void +db_show_timeout(struct timeout *to, struct circq *bucket) +{ + struct timespec remaining; + struct kclock *kc; + char buf[8]; db_expr_t offset; + struct circq *wheel; char *name, *where; int width = sizeof(long) * 2; - CIRCQ_FOREACH(p, bucket) { - to = timeout_from_circq(p); - db_find_sym_and_offset((vaddr_t)to->to_func, &name, &offset); - name = name ? name : "?"; - if (bucket == &timeout_todo) - where = "softint"; - else if (bucket == &timeout_proc) - where = "thread"; - else if (bucket == &timeout_new) - where = "new"; - else { - snprintf(buf, sizeof(buf), "%3ld/%1ld", - (bucket - timeout_wheel) % WHEELSIZE, - (bucket - timeout_wheel) / WHEELSIZE); - where = buf; - } - db_printf("%9d %7s 0x%0*lx %s\n", - to->to_time - ticks, where, width, (ulong)to->to_arg, name); + db_find_sym_and_offset((vaddr_t)to->to_func, &name, &offset); + name = name ? name : "?"; + if (bucket == &timeout_new) + where = "new"; + else if (bucket == &timeout_todo) + where = "softint"; + else if (bucket == &timeout_proc) + where = "thread"; + else { + if (ISSET(to->to_flags, TIMEOUT_KCLOCK)) + wheel = timeout_wheel_kc; + else + wheel = timeout_wheel; + snprintf(buf, sizeof(buf), "%3ld/%1ld", + (bucket - wheel) % WHEELSIZE, + (bucket - wheel) / WHEELSIZE); + where = buf; + } + if (ISSET(to->to_flags, TIMEOUT_KCLOCK)) { + kc = &timeout_kclock[to->to_kclock]; + timespecsub(&to->to_abstime, &kc->kc_lastscan, &remaining); + db_printf("%20s %8s %7s 0x%0*lx %s\n", + db_timespec(&remaining), db_kclock(to->to_kclock), where, + width, (ulong)to->to_arg, name); + } else { + db_printf("%20d %8s %7s 0x%0*lx %s\n", + to->to_time - ticks, "ticks", where, + width, (ulong)to->to_arg, name); } } void db_show_callout(db_expr_t addr, int haddr, db_expr_t count, char *modif) { + struct kclock *kc; int width = sizeof(long) * 2 + 2; - int b; - - db_printf("ticks now: %d\n", ticks); - db_printf("%9s %7s %*s func\n", "ticks", "wheel", width, "arg"); - + int b, i; + + db_printf("%20s %8s\n", "lastscan", "clock"); + db_printf("%20d %8s\n", ticks, "ticks"); + for (i = 0; i < nitems(timeout_kclock); i++) { + kc = &timeout_kclock[i]; + db_printf("%20s %8s\n", + db_timespec(&kc->kc_lastscan), db_kclock(i)); + } + db_printf("\n"); + db_printf("%20s %8s %7s %*s %s\n", + "remaining", "clock", "wheel", width, "arg", "func"); db_show_callout_bucket(&timeout_new); db_show_callout_bucket(&timeout_todo); db_show_callout_bucket(&timeout_proc); for (b = 0; b < nitems(timeout_wheel); b++) db_show_callout_bucket(&timeout_wheel[b]); + for (b = 0; b < nitems(timeout_wheel_kc); b++) + db_show_callout_bucket(&timeout_wheel_kc[b]); } #endif |