summaryrefslogtreecommitdiff
path: root/sys/kern/kern_timeout.c
diff options
context:
space:
mode:
authorcheloha <cheloha@cvs.openbsd.org>2020-10-15 20:03:45 +0000
committercheloha <cheloha@cvs.openbsd.org>2020-10-15 20:03:45 +0000
commited4942372fd0d307a78a972ef6d74ebbb6554d77 (patch)
treeb9f56c5cec9eb762c4cf19dddc58d8df89eb5a13 /sys/kern/kern_timeout.c
parentb09a19ebcaca9db3055b2d4097fbfc0636121c91 (diff)
timeout(9): basic support for kclock timeouts
A kclock timeout is a timeout that expires at an absolute time on one of the kernel's clocks. A timeout's absolute expiration time is kept in a new member of the timeout struct, to_abstime. The timeout's kclock is set at initialization and is kept in another new member of the timeout struct, to_kclock. Kclock timeouts are desireable because they have nanosecond resolution, regardless of the value of hz(9). The timecounter subsystem is also inherently NTP-sensitive, so timeouts scheduled against the subsystem are NTP-sensitive. These two qualities guarantee that a kclock timeout will never expire early. Currently there is support for one kclock, KCLOCK_UPTIME (the uptime clock). Support for KCLOCK_RUNTIME (the runtime clock) and KCLOCK_UTC (the UTC clock) is planned for the future. Support for these additional kclocks will allow us to implement some of the POSIX interfaces OpenBSD is missing, e.g. clock_nanosleep() and timer_create(). We could also use it to provide proper absolute timeouts for e.g. pthread_mutex_timedlock(3). Kclock timeouts are initialized with timeout_set_kclock(). They can be scheduled with either timeout_in_nsec() (relative timeout) or timeout_at_ts() (absolute timeout). They are incompatible with timeout_add(9), timeout_add_sec(9), timeout_add_msec(9), timeout_add_usec(9), timeout_add_nsec(9), and timeout_add_tv(9). They can be cancelled with timeout_del(9) or timeout_del_barrier(9). Documentation for the new interfaces is a work in progress. For now, tick-based timeouts remain supported alongside kclock timeouts. They will remain supported until we are certain we don't need them anymore. It is possible we will never remove them. I would rather not keep them around forever, but I cannot predict what difficulties we will encounter while converting tick-based timeouts to kclock timeouts. There are a *lot* of timeouts in the kernel. Kclock timeouts are more costly than tick-based timeouts: - Calling timeout_in_nsec() incurs a call to nanouptime(9). Reading the hardware timecounter is too expensive in some contexts, so care must be taken when converting existing timeouts. We may add a flag in the future to cause timeout_in_nsec() to use getnanouptime(9) instead of nanouptime(9), which is much cheaper. This may be appropriate for certain classes of timeouts. tcp/ip session timeouts come to mind. - Kclock timeout expirations are kept in a timespec. Timespec arithmetic has more overhead than 32-bit tick arithmetic, so processing kclock timeouts during softclock() is more expensive. On my machine the overhead for processing a tick-based timeout is ~125 cycles. The overhead for a kclock timeout is ~500 cycles. The overhead difference on 32-bit platforms is unknown. If it proves too large we may need to use a 64-bit value to store the expiration time. More measurement is needed. Priority targets for conversion are setitimer(2), *sleep_nsec(9), and the kevent(2) EVFILT_TIMER timers. Others will follow. With input from mpi@, visa@, kettenis@, dlg@, guenther@, claudio@, deraadt@, probably many others. Older version tested by visa@. Problems found in older version by bluhm@. Current version tested by Yuichiro Naito. "wait until after unlock" deraadt@, ok kettenis@
Diffstat (limited to 'sys/kern/kern_timeout.c')
-rw-r--r--sys/kern/kern_timeout.c398
1 files changed, 338 insertions, 60 deletions
diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c
index 530ecb7f670..3c15605f287 100644
--- a/sys/kern/kern_timeout.c
+++ b/sys/kern/kern_timeout.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: kern_timeout.c,v 1.80 2020/09/22 13:43:28 cheloha Exp $ */
+/* $OpenBSD: kern_timeout.c,v 1.81 2020/10/15 20:03:43 cheloha Exp $ */
/*
* Copyright (c) 2001 Thomas Nordin <nordin@openbsd.org>
* Copyright (c) 2000-2001 Artur Grabowski <art@openbsd.org>
@@ -64,16 +64,27 @@ struct timeoutstat tostat; /* [T] statistics and totals */
* of the global variable "ticks" when the timeout should be called. There are
* four levels with 256 buckets each.
*/
-#define BUCKETS 1024
+#define WHEELCOUNT 4
#define WHEELSIZE 256
#define WHEELMASK 255
#define WHEELBITS 8
+#define BUCKETS (WHEELCOUNT * WHEELSIZE)
-struct circq timeout_wheel[BUCKETS]; /* [T] Queues of timeouts */
+struct circq timeout_wheel[BUCKETS]; /* [T] Tick-based timeouts */
+struct circq timeout_wheel_kc[BUCKETS]; /* [T] Clock-based timeouts */
struct circq timeout_new; /* [T] New, unscheduled timeouts */
struct circq timeout_todo; /* [T] Due or needs rescheduling */
struct circq timeout_proc; /* [T] Due + needs process context */
+time_t timeout_level_width[WHEELCOUNT]; /* [I] Wheel level width (seconds) */
+struct timespec tick_ts; /* [I] Length of a tick (1/hz secs) */
+
+struct kclock {
+ struct timespec kc_lastscan; /* [T] Clock time at last wheel scan */
+ struct timespec kc_late; /* [T] Late if due prior */
+ struct timespec kc_offset; /* [T] Offset from primary kclock */
+} timeout_kclock[KCLOCK_MAX];
+
#define MASKWHEEL(wheel, time) (((time) >> ((wheel)*WHEELBITS)) & WHEELMASK)
#define BUCKET(rel, abs) \
@@ -155,9 +166,15 @@ struct lock_type timeout_spinlock_type = {
((needsproc) ? &timeout_sleeplock_obj : &timeout_spinlock_obj)
#endif
+void kclock_nanotime(int, struct timespec *);
void softclock(void *);
void softclock_create_thread(void *);
+void softclock_process_kclock_timeout(struct timeout *, int);
+void softclock_process_tick_timeout(struct timeout *, int);
void softclock_thread(void *);
+uint32_t timeout_bucket(const struct timeout *);
+uint32_t timeout_maskwheel(uint32_t, const struct timespec *);
+void timeout_run(struct timeout *);
void timeout_proc_barrier(void *);
/*
@@ -207,13 +224,19 @@ timeout_sync_leave(int needsproc)
void
timeout_startup(void)
{
- int b;
+ int b, level;
CIRCQ_INIT(&timeout_new);
CIRCQ_INIT(&timeout_todo);
CIRCQ_INIT(&timeout_proc);
for (b = 0; b < nitems(timeout_wheel); b++)
CIRCQ_INIT(&timeout_wheel[b]);
+ for (b = 0; b < nitems(timeout_wheel_kc); b++)
+ CIRCQ_INIT(&timeout_wheel_kc[b]);
+
+ for (level = 0; level < nitems(timeout_level_width); level++)
+ timeout_level_width[level] = 2 << (level * WHEELBITS);
+ NSEC_TO_TIMESPEC(tick_nsec, &tick_ts);
}
void
@@ -229,25 +252,39 @@ timeout_proc_init(void)
kthread_create_deferred(softclock_create_thread, NULL);
}
+static inline void
+_timeout_set(struct timeout *to, void (*fn)(void *), void *arg, int flags,
+ int kclock)
+{
+ to->to_func = fn;
+ to->to_arg = arg;
+ to->to_flags = flags | TIMEOUT_INITIALIZED;
+ to->to_kclock = kclock;
+}
+
void
timeout_set(struct timeout *new, void (*fn)(void *), void *arg)
{
- timeout_set_flags(new, fn, arg, 0);
+ _timeout_set(new, fn, arg, 0, KCLOCK_NONE);
}
void
timeout_set_flags(struct timeout *to, void (*fn)(void *), void *arg, int flags)
{
- to->to_func = fn;
- to->to_arg = arg;
- to->to_process = NULL;
- to->to_flags = flags | TIMEOUT_INITIALIZED;
+ _timeout_set(to, fn, arg, flags, KCLOCK_NONE);
}
void
timeout_set_proc(struct timeout *new, void (*fn)(void *), void *arg)
{
- timeout_set_flags(new, fn, arg, TIMEOUT_PROC);
+ _timeout_set(new, fn, arg, TIMEOUT_PROC, KCLOCK_NONE);
+}
+
+void
+timeout_set_kclock(struct timeout *to, void (*fn)(void *), void *arg,
+ int flags, int kclock)
+{
+ _timeout_set(to, fn, arg, flags | TIMEOUT_KCLOCK, kclock);
}
int
@@ -257,6 +294,8 @@ timeout_add(struct timeout *new, int to_ticks)
int ret = 1;
KASSERT(ISSET(new->to_flags, TIMEOUT_INITIALIZED));
+ KASSERT(!ISSET(new->to_flags, TIMEOUT_KCLOCK));
+ KASSERT(new->to_kclock == KCLOCK_NONE);
KASSERT(to_ticks >= 0);
mtx_enter(&timeout_mutex);
@@ -356,6 +395,66 @@ timeout_add_nsec(struct timeout *to, int nsecs)
}
int
+timeout_at_ts(struct timeout *to, const struct timespec *abstime)
+{
+ struct timespec old_abstime;
+ int ret = 1;
+
+ mtx_enter(&timeout_mutex);
+
+ KASSERT(ISSET(to->to_flags, TIMEOUT_INITIALIZED | TIMEOUT_KCLOCK));
+ KASSERT(to->to_kclock != KCLOCK_NONE);
+
+ old_abstime = to->to_abstime;
+ to->to_abstime = *abstime;
+ CLR(to->to_flags, TIMEOUT_TRIGGERED);
+
+ if (ISSET(to->to_flags, TIMEOUT_ONQUEUE)) {
+ if (timespeccmp(abstime, &old_abstime, <)) {
+ CIRCQ_REMOVE(&to->to_list);
+ CIRCQ_INSERT_TAIL(&timeout_new, &to->to_list);
+ }
+ tostat.tos_readded++;
+ ret = 0;
+ } else {
+ SET(to->to_flags, TIMEOUT_ONQUEUE);
+ CIRCQ_INSERT_TAIL(&timeout_new, &to->to_list);
+ }
+#if NKCOV > 0
+ new->to_process = curproc->p_p;
+#endif
+ tostat.tos_added++;
+
+ mtx_leave(&timeout_mutex);
+
+ return ret;
+}
+
+int
+timeout_in_nsec(struct timeout *to, uint64_t nsecs)
+{
+ struct timespec abstime, interval, now;
+
+ kclock_nanotime(to->to_kclock, &now);
+ NSEC_TO_TIMESPEC(nsecs, &interval);
+ timespecadd(&now, &interval, &abstime);
+
+ return timeout_at_ts(to, &abstime);
+}
+
+void
+kclock_nanotime(int kclock, struct timespec *now)
+{
+ switch (kclock) {
+ case KCLOCK_UPTIME:
+ nanouptime(now);
+ break;
+ default:
+ panic("invalid kclock: 0x%x", kclock);
+ }
+}
+
+int
timeout_del(struct timeout *to)
{
int ret = 0;
@@ -425,6 +524,48 @@ timeout_proc_barrier(void *arg)
cond_signal(c);
}
+uint32_t
+timeout_bucket(const struct timeout *to)
+{
+ struct kclock *kc = &timeout_kclock[to->to_kclock];
+ struct timespec diff, shifted_abstime;
+ uint32_t level;
+
+ KASSERT(ISSET(to->to_flags, TIMEOUT_KCLOCK));
+ KASSERT(timespeccmp(&kc->kc_lastscan, &to->to_abstime, <));
+
+ timespecsub(&to->to_abstime, &kc->kc_lastscan, &diff);
+ for (level = 0; level < nitems(timeout_level_width) - 1; level++) {
+ if (diff.tv_sec < timeout_level_width[level])
+ break;
+ }
+ timespecadd(&to->to_abstime, &kc->kc_offset, &shifted_abstime);
+ return level * WHEELSIZE + timeout_maskwheel(level, &shifted_abstime);
+}
+
+/*
+ * Hash the absolute time into a bucket on a given level of the wheel.
+ *
+ * The complete hash is 32 bits. The upper 25 bits are seconds, the
+ * lower 7 bits are nanoseconds. tv_nsec is a positive value less
+ * than one billion so we need to divide it to isolate the desired
+ * bits. We can't just shift it.
+ *
+ * The level is used to isolate an 8-bit portion of the hash. The
+ * resulting number indicates which bucket the absolute time belongs
+ * in on the given level of the wheel.
+ */
+uint32_t
+timeout_maskwheel(uint32_t level, const struct timespec *abstime)
+{
+ uint32_t hi, lo;
+
+ hi = abstime->tv_sec << 7;
+ lo = abstime->tv_nsec / 7812500;
+
+ return ((hi | lo) >> (level * WHEELBITS)) & WHEELMASK;
+}
+
/*
* This is called from hardclock() on the primary CPU at the start of
* every tick.
@@ -432,7 +573,15 @@ timeout_proc_barrier(void *arg)
void
timeout_hardclock_update(void)
{
- int need_softclock = 1;
+ struct timespec elapsed, now;
+ struct kclock *kc;
+ struct timespec *lastscan;
+ int b, done, first, i, last, level, need_softclock, off;
+
+ nanouptime(&now);
+ lastscan = &timeout_kclock[KCLOCK_UPTIME].kc_lastscan;
+ timespecsub(&now, lastscan, &elapsed);
+ need_softclock = 1;
mtx_enter(&timeout_mutex);
@@ -446,6 +595,47 @@ timeout_hardclock_update(void)
}
}
+ /*
+ * Dump the buckets that expired while we were away.
+ *
+ * If the elapsed time has exceeded a level's limit then we need
+ * to dump every bucket in the level. We have necessarily completed
+ * a lap of that level, too, so we need to process buckets in the
+ * next level.
+ *
+ * Otherwise we need to compare indices: if the index of the first
+ * expired bucket is greater than that of the last then we have
+ * completed a lap of the level and need to process buckets in the
+ * next level.
+ */
+ for (level = 0; level < nitems(timeout_level_width); level++) {
+ first = timeout_maskwheel(level, lastscan);
+ if (elapsed.tv_sec >= timeout_level_width[level]) {
+ last = (first == 0) ? WHEELSIZE - 1 : first - 1;
+ done = 0;
+ } else {
+ last = timeout_maskwheel(level, &now);
+ done = first <= last;
+ }
+ off = level * WHEELSIZE;
+ for (b = first;; b = (b + 1) % WHEELSIZE) {
+ CIRCQ_CONCAT(&timeout_todo, &timeout_wheel_kc[off + b]);
+ if (b == last)
+ break;
+ }
+ if (done)
+ break;
+ }
+
+ /*
+ * Update the cached state for each kclock.
+ */
+ for (i = 0; i < nitems(timeout_kclock); i++) {
+ kc = &timeout_kclock[i];
+ timespecadd(&now, &kc->kc_offset, &kc->kc_lastscan);
+ timespecsub(&kc->kc_lastscan, &tick_ts, &kc->kc_late);
+ }
+
if (CIRCQ_EMPTY(&timeout_new) && CIRCQ_EMPTY(&timeout_todo))
need_softclock = 0;
@@ -487,6 +677,51 @@ timeout_run(struct timeout *to)
mtx_enter(&timeout_mutex);
}
+void
+softclock_process_kclock_timeout(struct timeout *to, int new)
+{
+ struct kclock *kc = &timeout_kclock[to->to_kclock];
+
+ if (timespeccmp(&to->to_abstime, &kc->kc_lastscan, >)) {
+ tostat.tos_scheduled++;
+ if (!new)
+ tostat.tos_rescheduled++;
+ CIRCQ_INSERT_TAIL(&timeout_wheel_kc[timeout_bucket(to)],
+ &to->to_list);
+ return;
+ }
+ if (!new && timespeccmp(&to->to_abstime, &kc->kc_late, <=))
+ tostat.tos_late++;
+ if (ISSET(to->to_flags, TIMEOUT_PROC)) {
+ CIRCQ_INSERT_TAIL(&timeout_proc, &to->to_list);
+ return;
+ }
+ timeout_run(to);
+ tostat.tos_run_softclock++;
+}
+
+void
+softclock_process_tick_timeout(struct timeout *to, int new)
+{
+ int delta = to->to_time - ticks;
+
+ if (delta > 0) {
+ tostat.tos_scheduled++;
+ if (!new)
+ tostat.tos_rescheduled++;
+ CIRCQ_INSERT_TAIL(&BUCKET(delta, to->to_time), &to->to_list);
+ return;
+ }
+ if (!new && delta < 0)
+ tostat.tos_late++;
+ if (ISSET(to->to_flags, TIMEOUT_PROC)) {
+ CIRCQ_INSERT_TAIL(&timeout_proc, &to->to_list);
+ return;
+ }
+ timeout_run(to);
+ tostat.tos_run_softclock++;
+}
+
/*
* Timeouts are processed here instead of timeout_hardclock_update()
* to avoid doing any more work at IPL_CLOCK than absolutely necessary.
@@ -496,9 +731,8 @@ timeout_run(struct timeout *to)
void
softclock(void *arg)
{
- struct circq *bucket;
struct timeout *first_new, *to;
- int delta, needsproc, new;
+ int needsproc, new;
first_new = NULL;
new = 0;
@@ -512,28 +746,10 @@ softclock(void *arg)
CIRCQ_REMOVE(&to->to_list);
if (to == first_new)
new = 1;
-
- /*
- * If due run it or defer execution to the thread,
- * otherwise insert it into the right bucket.
- */
- delta = to->to_time - ticks;
- if (delta > 0) {
- bucket = &BUCKET(delta, to->to_time);
- CIRCQ_INSERT_TAIL(bucket, &to->to_list);
- tostat.tos_scheduled++;
- if (!new)
- tostat.tos_rescheduled++;
- continue;
- }
- if (!new && delta < 0)
- tostat.tos_late++;
- if (ISSET(to->to_flags, TIMEOUT_PROC)) {
- CIRCQ_INSERT_TAIL(&timeout_proc, &to->to_list);
- continue;
- }
- timeout_run(to);
- tostat.tos_run_softclock++;
+ if (ISSET(to->to_flags, TIMEOUT_KCLOCK))
+ softclock_process_kclock_timeout(to, new);
+ else
+ softclock_process_tick_timeout(to, new);
}
tostat.tos_softclocks++;
needsproc = !CIRCQ_EMPTY(&timeout_proc);
@@ -632,52 +848,114 @@ timeout_sysctl(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
}
#ifdef DDB
+const char *db_kclock(int);
void db_show_callout_bucket(struct circq *);
+void db_show_timeout(struct timeout *, struct circq *);
+const char *db_timespec(const struct timespec *);
+
+const char *
+db_kclock(int kclock)
+{
+ switch (kclock) {
+ case KCLOCK_UPTIME:
+ return "uptime";
+ default:
+ return "invalid";
+ }
+}
+
+const char *
+db_timespec(const struct timespec *ts)
+{
+ static char buf[32];
+ struct timespec tmp, zero;
+
+ if (ts->tv_sec >= 0) {
+ snprintf(buf, sizeof(buf), "%lld.%09ld",
+ ts->tv_sec, ts->tv_nsec);
+ return buf;
+ }
+
+ timespecclear(&zero);
+ timespecsub(&zero, ts, &tmp);
+ snprintf(buf, sizeof(buf), "-%lld.%09ld", tmp.tv_sec, tmp.tv_nsec);
+ return buf;
+}
void
db_show_callout_bucket(struct circq *bucket)
{
- char buf[8];
- struct timeout *to;
struct circq *p;
+
+ CIRCQ_FOREACH(p, bucket)
+ db_show_timeout(timeout_from_circq(p), bucket);
+}
+
+void
+db_show_timeout(struct timeout *to, struct circq *bucket)
+{
+ struct timespec remaining;
+ struct kclock *kc;
+ char buf[8];
db_expr_t offset;
+ struct circq *wheel;
char *name, *where;
int width = sizeof(long) * 2;
- CIRCQ_FOREACH(p, bucket) {
- to = timeout_from_circq(p);
- db_find_sym_and_offset((vaddr_t)to->to_func, &name, &offset);
- name = name ? name : "?";
- if (bucket == &timeout_todo)
- where = "softint";
- else if (bucket == &timeout_proc)
- where = "thread";
- else if (bucket == &timeout_new)
- where = "new";
- else {
- snprintf(buf, sizeof(buf), "%3ld/%1ld",
- (bucket - timeout_wheel) % WHEELSIZE,
- (bucket - timeout_wheel) / WHEELSIZE);
- where = buf;
- }
- db_printf("%9d %7s 0x%0*lx %s\n",
- to->to_time - ticks, where, width, (ulong)to->to_arg, name);
+ db_find_sym_and_offset((vaddr_t)to->to_func, &name, &offset);
+ name = name ? name : "?";
+ if (bucket == &timeout_new)
+ where = "new";
+ else if (bucket == &timeout_todo)
+ where = "softint";
+ else if (bucket == &timeout_proc)
+ where = "thread";
+ else {
+ if (ISSET(to->to_flags, TIMEOUT_KCLOCK))
+ wheel = timeout_wheel_kc;
+ else
+ wheel = timeout_wheel;
+ snprintf(buf, sizeof(buf), "%3ld/%1ld",
+ (bucket - wheel) % WHEELSIZE,
+ (bucket - wheel) / WHEELSIZE);
+ where = buf;
+ }
+ if (ISSET(to->to_flags, TIMEOUT_KCLOCK)) {
+ kc = &timeout_kclock[to->to_kclock];
+ timespecsub(&to->to_abstime, &kc->kc_lastscan, &remaining);
+ db_printf("%20s %8s %7s 0x%0*lx %s\n",
+ db_timespec(&remaining), db_kclock(to->to_kclock), where,
+ width, (ulong)to->to_arg, name);
+ } else {
+ db_printf("%20d %8s %7s 0x%0*lx %s\n",
+ to->to_time - ticks, "ticks", where,
+ width, (ulong)to->to_arg, name);
}
}
void
db_show_callout(db_expr_t addr, int haddr, db_expr_t count, char *modif)
{
+ struct kclock *kc;
int width = sizeof(long) * 2 + 2;
- int b;
-
- db_printf("ticks now: %d\n", ticks);
- db_printf("%9s %7s %*s func\n", "ticks", "wheel", width, "arg");
-
+ int b, i;
+
+ db_printf("%20s %8s\n", "lastscan", "clock");
+ db_printf("%20d %8s\n", ticks, "ticks");
+ for (i = 0; i < nitems(timeout_kclock); i++) {
+ kc = &timeout_kclock[i];
+ db_printf("%20s %8s\n",
+ db_timespec(&kc->kc_lastscan), db_kclock(i));
+ }
+ db_printf("\n");
+ db_printf("%20s %8s %7s %*s %s\n",
+ "remaining", "clock", "wheel", width, "arg", "func");
db_show_callout_bucket(&timeout_new);
db_show_callout_bucket(&timeout_todo);
db_show_callout_bucket(&timeout_proc);
for (b = 0; b < nitems(timeout_wheel); b++)
db_show_callout_bucket(&timeout_wheel[b]);
+ for (b = 0; b < nitems(timeout_wheel_kc); b++)
+ db_show_callout_bucket(&timeout_wheel_kc[b]);
}
#endif