summaryrefslogtreecommitdiff
path: root/sys
diff options
context:
space:
mode:
authorScott Soule Cheloha <cheloha@cvs.openbsd.org>2022-11-05 19:29:47 +0000
committerScott Soule Cheloha <cheloha@cvs.openbsd.org>2022-11-05 19:29:47 +0000
commit02dc962cf694b58ab04d3ec0483b539051ebe369 (patch)
treebca3746ada93fd48e98b668683df00c57f875a63 /sys
parentad7e72bf7a6d51934c476167fdfaf12af2d75e30 (diff)
clockintr(9): initial commit
clockintr(9) is a machine-independent clock interrupt scheduler. It emulates most of what the machine-dependent clock interrupt code is doing on every platform. Every CPU has a work schedule based on the system uptime clock. For now, every CPU has a hardclock(9) and a statclock(). If schedhz is set, every CPU has a schedclock(), too. This commit only contains the MI pieces. All code is conditionally compiled with __HAVE_CLOCKINTR. This commit changes no behavior yet. At a high level, clockintr(9) is configured and used as follows: 1. During boot, the primary CPU calls clockintr_init(9). Global state is initialized. 2. Primary CPU calls clockintr_cpu_init(9). Local, per-CPU state is initialized. An "intrclock" struct may be installed, too. 3. Secondary CPUs call clockintr_cpu_init(9) to initialize their local state. 4. All CPUs repeatedly call clockintr_dispatch(9) from the MD clock interrupt handler. The CPUs complete work and rearm their local interrupt clock, if any, during the dispatch. 5. Repeat step (4) until the system shuts down, suspends, or hibernates. 6. During resume, the primary CPU calls inittodr(9) and advances the system uptime. 7. Go to step (2). This time around, clockintr_cpu_init(9) also advances the work schedule on the calling CPU to skip events that expired during suspend. This prevents a "thundering herd" of useless work during the first clock interrupt. In the long term, we need an MI clock interrupt scheduler in order to (1) provide control over the clock interrupt to MI subsystems like timeout(9) and dt(4) to improve their accuracy, (2) provide drivers like acpicpu(4) a means for slowing or stopping the clock interrupt on idle CPUs to conserve power, and (3) reduce the amount of duplicated code in the MD clock interrupt code. Before we can do any of that, though, we need to switch every platform over to using clockintr(9) and do some cleanup. Prompted by "the vmm(4) time bug," among other problems, and a discussion at a2k19 on the subject. Lots of design input from kettenis@. Early versions reviewed by kettenis@ and mlarkin@. Platform-specific help and testing from kettenis@, gkoehler@, mlarkin@, miod@, aoyama@, visa@, and dv@. Babysitting and spiritual guidance from mlarkin@ and kettenis@. Link: https://marc.info/?l=openbsd-tech&m=166697497302283&w=2 ok kettenis@ mlarkin@
Diffstat (limited to 'sys')
-rw-r--r--sys/conf/files3
-rw-r--r--sys/ddb/db_command.c5
-rw-r--r--sys/ddb/db_interface.h7
-rw-r--r--sys/kern/kern_clockintr.c458
-rw-r--r--sys/kern/kern_sysctl.c8
-rw-r--r--sys/kern/subr_suspend.c7
-rw-r--r--sys/sys/clockintr.h105
-rw-r--r--sys/sys/sysctl.h17
8 files changed, 603 insertions, 7 deletions
diff --git a/sys/conf/files b/sys/conf/files
index 3bd32d693b8..18ad40c52cb 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -1,4 +1,4 @@
-# $OpenBSD: files,v 1.716 2022/07/29 17:47:11 semarie Exp $
+# $OpenBSD: files,v 1.717 2022/11/05 19:29:45 cheloha Exp $
# $NetBSD: files,v 1.87 1996/05/19 17:17:50 jonathan Exp $
# @(#)files.newconf 7.5 (Berkeley) 5/10/93
@@ -676,6 +676,7 @@ file kern/init_sysent.c
file kern/kern_acct.c accounting
file kern/kern_bufq.c
file kern/kern_clock.c
+file kern/kern_clockintr.c
file kern/kern_descrip.c
file kern/kern_event.c
file kern/kern_exec.c
diff --git a/sys/ddb/db_command.c b/sys/ddb/db_command.c
index bb369832baa..a8f60c9249b 100644
--- a/sys/ddb/db_command.c
+++ b/sys/ddb/db_command.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: db_command.c,v 1.96 2022/07/29 17:47:11 semarie Exp $ */
+/* $OpenBSD: db_command.c,v 1.97 2022/11/05 19:29:45 cheloha Exp $ */
/* $NetBSD: db_command.c,v 1.20 1996/03/30 22:30:05 christos Exp $ */
/*
@@ -597,6 +597,9 @@ db_bcstats_print_cmd(db_expr_t addr, int have_addr, db_expr_t count, char *modif
const struct db_command db_show_all_cmds[] = {
{ "procs", db_show_all_procs, 0, NULL },
{ "callout", db_show_callout, 0, NULL },
+#ifdef __HAVE_CLOCKINTR
+ { "clockintr", db_show_all_clockintr, 0, NULL },
+#endif
{ "pools", db_show_all_pools, 0, NULL },
{ "mounts", db_show_all_mounts, 0, NULL },
{ "vnodes", db_show_all_vnodes, 0, NULL },
diff --git a/sys/ddb/db_interface.h b/sys/ddb/db_interface.h
index 50a337dc9f7..fabc8fd1879 100644
--- a/sys/ddb/db_interface.h
+++ b/sys/ddb/db_interface.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: db_interface.h,v 1.24 2022/07/29 17:47:11 semarie Exp $ */
+/* $OpenBSD: db_interface.h,v 1.25 2022/11/05 19:29:45 cheloha Exp $ */
/* $NetBSD: db_interface.h,v 1.1 1996/02/05 01:57:03 christos Exp $ */
/*
@@ -43,6 +43,11 @@ vaddr_t db_disasm(vaddr_t, int);
void db_kill_cmd(db_expr_t, int, db_expr_t, char *);
void db_show_all_procs(db_expr_t, int, db_expr_t, char *);
+/* kern/kern_clockintr.c */
+#ifdef __HAVE_CLOCKINTR
+void db_show_all_clockintr(db_expr_t, int, db_expr_t, char *);
+#endif
+
/* kern/kern_timeout.c */
void db_show_callout(db_expr_t, int, db_expr_t, char *);
diff --git a/sys/kern/kern_clockintr.c b/sys/kern/kern_clockintr.c
new file mode 100644
index 00000000000..b6bcbf45904
--- /dev/null
+++ b/sys/kern/kern_clockintr.c
@@ -0,0 +1,458 @@
+/* $OpenBSD: kern_clockintr.c,v 1.1 2022/11/05 19:29:46 cheloha Exp $ */
+/*
+ * Copyright (c) 2003 Dale Rahn <drahn@openbsd.org>
+ * Copyright (c) 2020 Mark Kettenis <kettenis@openbsd.org>
+ * Copyright (c) 2020-2022 Scott Cheloha <cheloha@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/atomic.h>
+#include <sys/clockintr.h>
+#include <sys/kernel.h>
+#include <sys/mutex.h>
+#include <sys/stdint.h>
+#include <sys/sysctl.h>
+#include <sys/time.h>
+
+#ifdef __HAVE_CLOCKINTR
+
+/*
+ * Protection for global variables in this file:
+ *
+ * C Global clockintr configuration mutex (clockintr_mtx).
+ * I Immutable after initialization.
+ */
+struct mutex clockintr_mtx = MUTEX_INITIALIZER(IPL_CLOCK);
+
+u_int clockintr_flags; /* [I] global state + behavior flags */
+uint32_t hardclock_period; /* [I] hardclock period (ns) */
+uint32_t schedclock_period; /* [I] schedclock period (ns) */
+volatile u_int statclock_gen = 1; /* [C] statclock update generation */
+volatile uint32_t statclock_avg; /* [C] average statclock period (ns) */
+uint32_t statclock_min; /* [C] minimum statclock period (ns) */
+uint32_t statclock_mask; /* [C] set of allowed offsets */
+uint32_t stat_avg; /* [I] average stathz period (ns) */
+uint32_t stat_min; /* [I] set of allowed offsets */
+uint32_t stat_mask; /* [I] max offset from minimum (ns) */
+uint32_t prof_avg; /* [I] average profhz period (ns) */
+uint32_t prof_min; /* [I] minimum profhz period (ns) */
+uint32_t prof_mask; /* [I] set of allowed offsets */
+
+void clockintr_statvar_init(int, uint32_t *, uint32_t *, uint32_t *);
+uint64_t nsec_advance(uint64_t *, uint64_t, uint64_t);
+
+/*
+ * Initialize global state. Set flags and compute intervals.
+ */
+void
+clockintr_init(u_int flags)
+{
+ KASSERT(CPU_IS_PRIMARY(curcpu()));
+ KASSERT(clockintr_flags == 0);
+ KASSERT(!ISSET(flags, ~CL_FLAG_MASK));
+
+ KASSERT(hz > 0 && hz <= 1000000000);
+ hardclock_period = 1000000000 / hz;
+
+ KASSERT(stathz >= 1 && stathz <= 1000000000);
+ KASSERT(profhz >= stathz && profhz <= 1000000000);
+ KASSERT(profhz % stathz == 0);
+ clockintr_statvar_init(stathz, &stat_avg, &stat_min, &stat_mask);
+ clockintr_statvar_init(profhz, &prof_avg, &prof_min, &prof_mask);
+ SET(clockintr_flags, CL_STATCLOCK);
+ clockintr_setstatclockrate(stathz);
+
+ KASSERT(schedhz >= 0 && schedhz <= 1000000000);
+ if (schedhz != 0) {
+ schedclock_period = 1000000000 / schedhz;
+ SET(clockintr_flags, CL_SCHEDCLOCK);
+ }
+
+ SET(clockintr_flags, flags | CL_INIT);
+}
+
+/*
+ * Ready the calling CPU for clockintr_dispatch(). If this is our
+ * first time here, install the intrclock, if any, and set necessary
+ * flags. Advance the schedule as needed.
+ */
+void
+clockintr_cpu_init(const struct intrclock *ic)
+{
+ uint64_t multiplier, now;
+ struct cpu_info *ci = curcpu();
+ struct clockintr_queue *cq = &ci->ci_queue;
+
+ KASSERT(ISSET(clockintr_flags, CL_INIT));
+
+ if (!ISSET(cq->cq_flags, CL_CPU_INIT)) {
+ if (ic != NULL) {
+ cq->cq_intrclock = *ic;
+ SET(cq->cq_flags, CL_CPU_INTRCLOCK);
+ }
+ cq->cq_gen = 1;
+ }
+
+ /*
+ * Until we understand scheduler lock contention better, stagger
+ * the hardclock and statclock so they don't all happen at once.
+ * If we have no intrclock it doesn't matter, we have no control
+ * anyway. The primary CPU's starting offset is always zero, so
+ * set multiplier to zero.
+ */
+ if (!CPU_IS_PRIMARY(ci) && ISSET(cq->cq_flags, CL_CPU_INTRCLOCK))
+ multiplier = CPU_INFO_UNIT(ci);
+ else
+ multiplier = 0;
+
+ now = nsecuptime();
+
+ /*
+ * The first time we do this, the primary CPU cannot skip any
+ * hardclocks. We can skip hardclocks on subsequent calls because
+ * the global tick value is advanced during inittodr(9) on our
+ * behalf.
+ */
+ if (!CPU_IS_PRIMARY(ci) || ISSET(cq->cq_flags, CL_CPU_INIT)) {
+ cq->cq_next_hardclock = hardclock_period / ncpus * multiplier;
+ nsec_advance(&cq->cq_next_hardclock, hardclock_period, now);
+ }
+
+ /*
+ * We can always advance the statclock and schedclock.
+ */
+ cq->cq_next_statclock = stat_avg / ncpus * multiplier;
+ nsec_advance(&cq->cq_next_statclock, stat_avg, now);
+ if (ISSET(clockintr_flags, CL_SCHEDCLOCK)) {
+ cq->cq_next_schedclock = schedclock_period / ncpus * multiplier;
+ nsec_advance(&cq->cq_next_schedclock, schedclock_period, now);
+ }
+
+ SET(cq->cq_flags, CL_CPU_INIT);
+}
+
+/*
+ * If we have an intrclock, trigger it to start the dispatch cycle.
+ */
+void
+clockintr_trigger(void)
+{
+ struct clockintr_queue *cq = &curcpu()->ci_queue;
+
+ KASSERT(ISSET(cq->cq_flags, CL_CPU_INIT));
+
+ if (ISSET(cq->cq_flags, CL_CPU_INTRCLOCK))
+ intrclock_trigger(&cq->cq_intrclock);
+}
+
+/*
+ * Run all expired events scheduled on the calling CPU.
+ */
+int
+clockintr_dispatch(void *frame)
+{
+ uint64_t count, i, lateness, now, run = 0, start;
+ struct cpu_info *ci = curcpu();
+ struct clockintr_queue *cq = &ci->ci_queue;
+ struct proc *p = curproc;
+ uint32_t mask, min, off;
+ u_int gen, ogen;
+
+ if (cq->cq_dispatch != 0)
+ panic("%s: recursive dispatch", __func__);
+ cq->cq_dispatch = 1;
+
+ splassert(IPL_CLOCK);
+ KASSERT(ISSET(cq->cq_flags, CL_CPU_INIT));
+
+ /*
+ * If we arrived too early we have nothing to do.
+ */
+ start = nsecuptime();
+ now = start;
+ if (now < cq->cq_next)
+ goto done;
+ lateness = now - cq->cq_next;
+
+ /*
+ * Dispatch expired events.
+ */
+again:
+ /* hardclock */
+ count = nsec_advance(&cq->cq_next_hardclock, hardclock_period, now);
+ for (i = 0; i < count; i++)
+ hardclock(frame);
+ run += count;
+
+ /* statclock */
+ if (ISSET(clockintr_flags, CL_RNDSTAT)) {
+ do {
+ gen = statclock_gen;
+ membar_consumer();
+ min = statclock_min;
+ mask = statclock_mask;
+ membar_consumer();
+ } while (gen == 0 || gen != statclock_gen);
+ count = 0;
+ while (cq->cq_next_statclock <= now) {
+ while ((off = (random() & mask)) == 0)
+ continue;
+ cq->cq_next_statclock += min + off;
+ count++;
+ }
+ } else {
+ count = nsec_advance(&cq->cq_next_statclock, statclock_avg,
+ now);
+ }
+ for (i = 0; i < count; i++)
+ statclock(frame);
+ run += count;
+
+ /* schedclock */
+ if (ISSET(clockintr_flags, CL_SCHEDCLOCK)) {
+ count = nsec_advance(&cq->cq_next_schedclock,
+ schedclock_period, now);
+ if (p != NULL) {
+ for (i = 0; i < count; i++)
+ schedclock(p);
+ }
+ run += count;
+ }
+
+ /* Run the dispatch again if the next event has already expired. */
+ cq->cq_next = cq->cq_next_hardclock;
+ if (cq->cq_next_statclock < cq->cq_next)
+ cq->cq_next = cq->cq_next_statclock;
+ if (ISSET(clockintr_flags, CL_SCHEDCLOCK)) {
+ if (cq->cq_next_schedclock < cq->cq_next)
+ cq->cq_next = cq->cq_next_schedclock;
+ }
+ now = nsecuptime();
+ if (cq->cq_next <= now)
+ goto again;
+
+ /*
+ * Dispatch complete.
+ */
+done:
+ /* Rearm the interrupt clock if we have one. */
+ if (ISSET(cq->cq_flags, CL_CPU_INTRCLOCK))
+ intrclock_rearm(&cq->cq_intrclock, cq->cq_next - now);
+
+ /* Update our stats. */
+ ogen = cq->cq_gen;
+ cq->cq_gen = 0;
+ membar_producer();
+ cq->cq_stat.cs_dispatched += now - start;
+ if (run > 0) {
+ cq->cq_stat.cs_lateness += lateness;
+ cq->cq_stat.cs_prompt++;
+ cq->cq_stat.cs_run += run;
+ } else {
+ cq->cq_stat.cs_early++;
+ cq->cq_stat.cs_earliness += cq->cq_next - now;
+ }
+ membar_producer();
+ cq->cq_gen = MAX(1, ogen + 1);
+
+ if (cq->cq_dispatch != 1)
+ panic("%s: unexpected value: %u", __func__, cq->cq_dispatch);
+ cq->cq_dispatch = 0;
+
+ return run > 0;
+}
+
+/*
+ * Compute the period (avg) for the given frequency and a range around
+ * that period. The range is [min + 1, min + mask]. The range is used
+ * during dispatch to choose a new pseudorandom deadline for each statclock
+ * event.
+ */
+void
+clockintr_statvar_init(int freq, uint32_t *avg, uint32_t *min, uint32_t *mask)
+{
+ uint32_t half_avg, var;
+
+ KASSERT(!ISSET(clockintr_flags, CL_INIT | CL_STATCLOCK));
+ KASSERT(freq > 0 && freq <= 1000000000);
+
+ /* Compute avg, the average period. */
+ *avg = 1000000000 / freq;
+
+ /* Find var, the largest power of two such that var <= avg / 2. */
+ half_avg = *avg / 2;
+ for (var = 1U << 31; var > half_avg; var /= 2)
+ continue;
+
+ /* Using avg and var, set a lower bound for the range. */
+ *min = *avg - (var / 2);
+
+ /* The mask is just (var - 1). */
+ *mask = var - 1;
+}
+
+/*
+ * Update the statclock_* variables according to the given frequency.
+ * Must only be called after clockintr_statvar_init() initializes both
+ * stathz_* and profhz_*.
+ */
+void
+clockintr_setstatclockrate(int freq)
+{
+ u_int ogen;
+
+ KASSERT(ISSET(clockintr_flags, CL_STATCLOCK));
+
+ mtx_enter(&clockintr_mtx);
+
+ ogen = statclock_gen;
+ statclock_gen = 0;
+ membar_producer();
+ if (freq == stathz) {
+ statclock_avg = stat_avg;
+ statclock_min = stat_min;
+ statclock_mask = stat_mask;
+ } else if (freq == profhz) {
+ statclock_avg = prof_avg;
+ statclock_min = prof_min;
+ statclock_mask = prof_mask;
+ } else {
+ panic("%s: frequency is not stathz (%d) or profhz (%d): %d",
+ __func__, stathz, profhz, freq);
+ }
+ membar_producer();
+ statclock_gen = MAX(1, ogen + 1);
+
+ mtx_leave(&clockintr_mtx);
+}
+
+/*
+ * Advance *next in increments of period until it exceeds now.
+ * Returns the number of increments *next was advanced.
+ *
+ * We check the common cases first to avoid division if possible.
+ * This does no overflow checking.
+ */
+uint64_t
+nsec_advance(uint64_t *next, uint64_t period, uint64_t now)
+{
+ uint64_t elapsed;
+
+ if (now < *next)
+ return 0;
+
+ if (now < *next + period) {
+ *next += period;
+ return 1;
+ }
+
+ elapsed = (now - *next) / period + 1;
+ *next += period * elapsed;
+ return elapsed;
+}
+
+int
+sysctl_clockintr(int *name, u_int namelen, void *oldp, size_t *oldlenp,
+ void *newp, size_t newlen)
+{
+ struct clockintr_stat sum = { 0 }, tmp;
+ struct clockintr_queue *cq;
+ struct cpu_info *ci;
+ CPU_INFO_ITERATOR cii;
+ u_int gen;
+
+ if (namelen != 1)
+ return ENOTDIR;
+
+ switch (name[0]) {
+ case KERN_CLOCKINTR_STATS:
+ CPU_INFO_FOREACH(cii, ci) {
+ cq = &ci->ci_queue;
+ if (!ISSET(cq->cq_flags, CL_CPU_INIT))
+ continue;
+ do {
+ gen = cq->cq_gen;
+ membar_consumer();
+ tmp = cq->cq_stat;
+ membar_consumer();
+ } while (gen == 0 || gen != cq->cq_gen);
+ sum.cs_dispatched += tmp.cs_dispatched;
+ sum.cs_early += tmp.cs_early;
+ sum.cs_earliness += tmp.cs_earliness;
+ sum.cs_lateness += tmp.cs_lateness;
+ sum.cs_prompt += tmp.cs_prompt;
+ sum.cs_run += tmp.cs_run;
+ }
+ return sysctl_rdstruct(oldp, oldlenp, newp, &sum, sizeof sum);
+ default:
+ break;
+ }
+
+ return EINVAL;
+}
+
+#ifdef DDB
+
+#include <machine/db_machdep.h>
+
+#include <ddb/db_interface.h>
+#include <ddb/db_output.h>
+#include <ddb/db_sym.h>
+
+void db_show_clockintr(uint64_t, u_int, const char *);
+void db_show_clockintr_cpu(struct cpu_info *);
+
+void
+db_show_all_clockintr(db_expr_t addr, int haddr, db_expr_t count, char *modif)
+{
+ struct timespec now;
+ struct cpu_info *ci;
+ CPU_INFO_ITERATOR cii;
+
+ nanouptime(&now);
+ db_printf("%20s\n", "UPTIME");
+ db_printf("%10lld.%09ld\n", now.tv_sec, now.tv_nsec);
+ db_printf("\n");
+ db_printf("%20s %3s %s\n", "EXPIRATION", "CPU", "NAME");
+ CPU_INFO_FOREACH(cii, ci) {
+ if (ISSET(ci->ci_queue.cq_flags, CL_CPU_INIT))
+ db_show_clockintr_cpu(ci);
+ }
+}
+
+void
+db_show_clockintr_cpu(struct cpu_info *ci)
+{
+ struct clockintr_queue *cq = &ci->ci_queue;
+ u_int cpu = CPU_INFO_UNIT(ci);
+
+ db_show_clockintr(cq->cq_next_hardclock, cpu, "hardclock");
+ db_show_clockintr(cq->cq_next_statclock, cpu, "statclock");
+ if (ISSET(clockintr_flags, CL_SCHEDCLOCK))
+ db_show_clockintr(cq->cq_next_schedclock, cpu, "schedclock");
+}
+
+void
+db_show_clockintr(uint64_t expiration, u_int cpu, const char *name)
+{
+ struct timespec ts;
+
+ NSEC_TO_TIMESPEC(expiration, &ts);
+ db_printf("%10lld.%09ld %3u %s\n", ts.tv_sec, ts.tv_nsec, cpu, name);
+}
+
+#endif /* DDB */
+#endif /*__HAVE_CLOCKINTR */
diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c
index 1580409efeb..abf9cf0ded7 100644
--- a/sys/kern/kern_sysctl.c
+++ b/sys/kern/kern_sysctl.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: kern_sysctl.c,v 1.406 2022/08/16 13:29:52 visa Exp $ */
+/* $OpenBSD: kern_sysctl.c,v 1.407 2022/11/05 19:29:46 cheloha Exp $ */
/* $NetBSD: kern_sysctl.c,v 1.17 1996/05/20 17:49:05 mrg Exp $ */
/*-
@@ -53,6 +53,7 @@
#include <sys/vnode.h>
#include <sys/unistd.h>
#include <sys/buf.h>
+#include <sys/clockintr.h>
#include <sys/tty.h>
#include <sys/disklabel.h>
#include <sys/disk.h>
@@ -426,6 +427,11 @@ kern_sysctl_dirs(int top_name, int *name, u_int namelen,
case KERN_CPUSTATS:
return (sysctl_cpustats(name, namelen, oldp, oldlenp,
newp, newlen));
+#ifdef __HAVE_CLOCKINTR
+ case KERN_CLOCKINTR:
+ return sysctl_clockintr(name, namelen, oldp, oldlenp, newp,
+ newlen);
+#endif
default:
return (ENOTDIR); /* overloaded */
}
diff --git a/sys/kern/subr_suspend.c b/sys/kern/subr_suspend.c
index 59bc9dca07a..58f8ef5be19 100644
--- a/sys/kern/subr_suspend.c
+++ b/sys/kern/subr_suspend.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: subr_suspend.c,v 1.12 2022/09/03 18:05:10 kettenis Exp $ */
+/* $OpenBSD: subr_suspend.c,v 1.13 2022/11/05 19:29:46 cheloha Exp $ */
/*
* Copyright (c) 2005 Thorsten Lockert <tholo@sigmasoft.com>
* Copyright (c) 2005 Jordan Hargrave <jordan@openbsd.org>
@@ -19,6 +19,7 @@
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
+#include <sys/clockintr.h>
#include <sys/reboot.h>
#include <sys/sensors.h>
#include <sys/sysctl.h>
@@ -161,6 +162,10 @@ fail_suspend:
splx(s);
inittodr(gettime());
+#ifdef __HAVE_CLOCKINTR
+ clockintr_cpu_init(NULL);
+ clockintr_trigger();
+#endif
sleep_resume(v);
resume_randomness(rndbuf, rndbuflen);
#ifdef MULTIPROCESSOR
diff --git a/sys/sys/clockintr.h b/sys/sys/clockintr.h
new file mode 100644
index 00000000000..8021d6c0afd
--- /dev/null
+++ b/sys/sys/clockintr.h
@@ -0,0 +1,105 @@
+/* $OpenBSD: clockintr.h,v 1.1 2022/11/05 19:29:46 cheloha Exp $ */
+/*
+ * Copyright (c) 2020-2022 Scott Cheloha <cheloha@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef _SYS_CLOCKINTR_H_
+#define _SYS_CLOCKINTR_H_
+
+#include <sys/stdint.h>
+
+struct clockintr_stat {
+ uint64_t cs_dispatched; /* total time in dispatch (ns) */
+ uint64_t cs_early; /* number of early dispatch calls */
+ uint64_t cs_earliness; /* total earliness (ns) */
+ uint64_t cs_lateness; /* total lateness (ns) */
+ uint64_t cs_prompt; /* number of prompt dispatch calls */
+ uint64_t cs_run; /* number of events dispatched */
+};
+
+#ifdef _KERNEL
+
+/*
+ * Platform API
+ */
+
+struct intrclock {
+ void *ic_cookie;
+ void (*ic_rearm)(void *, uint64_t);
+ void (*ic_trigger)(void *);
+};
+
+static inline void
+intrclock_rearm(struct intrclock *ic, uint64_t nsecs)
+{
+ ic->ic_rearm(ic->ic_cookie, nsecs);
+}
+
+static inline void
+intrclock_trigger(struct intrclock *ic)
+{
+ ic->ic_trigger(ic->ic_cookie);
+}
+
+/*
+ * Per-CPU clock interrupt state.
+ *
+ * Struct member protections:
+ *
+ * I Immutable after initialization.
+ * o Owned by a single CPU.
+ */
+struct clockintr_queue {
+ uint64_t cq_next; /* [o] next event expiration */
+ uint64_t cq_next_hardclock; /* [o] next hardclock expiration */
+ uint64_t cq_next_schedclock; /* [o] next schedclock expiration */
+ uint64_t cq_next_statclock; /* [o] next statclock expiration */
+ struct intrclock cq_intrclock; /* [I] local interrupt clock */
+ struct clockintr_stat cq_stat; /* [o] dispatch statistics */
+ volatile u_int cq_gen; /* [o] cq_stat update generation */
+ volatile u_int cq_dispatch; /* [o] dispatch is running */
+ u_int cq_flags; /* [I] local state flags */
+};
+
+/* Global state flags. */
+#define CL_INIT 0x00000001 /* global init done */
+#define CL_STATCLOCK 0x00000002 /* statclock variables set */
+#define CL_SCHEDCLOCK 0x00000004 /* run separate schedclock */
+#define CL_STATE_MASK 0x00000007
+
+/* Global behavior flags. */
+#define CL_RNDSTAT 0x80000000 /* randomized statclock */
+#define CL_FLAG_MASK 0x80000000
+
+/* Per-CPU state flags. */
+#define CL_CPU_INIT 0x00000001 /* CPU is ready for dispatch */
+#define CL_CPU_INTRCLOCK 0x00000002 /* CPU has intrclock */
+#define CL_CPU_STATE_MASK 0x00000003
+
+void clockintr_cpu_init(const struct intrclock *);
+int clockintr_dispatch(void *);
+void clockintr_init(u_int);
+void clockintr_setstatclockrate(int);
+void clockintr_trigger(void);
+
+/*
+ * Kernel API
+ */
+
+int sysctl_clockintr(int *, u_int, void *, size_t *, void *, size_t);
+
+#endif /* _KERNEL */
+
+#endif /* !_SYS_CLOCKINTR_H_ */
diff --git a/sys/sys/sysctl.h b/sys/sys/sysctl.h
index 174629da4b1..84564782931 100644
--- a/sys/sys/sysctl.h
+++ b/sys/sys/sysctl.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: sysctl.h,v 1.229 2022/08/16 13:29:53 visa Exp $ */
+/* $OpenBSD: sysctl.h,v 1.230 2022/11/05 19:29:46 cheloha Exp $ */
/* $NetBSD: sysctl.h,v 1.16 1996/04/09 20:55:36 cgd Exp $ */
/*
@@ -191,7 +191,8 @@ struct ctlname {
#define KERN_TIMEOUT_STATS 87 /* struct: timeout status and stats */
#define KERN_UTC_OFFSET 88 /* int: adjust RTC time to UTC */
#define KERN_VIDEO 89 /* struct: video properties */
-#define KERN_MAXID 90 /* number of valid kern ids */
+#define KERN_CLOCKINTR 90 /* node: clockintr */
+#define KERN_MAXID 91 /* number of valid kern ids */
#define CTL_KERN_NAMES { \
{ 0, 0 }, \
@@ -284,6 +285,7 @@ struct ctlname {
{ "timeout_stats", CTLTYPE_STRUCT }, \
{ "utc_offset", CTLTYPE_INT }, \
{ "video", CTLTYPE_STRUCT }, \
+ { "clockintr", CTLTYPE_NODE }, \
}
/*
@@ -881,6 +883,17 @@ struct kinfo_file {
}
/*
+ * KERN_CLOCKINTR
+ */
+#define KERN_CLOCKINTR_STATS 1 /* struct: stats */
+#define KERN_CLOCKINTR_MAXID 2
+
+#define CTL_KERN_CLOCKINTR_NAMES { \
+ { 0, 0 }, \
+ { "stats", CTLTYPE_STRUCT }, \
+}
+
+/*
* CTL_FS identifiers
*/
#define FS_POSIX 1 /* POSIX flags */