/* $OpenBSD: kern_clockintr.c,v 1.71 2024/11/07 16:02:29 miod Exp $ */ /* * Copyright (c) 2003 Dale Rahn * Copyright (c) 2020 Mark Kettenis * Copyright (c) 2020-2024 Scott Cheloha * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include void clockintr_cancel_locked(struct clockintr *); void clockintr_hardclock(struct clockrequest *, void *, void *); void clockintr_schedule_locked(struct clockintr *, uint64_t); void clockqueue_intrclock_install(struct clockqueue *, const struct intrclock *); void clockqueue_intrclock_reprogram(struct clockqueue *); uint64_t clockqueue_next(const struct clockqueue *); void clockqueue_pend_delete(struct clockqueue *, struct clockintr *); void clockqueue_pend_insert(struct clockqueue *, struct clockintr *, uint64_t); void intrclock_rearm(struct intrclock *, uint64_t); void intrclock_trigger(struct intrclock *); uint64_t nsec_advance(uint64_t *, uint64_t, uint64_t); /* * Ready the calling CPU for clockintr_dispatch(). If this is our * first time here, install the intrclock, if any, and set necessary * flags. Advance the schedule as needed. */ void clockintr_cpu_init(const struct intrclock *ic) { uint64_t multiplier = 0; struct cpu_info *ci = curcpu(); struct clockqueue *cq = &ci->ci_queue; struct schedstate_percpu *spc = &ci->ci_schedstate; int reset_cq_intrclock = 0; if (ic != NULL) clockqueue_intrclock_install(cq, ic); /* TODO: Remove this from struct clockqueue. */ if (CPU_IS_PRIMARY(ci) && cq->cq_hardclock.cl_expiration == 0) { clockintr_bind(&cq->cq_hardclock, ci, clockintr_hardclock, NULL); } /* * Mask CQ_INTRCLOCK while we're advancing the internal clock * interrupts. We don't want the intrclock to fire until this * thread reaches clockintr_trigger(). */ if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { CLR(cq->cq_flags, CQ_INTRCLOCK); reset_cq_intrclock = 1; } /* * Until we understand scheduler lock contention better, stagger * the hardclock and statclock so they don't all happen at once. * If we have no intrclock it doesn't matter, we have no control * anyway. The primary CPU's starting offset is always zero, so * leave the multiplier zero. */ if (!CPU_IS_PRIMARY(ci) && reset_cq_intrclock) multiplier = CPU_INFO_UNIT(ci); /* * The first time we do this, the primary CPU cannot skip any * hardclocks. We can skip hardclocks on subsequent calls because * the global tick value is advanced during inittodr(9) on our * behalf. */ if (CPU_IS_PRIMARY(ci)) { if (cq->cq_hardclock.cl_expiration == 0) clockintr_schedule(&cq->cq_hardclock, 0); else clockintr_advance(&cq->cq_hardclock, hardclock_period); } /* * We can always advance the statclock. There is no reason to * stagger a randomized statclock. */ if (!statclock_is_randomized) { if (spc->spc_statclock.cl_expiration == 0) { clockintr_stagger(&spc->spc_statclock, statclock_avg, multiplier, MAXCPUS); } } clockintr_advance(&spc->spc_statclock, statclock_avg); /* * XXX Need to find a better place to do this. We can't do it in * sched_init_cpu() because initclocks() runs after it. */ if (spc->spc_itimer.cl_expiration == 0) { clockintr_stagger(&spc->spc_itimer, hardclock_period, multiplier, MAXCPUS); } if (spc->spc_profclock.cl_expiration == 0) { clockintr_stagger(&spc->spc_profclock, profclock_period, multiplier, MAXCPUS); } if (spc->spc_roundrobin.cl_expiration == 0) { clockintr_stagger(&spc->spc_roundrobin, hardclock_period, multiplier, MAXCPUS); } clockintr_advance(&spc->spc_roundrobin, roundrobin_period); if (reset_cq_intrclock) SET(cq->cq_flags, CQ_INTRCLOCK); } /* * If we have an intrclock, trigger it to start the dispatch cycle. */ void clockintr_trigger(void) { struct clockqueue *cq = &curcpu()->ci_queue; KASSERT(ISSET(cq->cq_flags, CQ_INIT)); if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) intrclock_trigger(&cq->cq_intrclock); } /* * Run all expired events scheduled on the calling CPU. */ int clockintr_dispatch(void *frame) { uint64_t lateness, run = 0, start; struct cpu_info *ci = curcpu(); struct clockintr *cl; struct clockqueue *cq = &ci->ci_queue; struct clockrequest *request = &cq->cq_request; void *arg; void (*func)(struct clockrequest *, void *, void *); uint32_t ogen; if (cq->cq_dispatch != 0) panic("%s: recursive dispatch", __func__); cq->cq_dispatch = 1; splassert(IPL_CLOCK); KASSERT(ISSET(cq->cq_flags, CQ_INIT)); mtx_enter(&cq->cq_mtx); /* * If nothing is scheduled or we arrived too early, we have * nothing to do. */ start = nsecuptime(); cq->cq_uptime = start; if (TAILQ_EMPTY(&cq->cq_pend)) goto stats; if (cq->cq_uptime < clockqueue_next(cq)) goto rearm; lateness = start - clockqueue_next(cq); /* * Dispatch expired events. */ for (;;) { cl = TAILQ_FIRST(&cq->cq_pend); if (cl == NULL) break; if (cq->cq_uptime < cl->cl_expiration) { /* Double-check the time before giving up. */ cq->cq_uptime = nsecuptime(); if (cq->cq_uptime < cl->cl_expiration) break; } /* * This clockintr has expired. Execute it. */ clockqueue_pend_delete(cq, cl); request->cr_expiration = cl->cl_expiration; arg = cl->cl_arg; func = cl->cl_func; cq->cq_running = cl; mtx_leave(&cq->cq_mtx); func(request, frame, arg); mtx_enter(&cq->cq_mtx); cq->cq_running = NULL; if (ISSET(cq->cq_flags, CQ_IGNORE_REQUEST)) { CLR(cq->cq_flags, CQ_IGNORE_REQUEST); CLR(request->cr_flags, CR_RESCHEDULE); } if (ISSET(request->cr_flags, CR_RESCHEDULE)) { CLR(request->cr_flags, CR_RESCHEDULE); clockqueue_pend_insert(cq, cl, request->cr_expiration); } if (ISSET(cq->cq_flags, CQ_NEED_WAKEUP)) { CLR(cq->cq_flags, CQ_NEED_WAKEUP); mtx_leave(&cq->cq_mtx); wakeup(&cq->cq_running); mtx_enter(&cq->cq_mtx); } run++; } /* * Dispatch complete. */ rearm: /* Rearm the interrupt clock if we have one. */ if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { if (!TAILQ_EMPTY(&cq->cq_pend)) { intrclock_rearm(&cq->cq_intrclock, clockqueue_next(cq) - cq->cq_uptime); } } stats: /* Update our stats. */ ogen = cq->cq_gen; cq->cq_gen = 0; membar_producer(); cq->cq_stat.cs_dispatched += cq->cq_uptime - start; if (run > 0) { cq->cq_stat.cs_lateness += lateness; cq->cq_stat.cs_prompt++; cq->cq_stat.cs_run += run; } else if (!TAILQ_EMPTY(&cq->cq_pend)) { cq->cq_stat.cs_early++; cq->cq_stat.cs_earliness += clockqueue_next(cq) - cq->cq_uptime; } else cq->cq_stat.cs_spurious++; membar_producer(); cq->cq_gen = MAX(1, ogen + 1); mtx_leave(&cq->cq_mtx); if (cq->cq_dispatch != 1) panic("%s: unexpected value: %u", __func__, cq->cq_dispatch); cq->cq_dispatch = 0; return run > 0; } uint64_t clockintr_advance(struct clockintr *cl, uint64_t period) { uint64_t count, expiration; struct clockqueue *cq = cl->cl_queue; mtx_enter(&cq->cq_mtx); expiration = cl->cl_expiration; count = nsec_advance(&expiration, period, nsecuptime()); clockintr_schedule_locked(cl, expiration); mtx_leave(&cq->cq_mtx); return count; } uint64_t clockrequest_advance(struct clockrequest *cr, uint64_t period) { struct clockqueue *cq = cr->cr_queue; KASSERT(cr == &cq->cq_request); SET(cr->cr_flags, CR_RESCHEDULE); return nsec_advance(&cr->cr_expiration, period, cq->cq_uptime); } uint64_t clockrequest_advance_random(struct clockrequest *cr, uint64_t min, uint32_t mask) { uint64_t count = 0; struct clockqueue *cq = cr->cr_queue; uint32_t off; KASSERT(cr == &cq->cq_request); while (cr->cr_expiration <= cq->cq_uptime) { while ((off = (random() & mask)) == 0) continue; cr->cr_expiration += min + off; count++; } SET(cr->cr_flags, CR_RESCHEDULE); return count; } void clockintr_cancel(struct clockintr *cl) { struct clockqueue *cq = cl->cl_queue; mtx_enter(&cq->cq_mtx); clockintr_cancel_locked(cl); mtx_leave(&cq->cq_mtx); } void clockintr_cancel_locked(struct clockintr *cl) { struct clockqueue *cq = cl->cl_queue; int was_next; MUTEX_ASSERT_LOCKED(&cq->cq_mtx); if (ISSET(cl->cl_flags, CLST_PENDING)) { was_next = cl == TAILQ_FIRST(&cq->cq_pend); clockqueue_pend_delete(cq, cl); if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { if (was_next && !TAILQ_EMPTY(&cq->cq_pend)) { if (cq == &curcpu()->ci_queue) clockqueue_intrclock_reprogram(cq); } } } if (cl == cq->cq_running) SET(cq->cq_flags, CQ_IGNORE_REQUEST); } void clockintr_bind(struct clockintr *cl, struct cpu_info *ci, void (*func)(struct clockrequest *, void *, void *), void *arg) { struct clockqueue *cq = &ci->ci_queue; splassert(IPL_NONE); KASSERT(cl->cl_queue == NULL); mtx_enter(&cq->cq_mtx); cl->cl_arg = arg; cl->cl_func = func; cl->cl_queue = cq; TAILQ_INSERT_TAIL(&cq->cq_all, cl, cl_alink); mtx_leave(&cq->cq_mtx); } void clockintr_unbind(struct clockintr *cl, uint32_t flags) { struct clockqueue *cq = cl->cl_queue; KASSERT(!ISSET(flags, ~CL_FLAG_MASK)); mtx_enter(&cq->cq_mtx); clockintr_cancel_locked(cl); cl->cl_arg = NULL; cl->cl_func = NULL; cl->cl_queue = NULL; TAILQ_REMOVE(&cq->cq_all, cl, cl_alink); if (ISSET(flags, CL_BARRIER) && cl == cq->cq_running) { SET(cq->cq_flags, CQ_NEED_WAKEUP); msleep_nsec(&cq->cq_running, &cq->cq_mtx, PWAIT | PNORELOCK, "clkbar", INFSLP); } else mtx_leave(&cq->cq_mtx); } void clockintr_schedule(struct clockintr *cl, uint64_t expiration) { struct clockqueue *cq = cl->cl_queue; mtx_enter(&cq->cq_mtx); clockintr_schedule_locked(cl, expiration); mtx_leave(&cq->cq_mtx); } void clockintr_schedule_locked(struct clockintr *cl, uint64_t expiration) { struct clockqueue *cq = cl->cl_queue; MUTEX_ASSERT_LOCKED(&cq->cq_mtx); if (ISSET(cl->cl_flags, CLST_PENDING)) clockqueue_pend_delete(cq, cl); clockqueue_pend_insert(cq, cl, expiration); if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { if (cl == TAILQ_FIRST(&cq->cq_pend)) { if (cq == &curcpu()->ci_queue) clockqueue_intrclock_reprogram(cq); } } if (cl == cq->cq_running) SET(cq->cq_flags, CQ_IGNORE_REQUEST); } void clockintr_stagger(struct clockintr *cl, uint64_t period, uint32_t numer, uint32_t denom) { struct clockqueue *cq = cl->cl_queue; KASSERT(numer < denom); mtx_enter(&cq->cq_mtx); if (ISSET(cl->cl_flags, CLST_PENDING)) panic("%s: clock interrupt pending", __func__); cl->cl_expiration = period / denom * numer; mtx_leave(&cq->cq_mtx); } void clockintr_hardclock(struct clockrequest *cr, void *frame, void *arg) { uint64_t count, i; count = clockrequest_advance(cr, hardclock_period); for (i = 0; i < count; i++) hardclock(frame); } void clockqueue_init(struct clockqueue *cq) { if (ISSET(cq->cq_flags, CQ_INIT)) return; cq->cq_request.cr_queue = cq; mtx_init(&cq->cq_mtx, IPL_CLOCK); TAILQ_INIT(&cq->cq_all); TAILQ_INIT(&cq->cq_pend); cq->cq_gen = 1; SET(cq->cq_flags, CQ_INIT); } void clockqueue_intrclock_install(struct clockqueue *cq, const struct intrclock *ic) { mtx_enter(&cq->cq_mtx); if (!ISSET(cq->cq_flags, CQ_INTRCLOCK)) { cq->cq_intrclock = *ic; SET(cq->cq_flags, CQ_INTRCLOCK); } mtx_leave(&cq->cq_mtx); } uint64_t clockqueue_next(const struct clockqueue *cq) { MUTEX_ASSERT_LOCKED(&cq->cq_mtx); return TAILQ_FIRST(&cq->cq_pend)->cl_expiration; } void clockqueue_pend_delete(struct clockqueue *cq, struct clockintr *cl) { MUTEX_ASSERT_LOCKED(&cq->cq_mtx); KASSERT(ISSET(cl->cl_flags, CLST_PENDING)); TAILQ_REMOVE(&cq->cq_pend, cl, cl_plink); CLR(cl->cl_flags, CLST_PENDING); } void clockqueue_pend_insert(struct clockqueue *cq, struct clockintr *cl, uint64_t expiration) { struct clockintr *elm; MUTEX_ASSERT_LOCKED(&cq->cq_mtx); KASSERT(!ISSET(cl->cl_flags, CLST_PENDING)); cl->cl_expiration = expiration; TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink) { if (cl->cl_expiration < elm->cl_expiration) break; } if (elm == NULL) TAILQ_INSERT_TAIL(&cq->cq_pend, cl, cl_plink); else TAILQ_INSERT_BEFORE(elm, cl, cl_plink); SET(cl->cl_flags, CLST_PENDING); } void clockqueue_intrclock_reprogram(struct clockqueue *cq) { uint64_t exp, now; MUTEX_ASSERT_LOCKED(&cq->cq_mtx); KASSERT(ISSET(cq->cq_flags, CQ_INTRCLOCK)); exp = clockqueue_next(cq); now = nsecuptime(); if (now < exp) intrclock_rearm(&cq->cq_intrclock, exp - now); else intrclock_trigger(&cq->cq_intrclock); } void intrclock_rearm(struct intrclock *ic, uint64_t nsecs) { ic->ic_rearm(ic->ic_cookie, nsecs); } void intrclock_trigger(struct intrclock *ic) { ic->ic_trigger(ic->ic_cookie); } /* * Advance *next in increments of period until it exceeds now. * Returns the number of increments *next was advanced. * * We check the common cases first to avoid division if possible. * This does no overflow checking. */ uint64_t nsec_advance(uint64_t *next, uint64_t period, uint64_t now) { uint64_t elapsed; if (now < *next) return 0; if (now < *next + period) { *next += period; return 1; } elapsed = (now - *next) / period + 1; *next += period * elapsed; return elapsed; } int sysctl_clockintr(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { struct clockintr_stat sum, tmp; struct clockqueue *cq; struct cpu_info *ci; CPU_INFO_ITERATOR cii; uint32_t gen; if (namelen != 1) return ENOTDIR; switch (name[0]) { case KERN_CLOCKINTR_STATS: memset(&sum, 0, sizeof sum); CPU_INFO_FOREACH(cii, ci) { cq = &ci->ci_queue; if (!ISSET(cq->cq_flags, CQ_INIT)) continue; do { gen = cq->cq_gen; membar_consumer(); tmp = cq->cq_stat; membar_consumer(); } while (gen == 0 || gen != cq->cq_gen); sum.cs_dispatched += tmp.cs_dispatched; sum.cs_early += tmp.cs_early; sum.cs_earliness += tmp.cs_earliness; sum.cs_lateness += tmp.cs_lateness; sum.cs_prompt += tmp.cs_prompt; sum.cs_run += tmp.cs_run; sum.cs_spurious += tmp.cs_spurious; } return sysctl_rdstruct(oldp, oldlenp, newp, &sum, sizeof sum); default: break; } return EINVAL; } #ifdef DDB #include #include #include #include void db_show_clockintr(const struct clockintr *, const char *, u_int); void db_show_clockintr_cpu(struct cpu_info *); void db_show_all_clockintr(db_expr_t addr, int haddr, db_expr_t count, char *modif) { struct timespec now; struct cpu_info *ci; CPU_INFO_ITERATOR cii; int width = sizeof(long) * 2 + 2; /* +2 for "0x" prefix */ nanouptime(&now); db_printf("%20s\n", "UPTIME"); db_printf("%10lld.%09ld\n", now.tv_sec, now.tv_nsec); db_printf("\n"); db_printf("%20s %5s %3s %*s %s\n", "EXPIRATION", "STATE", "CPU", width, "ARG", "NAME"); CPU_INFO_FOREACH(cii, ci) { if (ISSET(ci->ci_queue.cq_flags, CQ_INIT)) db_show_clockintr_cpu(ci); } } void db_show_clockintr_cpu(struct cpu_info *ci) { struct clockintr *elm; struct clockqueue *cq = &ci->ci_queue; u_int cpu = CPU_INFO_UNIT(ci); if (cq->cq_running != NULL) db_show_clockintr(cq->cq_running, "run", cpu); TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink) db_show_clockintr(elm, "pend", cpu); TAILQ_FOREACH(elm, &cq->cq_all, cl_alink) { if (!ISSET(elm->cl_flags, CLST_PENDING)) db_show_clockintr(elm, "idle", cpu); } } void db_show_clockintr(const struct clockintr *cl, const char *state, u_int cpu) { struct timespec ts; const char *name; db_expr_t offset; int width = sizeof(long) * 2; NSEC_TO_TIMESPEC(cl->cl_expiration, &ts); db_find_sym_and_offset((vaddr_t)cl->cl_func, &name, &offset); if (name == NULL) name = "?"; db_printf("%10lld.%09ld %5s %3u 0x%0*lx %s\n", ts.tv_sec, ts.tv_nsec, state, cpu, width, (unsigned long)cl->cl_arg, name); } #endif /* DDB */