/* $OpenBSD: kern_clockintr.c,v 1.1 2022/11/05 19:29:46 cheloha Exp $ */ /* * Copyright (c) 2003 Dale Rahn * Copyright (c) 2020 Mark Kettenis * Copyright (c) 2020-2022 Scott Cheloha * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include #include #include #include #include #include #include #include #include #ifdef __HAVE_CLOCKINTR /* * Protection for global variables in this file: * * C Global clockintr configuration mutex (clockintr_mtx). * I Immutable after initialization. */ struct mutex clockintr_mtx = MUTEX_INITIALIZER(IPL_CLOCK); u_int clockintr_flags; /* [I] global state + behavior flags */ uint32_t hardclock_period; /* [I] hardclock period (ns) */ uint32_t schedclock_period; /* [I] schedclock period (ns) */ volatile u_int statclock_gen = 1; /* [C] statclock update generation */ volatile uint32_t statclock_avg; /* [C] average statclock period (ns) */ uint32_t statclock_min; /* [C] minimum statclock period (ns) */ uint32_t statclock_mask; /* [C] set of allowed offsets */ uint32_t stat_avg; /* [I] average stathz period (ns) */ uint32_t stat_min; /* [I] set of allowed offsets */ uint32_t stat_mask; /* [I] max offset from minimum (ns) */ uint32_t prof_avg; /* [I] average profhz period (ns) */ uint32_t prof_min; /* [I] minimum profhz period (ns) */ uint32_t prof_mask; /* [I] set of allowed offsets */ void clockintr_statvar_init(int, uint32_t *, uint32_t *, uint32_t *); uint64_t nsec_advance(uint64_t *, uint64_t, uint64_t); /* * Initialize global state. Set flags and compute intervals. */ void clockintr_init(u_int flags) { KASSERT(CPU_IS_PRIMARY(curcpu())); KASSERT(clockintr_flags == 0); KASSERT(!ISSET(flags, ~CL_FLAG_MASK)); KASSERT(hz > 0 && hz <= 1000000000); hardclock_period = 1000000000 / hz; KASSERT(stathz >= 1 && stathz <= 1000000000); KASSERT(profhz >= stathz && profhz <= 1000000000); KASSERT(profhz % stathz == 0); clockintr_statvar_init(stathz, &stat_avg, &stat_min, &stat_mask); clockintr_statvar_init(profhz, &prof_avg, &prof_min, &prof_mask); SET(clockintr_flags, CL_STATCLOCK); clockintr_setstatclockrate(stathz); KASSERT(schedhz >= 0 && schedhz <= 1000000000); if (schedhz != 0) { schedclock_period = 1000000000 / schedhz; SET(clockintr_flags, CL_SCHEDCLOCK); } SET(clockintr_flags, flags | CL_INIT); } /* * Ready the calling CPU for clockintr_dispatch(). If this is our * first time here, install the intrclock, if any, and set necessary * flags. Advance the schedule as needed. */ void clockintr_cpu_init(const struct intrclock *ic) { uint64_t multiplier, now; struct cpu_info *ci = curcpu(); struct clockintr_queue *cq = &ci->ci_queue; KASSERT(ISSET(clockintr_flags, CL_INIT)); if (!ISSET(cq->cq_flags, CL_CPU_INIT)) { if (ic != NULL) { cq->cq_intrclock = *ic; SET(cq->cq_flags, CL_CPU_INTRCLOCK); } cq->cq_gen = 1; } /* * Until we understand scheduler lock contention better, stagger * the hardclock and statclock so they don't all happen at once. * If we have no intrclock it doesn't matter, we have no control * anyway. The primary CPU's starting offset is always zero, so * set multiplier to zero. */ if (!CPU_IS_PRIMARY(ci) && ISSET(cq->cq_flags, CL_CPU_INTRCLOCK)) multiplier = CPU_INFO_UNIT(ci); else multiplier = 0; now = nsecuptime(); /* * The first time we do this, the primary CPU cannot skip any * hardclocks. We can skip hardclocks on subsequent calls because * the global tick value is advanced during inittodr(9) on our * behalf. */ if (!CPU_IS_PRIMARY(ci) || ISSET(cq->cq_flags, CL_CPU_INIT)) { cq->cq_next_hardclock = hardclock_period / ncpus * multiplier; nsec_advance(&cq->cq_next_hardclock, hardclock_period, now); } /* * We can always advance the statclock and schedclock. */ cq->cq_next_statclock = stat_avg / ncpus * multiplier; nsec_advance(&cq->cq_next_statclock, stat_avg, now); if (ISSET(clockintr_flags, CL_SCHEDCLOCK)) { cq->cq_next_schedclock = schedclock_period / ncpus * multiplier; nsec_advance(&cq->cq_next_schedclock, schedclock_period, now); } SET(cq->cq_flags, CL_CPU_INIT); } /* * If we have an intrclock, trigger it to start the dispatch cycle. */ void clockintr_trigger(void) { struct clockintr_queue *cq = &curcpu()->ci_queue; KASSERT(ISSET(cq->cq_flags, CL_CPU_INIT)); if (ISSET(cq->cq_flags, CL_CPU_INTRCLOCK)) intrclock_trigger(&cq->cq_intrclock); } /* * Run all expired events scheduled on the calling CPU. */ int clockintr_dispatch(void *frame) { uint64_t count, i, lateness, now, run = 0, start; struct cpu_info *ci = curcpu(); struct clockintr_queue *cq = &ci->ci_queue; struct proc *p = curproc; uint32_t mask, min, off; u_int gen, ogen; if (cq->cq_dispatch != 0) panic("%s: recursive dispatch", __func__); cq->cq_dispatch = 1; splassert(IPL_CLOCK); KASSERT(ISSET(cq->cq_flags, CL_CPU_INIT)); /* * If we arrived too early we have nothing to do. */ start = nsecuptime(); now = start; if (now < cq->cq_next) goto done; lateness = now - cq->cq_next; /* * Dispatch expired events. */ again: /* hardclock */ count = nsec_advance(&cq->cq_next_hardclock, hardclock_period, now); for (i = 0; i < count; i++) hardclock(frame); run += count; /* statclock */ if (ISSET(clockintr_flags, CL_RNDSTAT)) { do { gen = statclock_gen; membar_consumer(); min = statclock_min; mask = statclock_mask; membar_consumer(); } while (gen == 0 || gen != statclock_gen); count = 0; while (cq->cq_next_statclock <= now) { while ((off = (random() & mask)) == 0) continue; cq->cq_next_statclock += min + off; count++; } } else { count = nsec_advance(&cq->cq_next_statclock, statclock_avg, now); } for (i = 0; i < count; i++) statclock(frame); run += count; /* schedclock */ if (ISSET(clockintr_flags, CL_SCHEDCLOCK)) { count = nsec_advance(&cq->cq_next_schedclock, schedclock_period, now); if (p != NULL) { for (i = 0; i < count; i++) schedclock(p); } run += count; } /* Run the dispatch again if the next event has already expired. */ cq->cq_next = cq->cq_next_hardclock; if (cq->cq_next_statclock < cq->cq_next) cq->cq_next = cq->cq_next_statclock; if (ISSET(clockintr_flags, CL_SCHEDCLOCK)) { if (cq->cq_next_schedclock < cq->cq_next) cq->cq_next = cq->cq_next_schedclock; } now = nsecuptime(); if (cq->cq_next <= now) goto again; /* * Dispatch complete. */ done: /* Rearm the interrupt clock if we have one. */ if (ISSET(cq->cq_flags, CL_CPU_INTRCLOCK)) intrclock_rearm(&cq->cq_intrclock, cq->cq_next - now); /* Update our stats. */ ogen = cq->cq_gen; cq->cq_gen = 0; membar_producer(); cq->cq_stat.cs_dispatched += now - start; if (run > 0) { cq->cq_stat.cs_lateness += lateness; cq->cq_stat.cs_prompt++; cq->cq_stat.cs_run += run; } else { cq->cq_stat.cs_early++; cq->cq_stat.cs_earliness += cq->cq_next - now; } membar_producer(); cq->cq_gen = MAX(1, ogen + 1); if (cq->cq_dispatch != 1) panic("%s: unexpected value: %u", __func__, cq->cq_dispatch); cq->cq_dispatch = 0; return run > 0; } /* * Compute the period (avg) for the given frequency and a range around * that period. The range is [min + 1, min + mask]. The range is used * during dispatch to choose a new pseudorandom deadline for each statclock * event. */ void clockintr_statvar_init(int freq, uint32_t *avg, uint32_t *min, uint32_t *mask) { uint32_t half_avg, var; KASSERT(!ISSET(clockintr_flags, CL_INIT | CL_STATCLOCK)); KASSERT(freq > 0 && freq <= 1000000000); /* Compute avg, the average period. */ *avg = 1000000000 / freq; /* Find var, the largest power of two such that var <= avg / 2. */ half_avg = *avg / 2; for (var = 1U << 31; var > half_avg; var /= 2) continue; /* Using avg and var, set a lower bound for the range. */ *min = *avg - (var / 2); /* The mask is just (var - 1). */ *mask = var - 1; } /* * Update the statclock_* variables according to the given frequency. * Must only be called after clockintr_statvar_init() initializes both * stathz_* and profhz_*. */ void clockintr_setstatclockrate(int freq) { u_int ogen; KASSERT(ISSET(clockintr_flags, CL_STATCLOCK)); mtx_enter(&clockintr_mtx); ogen = statclock_gen; statclock_gen = 0; membar_producer(); if (freq == stathz) { statclock_avg = stat_avg; statclock_min = stat_min; statclock_mask = stat_mask; } else if (freq == profhz) { statclock_avg = prof_avg; statclock_min = prof_min; statclock_mask = prof_mask; } else { panic("%s: frequency is not stathz (%d) or profhz (%d): %d", __func__, stathz, profhz, freq); } membar_producer(); statclock_gen = MAX(1, ogen + 1); mtx_leave(&clockintr_mtx); } /* * Advance *next in increments of period until it exceeds now. * Returns the number of increments *next was advanced. * * We check the common cases first to avoid division if possible. * This does no overflow checking. */ uint64_t nsec_advance(uint64_t *next, uint64_t period, uint64_t now) { uint64_t elapsed; if (now < *next) return 0; if (now < *next + period) { *next += period; return 1; } elapsed = (now - *next) / period + 1; *next += period * elapsed; return elapsed; } int sysctl_clockintr(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { struct clockintr_stat sum = { 0 }, tmp; struct clockintr_queue *cq; struct cpu_info *ci; CPU_INFO_ITERATOR cii; u_int gen; if (namelen != 1) return ENOTDIR; switch (name[0]) { case KERN_CLOCKINTR_STATS: CPU_INFO_FOREACH(cii, ci) { cq = &ci->ci_queue; if (!ISSET(cq->cq_flags, CL_CPU_INIT)) continue; do { gen = cq->cq_gen; membar_consumer(); tmp = cq->cq_stat; membar_consumer(); } while (gen == 0 || gen != cq->cq_gen); sum.cs_dispatched += tmp.cs_dispatched; sum.cs_early += tmp.cs_early; sum.cs_earliness += tmp.cs_earliness; sum.cs_lateness += tmp.cs_lateness; sum.cs_prompt += tmp.cs_prompt; sum.cs_run += tmp.cs_run; } return sysctl_rdstruct(oldp, oldlenp, newp, &sum, sizeof sum); default: break; } return EINVAL; } #ifdef DDB #include #include #include #include void db_show_clockintr(uint64_t, u_int, const char *); void db_show_clockintr_cpu(struct cpu_info *); void db_show_all_clockintr(db_expr_t addr, int haddr, db_expr_t count, char *modif) { struct timespec now; struct cpu_info *ci; CPU_INFO_ITERATOR cii; nanouptime(&now); db_printf("%20s\n", "UPTIME"); db_printf("%10lld.%09ld\n", now.tv_sec, now.tv_nsec); db_printf("\n"); db_printf("%20s %3s %s\n", "EXPIRATION", "CPU", "NAME"); CPU_INFO_FOREACH(cii, ci) { if (ISSET(ci->ci_queue.cq_flags, CL_CPU_INIT)) db_show_clockintr_cpu(ci); } } void db_show_clockintr_cpu(struct cpu_info *ci) { struct clockintr_queue *cq = &ci->ci_queue; u_int cpu = CPU_INFO_UNIT(ci); db_show_clockintr(cq->cq_next_hardclock, cpu, "hardclock"); db_show_clockintr(cq->cq_next_statclock, cpu, "statclock"); if (ISSET(clockintr_flags, CL_SCHEDCLOCK)) db_show_clockintr(cq->cq_next_schedclock, cpu, "schedclock"); } void db_show_clockintr(uint64_t expiration, u_int cpu, const char *name) { struct timespec ts; NSEC_TO_TIMESPEC(expiration, &ts); db_printf("%10lld.%09ld %3u %s\n", ts.tv_sec, ts.tv_nsec, cpu, name); } #endif /* DDB */ #endif /*__HAVE_CLOCKINTR */