/* $OpenBSD: kern_clockintr.c,v 1.71 2024/11/07 16:02:29 miod Exp $ */
/*
 * Copyright (c) 2003 Dale Rahn <drahn@openbsd.org>
 * Copyright (c) 2020 Mark Kettenis <kettenis@openbsd.org>
 * Copyright (c) 2020-2024 Scott Cheloha <cheloha@openbsd.org>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/clockintr.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/resourcevar.h>
#include <sys/queue.h>
#include <sys/sched.h>
#include <sys/stdint.h>
#include <sys/sysctl.h>
#include <sys/time.h>

void clockintr_cancel_locked(struct clockintr *);
void clockintr_hardclock(struct clockrequest *, void *, void *);
void clockintr_schedule_locked(struct clockintr *, uint64_t);
void clockqueue_intrclock_install(struct clockqueue *,
    const struct intrclock *);
void clockqueue_intrclock_reprogram(struct clockqueue *);
uint64_t clockqueue_next(const struct clockqueue *);
void clockqueue_pend_delete(struct clockqueue *, struct clockintr *);
void clockqueue_pend_insert(struct clockqueue *, struct clockintr *,
    uint64_t);
void intrclock_rearm(struct intrclock *, uint64_t);
void intrclock_trigger(struct intrclock *);
uint64_t nsec_advance(uint64_t *, uint64_t, uint64_t);

/*
 * Ready the calling CPU for clockintr_dispatch().  If this is our
 * first time here, install the intrclock, if any, and set necessary
 * flags.  Advance the schedule as needed.
 */
void
clockintr_cpu_init(const struct intrclock *ic)
{
	uint64_t multiplier = 0;
	struct cpu_info *ci = curcpu();
	struct clockqueue *cq = &ci->ci_queue;
	struct schedstate_percpu *spc = &ci->ci_schedstate;
	int reset_cq_intrclock = 0;

	if (ic != NULL)
		clockqueue_intrclock_install(cq, ic);

	/* TODO: Remove this from struct clockqueue. */
	if (CPU_IS_PRIMARY(ci) && cq->cq_hardclock.cl_expiration == 0) {
		clockintr_bind(&cq->cq_hardclock, ci, clockintr_hardclock,
		    NULL);
	}

	/*
	 * Mask CQ_INTRCLOCK while we're advancing the internal clock
	 * interrupts.  We don't want the intrclock to fire until this
	 * thread reaches clockintr_trigger().
	 */
	if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
		CLR(cq->cq_flags, CQ_INTRCLOCK);
		reset_cq_intrclock = 1;
	}

	/*
	 * Until we understand scheduler lock contention better, stagger
	 * the hardclock and statclock so they don't all happen at once.
	 * If we have no intrclock it doesn't matter, we have no control
	 * anyway.  The primary CPU's starting offset is always zero, so
	 * leave the multiplier zero.
	 */
	if (!CPU_IS_PRIMARY(ci) && reset_cq_intrclock)
		multiplier = CPU_INFO_UNIT(ci);

	/*
	 * The first time we do this, the primary CPU cannot skip any
	 * hardclocks.  We can skip hardclocks on subsequent calls because
	 * the global tick value is advanced during inittodr(9) on our
	 * behalf.
	 */
	if (CPU_IS_PRIMARY(ci)) {
		if (cq->cq_hardclock.cl_expiration == 0)
			clockintr_schedule(&cq->cq_hardclock, 0);
		else
			clockintr_advance(&cq->cq_hardclock, hardclock_period);
	}

	/*
	 * We can always advance the statclock.  There is no reason to
	 * stagger a randomized statclock.
	 */
	if (!statclock_is_randomized) {
		if (spc->spc_statclock.cl_expiration == 0) {
			clockintr_stagger(&spc->spc_statclock, statclock_avg,
			    multiplier, MAXCPUS);
		}
	}
	clockintr_advance(&spc->spc_statclock, statclock_avg);

	/*
	 * XXX Need to find a better place to do this.  We can't do it in
	 * sched_init_cpu() because initclocks() runs after it.
	 */
	if (spc->spc_itimer.cl_expiration == 0) {
		clockintr_stagger(&spc->spc_itimer, hardclock_period,
		    multiplier, MAXCPUS);
	}
	if (spc->spc_profclock.cl_expiration == 0) {
		clockintr_stagger(&spc->spc_profclock, profclock_period,
		    multiplier, MAXCPUS);
	}
	if (spc->spc_roundrobin.cl_expiration == 0) {
		clockintr_stagger(&spc->spc_roundrobin, hardclock_period,
		    multiplier, MAXCPUS);
	}
	clockintr_advance(&spc->spc_roundrobin, roundrobin_period);

	if (reset_cq_intrclock)
		SET(cq->cq_flags, CQ_INTRCLOCK);
}

/*
 * If we have an intrclock, trigger it to start the dispatch cycle.
 */
void
clockintr_trigger(void)
{
	struct clockqueue *cq = &curcpu()->ci_queue;

	KASSERT(ISSET(cq->cq_flags, CQ_INIT));

	if (ISSET(cq->cq_flags, CQ_INTRCLOCK))
		intrclock_trigger(&cq->cq_intrclock);
}

/*
 * Run all expired events scheduled on the calling CPU.
 */
int
clockintr_dispatch(void *frame)
{
	uint64_t lateness, run = 0, start;
	struct cpu_info *ci = curcpu();
	struct clockintr *cl;
	struct clockqueue *cq = &ci->ci_queue;
	struct clockrequest *request = &cq->cq_request;
	void *arg;
	void (*func)(struct clockrequest *, void *, void *);
	uint32_t ogen;

	if (cq->cq_dispatch != 0)
		panic("%s: recursive dispatch", __func__);
	cq->cq_dispatch = 1;

	splassert(IPL_CLOCK);
	KASSERT(ISSET(cq->cq_flags, CQ_INIT));

	mtx_enter(&cq->cq_mtx);

	/*
	 * If nothing is scheduled or we arrived too early, we have
	 * nothing to do.
	 */
	start = nsecuptime();
	cq->cq_uptime = start;
	if (TAILQ_EMPTY(&cq->cq_pend))
		goto stats;
	if (cq->cq_uptime < clockqueue_next(cq))
		goto rearm;
	lateness = start - clockqueue_next(cq);

	/*
	 * Dispatch expired events.
	 */
	for (;;) {
		cl = TAILQ_FIRST(&cq->cq_pend);
		if (cl == NULL)
			break;
		if (cq->cq_uptime < cl->cl_expiration) {
			/* Double-check the time before giving up. */
			cq->cq_uptime = nsecuptime();
			if (cq->cq_uptime < cl->cl_expiration)
				break;
		}

		/*
		 * This clockintr has expired.  Execute it.
		 */
		clockqueue_pend_delete(cq, cl);
		request->cr_expiration = cl->cl_expiration;
		arg = cl->cl_arg;
		func = cl->cl_func;
		cq->cq_running = cl;
		mtx_leave(&cq->cq_mtx);

		func(request, frame, arg);

		mtx_enter(&cq->cq_mtx);
		cq->cq_running = NULL;
		if (ISSET(cq->cq_flags, CQ_IGNORE_REQUEST)) {
			CLR(cq->cq_flags, CQ_IGNORE_REQUEST);
			CLR(request->cr_flags, CR_RESCHEDULE);
		}
		if (ISSET(request->cr_flags, CR_RESCHEDULE)) {
			CLR(request->cr_flags, CR_RESCHEDULE);
			clockqueue_pend_insert(cq, cl, request->cr_expiration);
		}
		if (ISSET(cq->cq_flags, CQ_NEED_WAKEUP)) {
			CLR(cq->cq_flags, CQ_NEED_WAKEUP);
			mtx_leave(&cq->cq_mtx);
			wakeup(&cq->cq_running);
			mtx_enter(&cq->cq_mtx);
		}
		run++;
	}

	/*
	 * Dispatch complete.
	 */
rearm:
	/* Rearm the interrupt clock if we have one. */
	if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
		if (!TAILQ_EMPTY(&cq->cq_pend)) {
			intrclock_rearm(&cq->cq_intrclock,
			    clockqueue_next(cq) - cq->cq_uptime);
		}
	}
stats:
	/* Update our stats. */
	ogen = cq->cq_gen;
	cq->cq_gen = 0;
	membar_producer();
	cq->cq_stat.cs_dispatched += cq->cq_uptime - start;
	if (run > 0) {
		cq->cq_stat.cs_lateness += lateness;
		cq->cq_stat.cs_prompt++;
		cq->cq_stat.cs_run += run;
	} else if (!TAILQ_EMPTY(&cq->cq_pend)) {
		cq->cq_stat.cs_early++;
		cq->cq_stat.cs_earliness += clockqueue_next(cq) - cq->cq_uptime;
	} else
		cq->cq_stat.cs_spurious++;
	membar_producer();
	cq->cq_gen = MAX(1, ogen + 1);

	mtx_leave(&cq->cq_mtx);

	if (cq->cq_dispatch != 1)
		panic("%s: unexpected value: %u", __func__, cq->cq_dispatch);
	cq->cq_dispatch = 0;

	return run > 0;
}

uint64_t
clockintr_advance(struct clockintr *cl, uint64_t period)
{
	uint64_t count, expiration;
	struct clockqueue *cq = cl->cl_queue;

	mtx_enter(&cq->cq_mtx);
	expiration = cl->cl_expiration;
	count = nsec_advance(&expiration, period, nsecuptime());
	clockintr_schedule_locked(cl, expiration);
	mtx_leave(&cq->cq_mtx);

	return count;
}

uint64_t
clockrequest_advance(struct clockrequest *cr, uint64_t period)
{
	struct clockqueue *cq = cr->cr_queue;

	KASSERT(cr == &cq->cq_request);

	SET(cr->cr_flags, CR_RESCHEDULE);
	return nsec_advance(&cr->cr_expiration, period, cq->cq_uptime);
}

uint64_t
clockrequest_advance_random(struct clockrequest *cr, uint64_t min,
    uint32_t mask)
{
	uint64_t count = 0;
	struct clockqueue *cq = cr->cr_queue;
	uint32_t off;

	KASSERT(cr == &cq->cq_request);

	while (cr->cr_expiration <= cq->cq_uptime) {
		while ((off = (random() & mask)) == 0)
			continue;
		cr->cr_expiration += min + off;
		count++;
	}
	SET(cr->cr_flags, CR_RESCHEDULE);
	return count;
}

void
clockintr_cancel(struct clockintr *cl)
{
	struct clockqueue *cq = cl->cl_queue;

	mtx_enter(&cq->cq_mtx);
	clockintr_cancel_locked(cl);
	mtx_leave(&cq->cq_mtx);
}

void
clockintr_cancel_locked(struct clockintr *cl)
{
	struct clockqueue *cq = cl->cl_queue;
	int was_next;

	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);

	if (ISSET(cl->cl_flags, CLST_PENDING)) {
		was_next = cl == TAILQ_FIRST(&cq->cq_pend);
		clockqueue_pend_delete(cq, cl);
		if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
			if (was_next && !TAILQ_EMPTY(&cq->cq_pend)) {
				if (cq == &curcpu()->ci_queue)
					clockqueue_intrclock_reprogram(cq);
			}
		}
	}
	if (cl == cq->cq_running)
		SET(cq->cq_flags, CQ_IGNORE_REQUEST);
}

void
clockintr_bind(struct clockintr *cl, struct cpu_info *ci,
    void (*func)(struct clockrequest *, void *, void *), void *arg)
{
	struct clockqueue *cq = &ci->ci_queue;

	splassert(IPL_NONE);
	KASSERT(cl->cl_queue == NULL);

	mtx_enter(&cq->cq_mtx);
	cl->cl_arg = arg;
	cl->cl_func = func;
	cl->cl_queue = cq;
	TAILQ_INSERT_TAIL(&cq->cq_all, cl, cl_alink);
	mtx_leave(&cq->cq_mtx);
}

void
clockintr_unbind(struct clockintr *cl, uint32_t flags)
{
	struct clockqueue *cq = cl->cl_queue;

	KASSERT(!ISSET(flags, ~CL_FLAG_MASK));

	mtx_enter(&cq->cq_mtx);

	clockintr_cancel_locked(cl);

	cl->cl_arg = NULL;
	cl->cl_func = NULL;
	cl->cl_queue = NULL;
	TAILQ_REMOVE(&cq->cq_all, cl, cl_alink);

	if (ISSET(flags, CL_BARRIER) && cl == cq->cq_running) {
		SET(cq->cq_flags, CQ_NEED_WAKEUP);
		msleep_nsec(&cq->cq_running, &cq->cq_mtx, PWAIT | PNORELOCK,
		    "clkbar", INFSLP);
	} else
		mtx_leave(&cq->cq_mtx);
}

void
clockintr_schedule(struct clockintr *cl, uint64_t expiration)
{
	struct clockqueue *cq = cl->cl_queue;

	mtx_enter(&cq->cq_mtx);
	clockintr_schedule_locked(cl, expiration);
	mtx_leave(&cq->cq_mtx);
}

void
clockintr_schedule_locked(struct clockintr *cl, uint64_t expiration)
{
	struct clockqueue *cq = cl->cl_queue;

	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);

	if (ISSET(cl->cl_flags, CLST_PENDING))
		clockqueue_pend_delete(cq, cl);
	clockqueue_pend_insert(cq, cl, expiration);
	if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
		if (cl == TAILQ_FIRST(&cq->cq_pend)) {
			if (cq == &curcpu()->ci_queue)
				clockqueue_intrclock_reprogram(cq);
		}
	}
	if (cl == cq->cq_running)
		SET(cq->cq_flags, CQ_IGNORE_REQUEST);
}

void
clockintr_stagger(struct clockintr *cl, uint64_t period, uint32_t numer,
    uint32_t denom)
{
	struct clockqueue *cq = cl->cl_queue;

	KASSERT(numer < denom);

	mtx_enter(&cq->cq_mtx);
	if (ISSET(cl->cl_flags, CLST_PENDING))
		panic("%s: clock interrupt pending", __func__);
	cl->cl_expiration = period / denom * numer;
	mtx_leave(&cq->cq_mtx);
}

void
clockintr_hardclock(struct clockrequest *cr, void *frame, void *arg)
{
	uint64_t count, i;

	count = clockrequest_advance(cr, hardclock_period);
	for (i = 0; i < count; i++)
		hardclock(frame);
}

void
clockqueue_init(struct clockqueue *cq)
{
	if (ISSET(cq->cq_flags, CQ_INIT))
		return;

	cq->cq_request.cr_queue = cq;
	mtx_init(&cq->cq_mtx, IPL_CLOCK);
	TAILQ_INIT(&cq->cq_all);
	TAILQ_INIT(&cq->cq_pend);
	cq->cq_gen = 1;
	SET(cq->cq_flags, CQ_INIT);
}

void
clockqueue_intrclock_install(struct clockqueue *cq,
    const struct intrclock *ic)
{
	mtx_enter(&cq->cq_mtx);
	if (!ISSET(cq->cq_flags, CQ_INTRCLOCK)) {
		cq->cq_intrclock = *ic;
		SET(cq->cq_flags, CQ_INTRCLOCK);
	}
	mtx_leave(&cq->cq_mtx);
}

uint64_t
clockqueue_next(const struct clockqueue *cq)
{
	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
	return TAILQ_FIRST(&cq->cq_pend)->cl_expiration;
}

void
clockqueue_pend_delete(struct clockqueue *cq, struct clockintr *cl)
{
	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
	KASSERT(ISSET(cl->cl_flags, CLST_PENDING));

	TAILQ_REMOVE(&cq->cq_pend, cl, cl_plink);
	CLR(cl->cl_flags, CLST_PENDING);
}

void
clockqueue_pend_insert(struct clockqueue *cq, struct clockintr *cl,
    uint64_t expiration)
{
	struct clockintr *elm;

	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
	KASSERT(!ISSET(cl->cl_flags, CLST_PENDING));

	cl->cl_expiration = expiration;
	TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink) {
		if (cl->cl_expiration < elm->cl_expiration)
			break;
	}
	if (elm == NULL)
		TAILQ_INSERT_TAIL(&cq->cq_pend, cl, cl_plink);
	else
		TAILQ_INSERT_BEFORE(elm, cl, cl_plink);
	SET(cl->cl_flags, CLST_PENDING);
}

void
clockqueue_intrclock_reprogram(struct clockqueue *cq)
{
	uint64_t exp, now;

	MUTEX_ASSERT_LOCKED(&cq->cq_mtx);
	KASSERT(ISSET(cq->cq_flags, CQ_INTRCLOCK));

	exp = clockqueue_next(cq);
	now = nsecuptime();
	if (now < exp)
		intrclock_rearm(&cq->cq_intrclock, exp - now);
	else
		intrclock_trigger(&cq->cq_intrclock);
}

void
intrclock_rearm(struct intrclock *ic, uint64_t nsecs)
{
	ic->ic_rearm(ic->ic_cookie, nsecs);
}

void
intrclock_trigger(struct intrclock *ic)
{
	ic->ic_trigger(ic->ic_cookie);
}

/*
 * Advance *next in increments of period until it exceeds now.
 * Returns the number of increments *next was advanced.
 *
 * We check the common cases first to avoid division if possible.
 * This does no overflow checking.
 */
uint64_t
nsec_advance(uint64_t *next, uint64_t period, uint64_t now)
{
	uint64_t elapsed;

	if (now < *next)
		return 0;

	if (now < *next + period) {
		*next += period;
		return 1;
	}

	elapsed = (now - *next) / period + 1;
	*next += period * elapsed;
	return elapsed;
}

int
sysctl_clockintr(int *name, u_int namelen, void *oldp, size_t *oldlenp,
    void *newp, size_t newlen)
{
	struct clockintr_stat sum, tmp;
	struct clockqueue *cq;
	struct cpu_info *ci;
	CPU_INFO_ITERATOR cii;
	uint32_t gen;

	if (namelen != 1)
		return ENOTDIR;

	switch (name[0]) {
	case KERN_CLOCKINTR_STATS:
		memset(&sum, 0, sizeof sum);
		CPU_INFO_FOREACH(cii, ci) {
			cq = &ci->ci_queue;
			if (!ISSET(cq->cq_flags, CQ_INIT))
				continue;
			do {
				gen = cq->cq_gen;
				membar_consumer();
				tmp = cq->cq_stat;
				membar_consumer();
			} while (gen == 0 || gen != cq->cq_gen);
			sum.cs_dispatched += tmp.cs_dispatched;
			sum.cs_early += tmp.cs_early;
			sum.cs_earliness += tmp.cs_earliness;
			sum.cs_lateness += tmp.cs_lateness;
			sum.cs_prompt += tmp.cs_prompt;
			sum.cs_run += tmp.cs_run;
			sum.cs_spurious += tmp.cs_spurious;
		}
		return sysctl_rdstruct(oldp, oldlenp, newp, &sum, sizeof sum);
	default:
		break;
	}

	return EINVAL;
}

#ifdef DDB

#include <machine/db_machdep.h>

#include <ddb/db_interface.h>
#include <ddb/db_output.h>
#include <ddb/db_sym.h>

void db_show_clockintr(const struct clockintr *, const char *, u_int);
void db_show_clockintr_cpu(struct cpu_info *);

void
db_show_all_clockintr(db_expr_t addr, int haddr, db_expr_t count, char *modif)
{
	struct timespec now;
	struct cpu_info *ci;
	CPU_INFO_ITERATOR cii;
	int width = sizeof(long) * 2 + 2;	/* +2 for "0x" prefix */

	nanouptime(&now);
	db_printf("%20s\n", "UPTIME");
	db_printf("%10lld.%09ld\n", now.tv_sec, now.tv_nsec);
	db_printf("\n");
	db_printf("%20s  %5s  %3s  %*s  %s\n",
	    "EXPIRATION", "STATE", "CPU", width, "ARG", "NAME");
	CPU_INFO_FOREACH(cii, ci) {
		if (ISSET(ci->ci_queue.cq_flags, CQ_INIT))
			db_show_clockintr_cpu(ci);
	}
}

void
db_show_clockintr_cpu(struct cpu_info *ci)
{
	struct clockintr *elm;
	struct clockqueue *cq = &ci->ci_queue;
	u_int cpu = CPU_INFO_UNIT(ci);

	if (cq->cq_running != NULL)
		db_show_clockintr(cq->cq_running, "run", cpu);
	TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink)
		db_show_clockintr(elm, "pend", cpu);
	TAILQ_FOREACH(elm, &cq->cq_all, cl_alink) {
		if (!ISSET(elm->cl_flags, CLST_PENDING))
			db_show_clockintr(elm, "idle", cpu);
	}
}

void
db_show_clockintr(const struct clockintr *cl, const char *state, u_int cpu)
{
	struct timespec ts;
	const char *name;
	db_expr_t offset;
	int width = sizeof(long) * 2;

	NSEC_TO_TIMESPEC(cl->cl_expiration, &ts);
	db_find_sym_and_offset((vaddr_t)cl->cl_func, &name, &offset);
	if (name == NULL)
		name = "?";
	db_printf("%10lld.%09ld  %5s  %3u  0x%0*lx  %s\n",
	    ts.tv_sec, ts.tv_nsec, state, cpu,
	    width, (unsigned long)cl->cl_arg, name);
}

#endif /* DDB */