diff options
Diffstat (limited to 'sys/kern/kern_sched.c')
-rw-r--r-- | sys/kern/kern_sched.c | 379 |
1 files changed, 344 insertions, 35 deletions
diff --git a/sys/kern/kern_sched.c b/sys/kern/kern_sched.c index 79bc376314a..1b921159985 100644 --- a/sys/kern/kern_sched.c +++ b/sys/kern/kern_sched.c @@ -1,6 +1,6 @@ -/* $OpenBSD: kern_sched.c,v 1.8 2008/11/06 19:49:13 deraadt Exp $ */ +/* $OpenBSD: kern_sched.c,v 1.9 2009/03/23 13:25:11 art Exp $ */ /* - * Copyright (c) 2007 Artur Grabowski <art@openbsd.org> + * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -24,12 +24,27 @@ #include <sys/resourcevar.h> #include <sys/signalvar.h> #include <sys/mutex.h> +#include <machine/atomic.h> #include <uvm/uvm_extern.h> +#include <sys/malloc.h> + + void sched_kthreads_create(void *); void sched_idle(void *); +int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p); +struct proc *sched_steal_proc(struct cpu_info *); + +/* + * To help choosing which cpu should run which process we keep track + * of cpus which are currently idle and which cpus have processes + * queued. + */ +struct cpuset sched_idle_cpus; +struct cpuset sched_queued_cpus; + /* * A few notes about cpu_switchto that is implemented in MD code. * @@ -55,12 +70,22 @@ void sched_init_cpu(struct cpu_info *ci) { struct schedstate_percpu *spc = &ci->ci_schedstate; + int i; + + for (i = 0; i < SCHED_NQS; i++) + TAILQ_INIT(&spc->spc_qs[i]); spc->spc_idleproc = NULL; kthread_create_deferred(sched_kthreads_create, ci); LIST_INIT(&spc->spc_deadproc); + + /* + * Slight hack here until the cpuset code handles cpu_info + * structures. + */ + cpuset_init_cpu(ci); } void @@ -79,27 +104,31 @@ sched_kthreads_create(void *v) void sched_idle(void *v) { + struct schedstate_percpu *spc; struct proc *p = curproc; struct cpu_info *ci = v; int s; KERNEL_PROC_UNLOCK(p); + spc = &ci->ci_schedstate; + /* * First time we enter here, we're not supposed to idle, * just go away for a while. */ SCHED_LOCK(s); + cpuset_add(&sched_idle_cpus, ci); p->p_stat = SSLEEP; mi_switch(); + cpuset_del(&sched_idle_cpus, ci); SCHED_UNLOCK(s); - while (1) { - KASSERT(ci == curcpu()); - KASSERT(curproc == ci->ci_schedstate.spc_idleproc); + KASSERT(ci == curcpu()); + KASSERT(curproc == spc->spc_idleproc); - while (!sched_is_idle()) { - struct schedstate_percpu *spc = &ci->ci_schedstate; + while (1) { + while (!curcpu_is_idle()) { struct proc *dead; SCHED_LOCK(s); @@ -115,10 +144,12 @@ sched_idle(void *v) splassert(IPL_NONE); + cpuset_add(&sched_idle_cpus, ci); cpu_idle_enter(); - while (sched_is_idle()) + while (spc->spc_whichqs == 0) cpu_idle_cycle(); cpu_idle_leave(); + cpuset_del(&sched_idle_cpus, ci); } } @@ -160,22 +191,10 @@ sched_exit(struct proc *p) /* * Run queue management. - * - * The run queue management is just like before, except that it's with - * a bit more modern queue handling. */ - -TAILQ_HEAD(prochead, proc) sched_qs[NQS]; -volatile int sched_whichqs; - void sched_init_runqueues(void) { - int i; - - for (i = 0; i < NQS; i++) - TAILQ_INIT(&sched_qs[i]); - #ifdef MULTIPROCESSOR __mp_lock_init(&sched_lock); #endif @@ -184,37 +203,56 @@ sched_init_runqueues(void) void setrunqueue(struct proc *p) { + struct schedstate_percpu *spc; int queue = p->p_priority >> 2; SCHED_ASSERT_LOCKED(); + sched_choosecpu(p); + spc = &p->p_cpu->ci_schedstate; + spc->spc_nrun++; + + TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq); + spc->spc_whichqs |= (1 << queue); + cpuset_add(&sched_queued_cpus, p->p_cpu); - TAILQ_INSERT_TAIL(&sched_qs[queue], p, p_runq); - sched_whichqs |= (1 << queue); + if (p->p_cpu != curcpu()) + cpu_unidle(p->p_cpu); } void remrunqueue(struct proc *p) { + struct schedstate_percpu *spc; int queue = p->p_priority >> 2; SCHED_ASSERT_LOCKED(); - - TAILQ_REMOVE(&sched_qs[queue], p, p_runq); - if (TAILQ_EMPTY(&sched_qs[queue])) - sched_whichqs &= ~(1 << queue); + spc = &p->p_cpu->ci_schedstate; + spc->spc_nrun--; + + TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq); + if (TAILQ_EMPTY(&spc->spc_qs[queue])) { + spc->spc_whichqs &= ~(1 << queue); + if (spc->spc_whichqs == 0) + cpuset_del(&sched_queued_cpus, p->p_cpu); + } } struct proc * sched_chooseproc(void) { + struct schedstate_percpu *spc = &curcpu()->ci_schedstate; struct proc *p; int queue; SCHED_ASSERT_LOCKED(); again: - if (sched_is_idle()) { - p = curcpu()->ci_schedstate.spc_idleproc; + if (spc->spc_whichqs) { + queue = ffs(spc->spc_whichqs) - 1; + p = TAILQ_FIRST(&spc->spc_qs[queue]); + remrunqueue(p); + } else if ((p = sched_steal_proc(curcpu())) == NULL) { + p = spc->spc_idleproc; if (p == NULL) { int s; /* @@ -232,13 +270,284 @@ again: } KASSERT(p); p->p_stat = SRUN; - } else { - queue = ffs(sched_whichqs) - 1; - p = TAILQ_FIRST(&sched_qs[queue]); - TAILQ_REMOVE(&sched_qs[queue], p, p_runq); - if (TAILQ_EMPTY(&sched_qs[queue])) - sched_whichqs &= ~(1 << queue); - } + } return (p); } + +uint64_t sched_nmigrations; +uint64_t sched_noidle; +uint64_t sched_stolen; + +uint64_t sched_choose; +uint64_t sched_wasidle; +uint64_t sched_nomigrations; + +void +sched_choosecpu(struct proc *p) +{ + struct cpu_info *choice = NULL; + int last_cost = INT_MAX; + struct cpu_info *ci; + struct cpuset set; + + sched_choose++; + + /* + * The simplest case. Our cpu of choice was idle. This happens + * when we were sleeping and something woke us up. + * + * We also need to check sched_queued_cpus to make sure that + * we're not thundering herding one cpu that hasn't managed to + * get out of the idle loop yet. + */ + if (p->p_cpu && cpuset_isset(&sched_idle_cpus, p->p_cpu) && + !cpuset_isset(&sched_queued_cpus, p->p_cpu)) { + sched_wasidle++; + return; + } + +#if 0 + + /* Most likely, this is broken. don't do it. */ + /* + * Second case. (shouldn't be necessary in the future) + * If our cpu is not idle, but has nothing else queued (which + * means that we are curproc and roundrobin asks us to reschedule). + */ + if (p->p_cpu && p->p_cpu->ci_schedstate.spc_nrun == 0) + return; +#endif + + /* + * Look at all cpus that are currently idle. Pick the cheapest of + * those. + */ + cpuset_copy(&set, &sched_idle_cpus); + while ((ci = cpuset_first(&set)) != NULL) { + int cost = sched_proc_to_cpu_cost(ci, p); + + if (choice == NULL || cost < last_cost) { + choice = ci; + last_cost = cost; + } + cpuset_del(&set, ci); + } + + /* + * All cpus are busy. Pick one. + */ + if (choice == NULL) { + CPU_INFO_ITERATOR cii; + + sched_noidle++; + + /* + * Not curproc, pick the cpu with the lowest cost to switch to. + */ + CPU_INFO_FOREACH(cii, ci) { + int cost = sched_proc_to_cpu_cost(ci, p); + + if (choice == NULL || cost < last_cost) { + choice = ci; + last_cost = cost; + } + } + } + + KASSERT(choice); + + if (p->p_cpu && p->p_cpu != choice) + sched_nmigrations++; + else if (p->p_cpu != NULL) + sched_nomigrations++; + + p->p_cpu = choice; +} + +/* + * Attempt to steal a proc from some cpu. + */ +struct proc * +sched_steal_proc(struct cpu_info *self) +{ + struct schedstate_percpu *spc; + struct proc *best = NULL; + int bestcost = INT_MAX; + struct cpu_info *ci; + struct cpuset set; + + cpuset_copy(&set, &sched_queued_cpus); + + while ((ci = cpuset_first(&set)) != NULL) { + struct proc *p; + int cost; + + cpuset_del(&set, ci); + + spc = &ci->ci_schedstate; + + p = TAILQ_FIRST(&spc->spc_qs[ffs(spc->spc_whichqs) - 1]); + KASSERT(p); + cost = sched_proc_to_cpu_cost(self, p); + + if (best == NULL || cost < bestcost) { + best = p; + bestcost = cost; + } + } + if (best == NULL) + return (NULL); + + spc = &best->p_cpu->ci_schedstate; + remrunqueue(best); + best->p_cpu = self; + + sched_stolen++; + + return (best); +} + +/* + * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know). + */ +static int +log2(unsigned int i) +{ + int ret = 0; + + while (i >>= 1) + ret++; + + return (ret); +} + +/* + * Calculate the cost of moving the proc to this cpu. + * + * What we want is some guesstimate of how much "performance" it will + * cost us to move the proc here. Not just for caches and TLBs and NUMA + * memory, but also for the proc itself. A highly loaded cpu might not + * be the best candidate for this proc since it won't get run. + * + * Just total guesstimates for now. + */ + +int sched_cost_load = 1; +int sched_cost_priority = 1; +int sched_cost_runnable = 3; +int sched_cost_resident = 1; + +int +sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p) +{ + struct schedstate_percpu *spc; + int l2resident = 0; + int cost; + + spc = &ci->ci_schedstate; + + cost = 0; + + /* + * First, account for the priority of the proc we want to move. + * More willing to move, the lower the priority of the destination + * and the higher the priority of the proc. + */ + if (!cpuset_isset(&sched_idle_cpus, ci)) { + cost += (p->p_priority - spc->spc_curpriority) * + sched_cost_priority; + cost += sched_cost_runnable; + } + if (cpuset_isset(&sched_queued_cpus, ci)) { + cost += spc->spc_nrun * sched_cost_runnable; + } + + /* + * Higher load on the destination means we don't want to go there. + */ + cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT); + + /* + * If the proc is on this cpu already, lower the cost by how much + * it has been running and an estimate of its footprint. + */ + if (p->p_cpu == ci && p->p_slptime == 0) { + l2resident = + log2(pmap_resident_count(p->p_vmspace->vm_map.pmap)); + cost -= l2resident * sched_cost_resident; + } + + return (cost); +} + +/* + * Functions to manipulate cpu sets. + */ +struct cpu_info *cpuset_infos[MAXCPUS]; +static struct cpuset cpuset_all; + +void +cpuset_init_cpu(struct cpu_info *ci) +{ + cpuset_add(&cpuset_all, ci); + cpuset_infos[CPU_INFO_UNIT(ci)] = ci; +} + +void +cpuset_clear(struct cpuset *cs) +{ + memset(cs, 0, sizeof(*cs)); +} + +/* + * XXX - implement it on SP architectures too + */ +#ifndef CPU_INFO_UNIT +#define CPU_INFO_UNIT 0 +#endif + +void +cpuset_add(struct cpuset *cs, struct cpu_info *ci) +{ + unsigned int num = CPU_INFO_UNIT(ci); + atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32))); +} + +void +cpuset_del(struct cpuset *cs, struct cpu_info *ci) +{ + unsigned int num = CPU_INFO_UNIT(ci); + atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32))); +} + +int +cpuset_isset(struct cpuset *cs, struct cpu_info *ci) +{ + unsigned int num = CPU_INFO_UNIT(ci); + return (cs->cs_set[num/32] & (1 << (num % 32))); +} + +void +cpuset_add_all(struct cpuset *cs) +{ + cpuset_copy(cs, &cpuset_all); +} + +void +cpuset_copy(struct cpuset *to, struct cpuset *from) +{ + memcpy(to, from, sizeof(*to)); +} + +struct cpu_info * +cpuset_first(struct cpuset *cs) +{ + int i; + + for (i = 0; i < CPUSET_ASIZE(ncpus); i++) + if (cs->cs_set[i]) + return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]); + + return (NULL); +} |