1 files changed, 344 insertions, 35 deletions
diff --git a/sys/kern/kern_sched.c b/sys/kern/kern_sched.c
index 79bc376314a..1b921159985 100644
--- a/sys/kern/kern_sched.c
+++ b/sys/kern/kern_sched.c
@@ -1,6 +1,6 @@
-/*	$OpenBSD: kern_sched.c,v 1.8 2008/11/06 19:49:13 deraadt Exp $	*/
+/*	$OpenBSD: kern_sched.c,v 1.9 2009/03/23 13:25:11 art Exp $	*/
 /*
- * Copyright (c) 2007 Artur Grabowski <art@openbsd.org>
+ * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -24,12 +24,27 @@
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/mutex.h>
+#include <machine/atomic.h>
 
 #include <uvm/uvm_extern.h>
 
+#include <sys/malloc.h>
+
+
 void sched_kthreads_create(void *);
 void sched_idle(void *);
 
+int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
+struct proc *sched_steal_proc(struct cpu_info *);
+
+/*
+ * To help choosing which cpu should run which process we keep track
+ * of cpus which are currently idle and which cpus have processes
+ * queued.
+ */
+struct cpuset sched_idle_cpus;
+struct cpuset sched_queued_cpus;
+
 /*
  * A few notes about cpu_switchto that is implemented in MD code.
  *
@@ -55,12 +70,22 @@ void
 sched_init_cpu(struct cpu_info *ci)
 {
 	struct schedstate_percpu *spc = &ci->ci_schedstate;
+	int i;
+
+	for (i = 0; i < SCHED_NQS; i++)
+		TAILQ_INIT(&spc->spc_qs[i]);
 
 	spc->spc_idleproc = NULL;
 
 	kthread_create_deferred(sched_kthreads_create, ci);
 
 	LIST_INIT(&spc->spc_deadproc);
+
+	/*
+	 * Slight hack here until the cpuset code handles cpu_info
+	 * structures.
+	 */
+	cpuset_init_cpu(ci);
 }
 
 void
@@ -79,27 +104,31 @@ sched_kthreads_create(void *v)
 void
 sched_idle(void *v)
 {
+	struct schedstate_percpu *spc;
 	struct proc *p = curproc;
 	struct cpu_info *ci = v;
 	int s;
 
 	KERNEL_PROC_UNLOCK(p);
 
+	spc = &ci->ci_schedstate;
+
 	/*
 	 * First time we enter here, we're not supposed to idle,
 	 * just go away for a while.
 	 */
 	SCHED_LOCK(s);
+	cpuset_add(&sched_idle_cpus, ci);
 	p->p_stat = SSLEEP;
 	mi_switch();
+	cpuset_del(&sched_idle_cpus, ci);
 	SCHED_UNLOCK(s);
 
-	while (1) {
-		KASSERT(ci == curcpu());
-		KASSERT(curproc == ci->ci_schedstate.spc_idleproc);
+	KASSERT(ci == curcpu());
+	KASSERT(curproc == spc->spc_idleproc);
 
-		while (!sched_is_idle()) {
-			struct schedstate_percpu *spc = &ci->ci_schedstate;
+	while (1) {
+		while (!curcpu_is_idle()) {
 			struct proc *dead;
 
 			SCHED_LOCK(s);
@@ -115,10 +144,12 @@ sched_idle(void *v)
 
 		splassert(IPL_NONE);
 
+		cpuset_add(&sched_idle_cpus, ci);
 		cpu_idle_enter();
-		while (sched_is_idle())
+		while (spc->spc_whichqs == 0)
 			cpu_idle_cycle();
 		cpu_idle_leave();
+		cpuset_del(&sched_idle_cpus, ci);
 	}
 }
 
@@ -160,22 +191,10 @@ sched_exit(struct proc *p)
 
 /*
  * Run queue management.
- *
- * The run queue management is just like before, except that it's with
- * a bit more modern queue handling.
  */
-
-TAILQ_HEAD(prochead, proc) sched_qs[NQS];
-volatile int sched_whichqs;
-
 void
 sched_init_runqueues(void)
 {
-	int i;
-
-	for (i = 0; i < NQS; i++)
-		TAILQ_INIT(&sched_qs[i]);
-
 #ifdef MULTIPROCESSOR
 	__mp_lock_init(&sched_lock);
 #endif
@@ -184,37 +203,56 @@ sched_init_runqueues(void)
 void
 setrunqueue(struct proc *p)
 {
+	struct schedstate_percpu *spc;
 	int queue = p->p_priority >> 2;
 
 	SCHED_ASSERT_LOCKED();
+	sched_choosecpu(p);
+	spc = &p->p_cpu->ci_schedstate;
+	spc->spc_nrun++;
+
+	TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
+	spc->spc_whichqs |= (1 << queue);
+	cpuset_add(&sched_queued_cpus, p->p_cpu);
 
-	TAILQ_INSERT_TAIL(&sched_qs[queue], p, p_runq);
-	sched_whichqs |= (1 << queue);
+	if (p->p_cpu != curcpu())
+		cpu_unidle(p->p_cpu);
 }
 
 void
 remrunqueue(struct proc *p)
 {
+	struct schedstate_percpu *spc;
 	int queue = p->p_priority >> 2;
 
 	SCHED_ASSERT_LOCKED();
-
-	TAILQ_REMOVE(&sched_qs[queue], p, p_runq);
-	if (TAILQ_EMPTY(&sched_qs[queue]))
-		sched_whichqs &= ~(1 << queue);
+	spc = &p->p_cpu->ci_schedstate;
+	spc->spc_nrun--;
+
+	TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
+	if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
+		spc->spc_whichqs &= ~(1 << queue);
+		if (spc->spc_whichqs == 0)
+			cpuset_del(&sched_queued_cpus, p->p_cpu);
+	}
 }
 
 struct proc *
 sched_chooseproc(void)
 {
+	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
 	struct proc *p;
 	int queue;
 
 	SCHED_ASSERT_LOCKED();
 
 again:
-	if (sched_is_idle()) {
-		p = curcpu()->ci_schedstate.spc_idleproc;
+	if (spc->spc_whichqs) {
+		queue = ffs(spc->spc_whichqs) - 1;
+		p = TAILQ_FIRST(&spc->spc_qs[queue]);
+		remrunqueue(p);
+	} else if ((p = sched_steal_proc(curcpu())) == NULL) {
+		p = spc->spc_idleproc;
 		if (p == NULL) {
                         int s;
 			/*
@@ -232,13 +270,284 @@ again:
                 }
 		KASSERT(p);
 		p->p_stat = SRUN;
-	} else {
-		queue = ffs(sched_whichqs) - 1;
-		p = TAILQ_FIRST(&sched_qs[queue]);
-		TAILQ_REMOVE(&sched_qs[queue], p, p_runq);
-		if (TAILQ_EMPTY(&sched_qs[queue]))
-			sched_whichqs &= ~(1 << queue);
-	}
+	} 
 
 	return (p);	
 }
+
+uint64_t sched_nmigrations;
+uint64_t sched_noidle;
+uint64_t sched_stolen;
+
+uint64_t sched_choose;
+uint64_t sched_wasidle;
+uint64_t sched_nomigrations;
+
+void
+sched_choosecpu(struct proc *p)
+{
+	struct cpu_info *choice = NULL;
+	int last_cost = INT_MAX;
+	struct cpu_info *ci;
+	struct cpuset set;
+
+	sched_choose++;
+
+	/*
+	 * The simplest case. Our cpu of choice was idle. This happens
+	 * when we were sleeping and something woke us up.
+	 *
+	 * We also need to check sched_queued_cpus to make sure that
+	 * we're not thundering herding one cpu that hasn't managed to
+	 * get out of the idle loop yet.
+	 */
+	if (p->p_cpu && cpuset_isset(&sched_idle_cpus, p->p_cpu) &&
+	    !cpuset_isset(&sched_queued_cpus, p->p_cpu)) {
+		sched_wasidle++;
+		return;
+	}
+
+#if 0
+
+		/* Most likely, this is broken. don't do it. */
+	/*
+	 * Second case. (shouldn't be necessary in the future)
+	 * If our cpu is not idle, but has nothing else queued (which
+	 * means that we are curproc and roundrobin asks us to reschedule).
+	 */
+	if (p->p_cpu && p->p_cpu->ci_schedstate.spc_nrun == 0)
+		return;
+#endif
+
+	/*
+	 * Look at all cpus that are currently idle. Pick the cheapest of
+	 * those.
+	 */
+	cpuset_copy(&set, &sched_idle_cpus);
+	while ((ci = cpuset_first(&set)) != NULL) {
+		int cost = sched_proc_to_cpu_cost(ci, p);
+
+		if (choice == NULL || cost < last_cost) {
+			choice = ci;
+			last_cost = cost;
+		}
+		cpuset_del(&set, ci);
+	}
+
+	/*
+	 * All cpus are busy. Pick one.
+	 */
+	if (choice == NULL) {
+		CPU_INFO_ITERATOR cii;
+
+		sched_noidle++;
+
+		/*
+		 * Not curproc, pick the cpu with the lowest cost to switch to.
+		 */
+		CPU_INFO_FOREACH(cii, ci) {
+			int cost = sched_proc_to_cpu_cost(ci, p);
+
+			if (choice == NULL || cost < last_cost) {
+				choice = ci;
+				last_cost = cost;
+			}
+		}
+	}
+
+	KASSERT(choice);
+
+	if (p->p_cpu && p->p_cpu != choice)
+		sched_nmigrations++;
+	else if (p->p_cpu != NULL)
+		sched_nomigrations++;
+
+	p->p_cpu = choice;
+}
+
+/*
+ * Attempt to steal a proc from some cpu.
+ */
+struct proc *
+sched_steal_proc(struct cpu_info *self)
+{
+	struct schedstate_percpu *spc;
+	struct proc *best = NULL;
+	int bestcost = INT_MAX;
+	struct cpu_info *ci;
+	struct cpuset set;
+
+	cpuset_copy(&set, &sched_queued_cpus);
+
+	while ((ci = cpuset_first(&set)) != NULL) {
+		struct proc *p;
+		int cost;
+
+		cpuset_del(&set, ci);
+
+		spc = &ci->ci_schedstate;
+
+		p = TAILQ_FIRST(&spc->spc_qs[ffs(spc->spc_whichqs) - 1]);
+		KASSERT(p);
+		cost = sched_proc_to_cpu_cost(self, p);
+
+		if (best == NULL || cost < bestcost) {
+			best = p;
+			bestcost = cost;
+		}
+	}
+	if (best == NULL)
+		return (NULL);
+
+	spc = &best->p_cpu->ci_schedstate;
+	remrunqueue(best);
+	best->p_cpu = self;
+
+	sched_stolen++;
+
+	return (best);
+}
+
+/*
+ * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
+ */
+static int
+log2(unsigned int i)
+{
+	int ret = 0;
+
+	while (i >>= 1)
+		ret++;
+
+	return (ret);
+}
+
+/*
+ * Calculate the cost of moving the proc to this cpu.
+ * 
+ * What we want is some guesstimate of how much "performance" it will
+ * cost us to move the proc here. Not just for caches and TLBs and NUMA
+ * memory, but also for the proc itself. A highly loaded cpu might not
+ * be the best candidate for this proc since it won't get run.
+ *
+ * Just total guesstimates for now.
+ */
+
+int sched_cost_load = 1;
+int sched_cost_priority = 1;
+int sched_cost_runnable = 3;
+int sched_cost_resident = 1;
+
+int
+sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
+{
+	struct schedstate_percpu *spc;
+	int l2resident = 0;
+	int cost;
+
+	spc = &ci->ci_schedstate;
+
+	cost = 0;
+
+	/*
+	 * First, account for the priority of the proc we want to move.
+	 * More willing to move, the lower the priority of the destination
+	 * and the higher the priority of the proc.
+	 */
+	if (!cpuset_isset(&sched_idle_cpus, ci)) {
+		cost += (p->p_priority - spc->spc_curpriority) *
+		    sched_cost_priority;
+		cost += sched_cost_runnable;
+	}
+	if (cpuset_isset(&sched_queued_cpus, ci)) {
+		cost += spc->spc_nrun * sched_cost_runnable;
+	}
+
+	/*
+	 * Higher load on the destination means we don't want to go there.
+	 */
+	cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT);
+
+	/*
+	 * If the proc is on this cpu already, lower the cost by how much
+	 * it has been running and an estimate of its footprint.
+	 */
+	if (p->p_cpu == ci && p->p_slptime == 0) {
+		l2resident =
+		    log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
+		cost -= l2resident * sched_cost_resident;
+	}
+
+	return (cost);
+}
+
+/*
+ * Functions to manipulate cpu sets.
+ */
+struct cpu_info *cpuset_infos[MAXCPUS];
+static struct cpuset cpuset_all;
+
+void
+cpuset_init_cpu(struct cpu_info *ci)
+{
+	cpuset_add(&cpuset_all, ci);
+	cpuset_infos[CPU_INFO_UNIT(ci)] = ci;
+}
+
+void
+cpuset_clear(struct cpuset *cs)
+{
+	memset(cs, 0, sizeof(*cs));
+}
+
+/*
+ * XXX - implement it on SP architectures too
+ */
+#ifndef CPU_INFO_UNIT
+#define CPU_INFO_UNIT 0
+#endif
+
+void
+cpuset_add(struct cpuset *cs, struct cpu_info *ci)
+{
+	unsigned int num = CPU_INFO_UNIT(ci);
+	atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32)));
+}
+
+void
+cpuset_del(struct cpuset *cs, struct cpu_info *ci)
+{
+	unsigned int num = CPU_INFO_UNIT(ci);
+	atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32)));
+}
+
+int
+cpuset_isset(struct cpuset *cs, struct cpu_info *ci)
+{
+	unsigned int num = CPU_INFO_UNIT(ci);
+	return (cs->cs_set[num/32] & (1 << (num % 32)));
+}
+
+void
+cpuset_add_all(struct cpuset *cs)
+{
+	cpuset_copy(cs, &cpuset_all);
+}
+
+void
+cpuset_copy(struct cpuset *to, struct cpuset *from)
+{
+	memcpy(to, from, sizeof(*to));
+}
+
+struct cpu_info *
+cpuset_first(struct cpuset *cs)
+{
+	int i;
+
+	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
+		if (cs->cs_set[i])
+			return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]);
+
+	return (NULL);
+}