Index: linux-2.6.12-rc4-ps5/include/linux/sched_drv.h =================================================================== --- linux-2.6.12-rc4-ps5.orig/include/linux/sched_drv.h 2005-05-26 12:17:59.000000000 +1000 +++ linux-2.6.12-rc4-ps5/include/linux/sched_drv.h 2005-05-26 13:13:12.000000000 +1000 @@ -27,6 +27,7 @@ struct sched_drv { int (*move_tasks)(runqueue_t *, int, runqueue_t *, unsigned long, struct sched_domain *, enum idle_type); #endif + void (*systime_hook)(runqueue_t *, cputime_t); void (*tick)(struct task_struct*, struct runqueue *, unsigned long long); #ifdef CONFIG_SCHED_SMT struct task_struct *(*head_of_queue)(union runqueue_queue *); Index: linux-2.6.12-rc4-ps5/include/linux/sched_runq.h =================================================================== --- linux-2.6.12-rc4-ps5.orig/include/linux/sched_runq.h 2005-05-26 12:17:53.000000000 +1000 +++ linux-2.6.12-rc4-ps5/include/linux/sched_runq.h 2005-05-26 12:22:42.000000000 +1000 @@ -41,6 +41,7 @@ struct staircase_runqueue_queue { struct list_head queue[STAIRCASE_NUM_PRIO_SLOTS - 1]; unsigned int cache_ticks; unsigned int preempted; + unsigned long systime_centile; }; #endif Index: linux-2.6.12-rc4-ps5/kernel/ingosched.c =================================================================== --- linux-2.6.12-rc4-ps5.orig/kernel/ingosched.c 2005-05-26 11:46:34.000000000 +1000 +++ linux-2.6.12-rc4-ps5/kernel/ingosched.c 2005-05-26 13:14:35.000000000 +1000 @@ -675,6 +675,10 @@ out: STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ ((rq)->curr->static_prio > (rq)->qu.ingosched.best_expired_prio)) +static void blank_hook(runqueue_t *rq, cputime_t cputime) +{ +} + /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -1148,6 +1152,7 @@ const struct sched_drv ingo_sched_drv = #ifdef CONFIG_SMP .move_tasks = ingo_move_tasks, #endif + .systime_hook = blank_hook, .tick = ingo_tick, #ifdef CONFIG_SCHED_SMT .head_of_queue = ingo_head_of_queue, Index: linux-2.6.12-rc4-ps5/kernel/nicksched.c =================================================================== --- linux-2.6.12-rc4-ps5.orig/kernel/nicksched.c 2005-05-26 11:46:34.000000000 +1000 +++ linux-2.6.12-rc4-ps5/kernel/nicksched.c 2005-05-26 13:14:18.000000000 +1000 @@ -589,6 +589,10 @@ out: } #endif +static void blank_hook(runqueue_t *rq, cputime_t cputime) +{ +} + /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -965,6 +969,7 @@ const struct sched_drv nick_sched_drv = #ifdef CONFIG_SMP .move_tasks = nick_move_tasks, #endif + .systime_hook = blank_hook, .tick = nick_tick, #ifdef CONFIG_SCHED_SMT .head_of_queue = nick_head_of_queue, Index: linux-2.6.12-rc4-ps5/kernel/sched.c =================================================================== --- linux-2.6.12-rc4-ps5.orig/kernel/sched.c 2005-05-26 12:17:28.000000000 +1000 +++ linux-2.6.12-rc4-ps5/kernel/sched.c 2005-05-26 13:13:43.000000000 +1000 @@ -1393,6 +1393,7 @@ void account_system_time(struct task_str acct_update_integrals(p); /* Update rss highwater mark */ update_mem_hiwater(p); + sched_drvp->systime_hook(rq, cputime); } /* Index: linux-2.6.12-rc4-ps5/kernel/sched_spa.c =================================================================== --- linux-2.6.12-rc4-ps5.orig/kernel/sched_spa.c 2005-05-26 11:46:34.000000000 +1000 +++ linux-2.6.12-rc4-ps5/kernel/sched_spa.c 2005-05-26 13:14:46.000000000 +1000 @@ -593,6 +593,10 @@ static inline void spa_reassess_at_end_o #define spa_runq_data_tick(p, numr) zaphod_runq_data_tick(p, numr) #endif +static void blank_hook(runqueue_t *rq, cputime_t cputime) +{ +} + /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -1472,6 +1476,7 @@ const struct sched_drv spa_nf_sched_drv #ifdef CONFIG_SMP .move_tasks = spa_move_tasks, #endif + .systime_hook = blank_hook, .tick = spa_tick, #ifdef CONFIG_SCHED_SMT .head_of_queue = spa_head_of_queue, @@ -1513,6 +1518,7 @@ const struct sched_drv zaphod_sched_drv #ifdef CONFIG_SMP .move_tasks = spa_move_tasks, #endif + .systime_hook = blank_hook, .tick = spa_tick, #ifdef CONFIG_SCHED_SMT .head_of_queue = spa_head_of_queue, Index: linux-2.6.12-rc4-ps5/kernel/staircase.c =================================================================== --- linux-2.6.12-rc4-ps5.orig/kernel/staircase.c 2005-05-26 12:17:28.000000000 +1000 +++ linux-2.6.12-rc4-ps5/kernel/staircase.c 2005-05-26 13:07:49.000000000 +1000 @@ -2,8 +2,8 @@ * kernel/staircase.c * Copyright (C) 1991-2005 Linus Torvalds * - * 2005-02-13 Staircase scheduler by Con Kolivas - * Staircase v10.7 + * 2005-05-26 Staircase scheduler by Con Kolivas + * Staircase v11.2 */ #include #include @@ -38,7 +38,8 @@ static void staircase_init_runqueue_queu __set_bit(STAIRCASE_MAX_PRIO, qup->staircase.bitmap); } -static void staircase_set_oom_time_slice(struct task_struct *p, unsigned long t) +static void staircase_set_oom_time_slice(struct task_struct *p, + unsigned long t) { p->sdu.staircase.slice = p->sdu.staircase.time_slice = t; } @@ -73,26 +74,28 @@ int sched_compute = 0; /* * Get nanosecond clock difference without overflowing unsigned long. */ -static inline unsigned long ns_diff(unsigned long long v1, unsigned long long v2) +static inline unsigned long ns_diff(unsigned long long v1, + unsigned long long v2) { unsigned long long vdiff; - if (unlikely(v1 < v2)) + if (likely(v1 > v2)) { + vdiff = v1 - v2; + if (vdiff > (1 << 31)) + vdiff = 1 << 31; + } else /* - * Rarely the clock goes backwards. There should always be - * a positive difference so return 1. + * Rarely the clock appears to go backwards. There should + * always be a positive difference so return 1. */ vdiff = 1; - else - vdiff = v1 - v2; - if (vdiff > (1 << 31)) - vdiff = 1 << 31; return (unsigned long)vdiff; } /* * Adding/removing a task to/from a priority array: */ -static inline void dequeue_task(struct task_struct *p, struct staircase_runqueue_queue *rqq) +static inline void dequeue_task(struct task_struct *p, + struct staircase_runqueue_queue *rqq) { list_del_init(&p->run_list); if (list_empty(rqq->queue + p->prio)) @@ -100,14 +103,16 @@ static inline void dequeue_task(struct t p->sdu.staircase.ns_debit = 0; } -static void enqueue_task(struct task_struct *p, struct staircase_runqueue_queue *rqq) +static void enqueue_task(struct task_struct *p, + struct staircase_runqueue_queue *rqq) { sched_info_queued(p); list_add_tail(&p->run_list, rqq->queue + p->prio); __set_bit(p->prio, rqq->bitmap); } -static void requeue_task(struct task_struct *p, struct staircase_runqueue_queue *rq) +static inline void requeue_task(struct task_struct *p, + struct staircase_runqueue_queue *rq) { list_move_tail(&p->run_list, rq->queue + p->prio); } @@ -117,7 +122,8 @@ static void requeue_task(struct task_str * remote queue so we want these tasks to show up at the head of the * local queue: */ -static inline void enqueue_task_head(struct task_struct *p, struct staircase_runqueue_queue *rqq) +static inline void enqueue_task_head(struct task_struct *p, + struct staircase_runqueue_queue *rqq) { list_add(&p->run_list, rqq->queue + p->prio); __set_bit(p->prio, rqq->bitmap); @@ -256,17 +262,24 @@ static void continue_slice(task_t *p) * or have just forked a thread/process and make them continue their old * slice instead of starting a new one at high priority. */ -static void recalc_task_prio(task_t *p, unsigned long long now, unsigned long rq_load) +static inline void recalc_task_prio(task_t *p, unsigned long long now, + unsigned long rq_systime, unsigned long rq_running) { - unsigned long sleep_time; + unsigned long sleep_time = ns_diff(now, p->timestamp); - if (rq_load > 31) - rq_load = 31; - sleep_time = ns_diff(now, p->timestamp) / (1 << rq_load); + /* + * Priority is elevated back to best by amount of sleep_time. + * sleep_time is scaled down by in-kernel system time and by + * number of tasks currently running. + */ + sleep_time /= rq_running + 1; + if (rq_systime) + sleep_time = sleep_time / 200 * (100 - rq_systime); p->sdu.staircase.totalrun += p->sdu.staircase.runtime; - if (NS_TO_JIFFIES(p->sdu.staircase.totalrun) >= p->sdu.staircase.slice && - NS_TO_JIFFIES(sleep_time) < p->sdu.staircase.slice) { + if (NS_TO_JIFFIES(p->sdu.staircase.totalrun) >= + p->sdu.staircase.slice && NS_TO_JIFFIES(sleep_time) < + p->sdu.staircase.slice) { p->sdu.staircase.sflags &= ~SF_FORKED; dec_burst(p); goto new_slice; @@ -317,7 +330,8 @@ static void activate_task(task_t *p, run #endif p->sdu.staircase.slice = slice(p); p->sdu.staircase.time_slice = rr_interval(p); - recalc_task_prio(p, now, rq->nr_running); + recalc_task_prio(p, now, rq->qu.staircase.systime_centile / 100, + rq->nr_running); p->sdu.staircase.sflags &= ~SF_UISLEEP; p->prio = effective_prio(p); p->timestamp = now; @@ -348,10 +362,8 @@ static void preempt(task_t *p, struct ru if (!TASK_PREEMPTS_CURR(p, rq)) return; - if (p->prio == rq->curr->prio && - ((p->sdu.staircase.totalrun || p->sdu.staircase.slice != slice(p)) || - rt_task(rq->curr))) - return; + if (p->prio >= rq->curr->prio) + return; if (!sched_compute || rq->qu.staircase.cache_ticks >= cache_delay || !p->mm || rt_task(p)) @@ -366,7 +378,8 @@ static void preempt(task_t *p, struct ru * @sync: do a synchronous wakeup? * @rq: The run queue on which the task is to be placed (already locked) */ -static void staircase_wake_up_task(struct task_struct *p, struct runqueue *rq, unsigned int old_state, int sync) +static void staircase_wake_up_task(struct task_struct *p, struct runqueue *rq, + unsigned int old_state, int sync) { int same_cpu = (rq == this_rq()); @@ -420,29 +433,19 @@ static void staircase_wake_up_new_task(t if (likely(cpu == this_cpu)) { current->sdu.staircase.sflags |= SF_FORKED; - - if (!(clone_flags & CLONE_VM)) { + activate_task(p, rq, 1); + if (!(clone_flags & CLONE_VM)) /* * The VM isn't cloned, so we're in a good position to * do child-runs-first in anticipation of an exec. This * usually avoids a lot of COW overhead. */ - if (unlikely(!task_is_queued(current))) { - p->prio = effective_prio(p); - __activate_task(p, rq); - } else { - p->prio = current->prio; - list_add_tail(&p->run_list, ¤t->run_list); - inc_nr_running(p, rq); - } set_need_resched(); - } else { - p->prio = effective_prio(p); - /* Run child last */ - __activate_task(p, rq); - } /* * We skip the following code due to cpu == this_cpu + * + * task_rq_unlock(rq, &flags); + * this_rq = task_rq_lock(current, &flags); */ this_rq = rq; } else { @@ -459,8 +462,8 @@ static void staircase_wake_up_new_task(t preempt(p, rq); /* - * Parent and child are on different CPUs, now get the - * parent runqueue to update the parent's ->sdu.staircase.sleep_avg: + * Parent and child are on different CPUs, now get the parent + * runqueue to update the parent's ->sdu.staircase.sleep_avg: */ task_rq_unlock(rq, &flags); this_rq = task_rq_lock(current, &flags); @@ -487,8 +490,8 @@ static void staircase_exit(task_t * p) * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ -static inline -void pull_task(runqueue_t *src_rq, task_t *p, runqueue_t *this_rq, int this_cpu) +static inline void pull_task(runqueue_t *src_rq, task_t *p, + runqueue_t *this_rq, int this_cpu) { dequeue_task(p, &src_rq->qu.staircase); dec_nr_running(p, src_rq); @@ -498,8 +501,8 @@ void pull_task(runqueue_t *src_rq, task_ p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + this_rq->timestamp_last_tick; /* - * Note that idle threads have a prio of STAIRCASE_MAX_PRIO, for this test - * to be always true for them. + * Note that idle threads have a prio of STAIRCASE_MAX_PRIO, for this + * test to be always true for them. */ preempt(p, this_rq); } @@ -512,9 +515,9 @@ void pull_task(runqueue_t *src_rq, task_ * * Called with both runqueues locked. */ -static int staircase_move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, - unsigned long max_nr_move, struct sched_domain *sd, - enum idle_type idle) +static int staircase_move_tasks(runqueue_t *this_rq, int this_cpu, + runqueue_t *busiest, unsigned long max_nr_move, + struct sched_domain *sd, enum idle_type idle) { struct list_head *head, *curr; int idx, pulled = 0; @@ -529,7 +532,8 @@ skip_bitmap: if (!idx) idx = sched_find_first_bit(busiest->qu.staircase.bitmap); else - idx = find_next_bit(busiest->qu.staircase.bitmap, STAIRCASE_MAX_PRIO, idx); + idx = find_next_bit(busiest->qu.staircase.bitmap, + STAIRCASE_MAX_PRIO, idx); if (idx >= STAIRCASE_MAX_PRIO) goto out; @@ -578,14 +582,25 @@ static void time_slice_expired(task_t *p enqueue_task(p, rqq); } +static void staircase_systime_hook(runqueue_t *rq, cputime_t cputime) +{ + /* For calculating rolling percentage of sys time per runqueue */ + rq->qu.staircase.systime_centile += cputime * 100; +} + /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. */ -static void staircase_tick(struct task_struct *p, struct runqueue *rq, unsigned long long now) +static void staircase_tick(struct task_struct *p, struct runqueue *rq, + unsigned long long now) { int cpu = smp_processor_id(); - unsigned long debit; + unsigned long debit, expired_balance = rq->nr_running; + + /* Rolling percentage systime per runqueue */ + rq->qu.staircase.systime_centile = rq->qu.staircase.systime_centile * + 99 / 100; if (p == rq->idle) { if (wake_priority_sleeper(rq)) @@ -603,8 +618,10 @@ static void staircase_tick(struct task_s /* * SCHED_FIFO tasks never run out of timeslice. */ - if (unlikely(p->policy == SCHED_FIFO)) + if (unlikely(p->policy == SCHED_FIFO)) { + expired_balance = 0; goto out; + } spin_lock(&rq->lock); debit = ns_diff(rq->timestamp_last_tick, p->timestamp); @@ -631,12 +648,17 @@ static void staircase_tick(struct task_s goto out_unlock; } rq->qu.staircase.cache_ticks++; - if (rq->qu.staircase.preempted && rq->qu.staircase.cache_ticks >= cache_delay) + if (rq->qu.staircase.preempted && + rq->qu.staircase.cache_ticks >= cache_delay) { set_tsk_need_resched(p); + goto out_unlock; + } + expired_balance = 0; out_unlock: spin_unlock(&rq->lock); out: - rebalance_tick(cpu, rq, NOT_IDLE); + if (expired_balance > 1) + rebalance_tick(cpu, rq, NOT_IDLE); } #ifdef CONFIG_SCHED_SMT @@ -649,9 +671,9 @@ static struct task_struct *staircase_hea static int staircase_dependent_sleeper_trumps(const struct task_struct *p1, const struct task_struct * p2, struct sched_domain *sd) { - return ((p1->sdu.staircase.time_slice * (100 - sd->per_cpu_gain) / 100) > - slice(p2) || rt_task(p1)) && - p2->mm && p1->mm && !rt_task(p2); + return ((p1->sdu.staircase.time_slice * (100 - sd->per_cpu_gain) / + 100) > slice(p2) || rt_task(p1)) && p2->mm && p1->mm && + !rt_task(p2); } #endif @@ -743,7 +765,8 @@ switch_tasks: int newprio = effective_prio(next); next->sdu.staircase.sflags &= ~SF_YIELDED; if (newprio != next->prio) { - struct staircase_runqueue_queue *rqq = &rq->qu.staircase; + struct staircase_runqueue_queue *rqq = + &rq->qu.staircase; dequeue_task(next, rqq); next->prio = newprio; @@ -930,7 +953,8 @@ static void staircase_migrate_dead_tasks for (i = 0; i < STAIRCASE_MAX_PRIO; i++) { struct list_head *list = &rq->qu.staircase.queue[i]; while (!list_empty(list)) - migrate_dead(dead_cpu, list_entry(list->next, task_t, run_list)); + migrate_dead(dead_cpu, list_entry(list->next, task_t, + run_list)); } } #endif @@ -990,6 +1014,7 @@ const struct sched_drv staircase_sched_d #ifdef CONFIG_SMP .move_tasks = staircase_move_tasks, #endif + .systime_hook = staircase_systime_hook, .tick = staircase_tick, #ifdef CONFIG_SCHED_SMT .head_of_queue = staircase_head_of_queue,