diff -urN oldtree/kernel/sched_staircase.c newtree/kernel/sched_staircase.c --- oldtree/kernel/sched_staircase.c 2006-10-06 03:27:46.000000000 -0400 +++ newtree/kernel/sched_staircase.c 2006-10-06 03:30:39.000000000 -0400 @@ -49,6 +49,25 @@ #include /* + * sched_interactive - sysctl which allows interactive tasks to have bonus + * raise its priority. + * sched_compute - sysctl which enables long timeslices and delayed preemption + * for compute server usage. + * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks + * are allowed to run (over ISO_PERIOD seconds) as real time tasks. + */ +int sched_interactive __read_mostly = 1; +int sched_compute __read_mostly; +int sched_iso_cpu __read_mostly = 80; + +#define ISO_PERIOD (5 * HZ) +/* + * CACHE_DELAY is the time preemption is delayed in sched_compute mode + * and is set to a nominal 10ms. + */ +#define CACHE_DELAY (10 * (HZ) / 1001 + 1) + +/* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], * and back. @@ -78,9 +97,10 @@ /* * This is the time all tasks within the same priority round robin. - * Set to a minimum of 6ms. + * Set to a minimum of 6ms. It is 10 times longer in compute mode. */ -#define RR_INTERVAL ((6 * HZ / 1001) + 1) +#define _RR_INTERVAL ((6 * HZ / 1001) + 1) +#define RR_INTERVAL (_RR_INTERVAL * (1 + 9 * sched_compute)) #define DEF_TIMESLICE (RR_INTERVAL * 19) @@ -118,6 +138,9 @@ unsigned long nr_uninterruptible; unsigned long long timestamp_last_tick; + unsigned short cache_ticks, preempted; + unsigned long iso_ticks; + unsigned short iso_refractory; struct task_struct *curr, *idle; struct mm_struct *prev_mm; unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO + 1)]; @@ -661,6 +684,12 @@ else #endif p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); + } else if (idleprio_task(p)) { + /* + * We want idleprio_tasks to have a presence on weighting but + * as small as possible + */ + p->load_weight = 1; } else p->load_weight = TASK_LOAD_WEIGHT(p); } @@ -837,6 +866,17 @@ continue_slice(p); } +static inline int idleprio_suitable(const struct task_struct *p) +{ + return (!p->mutexes_held && + !(p->flags & (PF_FREEZE | PF_NONSLEEP | PF_EXITING))); +} + +static inline int idleprio(const struct task_struct *p) +{ + return (p->prio == IDLEPRIO_PRIO); +} + /* * __normal_prio - dynamic priority dependent on bonus. * The priority normally decreases by one each RR_INTERVAL. @@ -849,13 +889,36 @@ unsigned int full_slice, used_slice = 0; unsigned int best_bonus, rr; + if (iso_task(p)) { + if (likely(!(p->flags & PF_ISOREF))) + /* + * If SCHED_ISO tasks have not used up their real time + * quota they have run just better than highest + * SCHED_NORMAL priority. Otherwise they run as + * SCHED_NORMAL. + */ + return ISO_PRIO; + } + + if (idleprio_task(p)) { + if (unlikely(!idleprio_suitable(p))) { + /* + * If idleprio tasks are holding a semaphore, mutex, + * or being frozen, schedule at a normal priority. + */ + p->time_slice = p->slice % RR_INTERVAL ? : RR_INTERVAL; + return MIN_USER_PRIO; + } + return IDLEPRIO_PRIO; + } + full_slice = slice(p); if (full_slice > p->slice) used_slice = full_slice - p->slice; best_bonus = bonus(p); prio = MAX_RT_PRIO + best_bonus; - if (!batch_task(p)) + if (sched_interactive && !sched_compute && !batch_task(p)) prio -= p->bonus; rr = rr_interval(p); @@ -1318,14 +1381,22 @@ #endif /* - * Check to see if p preempts rq->curr and resched if it does. + * Check to see if p preempts rq->curr and resched if it does. In compute + * mode we do not preempt for at least CACHE_DELAY and set rq->preempted. */ -static inline void preempt(const struct task_struct *p, struct rq *rq) +static void fastcall preempt(const struct task_struct *p, struct rq *rq) { - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); -} + struct task_struct *curr = rq->curr; + if (p->prio >= curr->prio) + return; + if (!sched_compute || rq->cache_ticks >= CACHE_DELAY || !p->mm || + rt_task(p) || curr == rq->idle) { + resched_task(curr); + return; + } + rq->preempted = 1; +} /*** * try_to_wake_up - wake up a thread @@ -1477,6 +1548,8 @@ out_running: p->state = TASK_RUNNING; out: + if (idleprio_task(p) && (p->flags & PF_FREEZE) && idleprio(p)) + requeue_task(p, rq, effective_prio(p)); task_rq_unlock(rq, &flags); return success; @@ -2802,7 +2875,7 @@ /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); - if (TASK_NICE(p) > 0) + if (TASK_NICE(p) > 0 || idleprio_task(p)) cpustat->nice = cputime64_add(cpustat->nice, tmp); else cpustat->user = cputime64_add(cpustat->user, tmp); @@ -2869,6 +2942,22 @@ } /* + * Test if SCHED_ISO tasks have run longer than their alloted period as RT + * tasks and set the refractory flag if necessary. There is 10% hysteresis + * for unsetting the flag. + */ +static inline unsigned int test_ret_isorefractory(struct rq *rq) +{ + if (likely(!rq->iso_refractory)) { + if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu) + rq->iso_refractory = 1; + } else + if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100)) + rq->iso_refractory = 0; + return rq->iso_refractory; +} + +/* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. */ @@ -2896,11 +2985,32 @@ set_tsk_need_resched(p); goto out; } - /* SCHED_FIFO tasks never run out of timeslice. */ - if (unlikely(p->policy == SCHED_FIFO)) - goto out; spin_lock(&rq->lock); + if (unlikely((rt_task(p) || (iso_task(p) && !rq->iso_refractory)) && + p->mm)) { + if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100) + rq->iso_ticks += 100; + } else + rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; + + if (iso_task(p)) { + if (unlikely(test_ret_isorefractory(rq))) { + if (!(p->flags & PF_ISOREF)) { + set_tsk_need_resched(p); + p->flags |= PF_ISOREF; + } + } else + p->flags &= ~PF_ISOREF; + } else { + if (idleprio_task(p) && !idleprio(p) && idleprio_suitable(p)) + set_tsk_need_resched(p); + else + /* SCHED_FIFO tasks never run out of timeslice. */ + if (unlikely(p->policy == SCHED_FIFO)) + goto out_unlock; + } + debit = ns_diff(rq->timestamp_last_tick, p->timestamp); p->ns_debit += debit; if (p->ns_debit < NSJIFFY) @@ -2924,6 +3034,9 @@ time_slice_expired(p, rq); goto out_unlock; } + rq->cache_ticks++; + if (rq->preempted && rq->cache_ticks >= CACHE_DELAY) + set_tsk_need_resched(p); out_unlock: spin_unlock(&rq->lock); out: @@ -2993,7 +3106,7 @@ int ret = 0, i; /* kernel/rt threads do not participate in dependent sleeping */ - if (!p->mm || rt_task(p)) + if (!p->mm || rt_task(p) || iso_task(p)) return 0; for_each_domain(this_cpu, tmp) { @@ -3030,7 +3143,7 @@ * task from using an unfair proportion of the * physical cpu's resources. -ck */ - if (rt_task(smt_curr)) { + if (rt_task(smt_curr) || iso_task(smt_curr)) { /* * With real time tasks we run non-rt tasks only * per_cpu_gain% of the time. @@ -3038,11 +3151,23 @@ if ((jiffies % DEF_TIMESLICE) > (sd->per_cpu_gain * DEF_TIMESLICE / 100)) ret = 1; + else if (idleprio(p)) + ret = 1; } else { if (smt_curr->static_prio < p->static_prio && !TASK_PREEMPTS_CURR(p, smt_rq) && smt_slice(smt_curr, sd) > slice(p)) ret = 1; + else if (idleprio(p) && !idleprio_task(smt_curr) && + smt_curr->slice * sd->per_cpu_gain > + slice(smt_curr)) { + /* + * With idleprio tasks they run just the last + * per_cpu_gain percent of the smt task's + * slice. + */ + ret = 1; + } } unlock: spin_unlock(&smt_rq->lock); @@ -3193,6 +3318,7 @@ sched_info_switch(prev, next); if (likely(prev != next)) { + rq->preempted = rq->cache_ticks = 0; next->timestamp = now; rq->nr_switches++; rq->curr = next; @@ -3698,8 +3824,9 @@ * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: */ - if (delta < 0 || (delta > 0 && task_running(rq, p))) - resched_task(rq->curr); + if (delta < 0 || ((delta > 0 || idleprio_task(p)) && + task_running(rq, p))) + resched_task(rq->curr); } out_unlock: task_rq_unlock(rq, &flags); @@ -3837,12 +3964,22 @@ int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) { + struct sched_param zero_param = { .sched_priority = 0 }; int queued, retval, oldprio, oldpolicy = -1; unsigned long flags; struct rq *rq; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); + if (SCHED_RT(policy) && !capable(CAP_SYS_NICE)) { + /* + * If the caller requested an RT policy without having the + * necessary rights, we downgrade the policy to SCHED_ISO. + * We also set the parameter to zero to pass the checks. + */ + policy = SCHED_ISO; + param = &zero_param; + } recheck: /* double check policy once rq lock held */ if (policy < 0) @@ -3891,6 +4028,11 @@ return -EPERM; } + if (!(p->mm) && policy == SCHED_IDLEPRIO) { + /* Don't allow kernel threads to be SCHED_IDLEPRIO. */ + return -EINVAL; + } + retval = security_task_setscheduler(p, policy, param); if (retval) return retval; @@ -4212,7 +4354,7 @@ schedstat_inc(rq, yld_cnt); current->slice = slice(current); current->time_slice = rr_interval(current); - if (likely(!rt_task(current))) + if (likely(!rt_task(current) && !idleprio(current))) newprio = MIN_USER_PRIO; requeue_task(current, rq, newprio); @@ -4376,6 +4518,8 @@ break; case SCHED_NORMAL: case SCHED_BATCH: + case SCHED_ISO: + case SCHED_IDLEPRIO: ret = 0; break; } @@ -4400,6 +4544,8 @@ break; case SCHED_NORMAL: case SCHED_BATCH: + case SCHED_ISO: + case SCHED_IDLEPRIO: ret = 0; } return ret; @@ -6629,7 +6775,8 @@ rq = cpu_rq(i); spin_lock_init(&rq->lock); lockdep_set_class(&rq->lock, &rq->rq_lock_key); - rq->nr_running = 0; + rq->nr_running = rq->cache_ticks = rq->preempted = + rq->iso_ticks = 0; #ifdef CONFIG_SMP rq->sd = NULL;