diff -urN oldtree/Documentation/sysctl/kernel.txt newtree/Documentation/sysctl/kernel.txt --- oldtree/Documentation/sysctl/kernel.txt 2006-09-24 18:06:07.000000000 -0400 +++ newtree/Documentation/sysctl/kernel.txt 2006-09-24 19:25:16.000000000 -0400 @@ -27,6 +27,7 @@ - hostname - hotplug - interactive +- iso_cpu - java-appletviewer [ binfmt_java, obsolete ] - java-interpreter [ binfmt_java, obsolete ] - l2cr [ PPC only ] @@ -181,6 +182,14 @@ ============================================================== +iso_cpu: + +This sets the percentage cpu that the unprivileged SCHED_ISO tasks can +run effectively at realtime priority, averaged over a rolling 3 seconds. +Set to 80% by default. + +============================================================== + l2cr: (PPC only) This flag controls the L2 cache of G3 processor boards. If diff -urN oldtree/include/linux/sched.h newtree/include/linux/sched.h --- oldtree/include/linux/sched.h 2006-09-24 18:25:15.000000000 -0400 +++ newtree/include/linux/sched.h 2006-09-24 19:18:17.000000000 -0400 @@ -34,10 +34,18 @@ #define SCHED_FIFO 1 #define SCHED_RR 2 #define SCHED_BATCH 3 +#ifdef CONFIG_STAIRCASE +#define SCHED_ISO 4 +#endif #ifdef __KERNEL__ +#ifdef CONFIG_INGOSCHED #define SCHED_MAX SCHED_BATCH +#endif +#ifdef CONFIG_STAIRCASE +#define SCHED_MAX SCHED_ISO +#endif #define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) #define SCHED_RT(policy) ((policy) == SCHED_FIFO || \ (policy) == SCHED_RR) @@ -213,7 +221,7 @@ void io_schedule(void); long io_schedule_timeout(long timeout); #ifdef CONFIG_STAIRCASE -extern int sched_interactive, sched_compute; +extern int sched_interactive, sched_compute, sched_iso_cpu; #endif extern void cpu_init (void); @@ -509,6 +517,9 @@ #define MAX_USER_RT_PRIO 100 #define MAX_RT_PRIO MAX_USER_RT_PRIO +#ifdef CONFIG_STAIRCASE +#define ISO_PRIO (MAX_RT_PRIO - 1) +#endif #define MAX_PRIO (MAX_RT_PRIO + 40) #ifdef CONFIG_STAIRCASE @@ -524,6 +535,7 @@ #endif #ifdef CONFIG_STAIRCASE #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy) && SCHED_RT((p)->policy)) +#define iso_task(p) (unlikely((p)->policy == SCHED_ISO)) #endif /* Must be high prio: stop_machine expects to yield to it. */ @@ -1168,6 +1180,9 @@ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ +#ifdef CONFIG_STAIRCASE +#define PF_ISOREF 0x04000000 /* SCHED_ISO task has used up quota */ +#endif #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #ifdef CONFIG_STAIRCASE diff -urN oldtree/include/linux/sysctl.h newtree/include/linux/sysctl.h --- oldtree/include/linux/sysctl.h 2006-09-24 18:06:07.000000000 -0400 +++ newtree/include/linux/sysctl.h 2006-09-24 19:19:18.000000000 -0400 @@ -155,6 +155,7 @@ KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */ KERN_INTERACTIVE=77, /* interactive tasks can have cpu bursts */ KERN_COMPUTE=78, /* adjust timeslices for a compute server */ + KERN_ISO_CPU=79, /* percent cpu SCHED_ISO tasks run SCHED_RR */ }; diff -urN oldtree/kernel/sched_staircase.c newtree/kernel/sched_staircase.c --- oldtree/kernel/sched_staircase.c 2006-09-24 18:20:11.000000000 -0400 +++ newtree/kernel/sched_staircase.c 2006-09-24 19:23:56.000000000 -0400 @@ -53,10 +53,14 @@ * raise its priority. * sched_compute - sysctl which enables long timeslices and delayed preemption * for compute server usage. + * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks + * are allowed to run (over ISO_PERIOD seconds) as real time tasks. */ int sched_interactive __read_mostly = 1; int sched_compute __read_mostly; +int sched_iso_cpu __read_mostly = 80; +#define ISO_PERIOD (5 * HZ) /* * CACHE_DELAY is the time preemption is delayed in sched_compute mode * and is set to a nominal 10ms. @@ -135,6 +139,8 @@ unsigned long long timestamp_last_tick; unsigned short cache_ticks, preempted; + unsigned long iso_ticks; + unsigned short iso_refractory; struct task_struct *curr, *idle; struct mm_struct *prev_mm; unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO + 1)]; @@ -866,6 +872,17 @@ unsigned int full_slice, used_slice = 0; unsigned int best_bonus, rr; + if (iso_task(p)) { + if (likely(!(p->flags & PF_ISOREF))) + /* + * If SCHED_ISO tasks have not used up their real time + * quota they have run just better than highest + * SCHED_NORMAL priority. Otherwise they run as + * SCHED_NORMAL. + */ + return ISO_PRIO; + } + full_slice = slice(p); if (full_slice > p->slice) used_slice = full_slice - p->slice; @@ -2894,6 +2911,22 @@ } /* + * Test if SCHED_ISO tasks have run longer than their alloted period as RT + * tasks and set the refractory flag if necessary. There is 10% hysteresis + * for unsetting the flag. + */ +static inline unsigned int test_ret_isorefractory(struct rq *rq) +{ + if (likely(!rq->iso_refractory)) { + if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu) + rq->iso_refractory = 1; + } else + if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100)) + rq->iso_refractory = 0; + return rq->iso_refractory; +} + +/* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. */ @@ -2921,11 +2954,29 @@ set_tsk_need_resched(p); goto out; } - /* SCHED_FIFO tasks never run out of timeslice. */ - if (unlikely(p->policy == SCHED_FIFO)) - goto out; spin_lock(&rq->lock); + if (unlikely((rt_task(p) || (iso_task(p) && !rq->iso_refractory)) && + p->mm)) { + if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100) + rq->iso_ticks += 100; + } else + rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; + + if (iso_task(p)) { + if (unlikely(test_ret_isorefractory(rq))) { + if (!(p->flags & PF_ISOREF)) { + set_tsk_need_resched(p); + p->flags |= PF_ISOREF; + } + } else + p->flags &= ~PF_ISOREF; + } else + /* SCHED_FIFO tasks never run out of timeslice. */ + if (unlikely(p->policy == SCHED_FIFO)) + goto out_unlock; + + debit = ns_diff(rq->timestamp_last_tick, p->timestamp); p->ns_debit += debit; if (p->ns_debit < NSJIFFY) @@ -3021,7 +3072,7 @@ int ret = 0, i; /* kernel/rt threads do not participate in dependent sleeping */ - if (!p->mm || rt_task(p)) + if (!p->mm || rt_task(p) || iso_task(p)) return 0; for_each_domain(this_cpu, tmp) { @@ -3058,7 +3109,7 @@ * task from using an unfair proportion of the * physical cpu's resources. -ck */ - if (rt_task(smt_curr)) { + if (rt_task(smt_curr) || iso_task(smt_curr)) { /* * With real time tasks we run non-rt tasks only * per_cpu_gain% of the time. @@ -3866,12 +3917,22 @@ int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) { + struct sched_param zero_param = { .sched_priority = 0 }; int queued, retval, oldprio, oldpolicy = -1; unsigned long flags; struct rq *rq; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); + if (SCHED_RT(policy) && !capable(CAP_SYS_NICE)) { + /* + * If the caller requested an RT policy without having the + * necessary rights, we downgrade the policy to SCHED_ISO. + * We also set the parameter to zero to pass the checks. + */ + policy = SCHED_ISO; + param = &zero_param; + } recheck: /* double check policy once rq lock held */ if (policy < 0) @@ -4404,6 +4465,7 @@ break; case SCHED_NORMAL: case SCHED_BATCH: + case SCHED_ISO: ret = 0; break; } @@ -6656,7 +6718,8 @@ rq = cpu_rq(i); spin_lock_init(&rq->lock); lockdep_set_class(&rq->lock, &rq->rq_lock_key); - rq->nr_running = rq->cache_ticks = rq->preempted = 0; + rq->nr_running = rq->cache_ticks = rq->preempted = + rq->iso_ticks = 0; #ifdef CONFIG_SMP rq->sd = NULL; diff -urN oldtree/kernel/sysctl.c newtree/kernel/sysctl.c --- oldtree/kernel/sysctl.c 2006-09-24 18:06:07.000000000 -0400 +++ newtree/kernel/sysctl.c 2006-09-24 19:24:46.000000000 -0400 @@ -236,6 +236,11 @@ { .ctl_name = 0 } }; +/* Constants for minimum and maximum testing. + We use these as one-element integer vectors. */ +static int zero; +static int one_hundred = 100; + static ctl_table kern_table[] = { #ifndef CONFIG_UTS_NS { @@ -704,6 +709,17 @@ .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = KERN_ISO_CPU, + .procname = "iso_cpu", + .data = &sched_iso_cpu, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { @@ -805,12 +821,6 @@ { .ctl_name = 0 } }; -/* Constants for minimum and maximum testing in vm_table. - We use these as one-element integer vectors. */ -static int zero; -static int one_hundred = 100; - - static ctl_table vm_table[] = { { .ctl_name = VM_OVERCOMMIT_MEMORY,