diff -urN oldtree/Documentation/sysctl/kernel.txt newtree/Documentation/sysctl/kernel.txt --- oldtree/Documentation/sysctl/kernel.txt 2006-09-24 17:58:58.000000000 -0400 +++ newtree/Documentation/sysctl/kernel.txt 2006-09-24 18:05:38.000000000 -0400 @@ -18,6 +18,7 @@ show up in /proc/sys/kernel: - acpi_video_flags - acct +- compute - core_pattern - core_uses_pid - ctrl-alt-del @@ -84,6 +85,16 @@ ============================================================== +compute: (Staircase only) + +This flag controls the long timeslice, delayed preemption mode in the +cpu scheduler suitable for scientific computation applications. It +leads to large latencies so is unsuitable for normal usage. + +Disabled by default. + +============================================================== + core_pattern: core_pattern is used to specify a core dumpfile pattern name. diff -urN oldtree/include/linux/sched.h newtree/include/linux/sched.h --- oldtree/include/linux/sched.h 2006-09-24 17:58:58.000000000 -0400 +++ newtree/include/linux/sched.h 2006-09-24 17:59:37.000000000 -0400 @@ -208,7 +208,7 @@ void io_schedule(void); long io_schedule_timeout(long timeout); #ifdef CONFIG_STAIRCASE -extern int sched_interactive; +extern int sched_interactive, sched_compute; #endif extern void cpu_init (void); diff -urN oldtree/include/linux/sysctl.h newtree/include/linux/sysctl.h --- oldtree/include/linux/sysctl.h 2006-09-24 17:58:58.000000000 -0400 +++ newtree/include/linux/sysctl.h 2006-09-24 18:00:15.000000000 -0400 @@ -153,7 +153,8 @@ KERN_MAX_LOCK_DEPTH=74, KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */ KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */ - KERN_INTERACTIVE=77, /* interactive tasks can have cpu bursts */ + KERN_INTERACTIVE=77, /* interactive tasks can have cpu bursts */ + KERN_COMPUTE=78, /* adjust timeslices for a compute server */ }; diff -urN oldtree/kernel/sched_staircase.c newtree/kernel/sched_staircase.c --- oldtree/kernel/sched_staircase.c 2006-09-24 17:58:58.000000000 -0400 +++ newtree/kernel/sched_staircase.c 2006-09-24 18:04:38.000000000 -0400 @@ -51,8 +51,17 @@ /* * sched_interactive - sysctl which allows interactive tasks to have bonus * raise its priority. + * sched_compute - sysctl which enables long timeslices and delayed preemption + * for compute server usage. */ int sched_interactive __read_mostly = 1; +int sched_compute __read_mostly; + +/* + * CACHE_DELAY is the time preemption is delayed in sched_compute mode + * and is set to a nominal 10ms. + */ +#define CACHE_DELAY (10 * (HZ) / 1001 + 1) /* * Convert user-nice values [ -20 ... 0 ... 19 ] @@ -84,9 +93,10 @@ /* * This is the time all tasks within the same priority round robin. - * Set to a minimum of 6ms. + * Set to a minimum of 6ms. It is 10 times longer in compute mode. */ -#define RR_INTERVAL ((6 * HZ / 1001) + 1) +#define _RR_INTERVAL ((6 * HZ / 1001) + 1) +#define RR_INTERVAL (_RR_INTERVAL * (1 + 9 * sched_compute)) #define DEF_TIMESLICE (RR_INTERVAL * 19) @@ -124,6 +134,7 @@ unsigned long nr_uninterruptible; unsigned long long timestamp_last_tick; + unsigned short cache_ticks, preempted; struct task_struct *curr, *idle; struct mm_struct *prev_mm; unsigned long bitmap[BITS_TO_LONGS(MAX_PRIO + 1)]; @@ -861,7 +872,7 @@ best_bonus = bonus(p); prio = MAX_RT_PRIO + best_bonus; - if (sched_interactive && !batch_task(p)) + if (sched_interactive && !sched_compute && !batch_task(p)) prio -= p->bonus; rr = rr_interval(p); @@ -1324,14 +1335,22 @@ #endif /* - * Check to see if p preempts rq->curr and resched if it does. + * Check to see if p preempts rq->curr and resched if it does. In compute + * mode we do not preempt for at least CACHE_DELAY and set rq->preempted. */ -static inline void preempt(const struct task_struct *p, struct rq *rq) +static void fastcall preempt(const struct task_struct *p, struct rq *rq) { - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); -} + struct task_struct *curr = rq->curr; + if (p->prio >= curr->prio) + return; + if (!sched_compute || rq->cache_ticks >= CACHE_DELAY || !p->mm || + rt_task(p) || curr == rq->idle) { + resched_task(curr); + return; + } + rq->preempted = 1; +} /*** * try_to_wake_up - wake up a thread @@ -2930,6 +2949,9 @@ time_slice_expired(p, rq); goto out_unlock; } + rq->cache_ticks++; + if (rq->preempted && rq->cache_ticks >= CACHE_DELAY) + set_tsk_need_resched(p); out_unlock: spin_unlock(&rq->lock); out: @@ -3199,6 +3221,7 @@ sched_info_switch(prev, next); if (likely(prev != next)) { + rq->preempted = rq->cache_ticks = 0; next->timestamp = now; rq->nr_switches++; rq->curr = next; @@ -6634,7 +6657,7 @@ rq = cpu_rq(i); spin_lock_init(&rq->lock); lockdep_set_class(&rq->lock, &rq->rq_lock_key); - rq->nr_running = 0; + rq->nr_running = rq->cache_ticks = rq->preempted = 0; #ifdef CONFIG_SMP rq->sd = NULL; diff -urN oldtree/kernel/sysctl.c newtree/kernel/sysctl.c --- oldtree/kernel/sysctl.c 2006-09-24 17:58:58.000000000 -0400 +++ newtree/kernel/sysctl.c 2006-09-24 18:05:02.000000000 -0400 @@ -696,6 +696,14 @@ .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = KERN_COMPUTE, + .procname = "compute", + .data = &sched_compute, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) {