diff -urN newtree/Documentation/sysctl/kernel.txt newtree.2/Documentation/sysctl/kernel.txt --- newtree/Documentation/sysctl/kernel.txt 2006-07-15 14:53:08.000000000 -0700 +++ newtree.2/Documentation/sysctl/kernel.txt 2006-08-02 08:28:57.000000000 -0700 @@ -18,6 +18,7 @@ show up in /proc/sys/kernel: - acpi_video_flags - acct +- compute - core_pattern - core_uses_pid - ctrl-alt-del @@ -25,6 +26,8 @@ - domainname - hostname - hotplug +- interactive +- iso_cpu - java-appletviewer [ binfmt_java, obsolete ] - java-interpreter [ binfmt_java, obsolete ] - l2cr [ PPC only ] @@ -84,6 +87,16 @@ ============================================================== +compute: + +This flag controls the long timeslice, delayed preemption mode in the +cpu scheduler suitable for scientific computation applications. It +leads to large latencies so is unsuitable for normal usage. + +Disabled by default. + +============================================================== + core_pattern: core_pattern is used to specify a core dumpfile pattern name. @@ -161,7 +174,25 @@ ============================================================== -l2cr: (PPC only) +interactive: + +This flag controls the allocation of dynamic priorities in the cpu +scheduler. It gives low cpu using tasks high priority for lowest +latencies. Nice value is still observed but stricter cpu proportions +are obeyed if this tunable is disabled. Enabled by default. + +============================================================== + +iso_cpu: + +This sets the percentage cpu that the unprivileged SCHED_ISO tasks can +run effectively at realtime priority, averaged over a rolling 3 seconds. +Set to 80% by default. + +============================================================== + +l2cr: +(PPC only) This flag controls the L2 cache of G3 processor boards. If 0, the cache is disabled. Enabled if nonzero. diff -urN newtree/fs/proc/array.c newtree.2/fs/proc/array.c --- newtree/fs/proc/array.c 2006-07-15 14:53:08.000000000 -0700 +++ newtree.2/fs/proc/array.c 2006-08-02 08:30:12.000000000 -0700 @@ -165,7 +165,12 @@ read_lock(&tasklist_lock); buffer += sprintf(buffer, "State:\t%s\n" +#ifdef CONFIG_INGOSCHED "SleepAVG:\t%lu%%\n" +#endif +#ifdef CONFIG_STAIRCASE + "Bonus:\t%d\n" +#endif "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -173,7 +178,12 @@ "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), +#ifdef CONFIG_INGOSCHED (p->sleep_avg/1024)*100/(1020000000/1024), +#endif +#ifdef CONFIG_STAIRCASE + p->bonus, +#endif p->tgid, p->pid, pid_alive(p) ? p->group_leader->real_parent->tgid : 0, pid_alive(p) && p->ptrace ? p->parent->pid : 0, diff -urN newtree/include/linux/init_task.h newtree.2/include/linux/init_task.h --- newtree/include/linux/init_task.h 2006-08-02 07:14:22.000000000 -0700 +++ newtree.2/include/linux/init_task.h 2006-08-02 08:38:59.000000000 -0700 @@ -99,9 +99,16 @@ .usage = ATOMIC_INIT(2), \ .flags = 0, \ .lock_depth = -1, \ - .prio = MAX_PRIO-20, \ - .static_prio = MAX_PRIO-20, \ - .normal_prio = MAX_PRIO-20, \ +#ifdef CONFIG_STAIRCASE + .prio = MAX_PRIO-21, \ + .static_prio = MAX_PRIO-21, \ + .normal_prio = MAX_PRIO-21, \ +#endif +#ifdef CONFIG_STAIRCASE + .prio = MAX_PRIO-20, \ + .static_prio = MAX_PRIO-20, \ + .normal_prio = MAX_PRIO-20, \ +#endif .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ @@ -135,6 +142,7 @@ .signal = {{0}}}, \ .blocked = {{0}}, \ .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \ + .mutexes_held = 0, \ .journal_info = NULL, \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .fs_excl = ATOMIC_INIT(0), \ diff -urN newtree/include/linux/sched.h newtree.2/include/linux/sched.h --- newtree/include/linux/sched.h 2006-08-02 07:14:22.000000000 -0700 +++ newtree.2/include/linux/sched.h 2006-08-02 08:41:16.000000000 -0700 @@ -34,9 +34,20 @@ #define SCHED_FIFO 1 #define SCHED_RR 2 #define SCHED_BATCH 3 +#ifdef CONFIG_STAIRCASE +#define SCHED_ISO 4 +#define SCHED_IDLEPRIO 5 +#endif #ifdef __KERNEL__ +#ifdef CONFIG_STAIRCASE +#define SCHED_MAX SCHED_IDLEPRIO +#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) +#define SCHED_RT(policy) ((policy) == SCHED_FIFO || \ + (policy) == SCHED_RR) +#endif + struct sched_param { int sched_priority; }; @@ -207,6 +218,9 @@ void io_schedule(void); long io_schedule_timeout(long timeout); +#ifdef CONFIG_STAIRCASE +extern int sched_interactive, sched_compute, sched_iso_cpu; +#endif extern void cpu_init (void); extern void trap_init(void); @@ -502,14 +516,32 @@ #define MAX_USER_RT_PRIO 100 #define MAX_RT_PRIO MAX_USER_RT_PRIO +#ifdef CONFIG_STAIRCASE +#define ISO_PRIO (MAX_RT_PRIO - 1) +#endif +#ifdef CONFIG_INGOSCHED #define MAX_PRIO (MAX_RT_PRIO + 40) +#endif + +#ifdef CONFIG_STAIRCASE +#define MAX_PRIO (MAX_RT_PRIO + 41) +#define MIN_USER_PRIO (MAX_PRIO - 2) +#define IDLEPRIO_PRIO (MAX_PRIO - 1) +#endif #define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) #define rt_task(p) rt_prio((p)->prio) #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) +#ifdef CONFIG_INGOSCHED #define has_rt_policy(p) \ unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH) +#endif +#ifdef CONFIG_STAIRCASE +#define has_rt_policy(p) unlikely(SCHED_RT((p)->policy)) +#define iso_task(p) (unlikely((p)->policy == SCHED_ISO)) +#define idleprio_task(p) (unlikely((p)->policy == SCHED_IDLEPRIO)) +#endif /* * Some day this will be a full-fledged user tracking system.. @@ -776,6 +808,7 @@ struct pipe_inode_info; struct uts_namespace; +#ifdef CONFIG_INGOSCHED enum sleep_type { SLEEP_NORMAL, SLEEP_NONINTERACTIVE, @@ -784,6 +817,7 @@ }; struct prio_array; +#endif struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ @@ -802,19 +836,33 @@ int load_weight; /* for niceness load balancing purposes */ int prio, static_prio, normal_prio; struct list_head run_list; +#ifdef CONFIG_INGOSCHED struct prio_array *array; +#endif unsigned short ioprio; unsigned int btrace_seq; +#ifdef CONFIG_INGOSCHED unsigned long sleep_avg; unsigned long long timestamp, last_ran; +#endif +#ifdef CONFIG_STAIRCASE + unsigned long long timestamp; + unsigned long runtime, totalrun, ns_debit, systime; + unsigned int bonus; + unsigned int slice, time_slice; +#endif unsigned long long sched_time; /* sched_clock time spent running */ +#ifdef CONFIG_INGOSCHED enum sleep_type sleep_type; +#endif unsigned long policy; cpumask_t cpus_allowed; +#ifdef CONFIG_INGOSCHED unsigned int time_slice, first_time_slice; +#endif #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; @@ -972,6 +1020,7 @@ struct held_lock held_locks[MAX_LOCK_DEPTH]; unsigned int lockdep_recursion; #endif + unsigned long mutexes_held; /* journalling filesystem info */ void *journal_info; @@ -1090,8 +1139,15 @@ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ +#ifdef CONFIG_STAIRCASE +#define PF_ISOREF 0x04000000 /* SCHED_ISO task has used up quota */ +#endif #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ +#ifdef CONFIG_STAIRCASE +#define PF_NONSLEEP 0x40000000 /* Waiting on in kernel activity */ +#define PF_FORKED 0x80000000 /* Task just forked another process */ +#endif /* * Only the _current_ task can read/write to tsk->flags, but other @@ -1227,7 +1283,9 @@ static inline void kick_process(struct task_struct *tsk) { } #endif extern void FASTCALL(sched_fork(struct task_struct * p, int clone_flags)); +#ifdef CONFIG_INGOSCHED extern void FASTCALL(sched_exit(struct task_struct * p)); +#endif extern int in_group_p(gid_t); extern int in_egroup_p(gid_t); diff -urN newtree/include/linux/sysctl.h newtree.2/include/linux/sysctl.h --- newtree/include/linux/sysctl.h 2006-08-02 07:14:22.000000000 -0700 +++ newtree.2/include/linux/sysctl.h 2006-08-02 08:42:02.000000000 -0700 @@ -153,6 +153,9 @@ KERN_MAX_LOCK_DEPTH=74, KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */ KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */ + KERN_INTERACTIVE=77, /* interactive tasks can have cpu bursts */ + KERN_COMPUTE=78, /* adjust timeslices for a compute server */ + KERN_ISO_CPU=79, /* percent cpu SCHED_ISO tasks run SCHED_RR */ }; diff -urN newtree/kernel/exit.c newtree.2/kernel/exit.c --- newtree/kernel/exit.c 2006-08-02 07:14:23.000000000 -0700 +++ newtree.2/kernel/exit.c 2006-08-02 08:42:19.000000000 -0700 @@ -165,7 +165,9 @@ zap_leader = (leader->exit_signal == -1); } +#ifdef CONFIG_INGOSCHED sched_exit(p); +#endif write_unlock_irq(&tasklist_lock); proc_flush_task(p); release_thread(p); diff -urN newtree/kernel/fork.c newtree.2/kernel/fork.c --- newtree/kernel/fork.c 2006-08-02 07:14:23.000000000 -0700 +++ newtree.2/kernel/fork.c 2006-08-02 08:42:47.000000000 -0700 @@ -1042,6 +1042,7 @@ p->io_context = NULL; p->io_wait = NULL; p->audit_context = NULL; + p->mutexes_held = 0; p->tgid = p->pid; if (clone_flags & CLONE_THREAD) diff -urN newtree/kernel/mutex.c newtree.2/kernel/mutex.c --- newtree/kernel/mutex.c 2006-07-15 14:53:08.000000000 -0700 +++ newtree.2/kernel/mutex.c 2006-08-02 08:47:13.000000000 -0700 @@ -60,6 +60,16 @@ static void fastcall noinline __sched __mutex_lock_slowpath(atomic_t *lock_count); +static inline void inc_mutex_count(void) +{ + current->mutexes_held++; +} + +static inline void dec_mutex_count(void) +{ + current->mutexes_held--; +} + /*** * mutex_lock - acquire the mutex * @lock: the mutex to be acquired @@ -89,6 +99,7 @@ * 'unlocked' into 'locked' state. */ __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); + inc_mutex_count(); } EXPORT_SYMBOL(mutex_lock); @@ -114,6 +125,7 @@ * into 'unlocked' state: */ __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); + dec_mutex_count(); } EXPORT_SYMBOL(mutex_unlock); @@ -274,9 +286,14 @@ */ int fastcall __sched mutex_lock_interruptible(struct mutex *lock) { + int ret; + might_sleep(); - return __mutex_fastpath_lock_retval + ret = __mutex_fastpath_lock_retval (&lock->count, __mutex_lock_interruptible_slowpath); + if (likely(!ret)) + inc_mutex_count(); + return ret; } EXPORT_SYMBOL(mutex_lock_interruptible); @@ -331,8 +348,12 @@ */ int fastcall __sched mutex_trylock(struct mutex *lock) { - return __mutex_fastpath_trylock(&lock->count, + int ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath); + + if (likely(ret)) + inc_mutex_count(); + return ret; } EXPORT_SYMBOL(mutex_trylock); diff -urN newtree/kernel/sysctl.c newtree.2/kernel/sysctl.c --- newtree/kernel/sysctl.c 2006-08-02 07:14:23.000000000 -0700 +++ newtree.2/kernel/sysctl.c 2006-08-02 08:49:50.000000000 -0700 @@ -234,6 +234,11 @@ { .ctl_name = 0 } }; +/* Constants for minimum and maximum testing. + We use these as one-element integer vectors. */ +static int zero; +static int one_hundred = 100; + static ctl_table kern_table[] = { #ifndef CONFIG_UTS_NS { @@ -685,6 +690,35 @@ .mode = 0444, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_STAIRCASE + { + .ctl_name = KERN_INTERACTIVE, + .procname = "interactive", + .data = &sched_interactive, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_COMPUTE, + .procname = "compute", + .data = &sched_compute, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_ISO_CPU, + .procname = "iso_cpu", + .data = &sched_iso_cpu, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, +#endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .ctl_name = KERN_UNKNOWN_NMI_PANIC,