diff -Naur linux-2.6.11-rc2-mm1/fs/proc/array.c linux-2.6.11-rc2/fs/proc/array.c --- linux-2.6.11-rc2-mm1/fs/proc/array.c 2005-01-24 13:38:21.000000000 -0800 +++ linux-2.6.11-rc2/fs/proc/array.c 2005-01-24 13:43:13.805072848 -0800 @@ -163,7 +163,8 @@ read_lock(&tasklist_lock); buffer += sprintf(buffer, "State:\t%s\n" - "SleepAVG:\t%lu%%\n" + "sleep_time:\t%lu\n" + "total_time:\t%lu\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -171,7 +172,7 @@ "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - (p->sleep_avg/1024)*100/(1020000000/1024), + p->sleep_time, p->total_time, p->tgid, p->pid, pid_alive(p) ? p->group_leader->real_parent->tgid : 0, pid_alive(p) && p->ptrace ? p->parent->pid : 0, diff -Naur linux-2.6.11-rc2-mm1/include/linux/init_task.h linux-2.6.11-rc2/include/linux/init_task.h --- linux-2.6.11-rc2-mm1/include/linux/init_task.h 2005-01-24 13:38:27.000000000 -0800 +++ linux-2.6.11-rc2/include/linux/init_task.h 2005-01-24 14:41:11.636362128 -0800 @@ -74,15 +74,14 @@ .usage = ATOMIC_INIT(2), \ .flags = 0, \ .lock_depth = -1, \ - .prio = MAX_PRIO-20, \ - .static_prio = MAX_PRIO-20, \ + .prio = MAX_PRIO-29, \ + .static_prio = MAX_PRIO-29, \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ .active_mm = &init_mm, \ .run_list = LIST_HEAD_INIT(tsk.run_list), \ .iso_list = LIST_HEAD_INIT(tsk.iso_list), \ - .time_slice = HZ, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ @@ -109,7 +108,6 @@ .blocked = {{0}}, \ .alloc_lock = SPIN_LOCK_UNLOCKED, \ .proc_lock = SPIN_LOCK_UNLOCKED, \ - .switch_lock = SPIN_LOCK_UNLOCKED, \ .journal_info = NULL, \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .private_pages = LIST_HEAD_INIT(tsk.private_pages), \ diff -Naur linux-2.6.11-rc2-mm1/include/linux/sched.h linux-2.6.11-rc2/include/linux/sched.h --- linux-2.6.11-rc2-mm1/include/linux/sched.h 2005-01-24 13:38:27.000000000 -0800 +++ linux-2.6.11-rc2/include/linux/sched.h 2005-01-24 14:42:57.979195560 -0800 @@ -362,6 +362,11 @@ struct list_head cpu_timers[3]; }; +/* Context switch must be unlocked if interrupts are to be enabled */ +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW +# define __ARCH_WANT_UNLOCKED_CTXSW +#endif + /* * Bits in flags field of signal_struct. */ @@ -387,7 +392,7 @@ #define MAX_USER_RT_PRIO 100 #define MAX_RT_PRIO MAX_USER_RT_PRIO -#define MAX_PRIO (MAX_RT_PRIO + 40) +#define MAX_PRIO (MAX_RT_PRIO + 59) #define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) #define iso_task(p) (unlikely((p)->policy == SCHED_ISO)) @@ -460,7 +465,6 @@ #define SD_WAKE_IDLE 8 /* Wake to idle CPU on task wakeup */ #define SD_WAKE_AFFINE 16 /* Wake task to waking CPU */ #define SD_WAKE_BALANCE 32 /* Perform balancing at task wakeup */ -#define SD_SHARE_CPUPOWER 64 /* Domain members share cpu power */ struct sched_group { struct sched_group *next; /* Must be a circular list */ @@ -484,7 +488,9 @@ unsigned int imbalance_pct; /* No balance until over watermark */ unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ - unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */ + unsigned int load_idx; + unsigned int idle_load_idx; + unsigned int wake_balance_load_idx; int flags; /* See SD_* */ /* Runtime fields. */ @@ -496,17 +502,26 @@ /* load_balance() stats */ unsigned long lb_cnt[MAX_IDLE_TYPES]; unsigned long lb_failed[MAX_IDLE_TYPES]; + unsigned long lb_balanced[MAX_IDLE_TYPES]; unsigned long lb_imbalance[MAX_IDLE_TYPES]; + unsigned long lb_gained[MAX_IDLE_TYPES]; + unsigned long lb_hot_gained[MAX_IDLE_TYPES]; unsigned long lb_nobusyg[MAX_IDLE_TYPES]; unsigned long lb_nobusyq[MAX_IDLE_TYPES]; + /* Active load balancing */ + unsigned long alb_cnt; + unsigned long alb_failed; + unsigned long alb_pushed; + /* sched_balance_exec() stats */ unsigned long sbe_attempts; unsigned long sbe_pushed; /* try_to_wake_up() stats */ - unsigned long ttwu_wake_affine; - unsigned long ttwu_wake_balance; + unsigned long ttwu_wake_remote; + unsigned long ttwu_move_affine; + unsigned long ttwu_move_balance; #endif }; @@ -568,20 +583,23 @@ unsigned long ptrace; int lock_depth; /* Lock depth */ - +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + int oncpu; +#endif int prio, static_prio; struct list_head run_list; struct list_head iso_list; prio_array_t *array; - unsigned long sleep_avg; + unsigned long array_sequence; unsigned long long timestamp, last_ran; unsigned long long sched_time; /* sched_clock time spent running */ - int activated; + int used_slice; + unsigned long total_time, sleep_time; unsigned long policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; + #ifdef CONFIG_SCHEDSTATS struct sched_info sched_info; @@ -689,8 +707,6 @@ spinlock_t alloc_lock; /* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */ spinlock_t proc_lock; -/* context-switch lock */ - spinlock_t switch_lock; /* journalling filesystem info */ void *journal_info; diff -Naur linux-2.6.11-rc2-mm1/include/linux/sysctl.h linux-2.6.11-rc2/include/linux/sysctl.h --- linux-2.6.11-rc2-mm1/include/linux/sysctl.h 2005-01-24 13:38:28.000000000 -0800 +++ linux-2.6.11-rc2/include/linux/sysctl.h 2005-01-24 14:43:38.508034240 -0800 @@ -137,6 +137,7 @@ KERN_BOOTLOADER_TYPE=67, /* int: boot loader type */ KERN_ISO_CPU=68, /* int: cpu% allowed by SCHED_ISO class */ KERN_ISO_PERIOD=69, /* int: seconds over which SCHED_ISO cpu is decayed */ + KERN_SCHED_TIMESLICE=70, /* int: base timeslice for scheduler */ }; diff -Naur linux-2.6.11-rc2-mm1/kernel/#sched.c# linux-2.6.11-rc2/kernel/#sched.c# --- linux-2.6.11-rc2-mm1/kernel/#sched.c# 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.11-rc2/kernel/#sched.c# 2005-01-24 13:57:34.124284448 -0800 @@ -0,0 +1,5323 @@ +/* + * kernel/sched.c + * + * Kernel scheduler and related syscalls + * + * Copyright (C) 1991-2002 Linus Torvalds + * + * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and + * make semaphores SMP safe + * 1998-11-19 Implemented schedule_timeout() and related stuff + * by Andrea Arcangeli + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: + * hybrid priority-list and round-robin design with + * an array-switch method of distributing timeslices + * and per-CPU runqueues. Cleanups and useful suggestions + * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2003-09-03 Interactivity tuning by Con Kolivas. + * 2004-04-02 Scheduler domains code by Nick Piggin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], + * and back. + */ +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 30) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 30) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) + +#define US_TO_JIFFIES(x) ((x) * HZ / 1000000) +#define JIFFIES_TO_US(x) ((x) * 1000000 / HZ) + + +/* + * MIN_TIMESLICE is the timeslice that a minimum priority process gets if there + * is a maximum priority process runnable. MAX_TIMESLICE is derived from the + * formula in task_timeslice. It cannot be changed here. It is the timesilce + * that the maximum priority process will get. Larger timeslices are attainable + * by low priority processes however. + */ +int sched_base_timeslice = 64; +int sched_min_base = 1; +int sched_max_base = 10000; + + ++ #define RT_TIMESLICE (50 * 1000 / HZ) /* 50ms */ ++ #define BASE_TIMESLICE (sched_base_timeslice) ++ #define MIN_TIMESLICE 1 + ++ /* Maximum amount of history that will be used to calculate priority */ ++ #define MAX_SLEEP_SHIFT 19 ++ #define MAX_SLEEP (1UL << MAX_SLEEP_SHIFT) /* roughly 0.52s */ + ++ /* ++ * Maximum effect that 1 block of activity (run/sleep/etc) can have. This is ++ * will moderate dicard freak events (eg. SIGSTOP) ++ */ ++ #define MAX_SLEEP_AFFECT (MAX_SLEEP/4) + ++ /* ++ * The amount of history can be decreased (on fork for example). This puts a ++ * lower bound on it. ++ */ + ++ #define MIN_HISTORY (MAX_SLEEP/8) + ++ #define FORKED_TS_MAX (US_TO_JIFFIES(MIN_HISTORY) ?: 1) + ++ /* ++ * SLEEP_FACTOR is a fixed point factor used to scale history tracking things. ++ * In particular: total_time, sleep_time, sleep_avg. ++ */ ++ #define SLEEP_FACTOR 1024 + + + +int iso_cpu = 70; /* The soft %cpu limit on SCHED_ISO tasks */ +int iso_period = 5; /* The time over which SCHED_ISO cpu decays */ + +#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ + < (long long) (sd)->cache_hot_time) + +/* + * These are the runqueue data structures: + */ + +#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) + +typedef struct runqueue runqueue_t; + +struct prio_array { + int min_prio; + unsigned int nr_active; + unsigned long bitmap[BITMAP_SIZE]; + struct list_head queue[MAX_PRIO]; +}; + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct runqueue { + spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned long nr_running; +#ifdef CONFIG_SMP + unsigned long cpu_load; +#endif + unsigned long iso_ticks; + struct list_head iso_queue; + int iso_refractory; + /* + * Refractory is the flag that we've hit the maximum iso cpu and are + * in the refractory period where SCHED_ISO tasks can only run as + * SCHED_NORMAL until their cpu usage drops to 90% of their iso_cpu + * limit. + */ + + unsigned long long nr_switches; + + /* + * This is part of a global counter where only the total sum + * over all CPUs matters. A task can increase this counter on + * one CPU and if it got migrated afterwards it may decrease + * it on another CPU. Always updated under the runqueue lock: + */ + + unsigned long long timestamp_last_tick; + task_t *curr, *idle; + struct mm_struct *prev_mm; + atomic_t nr_iowait; + prio_array_t *active, *expired, arrays[2]; + +#ifdef CONFIG_SMP + unsigned long long timestamp_last_tick; + + struct sched_domain *sd; + + /* For active balancing */ + int active_balance; + int push_cpu; + + task_t *migration_thread; + struct list_head migration_queue; +#endif + +#ifdef CONFIG_SCHEDSTATS + /* latency stats */ + struct sched_info rq_sched_info; + + /* sys_sched_yield() stats */ + unsigned long yld_exp_empty; + unsigned long yld_act_empty; + unsigned long yld_both_empty; + unsigned long yld_cnt; + + /* schedule() stats */ + unsigned long sched_switch; + unsigned long sched_cnt; + unsigned long sched_goidle; + + /* try_to_wake_up() stats */ + unsigned long ttwu_cnt; + unsigned long ttwu_local; +#endif +}; + +static DEFINE_PER_CPU(struct runqueue, runqueues); + +#define for_each_domain(cpu, domain) \ + for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) + +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() (&__get_cpu_var(runqueues)) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) + +#ifndef prepare_arch_switch +# define prepare_arch_switch(next) do { } while (0) +#endif +#ifndef finish_arch_switch +# define finish_arch_switch(prev) do { } while (0) +#endif + +#ifndef __ARCH_WANT_UNLOCKED_CTXSW +static inline int task_running(runqueue_t *rq, task_t *p) +{ + return rq->curr == p; +} + +static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) +{ +} + +static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) +{ + spin_unlock_irq(&rq->lock); +} + +#else /* __ARCH_WANT_UNLOCKED_CTXSW */ +static inline int task_running(runqueue_t *rq, task_t *p) +{ +#ifdef CONFIG_SMP + return p->oncpu; +#else + return rq->curr == p; +#endif +} + +static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->oncpu = 1; +#endif +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + spin_unlock_irq(&rq->lock); +#else + spin_unlock(&rq->lock); +#endif +} + +static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->oncpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->oncpu = 0; +#endif +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_enable(); +#endif +} +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ + +static inline int task_preempts_curr(task_t *p, runqueue_t *rq) +{ + if ((!iso_task(p) && !iso_task(rq->curr)) || rq->iso_refractory || + rt_task(p) || rt_task(rq->curr)) { + if (p->prio < rq->curr->prio) + return 1; + return 0; + } + if (iso_task(p) && !iso_task(rq->curr)) + return 1; + return 0; +} + +/* + * task_rq_lock - lock the runqueue a given task resides on and disable + * interrupts. Note the ordering: we can safely lookup the task_rq without + * explicitly disabling preemption. + */ +static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) + __acquires(rq->lock) +{ + struct runqueue *rq; + + ltt_ev_process(LTT_EV_PROCESS_WAKEUP, p->pid, p->state); + +repeat_lock_task: + local_irq_save(*flags); + rq = task_rq(p); + spin_lock(&rq->lock); + if (unlikely(rq != task_rq(p))) { + spin_unlock_irqrestore(&rq->lock, *flags); + goto repeat_lock_task; + } + return rq; +} + +static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) + __releases(rq->lock) +{ + spin_unlock_irqrestore(&rq->lock, *flags); +} + +#ifdef CONFIG_SCHEDSTATS +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 11 + +static int show_schedstat(struct seq_file *seq, void *v) +{ + int cpu; + enum idle_type itype; + + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); + seq_printf(seq, "timestamp %lu\n", jiffies); + for_each_online_cpu(cpu) { + runqueue_t *rq = cpu_rq(cpu); +#ifdef CONFIG_SMP + struct sched_domain *sd; + int dcnt = 0; +#endif + + /* runqueue-specific stats */ + seq_printf(seq, + "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", + cpu, rq->yld_both_empty, + rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, + rq->sched_switch, rq->sched_cnt, rq->sched_goidle, + rq->ttwu_cnt, rq->ttwu_local, + rq->rq_sched_info.cpu_time, + rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); + + seq_printf(seq, "\n"); + +#ifdef CONFIG_SMP + /* domain-specific stats */ + for_each_domain(cpu, sd) { + char mask_str[NR_CPUS]; + + cpumask_scnprintf(mask_str, NR_CPUS, sd->span); + seq_printf(seq, "domain%d %s", dcnt++, mask_str); + for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; + itype++) { + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", + sd->lb_cnt[itype], + sd->lb_balanced[itype], + sd->lb_failed[itype], + sd->lb_imbalance[itype], + sd->lb_gained[itype], + sd->lb_hot_gained[itype], + sd->lb_nobusyq[itype], + sd->lb_nobusyg[itype]); + } + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n", + sd->alb_cnt, sd->alb_failed, sd->alb_pushed, + sd->sbe_pushed, sd->sbe_attempts, + sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); + } +#endif + } + return 0; +} + +static int schedstat_open(struct inode *inode, struct file *file) +{ + unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); + char *buf = kmalloc(size, GFP_KERNEL); + struct seq_file *m; + int res; + + if (!buf) + return -ENOMEM; + res = single_open(file, show_schedstat, NULL); + if (!res) { + m = file->private_data; + m->buf = buf; + m->size = size; + } else + kfree(buf); + return res; +} + +struct file_operations proc_schedstat_operations = { + .open = schedstat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +# define schedstat_inc(rq, field) do { rq->field++; } while (0) +# define schedstat_add(rq, field, amt) do { rq->field += amt; } while (0) +#else /* !CONFIG_SCHEDSTATS */ +# define schedstat_inc(rq, field) do { } while (0) +# define schedstat_add(rq, field, amt) do { } while (0) +#endif + + +#ifdef CONFIG_SCHED_SMT +static int cpu_and_siblings_are_idle(int cpu) +{ + int sib; + for_each_cpu_mask(sib, cpu_sibling_map[cpu]) { + if (idle_cpu(sib)) + continue; + return 0; + } + + return 1; +} +#else +#define cpu_and_siblings_are_idle(A) idle_cpu(A) +#endif + +#ifdef CONFIG_SCHEDSTATS +/* + * Called when a process is dequeued from the active array and given + * the cpu. We should note that with the exception of interactive + * tasks, the expired queue will become the active queue after the active + * queue is empty, without explicitly dequeuing and requeuing tasks in the + * expired queue. (Interactive tasks may be requeued directly to the + * active queue, thus delaying tasks in the expired queue from running; + * see scheduler_tick()). + * + * This function is only called from sched_info_arrive(), rather than + * dequeue_task(). Even though a task may be queued and dequeued multiple + * times as it is shuffled about, we're really interested in knowing how + * long it was from the *first* time it was queued to the time that it + * finally hit a cpu. + */ +static inline void sched_info_dequeued(task_t *t) +{ + t->sched_info.last_queued = 0; +} + +/* + * Called when a task finally hits the cpu. We can now calculate how + * long it was waiting to run. We also note when it began so that we + * can keep stats on how long its timeslice is. + */ +static inline void sched_info_arrive(task_t *t) +{ + unsigned long now = jiffies, diff = 0; + struct runqueue *rq = task_rq(t); + + if (t->sched_info.last_queued) + diff = now - t->sched_info.last_queued; + sched_info_dequeued(t); + t->sched_info.run_delay += diff; + t->sched_info.last_arrival = now; + t->sched_info.pcnt++; + + if (!rq) + return; + + rq->rq_sched_info.run_delay += diff; + rq->rq_sched_info.pcnt++; +} + +/* + * Called when a process is queued into either the active or expired + * array. The time is noted and later used to determine how long we + * had to wait for us to reach the cpu. Since the expired queue will + * become the active queue after active queue is empty, without dequeuing + * and requeuing any tasks, we are interested in queuing to either. It + * is unusual but not impossible for tasks to be dequeued and immediately + * requeued in the same or another array: this can happen in sched_yield(), + * set_user_nice(), and even load_balance() as it moves tasks from runqueue + * to runqueue. + * + * This function is only called from enqueue_task(), but also only updates + * the timestamp if it is already not set. It's assumed that + * sched_info_dequeued() will clear that stamp when appropriate. + */ +static inline void sched_info_queued(task_t *t) +{ + if (!t->sched_info.last_queued) + t->sched_info.last_queued = jiffies; +} + +/* + * Called when a process ceases being the active-running process, either + * voluntarily or involuntarily. Now we can calculate how long we ran. + */ +static inline void sched_info_depart(task_t *t) +{ + struct runqueue *rq = task_rq(t); + unsigned long diff = jiffies - t->sched_info.last_arrival; + + t->sched_info.cpu_time += diff; + + if (rq) + rq->rq_sched_info.cpu_time += diff; +} + +/* + * Called when tasks are switched involuntarily due, typically, to expiring + * their time slice. (This may also be called when switching to or from + * the idle task.) We are only called when prev != next. + */ +static inline void sched_info_switch(task_t *prev, task_t *next) +{ + struct runqueue *rq = task_rq(prev); + + /* + * prev now departs the cpu. It's not interesting to record + * stats about how efficient we were at scheduling the idle + * process, however. + */ + if (prev != rq->idle) + sched_info_depart(prev); + + if (next != rq->idle) + sched_info_arrive(next); +} +#else +#define sched_info_queued(t) do { } while (0) +#define sched_info_switch(t, next) do { } while (0) +#endif /* CONFIG_SCHEDSTATS */ + +static inline int iso_queued(runqueue_t *rq) +{ + return !list_empty(&rq->iso_queue); +} + +static inline void dequeue_iso_task(struct task_struct *p) +{ + list_del(&p->iso_list); +} + +/* + * Adding/removing a task to/from a priority array: + */ +static void dequeue_task(struct task_struct *p, prio_array_t *array) +{ + array->nr_active--; + if (iso_task(p)) + dequeue_iso_task(p); + list_del(&p->run_list); + if (list_empty(array->queue + p->prio)) + __clear_bit(p->prio, array->bitmap); +} + +/* + * SCHED_ISO tasks are queued at both runqueues. Their actual priority is + * either better than SCHED_NORMAL if below starvation limits, or + * the underlying SCHED_NORMAL dynamic priority. + */ +static void enqueue_iso_task(struct task_struct *p) +{ + runqueue_t *rq = task_rq(p); + list_add_tail(&p->iso_list, &rq->iso_queue); +} + +static void enqueue_task(struct task_struct *p, prio_array_t *array) +{ + struct list_head *entry = array->queue + p->prio; + sched_info_queued(p); + + if (!rt_task(p)) { + /* + * Cycle tasks on the same priority level. This reduces their + * timeslice fluctuations due to higher priority tasks expiring. + */ + if (!list_empty(entry)) + entry = entry->next; + } + list_add_tail(&p->run_list, entry); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->array = array; + if (iso_task(p)) + enqueue_iso_task(p); +} + +static void requeue_iso_task(struct task_struct *p) +{ + runqueue_t *rq = task_rq(p); + list_move_tail(&p->iso_list, &rq->iso_queue); +} + +/* + * Put task to the end of the run list without the overhead of dequeue + * followed by enqueue. + */ +static void requeue_task(struct task_struct *p, prio_array_t *array) +{ + list_move_tail(&p->run_list, array->queue + p->prio); + if (iso_task(p)) + requeue_iso_task(p); +} + +static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) +{ + list_add(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->array = array; +} + +static inline unsigned long long clock_us(void) +{ + return sched_clock() >> 10; +} + +/* + * add_task_time updates a task @p after @time of doing the specified @type + * of activity. See STIME_*. This is used for priority calculation. + */ +static inline void add_task_time(task_t *p, unsigned long long time, unsigned long type) +{ + unsigned long ratio; + unsigned long long tmp; + unsigned long t; + + if (type == STIME_SLEEP) { + if (time > MAX_SLEEP_AFFECT*4) + time = MAX_SLEEP_AFFECT*4; + t = ((unsigned long)time + 3) / 4; + } else { + unsigned long div = 60 - USER_PRIO(p->static_prio); + t = (unsigned long)time * 30; + t = t / div; + t = t * 30; + t = t / div; + } + + ratio = MAX_SLEEP - t; + tmp = (unsigned long long)ratio*p->total_time + MAX_SLEEP/2; + tmp >>= MAX_SLEEP_SHIFT; + p->total_time = (unsigned long)tmp; + + tmp = (unsigned long long)ratio*p->sleep_time + MAX_SLEEP/2; + tmp >>= MAX_SLEEP_SHIFT; + p->sleep_time = (unsigned long)tmp; + + p->total_time += t; + if (type == STIME_SLEEP) + p->sleep_time += t; +} + +static unsigned long task_sleep_avg(task_t *p) +{ + return (SLEEP_FACTOR * p->sleep_time) / (p->total_time + 1); +} + +/* + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. + * + * Timeslices are scaled, so if only low priority processes are running, + * they will all get long timeslices. + */ +static int task_timeslice(task_t *p, runqueue_t *rq) +{ + int idx, base, delta; + int timeslice; + + if (rt_task(p)) + return RT_TIMESLICE; + + idx = min(p->prio, rq->expired->min_prio); + delta = p->prio - idx; + base = BASE_TIMESLICE * (MAX_USER_PRIO + 1) / (delta + 2); + base = base * (MAX_USER_PRIO + 1) / (delta + 2); + + base = base * 40 / (70 - USER_PRIO(idx)); + base = base * 40 / (70 - USER_PRIO(idx)); + + timeslice = base >> 10; + timeslice = timeslice * HZ / 1000; + if (timeslice < MIN_TIMESLICE) + timeslice = MIN_TIMESLICE; + + return timeslice; +} + +/* + * task_priority: calculates a task's priority based on previous running + * history (see add_task_time). The priority is just a simple linear function + * based on sleep_avg and static_prio. + */ +static int task_priority(task_t *p) +{ + unsigned long sleep_avg; + int bonus, prio; + + if (rt_task(p)) + return p->prio; + + sleep_avg = task_sleep_avg(p); + + prio = USER_PRIO(p->static_prio) + 10; + bonus = (((MAX_USER_PRIO + 1) / 3) * sleep_avg + (SLEEP_FACTOR / 2)) + / SLEEP_FACTOR; + prio = MAX_RT_PRIO + prio - bonus; + + if (prio < MAX_RT_PRIO) + return MAX_RT_PRIO; + if (prio > MAX_PRIO-1) + return MAX_PRIO-1; + + return prio; +} + +/* + * __activate_task - move a task to the runqueue. + */ +static inline void __activate_task(task_t *p, runqueue_t *rq, prio_array_t *array) +{ + enqueue_task(p, array); + rq->nr_running++; + if (!rt_task(p)) { + if (p->prio < array->min_prio) + array->min_prio = p->prio; + } +} + +/* + * __activate_idle_task - move idle task to the _front_ of runqueue. + */ +static inline void __activate_idle_task(task_t *p, runqueue_t *rq) +{ + enqueue_task_head(p, rq->active); + rq->nr_running++; +} + +/* + * activate_task - move a task to the runqueue and do priority recalculation + * + * Update all the scheduling statistics stuff. (sleep average + * calculation, priority modifiers, etc.) + */ +static void activate_task(task_t *p, runqueue_t *rq, int local) +{ + unsigned long long now, sleep; + prio_array_t *array; + + now = clock_us(); +#ifdef CONFIG_SMP + if (!local) { + /* Compensate for drifting sched_clock */ + runqueue_t *this_rq = this_rq(); + now = (now - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; + } +#endif + + + /* + * If we have slept through an active/expired array switch, restart + * our timeslice too. + */ + + sleep = now - p->timestamp; + p->timestamp = now; + add_task_time(p, sleep, STIME_SLEEP); + p->prio = task_priority(p); + + array = rq->active; + if (unlikely(p->used_slice == -1)) { + /* This only applys to newly woken children */ + array = rq->expired; + p->used_slice = 0; + } else if (rq->array_sequence != p->array_sequence) + p->used_slice = 0; + + __activate_task(p, rq, array); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) +{ + p->array_sequence = rq->array_sequence; + rq->nr_running--; + dequeue_task(p, p->array); + p->array = NULL; +} + +/* + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag, on SMP it + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. + */ +#ifdef CONFIG_SMP +static void resched_task(task_t *p) +{ + int need_resched, nrpolling; + + BUG_ON(!spin_is_locked(&task_rq(p)->lock)); + + /* minimise the chance of sending an interrupt to poll_idle() */ + nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); + need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED); + nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); + + if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) + smp_send_reschedule(task_cpu(p)); +} +#else +static inline void resched_task(task_t *p) +{ + set_tsk_need_resched(p); +} +#endif + +/** + * task_curr - is this task currently executing on a CPU? + * @p: the task in question. + */ +inline int task_curr(const task_t *p) +{ + return cpu_curr(task_cpu(p)) == p; +} + +#ifdef CONFIG_SMP +enum request_type { + REQ_MOVE_TASK, + REQ_SET_DOMAIN, +}; + +typedef struct { + struct list_head list; + enum request_type type; + + /* For REQ_MOVE_TASK */ + task_t *task; + int dest_cpu; + + /* For REQ_SET_DOMAIN */ + struct sched_domain *sd; + + struct completion done; +} migration_req_t; + +/* + * The task's runqueue lock must be held. + * Returns true if you have to wait for migration thread. + */ +static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) +{ + runqueue_t *rq = task_rq(p); + + /* + * If the task is not on a runqueue (and not running), then + * it is sufficient to simply update the task's cpu field. + */ + if (!p->array && !task_running(rq, p)) { + set_task_cpu(p, dest_cpu); + return 0; + } + + init_completion(&req->done); + req->type = REQ_MOVE_TASK; + req->task = p; + req->dest_cpu = dest_cpu; + list_add(&req->list, &rq->migration_queue); + return 1; +} + +/* + * wait_task_inactive - wait for a thread to unschedule. + * + * The caller must ensure that the task *will* unschedule sometime soon, + * else this function might spin for a *long* time. This function can't + * be called with interrupts off, or it may introduce deadlock with + * smp_call_function() if an IPI is sent by the same process we are + * waiting to become inactive. + */ +void wait_task_inactive(task_t * p) +{ + unsigned long flags; + runqueue_t *rq; + int preempted; + +repeat: + rq = task_rq_lock(p, &flags); + /* Must be off runqueue entirely, not preempted. */ + if (unlikely(p->array)) { + /* If it's preempted, we yield. It could be a while. */ + preempted = !task_running(rq, p); + task_rq_unlock(rq, &flags); + cpu_relax(); + if (preempted) + yield(); + goto repeat; + } + task_rq_unlock(rq, &flags); +} + +/*** + * kick_process - kick a running thread to enter/exit the kernel + * @p: the to-be-kicked thread + * + * Cause a process which is running on another CPU to enter + * kernel-mode, without any delay. (to get signals handled.) + */ +void kick_process(task_t *p) +{ + int cpu; + + preempt_disable(); + cpu = task_cpu(p); + if ((cpu != smp_processor_id()) && task_curr(p)) + smp_send_reschedule(cpu); + preempt_enable(); +} + +/* + * Return a low guess at the load of a migration-source cpu. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively. + */ +static inline unsigned long source_load(int cpu, int type) +{ + runqueue_t *rq = cpu_rq(cpu); + unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + + return min(rq->cpu_load[type], load_now); +} + +/* + * Return a high guess at the load of a migration-target cpu + */ +static inline unsigned long target_load(int cpu, int type) +{ + runqueue_t *rq = cpu_rq(cpu); + unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + + return max(rq->cpu_load[type], load_now); +} + +#endif + +/* + * wake_idle() will wake a task on an idle cpu if task->cpu is + * not idle and an idle cpu is available. The span of cpus to + * search starts with cpus closest then further out as needed, + * so we always favor a closer, idle cpu. + * + * Returns the CPU we should wake onto. + */ +#if defined(ARCH_HAS_SCHED_WAKE_IDLE) +static int wake_idle(int cpu, task_t *p) +{ + cpumask_t tmp; + struct sched_domain *sd; + int i; + + if (idle_cpu(cpu)) + return cpu; + + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_IDLE) { + cpus_and(tmp, sd->span, cpu_online_map); + cpus_and(tmp, tmp, p->cpus_allowed); + for_each_cpu_mask(i, tmp) { + if (idle_cpu(i)) + return i; + } + } + else break; + } + return cpu; +} +#else +static inline int wake_idle(int cpu, task_t *p) +{ + return cpu; +} +#endif + +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @state: the mask of task states that can be woken + * @sync: do a synchronous wakeup? + * + * Put it on the run-queue if it's not already there. The "current" + * thread is always on the run-queue (except when the actual + * re-schedule is in progress), and as such you're allowed to do + * the simpler "current->state = TASK_RUNNING" to mark yourself + * runnable without the overhead of this. + * + * returns failure only if the task is already active. + */ +static int try_to_wake_up(task_t * p, unsigned int state, int sync) +{ + int cpu, this_cpu, success = 0; + unsigned long flags; + long old_state; + runqueue_t *rq; +#ifdef CONFIG_SMP + unsigned long load, this_load; + struct sched_domain *sd, *this_sd = NULL; + int new_cpu; +#endif + + rq = task_rq_lock(p, &flags); + old_state = p->state; + if (!(old_state & state)) + goto out; + + if (p->array) + goto out_running; + + cpu = task_cpu(p); + this_cpu = smp_processor_id(); + +#ifdef CONFIG_SMP + if (unlikely(task_running(rq, p))) + goto out_activate; + + schedstat_inc(rq, ttwu_cnt); + if (cpu == this_cpu) { + schedstat_inc(rq, ttwu_local); + } else { + for_each_domain(this_cpu, sd) { + if (cpu_isset(cpu, sd->span)) { + schedstat_inc(sd, ttwu_wake_remote); + this_sd = sd; + break; + } + } + } + if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + goto out_set_cpu; + + /* + * Check for affine wakeup and passive balancing possibilities. + */ + if (this_sd) { + unsigned int imbalance; + + load = source_load(cpu, this_sd->wake_balance_load_idx); + this_load = target_load(this_cpu, this_sd->wake_balance_load_idx); + + /* Don't pull the task off an idle CPU to a busy one */ + if (load < SCHED_LOAD_SCALE + && load + this_load > SCHED_LOAD_SCALE + && this_load > load) + goto out_set_cpu; + + new_cpu = this_cpu; /* Wake to this CPU if we can */ + + /* + * Start passive balancing when half the imbalance_pct + * limit is reached. + */ + imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; + + if ((this_sd->flags & SD_WAKE_AFFINE) && + !task_hot(p, rq->timestamp_last_tick, this_sd)) { + unsigned long load, this_load; + load = target_load(cpu, this_sd->wake_balance_load_idx); + this_load = source_load(this_cpu, this_sd->wake_balance_load_idx); + + if (imbalance*load > 100*this_load) { + /* + * This domain has SD_WAKE_AFFINE and + * p is cache cold in this domain, and + * there is no bad imbalance. + */ + schedstat_inc(this_sd, ttwu_move_affine); + goto out_set_cpu; + } + } + if ((this_sd->flags & SD_WAKE_BALANCE) && + imbalance*this_load <= 100*load) { + /* + * This domain has SD_WAKE_BALANCE and there is + * an imbalance. + */ + schedstat_inc(this_sd, ttwu_move_balance); + goto out_set_cpu; + } + } + + new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ +out_set_cpu: + new_cpu = wake_idle(new_cpu, p); + if (new_cpu != cpu) { + set_task_cpu(p, new_cpu); + task_rq_unlock(rq, &flags); + /* might preempt at this point */ + rq = task_rq_lock(p, &flags); + old_state = p->state; + if (!(old_state & state)) + goto out; + if (p->array) + goto out_running; + + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + } + +out_activate: +#endif /* CONFIG_SMP */ + if (old_state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; + activate_task(p, rq, cpu == this_cpu); + if (!sync || cpu != this_cpu) { + if (task_preempts_curr(p, rq)) + resched_task(rq->curr); + } + success = 1; + +out_running: + p->state = TASK_RUNNING; +out: + task_rq_unlock(rq, &flags); + + return success; +} + +int fastcall wake_up_process(task_t * p) +{ + return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); +} + +EXPORT_SYMBOL(wake_up_process); + +int fastcall wake_up_state(task_t *p, unsigned int state) +{ + return try_to_wake_up(p, state, 0); +} + +#ifdef CONFIG_SMP +static int find_idlest_cpu(struct task_struct *p, int this_cpu, + struct sched_domain *sd); +#endif + +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + */ +void fastcall sched_fork(task_t *p) +{ + unsigned long sleep_avg; + runqueue_t *rq; + + /* + * We mark the process as running here, but have not actually + * inserted it onto the runqueue yet. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_RUNNING; + INIT_LIST_HEAD(&p->run_list); + INIT_LIST_HEAD(&p->iso_list); + p->array = NULL; +#ifdef CONFIG_SCHEDSTATS + memset(&p->sched_info, 0, sizeof(p->sched_info)); +#endif +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + p->oncpu = 0; +#endif +#ifdef CONFIG_PREEMPT + /* Want to start with kernel preemption disabled. */ + p->thread_info->preempt_count = 1; +#endif + p->timestamp = clock_us(); + p->used_slice = 0; + if (rt_task(p)) { + BUG_ON(!rt_task(current)); + return; + } + + preempt_disable(); + rq = this_rq(); + + /* XXX */ + if (unlikely(p->comm[0] == 'X' && p->comm[1] == 'F')) { + static int warned = 0; + if (!warned) { + printk(KERN_INFO "Renicing %s for you\n", p->comm); + warned = 1; + } + p->static_prio = NICE_TO_PRIO(-10); + } + + /* Get MIN_HISTORY of history with the same sleep_avg as parent. */ + sleep_avg = task_sleep_avg(current); + p->total_time = MIN_HISTORY; + p->sleep_time = p->total_time * sleep_avg / SLEEP_FACTOR; + + /* Parent loses 1/4 of sleep time for forking */ + current->sleep_time = 3*current->sleep_time/4; + + local_irq_disable(); + if (unlikely(current->used_slice == -1 || current == rq->idle)) + p->used_slice = -1; + else { + int ts = task_timeslice(current, rq); + current->used_slice += (ts + 3) / 4; + if (current->used_slice >= ts) { + current->used_slice = -1; + set_need_resched(); + } + } + local_irq_enable(); + preempt_enable(); +} + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) +{ + unsigned long flags; + int this_cpu, cpu; + runqueue_t *rq, *this_rq; + + rq = task_rq_lock(p, &flags); + cpu = task_cpu(p); + this_cpu = smp_processor_id(); + + BUG_ON(p->state != TASK_RUNNING); + + schedstat_inc(rq, wunt_cnt); + /* + * We decrease the sleep average of forking parents + * and children as well, to keep max-interactive tasks + * from forking tasks that are max-interactive. The parent + * (current) is done further down, under its lock. + */ + p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * + CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + + p->prio = effective_prio(p); + + if (likely(cpu == this_cpu)) { + if (!(clone_flags & CLONE_VM)) { + /* + * The VM isn't cloned, so we're in a good position to + * do child-runs-first in anticipation of an exec. This + * usually avoids a lot of COW overhead. + */ + if (unlikely(!current->array)) + __activate_task(p, rq); + else { + p->prio = current->prio; + list_add_tail(&p->run_list, ¤t->run_list); + p->array = current->array; + p->array->nr_active++; + rq->nr_running++; + if (iso_task(p)) + enqueue_iso_task(p); + } + set_need_resched(); + } else + /* Run child last */ + __activate_task(p, rq); + /* + * We skip the following code due to cpu == this_cpu + * + * task_rq_unlock(rq, &flags); + * this_rq = task_rq_lock(current, &flags); + */ + this_rq = rq; + } else { + this_rq = cpu_rq(this_cpu); + + /* + * Not the local CPU - must adjust timestamp. This should + * get optimised away in the !CONFIG_SMP case. + */ + p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; + __activate_task(p, rq); + if (task_preempts_curr(p, rq)) + resched_task(rq->curr); + + schedstat_inc(rq, wunt_moved); + /* + * Parent and child are on different CPUs, now get the + * parent runqueue to update the parent's ->sleep_avg: + */ + task_rq_unlock(rq, &flags); + this_rq = task_rq_lock(current, &flags); + } + current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * + PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + task_rq_unlock(this_rq, &flags); +} + +/* + * Potentially available exiting-child timeslices are + * retrieved here - this way the parent does not get + * penalized for creating too many threads. + * + * (this cannot be used to 'generate' timeslices + * artificially, because any timeslice recovered here + * was given away by the parent in the first place.) + */ +void fastcall sched_exit(task_t * p) +{ + unsigned long flags; + runqueue_t *rq; + + /* + * If the child was a (relative-) CPU hog then decrease + * the sleep_avg of the parent as well. + */ + rq = task_rq_lock(p->parent, &flags); + if (p->first_time_slice) { + p->parent->time_slice += p->time_slice; + if (unlikely(p->parent->time_slice > task_timeslice(p))) + p->parent->time_slice = task_timeslice(p); + } + if (p->sleep_avg < p->parent->sleep_avg) + p->parent->sleep_avg = p->parent->sleep_avg / + (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / + (EXIT_WEIGHT + 1); + task_rq_unlock(rq, &flags); +} + +/** + * finish_task_switch - clean up after a task-switch + * @prev: the thread we just switched away from. + * + * We enter this with the runqueue still locked, and finish_arch_switch() + * will unlock it along with doing any other architecture-specific cleanup + * actions. + * + * Note that we may have delayed dropping an mm in context_switch(). If + * so, we finish that here outside of the runqueue lock. (Doing it + * with the lock held can cause deadlocks; see schedule() for + * details.) + */ +static void finish_task_switch(task_t *prev) + __releases(rq->lock) +{ + runqueue_t *rq = this_rq(); + struct mm_struct *mm = rq->prev_mm; + unsigned long prev_task_flags; + + rq->prev_mm = NULL; + + /* + * A task struct has one reference for the use as "current". + * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and + * calls schedule one last time. The schedule call will never return, + * and the scheduled task must drop that reference. + * The test for EXIT_ZOMBIE must occur while the runqueue locks are + * still held, otherwise prev could be scheduled on another cpu, die + * there before we look at prev->state, and then the reference would + * be dropped twice. + * Manfred Spraul + */ + prev_task_flags = prev->flags; + finish_arch_switch(prev); + finish_lock_switch(rq, prev); + if (mm) + mmdrop(mm); + if (unlikely(prev_task_flags & PF_DEAD)) + put_task_struct(prev); +} + +/** + * schedule_tail - first thing a freshly forked thread must call. + * @prev: the thread we just switched away from. + */ +asmlinkage void schedule_tail(task_t *prev) + __releases(rq->lock) +{ + finish_task_switch(prev); +#ifdef __ARCH_WANT_UNLOCKED_CTXSW + /* In this case, finish_task_switch does not reenable preemption */ + preempt_enable(); +#endif + if (current->set_child_tid) + put_user(current->pid, current->set_child_tid); +} + +/* + * context_switch - switch to the new MM and the new + * thread's register state. + */ +static inline +task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) +{ + struct mm_struct *mm = next->mm; + struct mm_struct *oldmm = prev->active_mm; + + if (unlikely(!mm)) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next); + } else + switch_mm(oldmm, mm, next); + + if (unlikely(!prev->mm)) { + prev->active_mm = NULL; + WARN_ON(rq->prev_mm); + rq->prev_mm = oldmm; + } + + /* Here we just switch the register state and the stack. */ + switch_to(prev, next, prev); + + return prev; +} + +/* + * nr_running, nr_uninterruptible and nr_context_switches: + * + * externally visible scheduler statistics: current number of runnable + * threads, current number of uninterruptible-sleeping threads, total + * number of context switches performed since bootup. + */ +unsigned long nr_running(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->nr_running; + + return sum; +} + +unsigned long nr_uninterruptible(void) +{ + unsigned long i, sum = 0; + + for_each_cpu(i) + sum += cpu_rq(i)->nr_uninterruptible; + + /* + * Since we read the counters lockless, it might be slightly + * inaccurate. Do not allow it to go below zero though: + */ + if (unlikely((long)sum < 0)) + sum = 0; + + return sum; +} + +unsigned long long nr_context_switches(void) +{ + unsigned long long i, sum = 0; + + for_each_cpu(i) + sum += cpu_rq(i)->nr_switches; + + return sum; +} + +unsigned long nr_iowait(void) +{ + unsigned long i, sum = 0; + + for_each_cpu(i) + sum += atomic_read(&cpu_rq(i)->nr_iowait); + + return sum; +} + +#ifdef CONFIG_SMP + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + if (rq1 == rq2) { + spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ + } else { + if (rq1 < rq2) { + spin_lock(&rq1->lock); + spin_lock(&rq2->lock); + } else { + spin_lock(&rq2->lock); + spin_lock(&rq1->lock); + } + } +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + spin_unlock(&rq1->lock); + if (rq1 != rq2) + spin_unlock(&rq2->lock); + else + __release(rq2->lock); +} + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + if (unlikely(!spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + spin_unlock(&this_rq->lock); + spin_lock(&busiest->lock); + spin_lock(&this_rq->lock); + } else + spin_lock(&busiest->lock); + } +} + +/* + * find_idlest_cpu - find the least busy runqueue. + */ +static int find_idlest_cpu(struct task_struct *p, int this_cpu, + struct sched_domain *sd) +{ + unsigned long load, min_load, this_load; + int i, min_cpu; + cpumask_t mask; + + min_cpu = UINT_MAX; + min_load = ULONG_MAX; + + cpus_and(mask, sd->span, p->cpus_allowed); + + for_each_cpu_mask(i, mask) { + load = target_load(i, sd->wake_balance_load_idx); + + if (load < min_load) { + min_cpu = i; + min_load = load; + + /* break out early on an idle CPU: */ + if (!min_load) + break; + } + } + + /* add +1 to account for the new task */ + this_load = source_load(this_cpu, sd->wake_balance_load_idx) + SCHED_LOAD_SCALE; + + /* + * Would with the addition of the new task to the + * current CPU there be an imbalance between this + * CPU and the idlest CPU? + * + * Use half of the balancing threshold - new-context is + * a good opportunity to balance. + */ + if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100) + return min_cpu; + + return this_cpu; +} + +/* + * If dest_cpu is allowed for this process, migrate the task to it. + * This is accomplished by forcing the cpu_allowed mask to only + * allow dest_cpu, which will force the cpu onto dest_cpu. Then + * the cpu_allowed mask is restored. + */ +static void sched_migrate_task(task_t *p, int dest_cpu) +{ + migration_req_t req; + runqueue_t *rq; + unsigned long flags; + + rq = task_rq_lock(p, &flags); + if (!cpu_isset(dest_cpu, p->cpus_allowed) + || unlikely(cpu_is_offline(dest_cpu))) + goto out; + + /* force the process onto the specified CPU */ + if (migrate_task(p, dest_cpu, &req)) { + /* Need to wait for migration thread (might exit: take ref). */ + struct task_struct *mt = rq->migration_thread; + get_task_struct(mt); + task_rq_unlock(rq, &flags); + wake_up_process(mt); + put_task_struct(mt); + wait_for_completion(&req.done); + return; + } +out: + task_rq_unlock(rq, &flags); +} + +/* + * sched_exec(): find the highest-level, exec-balance-capable + * domain and try to migrate the task to the least loaded CPU. + * + * execve() is a valuable balancing opportunity, because at this point + * the task has the smallest effective memory and cache footprint. + */ +void sched_exec(void) +{ + struct sched_domain *tmp, *sd = NULL; + int new_cpu, this_cpu = get_cpu(); + + /* Prefer the current CPU if there's only this task running */ + if (this_rq()->nr_running <= 1) + goto out; + + for_each_domain(this_cpu, tmp) + if (tmp->flags & SD_BALANCE_EXEC) + sd = tmp; + + if (sd) { + schedstat_inc(sd, sbe_attempts); + new_cpu = find_idlest_cpu(current, this_cpu, sd); + if (new_cpu != this_cpu) { + schedstat_inc(sd, sbe_pushed); + put_cpu(); + sched_migrate_task(current, new_cpu); + return; + } + } +out: + put_cpu(); +} + +/* + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static inline +void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, + runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) +{ + dequeue_task(p, src_array); + src_rq->nr_running--; + set_task_cpu(p, this_cpu); + this_rq->nr_running++; + enqueue_task(p, this_array); + if (!rt_task(p)) { + if (p->prio < this_array->min_prio) + this_array->min_prio = p->prio; + } + p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + + this_rq->timestamp_last_tick; + /* + * Note that idle threads have a prio of MAX_PRIO, for this test + * to be always true for them. + */ + if (task_preempts_curr(p, this_rq)) + resched_task(this_rq->curr); +} + +/* + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + */ +static inline +int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, + struct sched_domain *sd, enum idle_type idle) +{ + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 3) are cache-hot on their current CPU. + */ + if (task_running(rq, p)) + return 0; + if (!cpu_isset(this_cpu, p->cpus_allowed)) + return 0; + + /* + * Aggressive migration if: + * 1) the [whole] cpu is idle, or + * 2) too many balance attempts have failed. + */ + + if (cpu_and_siblings_are_idle(this_cpu) || \ + sd->nr_balance_failed > sd->cache_nice_tries) + return 1; + + if (task_hot(p, rq->timestamp_last_tick, sd)) + return 0; + return 1; +} + +/* + * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, + * as part of a balancing operation within "domain". Returns the number of + * tasks moved. + * + * Called with both runqueues locked. + */ +static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) +{ + prio_array_t *array, *dst_array; + struct list_head *head, *curr; + int idx, pulled = 0; + task_t *tmp; + + if (max_nr_move <= 0 || busiest->nr_running <= 1) + goto out; + + /* + * We first consider expired tasks. Those will likely not be + * executed in the near future, and they are most likely to + * be cache-cold, thus switching CPUs has the least effect + * on them. + */ + if (busiest->expired->nr_active) { + array = busiest->expired; + dst_array = this_rq->expired; + } else { + array = busiest->active; + dst_array = this_rq->active; + } + +new_array: + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, MAX_PRIO, idx); + if (idx >= MAX_PRIO) { + if (array == busiest->expired && busiest->active->nr_active) { + array = busiest->active; + dst_array = this_rq->active; + goto new_array; + } + goto out; + } + + head = array->queue + idx; + curr = head->prev; +skip_queue: + tmp = list_entry(curr, task_t, run_list); + + curr = curr->prev; + + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + +#ifdef CONFIG_SCHEDSTATS + if (task_hot(tmp, busiest->timestamp_last_tick, sd)) + schedstat_inc(sd, lb_hot_gained[idle]); +#endif + + pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pulled++; + + /* We only want to steal up to the prescribed number of tasks. */ + if (pulled < max_nr_move) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } +out: + /* + * Right now, this is the only place pull_task() is called, + * so we can safely collect pull_task() stats here rather than + * inside pull_task(). + */ + schedstat_add(sd, lb_gained[idle], pulled); + + return pulled; +} + +/* + * find_busiest_group finds and returns the busiest CPU group within the + * domain. It calculates and returns the number of tasks which should be + * moved to restore balance via the imbalance parameter. + */ +static struct sched_group * +find_busiest_group(struct sched_domain *sd, int this_cpu, + unsigned long *imbalance, enum idle_type idle) +{ + struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; + unsigned long max_load, avg_load, total_load, this_load, total_pwr; + int load_idx = sd->idle_load_idx; + + max_load = this_load = total_load = total_pwr = 0; + if (idle == NOT_IDLE) + load_idx = sd->load_idx; + + do { + unsigned long load; + int local_group; + int i, nr_cpus = 0; + + local_group = cpu_isset(this_cpu, group->cpumask); + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; + + for_each_cpu_mask(i, group->cpumask) { + /* Bias balancing toward cpus of our domain */ + if (local_group) + load = target_load(i, load_idx); + else + load = source_load(i, load_idx); + + nr_cpus++; + avg_load += load; + } + + if (!nr_cpus) + goto nextgroup; + + total_load += avg_load; + total_pwr += group->cpu_power; + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + this = group; + goto nextgroup; + } else if (avg_load > max_load) { + max_load = avg_load; + busiest = group; + } +nextgroup: + group = group->next; + } while (group != sd->groups); + + if (!busiest || this_load >= max_load) + goto out_balanced; + + avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; + + if (this_load >= avg_load || + 100*max_load <= sd->imbalance_pct*this_load) + goto out_balanced; + + /* + * We're trying to get all the cpus to the average_load, so we don't + * want to push ourselves above the average load, nor do we wish to + * reduce the max loaded cpu below the average load, as either of these + * actions would just result in more rebalancing later, and ping-pong + * tasks around. Thus we look for the minimum possible imbalance. + * Negative imbalances (*we* are more loaded than anyone else) will + * be counted as no imbalance for these purposes -- we can't fix that + * by pulling tasks to us. Be careful of negative numbers as they'll + * appear as very large values with unsigned longs. + */ + *imbalance = min(max_load - avg_load, avg_load - this_load); + + /* How much load to actually move to equalise the imbalance */ + *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power)) + / SCHED_LOAD_SCALE; + + if (*imbalance < SCHED_LOAD_SCALE) { + unsigned long pwr_now = 0, pwr_move = 0; + unsigned long tmp; + + if (max_load - this_load >= SCHED_LOAD_SCALE*2) { + *imbalance = 1; + return busiest; + } + + /* + * OK, we don't have enough imbalance to justify moving tasks, + * however we may be able to increase total CPU power used by + * moving them. + */ + + pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); + pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); + pwr_now /= SCHED_LOAD_SCALE; + + /* Amount of load we'd subtract */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; + if (max_load > tmp) + pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, + max_load - tmp); + + /* Amount of load we'd add */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; + if (max_load < tmp) + tmp = max_load; + pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); + pwr_move /= SCHED_LOAD_SCALE; + + /* Move if we gain another 8th of a CPU worth of throughput */ + if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8) + goto out_balanced; + + *imbalance = 1; + return busiest; + } + + /* Get rid of the scaling factor, rounding down as we divide */ + *imbalance = *imbalance / SCHED_LOAD_SCALE; + + return busiest; + +out_balanced: + if (busiest && idle != NOT_IDLE && max_load > SCHED_LOAD_SCALE) { + *imbalance = 1; + return busiest; + } + + *imbalance = 0; + return NULL; +} + +/* + * find_busiest_queue - find the busiest runqueue among the cpus in group. + */ +static runqueue_t *find_busiest_queue(struct sched_group *group) +{ + unsigned long load, max_load = 0; + runqueue_t *busiest = NULL; + int i; + + for_each_cpu_mask(i, group->cpumask) { + load = source_load(i, 0); + + if (load > max_load) { + max_load = load; + busiest = cpu_rq(i); + } + } + + return busiest; +} + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + * + * Called with this_rq unlocked. + */ +static int load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, enum idle_type idle) +{ + struct sched_group *group; + runqueue_t *busiest; + unsigned long imbalance; + int nr_moved; + + schedstat_inc(sd, lb_cnt[idle]); + + group = find_busiest_group(sd, this_cpu, &imbalance, idle); + if (!group) { + schedstat_inc(sd, lb_nobusyg[idle]); + goto out_balanced; + } + + busiest = find_busiest_queue(group); + if (!busiest) { + schedstat_inc(sd, lb_nobusyq[idle]); + goto out_balanced; + } + + /* + * This should be "impossible", but since load + * balancing is inherently racy and statistical, + * it could happen in theory. + */ + if (unlikely(busiest == this_rq)) { + WARN_ON(1); + goto out_balanced; + } + + schedstat_add(sd, lb_imbalance[idle], imbalance); + + nr_moved = 0; + if (busiest->nr_running > 1) { + /* + * Attempt to move tasks. If find_busiest_group has found + * an imbalance but busiest->nr_running <= 1, the group is + * still unbalanced. nr_moved simply stays zero, so it is + * correctly treated as an imbalance. + */ + double_rq_lock(this_rq, busiest); + nr_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, idle); + double_rq_unlock(this_rq, busiest); + } + + if (!nr_moved) { + schedstat_inc(sd, lb_failed[idle]); + sd->nr_balance_failed++; + + if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { + int wake = 0; + + spin_lock(&busiest->lock); + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + wake = 1; + } + spin_unlock(&busiest->lock); + if (wake) + wake_up_process(busiest->migration_thread); + + /* + * We've kicked active balancing, reset the failure + * counter. + */ + sd->nr_balance_failed = sd->cache_nice_tries; + } + + /* + * We were unbalanced, but unsuccessful in move_tasks(), + * so bump the balance_interval to lessen the lock contention. + */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval++; + } else { + sd->nr_balance_failed = 0; + + /* We were unbalanced, so reset the balancing interval */ + sd->balance_interval = sd->min_interval; + } + + return nr_moved; + +out_balanced: + schedstat_inc(sd, lb_balanced[idle]); + + /* tune up the balancing interval */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; + + return 0; +} + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + * + * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). + * this_rq is locked. + */ +static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd) +{ + struct sched_group *group; + runqueue_t *busiest = NULL; + unsigned long imbalance; + int nr_moved = 0; + + schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); + group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); + if (!group) { + schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); + goto out; + } + + busiest = find_busiest_queue(group); + if (!busiest || busiest == this_rq) { + schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); + goto out; + } + + /* Attempt to move tasks */ + double_lock_balance(this_rq, busiest); + + schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); + nr_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, NEWLY_IDLE); + if (!nr_moved) { + schedstat_inc(sd, lb_failed[NEWLY_IDLE]); + } else { + schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); + } + + spin_unlock(&busiest->lock); + +out: + return nr_moved; +} + +/* + * idle_balance is called by schedule() if this_cpu is about to become + * idle. Attempts to pull tasks from other CPUs. + */ +static inline void idle_balance(int this_cpu, runqueue_t *this_rq) +{ + struct sched_domain *sd; + + for_each_domain(this_cpu, sd) { + if (sd->flags & SD_BALANCE_NEWIDLE) { + if (load_balance_newidle(this_cpu, this_rq, sd)) { + /* We've pulled tasks over so stop searching */ + break; + } + } + } +} + +/* + * active_load_balance is run by migration threads. It pushes running tasks + * off the busiest CPU onto idle CPUs. It requires at least 1 task to be + * running on each physical CPU where possible, and avoids physical / + * logical imbalances. + * + * Called with busiest_rq locked. + */ +static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) +{ + struct sched_domain *sd; + struct sched_group *cpu_group; + runqueue_t *target_rq; + cpumask_t visited_cpus; + int cpu; + + + /* + * Search for suitable CPUs to push tasks to in successively higher + * domains with SD_LOAD_BALANCE set. + */ + visited_cpus = CPU_MASK_NONE; + for_each_domain(busiest_cpu, sd) { + if (!(sd->flags & SD_LOAD_BALANCE)) + /* no more domains to search */ + break; + schedstat_inc(sd, alb_cnt); + + if (busiest->nr_running <= 1) + return; + + cpu_group = sd->groups; + do { + for_each_cpu_mask(cpu, cpu_group->cpumask) { + if (cpu_isset(cpu, visited_cpus)) + continue; + cpu_set(cpu, visited_cpus); + if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu) + continue; + + target_rq = cpu_rq(cpu); + /* + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by + * Bjorn Helgaas on a 128-cpu setup. + */ + BUG_ON(busiest_rq == target_rq); + + /* move a task from busiest_rq to target_rq */ + double_lock_balance(busiest_rq, target_rq); + if (move_tasks(target_rq, cpu, busiest_rq, + 1, sd, SCHED_IDLE)) { + schedstat_inc(sd, alb_pushed); + } else { + schedstat_inc(sd, alb_failed); + } + spin_unlock(&target_rq->lock); + } + cpu_group = cpu_group->next; + } while (cpu_group != sd->groups); + } +} + +/* + * rebalance_tick will get called every timer tick, on every CPU. + * + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. + * + * Balancing parameters are set up in arch_init_sched_domains. + */ + +/* Don't have all balancing operations going off at once */ +#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) + +static void rebalance_tick(int this_cpu, runqueue_t *this_rq, + enum idle_type idle) +{ + unsigned long old_load, this_load; + unsigned long j = jiffies + CPU_OFFSET(this_cpu); + struct sched_domain *sd; + int i; + + this_load = this_rq->nr_running * SCHED_LOAD_SCALE; + /* Update our load */ + for (i = 0; i < 4; i++) { + unsigned long new_load = this_load; + int scale = 1<cpu_load[i]; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale-1; + this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; + } + + for_each_domain(this_cpu, sd) { + unsigned long interval; + + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + + interval = sd->balance_interval; + if (idle != SCHED_IDLE) + interval *= sd->busy_factor; + + /* scale ms to jiffies */ + interval = msecs_to_jiffies(interval); + if (unlikely(!interval)) + interval = 1; + + if (j - sd->last_balance >= interval) { + if (load_balance(this_cpu, this_rq, sd, idle)) { + /* We've pulled tasks over so no longer idle */ + idle = NOT_IDLE; + } + sd->last_balance += interval; + } + } +} +#else +/* + * on UP we do not need to balance between CPUs: + */ +static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) +{ +} +static inline void idle_balance(int cpu, runqueue_t *rq) +{ +} +#endif + +static inline int wake_priority_sleeper(runqueue_t *rq) +{ + int ret = 0; +#ifdef CONFIG_SCHED_SMT + spin_lock(&rq->lock); + /* + * If an SMT sibling task has been put to sleep for priority + * reasons reschedule the idle task to see if it can now run. + */ + if (rq->nr_running) { + resched_task(rq->idle); + ret = 1; + } + spin_unlock(&rq->lock); +#endif + return ret; +} + +DEFINE_PER_CPU(struct kernel_stat, kstat); + +EXPORT_PER_CPU_SYMBOL(kstat); + +/* + * This is called on clock ticks and on context switches. + * Bank in p->sched_time the ns elapsed since the last tick or switch. + */ +static inline void update_cpu_clock(task_t *p, runqueue_t *rq, + unsigned long long now) +{ + unsigned long long last = max(p->timestamp, rq->timestamp_last_tick); + p->sched_time += now - last; +} + +/* + * Return current->sched_time plus any more ns on the sched_clock + * that have not yet been banked. + */ +unsigned long long current_sched_time(const task_t *tsk) +{ + unsigned long long ns; + unsigned long flags; + local_irq_save(flags); + ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick); + ns = tsk->sched_time + (sched_clock() - ns); + local_irq_restore(flags); + return ns; +} + +/* + * We place interactive tasks back into the active array, if possible. + * + * To guarantee that this does not starve expired tasks we ignore the + * interactivity of a task if the first expired task had to wait more + * than a 'reasonable' amount of time. This deadline timeout is + * load-dependent, as the frequency of array switched decreases with + * increasing number of running tasks. We also ignore the interactivity + * if a better static_prio task has expired: + */ +#define EXPIRED_STARVING(rq) \ + ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ + (jiffies - (rq)->expired_timestamp >= \ + STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ + ((rq)->curr->static_prio > (rq)->best_expired_prio)) + +/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in user space since the last update + */ +void account_user_time(struct task_struct *p, cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t tmp; + + p->utime = cputime_add(p->utime, cputime); + + /* Add user time to cpustat. */ + tmp = cputime_to_cputime64(cputime); + if (TASK_NICE(p) > 0) + cpustat->nice = cputime64_add(cpustat->nice, tmp); + else + cpustat->user = cputime64_add(cpustat->user, tmp); +} + +/* + * Account system cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in kernel space since the last update + */ +void account_system_time(struct task_struct *p, int hardirq_offset, + cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + runqueue_t *rq = this_rq(); + cputime64_t tmp; + + p->stime = cputime_add(p->stime, cputime); + + /* Add system time to cpustat. */ + tmp = cputime_to_cputime64(cputime); + if (hardirq_count() - hardirq_offset) + cpustat->irq = cputime64_add(cpustat->irq, tmp); + else if (softirq_count()) + cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + else if (p != rq->idle) + cpustat->system = cputime64_add(cpustat->system, tmp); + else if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, tmp); + else + cpustat->idle = cputime64_add(cpustat->idle, tmp); +} + +/* + * Account for involuntary wait time. + * @p: the process from which the cpu time has been stolen + * @steal: the cpu time spent in involuntary wait + */ +void account_steal_time(struct task_struct *p, cputime_t steal) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t tmp = cputime_to_cputime64(steal); + runqueue_t *rq = this_rq(); + + if (p == rq->idle) { + p->stime = cputime_add(p->stime, steal); + if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, tmp); + else + cpustat->idle = cputime64_add(cpustat->idle, tmp); + } else + cpustat->steal = cputime64_add(cpustat->steal, tmp); +} + +static inline void inc_iso_ticks(runqueue_t *rq, task_t *p) +{ + if (rq->iso_ticks < (iso_period * HZ * 100 - 99)) + rq->iso_ticks += 100; + spin_lock(&rq->lock); + if (!rq->iso_refractory && (rq->iso_ticks / + ((iso_period * HZ) + 1) > iso_cpu)) + rq->iso_refractory = 1; + spin_unlock(&rq->lock); +} + +static inline void dec_iso_ticks(runqueue_t *rq, task_t *p) +{ + if (rq->iso_ticks) + rq->iso_ticks = rq->iso_ticks * (iso_period * HZ - 1) / + (iso_period * HZ); + spin_lock(&rq->lock); + if (rq->iso_refractory && (rq->iso_ticks / + ((iso_period * HZ) + 1) < (iso_cpu * 9 / 10))) + rq->iso_refractory = 0; + spin_unlock(&rq->lock); +} + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + * + * It also gets called by the fork code, when changing the parent's + * timeslices. + */ +void scheduler_tick(void) +{ + int cpu = smp_processor_id(); + runqueue_t *rq = this_rq(); + task_t *p = current; + unsigned long long now = sched_clock(); + + update_cpu_clock(p, rq, now); + + rq->timestamp_last_tick = now; + + /* + * The iso_ticks accounting is incremented only when a SCHED_ISO task + * is running in soft rt mode. Running rt_tasks are also accounted + * to make the iso_cpu a proportion of cpu available for SCHED_NORMAL + * tasks only. + */ + if (rt_task(p) || (iso_task(p) && !rq->iso_refractory)) + inc_iso_ticks(rq, p); + else + dec_iso_ticks(rq, p); + + if (p == rq->idle) { + if (wake_priority_sleeper(rq)) + goto out; + rebalance_tick(cpu, rq, SCHED_IDLE); + return; + } + + /* Task might have expired already, but not scheduled off yet */ + if (p->array != rq->active) { + set_tsk_need_resched(p); + goto out; + } + spin_lock(&rq->lock); + + /* + * The task was running during this tick - update the + * time slice counter. Note: we do not update a thread's + * priority until it either goes to sleep or uses up its + * timeslice. This makes it possible for interactive tasks + * to use up their timeslices at their highest priority levels. + */ + if (rt_task(p)) { + /* + * RR tasks need a special form of timeslice management. + * FIFO tasks have no timeslices. + */ + if ((p->policy == SCHED_RR) && !--p->time_slice) { + p->time_slice = task_timeslice(p); + p->first_time_slice = 0; + set_tsk_need_resched(p); + + /* put it at the end of the queue: */ + requeue_task(p, rq->active); + } + goto out_unlock; + } + + if (iso_task(p)) { + if (rq->iso_refractory) { + /* + * If we are in the refractory period for SCHED_ISO + * tasks we schedule them as SCHED_NORMAL. If their + * priority is set to the ISO priority, change it. + */ + if (p->prio == MAX_RT_PRIO) { + dequeue_task(p, rq->active); + p->prio = effective_prio(p); + enqueue_task(p, rq->active); + } + goto sched_normal; + } + if (p->prio > MAX_RT_PRIO) { + dequeue_task(p, rq->active); + p->prio = MAX_RT_PRIO; + enqueue_task(p, rq->active); + } + if (!(--p->time_slice % GRANULARITY)) { + set_tsk_need_resched(p); + requeue_iso_task(p); + } + if (!p->time_slice) + p->time_slice = task_timeslice(p); + goto out_unlock; + } + +sched_normal: + if (!--p->time_slice) { + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + p->time_slice = task_timeslice(p); + p->first_time_slice = 0; + + if (!rq->expired_timestamp) + rq->expired_timestamp = jiffies; + if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { + enqueue_task(p, rq->expired); + if (p->static_prio < rq->best_expired_prio) + rq->best_expired_prio = p->static_prio; + } else + enqueue_task(p, rq->active); + } else { + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + * + * This only applies to tasks in the interactive + * delta range with at least TIMESLICE_GRANULARITY to requeue. + */ + if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - + p->time_slice) % TIMESLICE_GRANULARITY(p)) && + (p->time_slice >= TIMESLICE_GRANULARITY(p)) && + (p->array == rq->active)) { + + requeue_task(p, rq->active); + set_tsk_need_resched(p); + } + } +out_unlock: + spin_unlock(&rq->lock); +out: + rebalance_tick(cpu, rq, NOT_IDLE); +} + +static inline int iso_ready(runqueue_t *rq) +{ + if (iso_queued(rq) && !rq->iso_refractory) + return 1; + return 0; +} + +/* + * When a SCHED_ISO task is ready to be scheduled, we re-queue it with an + * effective prio of MAX_RT_PRIO for userspace to know its relative prio. + */ +static task_t* queue_iso(runqueue_t *rq, prio_array_t *array) +{ + task_t *p = list_entry(rq->iso_queue.next, task_t, iso_list); + if (p->prio == MAX_RT_PRIO) + goto out; + list_del(&p->run_list); + if (list_empty(array->queue + p->prio)) + __clear_bit(p->prio, array->bitmap); + p->prio = MAX_RT_PRIO; + list_add_tail(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); +out: + return p; +} + +#ifdef CONFIG_SCHED_SMT +static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) +{ + struct sched_domain *sd = this_rq->sd; + cpumask_t sibling_map; + int i; + + if (!(sd->flags & SD_SHARE_CPUPOWER)) + return; + + /* + * Unlock the current runqueue because we have to lock in + * CPU order to avoid deadlocks. Caller knows that we might + * unlock. We keep IRQs disabled. + */ + spin_unlock(&this_rq->lock); + + sibling_map = sd->span; + + for_each_cpu_mask(i, sibling_map) + spin_lock(&cpu_rq(i)->lock); + /* + * We clear this CPU from the mask. This both simplifies the + * inner loop and keps this_rq locked when we exit: + */ + cpu_clear(this_cpu, sibling_map); + + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq = cpu_rq(i); + + /* + * If an SMT sibling task is sleeping due to priority + * reasons wake it up now. + */ + if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) + resched_task(smt_rq->idle); + } + + for_each_cpu_mask(i, sibling_map) + spin_unlock(&cpu_rq(i)->lock); + /* + * We exit with this_cpu's rq still held and IRQs + * still disabled: + */ +} + +static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) +{ + struct sched_domain *sd = this_rq->sd; + cpumask_t sibling_map; + prio_array_t *array; + int ret = 0, i, idx; + task_t *p; + + if (!(sd->flags & SD_SHARE_CPUPOWER)) + return 0; + + /* + * The same locking rules and details apply as for + * wake_sleeping_dependent(): + */ + spin_unlock(&this_rq->lock); + sibling_map = sd->span; + for_each_cpu_mask(i, sibling_map) + spin_lock(&cpu_rq(i)->lock); + cpu_clear(this_cpu, sibling_map); + + /* + * Establish next task to be run - it might have gone away because + * we released the runqueue lock above: + */ + if (!this_rq->nr_running) + goto out_unlock; + array = this_rq->active; + if (!array->nr_active) + array = this_rq->expired; + BUG_ON(!array->nr_active); + + idx = sched_find_first_bit(array->bitmap); + if (unlikely(iso_ready(this_rq) && idx >= MAX_RT_PRIO)) + p = queue_iso(this_rq, array); + else + p = list_entry(array->queue[idx].next, task_t, run_list); + + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq = cpu_rq(i); + task_t *smt_curr = smt_rq->curr; + + /* + * If a user task with lower static priority than the + * running task on the SMT sibling is trying to schedule, + * delay it till there is proportionately less timeslice + * left of the sibling task to prevent a lower priority + * task from using an unfair proportion of the + * physical cpu's resources. -ck + */ + if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > + task_timeslice(p) || rt_task(smt_curr)) && + p->mm && smt_curr->mm && !rt_task(p) && !iso_task(p)) + ret = 1; + + /* + * Reschedule a lower priority task on the SMT sibling, + * or wake it up if it has been put to sleep for priority + * reasons. + */ + if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > + task_timeslice(smt_curr) || rt_task(p) || + iso_task(p)) && smt_curr->mm && p->mm && + !rt_task(smt_curr)) || (smt_curr == smt_rq->idle && + smt_rq->nr_running)) + resched_task(smt_curr); + } +out_unlock: + for_each_cpu_mask(i, sibling_map) + spin_unlock(&cpu_rq(i)->lock); + return ret; +} +#else +static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) +{ +} + +static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) +{ + return 0; +} +#endif + +#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) + +void fastcall add_preempt_count(int val) +{ + /* + * Underflow? + */ + BUG_ON(((int)preempt_count() < 0)); + preempt_count() += val; + /* + * Spinlock count overflowing soon? + */ + BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); +} +EXPORT_SYMBOL(add_preempt_count); + +void fastcall sub_preempt_count(int val) +{ + /* + * Underflow? + */ + BUG_ON(val > preempt_count()); + /* + * Is the spinlock portion underflowing? + */ + BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); + preempt_count() -= val; +} +EXPORT_SYMBOL(sub_preempt_count); + +#endif + +/* + * schedule() is the main scheduler function. + */ +asmlinkage void __sched schedule(void) +{ + long *switch_count; + task_t *prev, *next; + runqueue_t *rq; + prio_array_t *array; + struct list_head *queue; + unsigned long long now; + unsigned long run_time; + int cpu, idx; + + /* + * Test if we are atomic. Since do_exit() needs to call into + * schedule() atomically, we ignore that path for now. + * Otherwise, whine if we are scheduling when we should not be. + */ + if (unlikely(in_atomic()) && + likely(!(current->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)))) { + printk(KERN_ERR "scheduling while atomic: %s/0x%08x/%d\n", + current->comm, preempt_count(), current->pid); + dump_stack(); + } + profile_hit(SCHED_PROFILING, __builtin_return_address(0)); + +need_resched: + preempt_disable(); + prev = current; + release_kernel_lock(prev); +need_resched_nonpreemptible: + rq = this_rq(); + + /* + * The idle thread is not allowed to schedule! + * Remove this check after it has been exercised a bit. + */ + if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { + printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + dump_stack(); + } + + + + + + + schedstat_inc(rq, sched_cnt); + now = clock_us(); + run_time = now - prev->timestamp; + prev->timestamp = prev->last_ran = now; + add_task_time(prev, run_time, STIME_RUN); + + spin_lock_irq(&rq->lock); + + if (unlikely(prev->flags & PF_DEAD)) + prev->state = EXIT_DEAD; + + switch_count = &prev->nivcsw; + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; + else { + deactivate_task(prev, rq); + if (prev->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; + goto no_check_expired; + } + + } + + if (unlikely(prev->used_slice == -1)) { + if (rt_task(prev)) { + /* SCHED_FIFO can come in here too, from sched_yield */ + dequeue_task(prev, prev->array); + enqueue_task(prev, rq->active); + } else { + dequeue_task(prev, prev->array); + prev->prio = task_priority(prev); + enqueue_task(prev, rq->expired); + if (prev->prio < rq->expired->min_prio) + rq->expired->min_prio = prev->prio; + } + prev->used_slice = 0; + } +no_check_expired: + + cpu = smp_processor_id(); + if (unlikely(!rq->nr_running)) { + rq->array_sequence++; + idle_balance(cpu, rq); + if (!rq->nr_running) { + next = rq->idle; + rq->arrays[0].min_prio = MAX_PRIO; + rq->arrays[1].min_prio = MAX_PRIO; + goto switch_tasks; + } + } + + array = rq->active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + schedstat_inc(rq, sched_switch); + rq->active = rq->expired; + rq->expired = array; + array = rq->active; + rq->expired_timestamp = 0; + rq->best_expired_prio = MAX_PRIO; + } else + schedstat_inc(rq, sched_noswitch); + + idx = sched_find_first_bit(array->bitmap); + if (unlikely(iso_ready(rq) && idx >= MAX_RT_PRIO)) + next = queue_iso(rq, array); + else { + queue = array->queue + idx; + next = list_entry(queue->next, task_t, run_list); + } + + if (!rt_task(next) && !(iso_task(next) && !rq->iso_refractory) && + next->activated > 0) { + unsigned long long delta = now - next->timestamp; + + if (next->activated == 1) + delta = delta * + (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; + array = next->array; + dequeue_task(next, array); + recalc_task_prio(next, next->timestamp + delta); + enqueue_task(next, array); + } + next->activated = 0; +switch_tasks: + if (next == rq->idle) + schedstat_inc(rq, sched_goidle); + prefetch(next); + clear_tsk_need_resched(prev); + rcu_qsctr_inc(task_cpu(prev)); + + update_cpu_clock(prev, rq, now); + + sched_info_switch(prev, next); + if (likely(prev != next)) { + next->timestamp = now; + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + + prepare_arch_switch(rq, next); + ltt_ev_schedchange(prev, next); + prev = context_switch(rq, prev, next); + barrier(); + + finish_task_switch(prev); + } else + spin_unlock_irq(&rq->lock); + + prev = current; + if (unlikely(reacquire_kernel_lock(prev) < 0)) + goto need_resched_nonpreemptible; + preempt_enable_no_resched(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; +} + +EXPORT_SYMBOL(schedule); + +#ifdef CONFIG_PREEMPT +/* + * this is is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. Kernel preemptions off return from interrupt + * occur there and call schedule directly. + */ +asmlinkage void __sched preempt_schedule(void) +{ + struct thread_info *ti = current_thread_info(); +#ifdef CONFIG_PREEMPT_BKL + struct task_struct *task = current; + int saved_lock_depth; +#endif + /* + * If there is a non-zero preempt_count or interrupts are disabled, + * we do not want to preempt the current task. Just return.. + */ + if (unlikely(ti->preempt_count || irqs_disabled())) + return; + +need_resched: + add_preempt_count(PREEMPT_ACTIVE); + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ +#ifdef CONFIG_PREEMPT_BKL + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; +#endif + schedule(); +#ifdef CONFIG_PREEMPT_BKL + task->lock_depth = saved_lock_depth; +#endif + sub_preempt_count(PREEMPT_ACTIVE); + + /* we could miss a preemption opportunity between schedule and now */ + barrier(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; +} + +EXPORT_SYMBOL(preempt_schedule); + +/* + * this is is the entry point to schedule() from kernel preemption + * off of irq context. + * Note, that this is called and return with irqs disabled. This will + * protect us against recursive calling from irq. + */ +asmlinkage void __sched preempt_schedule_irq(void) +{ + struct thread_info *ti = current_thread_info(); +#ifdef CONFIG_PREEMPT_BKL + struct task_struct *task = current; + int saved_lock_depth; +#endif + /* Catch callers which need to be fixed*/ + BUG_ON(ti->preempt_count || !irqs_disabled()); + +need_resched: + add_preempt_count(PREEMPT_ACTIVE); + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ +#ifdef CONFIG_PREEMPT_BKL + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; +#endif + local_irq_enable(); + schedule(); + local_irq_disable(); +#ifdef CONFIG_PREEMPT_BKL + task->lock_depth = saved_lock_depth; +#endif + sub_preempt_count(PREEMPT_ACTIVE); + + /* we could miss a preemption opportunity between schedule and now */ + barrier(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; +} + +#endif /* CONFIG_PREEMPT */ + +int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) +{ + task_t *p = curr->task; + return try_to_wake_up(p, mode, sync); +} + +EXPORT_SYMBOL(default_wake_function); + +/* + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. + * + * There are circumstances in which we can try to wake a task which has already + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. + */ +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, int sync, void *key) +{ + struct list_head *tmp, *next; + + list_for_each_safe(tmp, next, &q->task_list) { + wait_queue_t *curr; + unsigned flags; + curr = list_entry(tmp, wait_queue_t, task_list); + flags = curr->flags; + if (curr->func(curr, mode, sync, key) && + (flags & WQ_FLAG_EXCLUSIVE) && + !--nr_exclusive) + break; + } +} + +/** + * __wake_up - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + */ +void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, 0, key); + spin_unlock_irqrestore(&q->lock, flags); +} + +EXPORT_SYMBOL(__wake_up); + +/* + * Same as __wake_up but called with the spinlock in wait_queue_head_t held. + */ +void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) +{ + __wake_up_common(q, mode, 1, 0, NULL); +} + +/** + * __wake_up - sync- wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * + * The sync wakeup differs that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + */ +void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + unsigned long flags; + int sync = 1; + + if (unlikely(!q)) + return; + + if (unlikely(!nr_exclusive)) + sync = 0; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, sync, NULL); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ + +void fastcall complete(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done++; + __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, + 1, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete); + +void fastcall complete_all(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done += UINT_MAX/2; + __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, + 0, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete_all); + +void fastcall __sched wait_for_completion(struct completion *x) +{ + might_sleep(); + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + schedule(); + spin_lock_irq(&x->wait.lock); + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + } + x->done--; + spin_unlock_irq(&x->wait.lock); +} +EXPORT_SYMBOL(wait_for_completion); + +unsigned long fastcall __sched +wait_for_completion_timeout(struct completion *x, unsigned long timeout) +{ + might_sleep(); + + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + timeout = schedule_timeout(timeout); + if (!timeout) + goto out; + spin_lock_irq(&x->wait.lock); + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + } + x->done--; + spin_unlock_irq(&x->wait.lock); +out: + return timeout; +} +EXPORT_SYMBOL(wait_for_completion_timeout); + +int fastcall __sched wait_for_completion_interruptible(struct completion *x) +{ + int ret = 0; + + might_sleep(); + + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + if (signal_pending(current)) { + ret = -ERESTARTSYS; + goto out; + } + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + schedule(); + spin_lock_irq(&x->wait.lock); + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + } + x->done--; +out: + spin_unlock_irq(&x->wait.lock); + + return ret; +} +EXPORT_SYMBOL(wait_for_completion_interruptible); + +unsigned long fastcall __sched +wait_for_completion_interruptible_timeout(struct completion *x, + unsigned long timeout) +{ + might_sleep(); + + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + if (signal_pending(current)) { + timeout = -ERESTARTSYS; + goto out_unlock; + } + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + timeout = schedule_timeout(timeout); + if (!timeout) + goto out; + spin_lock_irq(&x->wait.lock); + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + } + x->done--; +out_unlock: + spin_unlock_irq(&x->wait.lock); +out: + return timeout; +} +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); + + +#define SLEEP_ON_VAR \ + unsigned long flags; \ + wait_queue_t wait; \ + init_waitqueue_entry(&wait, current); + +#define SLEEP_ON_HEAD \ + spin_lock_irqsave(&q->lock,flags); \ + __add_wait_queue(q, &wait); \ + spin_unlock(&q->lock); + +#define SLEEP_ON_TAIL \ + spin_lock_irq(&q->lock); \ + __remove_wait_queue(q, &wait); \ + spin_unlock_irqrestore(&q->lock, flags); + +void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) +{ + SLEEP_ON_VAR + + current->state = TASK_INTERRUPTIBLE; + + SLEEP_ON_HEAD + schedule(); + SLEEP_ON_TAIL +} + +EXPORT_SYMBOL(interruptible_sleep_on); + +long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + SLEEP_ON_VAR + + current->state = TASK_INTERRUPTIBLE; + + SLEEP_ON_HEAD + timeout = schedule_timeout(timeout); + SLEEP_ON_TAIL + + return timeout; +} + +EXPORT_SYMBOL(interruptible_sleep_on_timeout); + +void fastcall __sched sleep_on(wait_queue_head_t *q) +{ + SLEEP_ON_VAR + + current->state = TASK_UNINTERRUPTIBLE; + + SLEEP_ON_HEAD + schedule(); + SLEEP_ON_TAIL +} + +EXPORT_SYMBOL(sleep_on); + +long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + SLEEP_ON_VAR + + current->state = TASK_UNINTERRUPTIBLE; + + SLEEP_ON_HEAD + timeout = schedule_timeout(timeout); + SLEEP_ON_TAIL + + return timeout; +} + +EXPORT_SYMBOL(sleep_on_timeout); + +void set_user_nice(task_t *p, long nice) +{ + unsigned long flags; + prio_array_t *array; + runqueue_t *rq; + int old_prio, new_prio, delta; + + if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + return; + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + rq = task_rq_lock(p, &flags); + /* + * The RT priorities are set via sched_setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it wont have any effect on scheduling until the task is + * not SCHED_NORMAL: + */ + if (rt_task(p)) { + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } + array = p->array; + if (array) + dequeue_task(p, array); + + old_prio = p->prio; + new_prio = NICE_TO_PRIO(nice); + delta = new_prio - old_prio; + p->static_prio = NICE_TO_PRIO(nice); + p->prio += delta; + + if (array) { + enqueue_task(p, array); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + if (delta < 0 || (delta > 0 && task_running(rq, p))) + resched_task(rq->curr); + } +out_unlock: + task_rq_unlock(rq, &flags); +} + +EXPORT_SYMBOL(set_user_nice); + +#ifdef CONFIG_KGDB +struct task_struct *kgdb_get_idle(int this_cpu) +{ + return cpu_rq(this_cpu)->idle; +} +#endif + +#ifdef __ARCH_WANT_SYS_NICE + +/* + * sys_nice - change the priority of the current process. + * @increment: priority increment + * + * sys_setpriority is a more generic, but much slower function that + * does similar things. + */ +asmlinkage long sys_nice(int increment) +{ + int retval; + long nice; + + /* + * Setpriority might change our priority at the same moment. + * We don't have to worry. Conceptually one call occurs first + * and we have a single winner. + */ + if (increment < 0) { + if (!capable(CAP_SYS_NICE)) + return -EPERM; + if (increment < -40) + increment = -40; + } + if (increment > 40) + increment = 40; + + nice = PRIO_TO_NICE(current->static_prio) + increment; + if (nice < -20) + nice = -20; + if (nice > 19) + nice = 19; + + retval = security_task_setnice(current, nice); + if (retval) + return retval; + + set_user_nice(current, nice); + return 0; +} + +#endif + +/** + * task_prio - return the priority value of a given task. + * @p: the task in question. + * + * This is the priority value as seen by users in /proc. + * RT tasks are offset by -200. Normal tasks are centered + * around 0, value goes from -16 to +15. + */ +int task_prio(const task_t *p) +{ + return p->prio - MAX_RT_PRIO; +} + +/** + * task_nice - return the nice value of a given task. + * @p: the task in question. + */ +int task_nice(const task_t *p) +{ + return TASK_NICE(p); +} + +/* + * The only users of task_nice are binfmt_elf and binfmt_elf32. + * binfmt_elf is no longer modular, but binfmt_elf32 still is. + * Therefore, task_nice is needed if there is a compat_mode. + */ +#ifdef CONFIG_COMPAT +EXPORT_SYMBOL_GPL(task_nice); +#endif + +/** + * idle_cpu - is a given cpu idle currently? + * @cpu: the processor in question. + */ +int idle_cpu(int cpu) +{ + return cpu_curr(cpu) == cpu_rq(cpu)->idle; +} + +EXPORT_SYMBOL_GPL(idle_cpu); + +/** + * idle_task - return the idle task for a given cpu. + * @cpu: the processor in question. + */ +task_t *idle_task(int cpu) +{ + return cpu_rq(cpu)->idle; +} + +/** + * find_process_by_pid - find a process with a matching PID value. + * @pid: the pid in question. + */ +static inline task_t *find_process_by_pid(pid_t pid) +{ + return pid ? find_task_by_pid(pid) : current; +} + +/* Actually do priority change: must hold rq lock. */ +static void __setscheduler(struct task_struct *p, int policy, int prio) +{ + BUG_ON(p->array); + p->policy = policy; + p->rt_priority = prio; + if (SCHED_RT(policy)) + p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; + else + p->prio = p->static_prio; +} + +/** + * sched_setscheduler - change the scheduling policy and/or RT priority of + * a thread. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + */ +int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) +{ + int retval; + int oldprio, oldpolicy = -1; + prio_array_t *array; + unsigned long flags; + runqueue_t *rq; + +recheck: + /* double check policy once rq lock held */ + if (policy < 0) + policy = oldpolicy = p->policy; + else if (!SCHED_RANGE(policy)) + return -EINVAL; + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. + */ + if (param->sched_priority < 0 || + param->sched_priority > MAX_USER_RT_PRIO-1) + return -EINVAL; + if ((!SCHED_RT(policy)) != (param->sched_priority == 0)) + return -EINVAL; + + if (SCHED_RT(policy) && !capable(CAP_SYS_NICE)) + /* + * If the caller requested an RT policy without having the + * necessary rights, we downgrade the policy to SCHED_ISO. + * Temporary hack for testing. + */ + policy = SCHED_ISO; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + return -EPERM; + + retval = security_task_setscheduler(p, policy, param); + if (retval) + return retval; + /* + * To be able to change p->policy safely, the apropriate + * runqueue lock must be held. + */ + rq = task_rq_lock(p, &flags); + /* recheck policy now with rq lock held */ + if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { + policy = oldpolicy = -1; + task_rq_unlock(rq, &flags); + goto recheck; + } + array = p->array; + if (array) + deactivate_task(p, rq); + oldprio = p->prio; + __setscheduler(p, policy, param->sched_priority); + if (array) { + __activate_task(p, rq,array); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else if (task_preempts_curr(p, rq)) + resched_task(rq->curr); + } + task_rq_unlock(rq, &flags); + return 0; +} +EXPORT_SYMBOL_GPL(sched_setscheduler); + +static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +{ + int retval; + struct sched_param lparam; + struct task_struct *p; + + if (!param || pid < 0) + return -EINVAL; + if (copy_from_user(&lparam, param, sizeof(struct sched_param))) + return -EFAULT; + read_lock_irq(&tasklist_lock); + p = find_process_by_pid(pid); + if (!p) { + read_unlock_irq(&tasklist_lock); + return -ESRCH; + } + retval = sched_setscheduler(p, policy, &lparam); + read_unlock_irq(&tasklist_lock); + return retval; +} + +/** + * sys_sched_setscheduler - set/change the scheduler policy and RT priority + * @pid: the pid in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + */ +asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, + struct sched_param __user *param) +{ + return do_sched_setscheduler(pid, policy, param); +} + +/** + * sys_sched_setparam - set/change the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the new RT priority. + */ +asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) +{ + return do_sched_setscheduler(pid, -1, param); +} + +/** + * sys_sched_getscheduler - get the policy (scheduling class) of a thread + * @pid: the pid in question. + */ +asmlinkage long sys_sched_getscheduler(pid_t pid) +{ + int retval = -EINVAL; + task_t *p; + + if (pid < 0) + goto out_nounlock; + + retval = -ESRCH; + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + if (p) { + retval = security_task_getscheduler(p); + if (!retval) + retval = p->policy; + } + read_unlock(&tasklist_lock); + +out_nounlock: + return retval; +} + +/** + * sys_sched_getscheduler - get the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the RT priority. + */ +asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) +{ + struct sched_param lp; + int retval = -EINVAL; + task_t *p; + + if (!param || pid < 0) + goto out_nounlock; + + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + retval = -ESRCH; + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + lp.sched_priority = p->rt_priority; + read_unlock(&tasklist_lock); + + /* + * This one might sleep, we cannot do it with a spinlock held ... + */ + retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; + +out_nounlock: + return retval; + +out_unlock: + read_unlock(&tasklist_lock); + return retval; +} + +long sched_setaffinity(pid_t pid, cpumask_t new_mask) +{ + task_t *p; + int retval; + cpumask_t cpus_allowed; + + lock_cpu_hotplug(); + read_lock(&tasklist_lock); + + p = find_process_by_pid(pid); + if (!p) { + read_unlock(&tasklist_lock); + unlock_cpu_hotplug(); + return -ESRCH; + } + + /* + * It is not safe to call set_cpus_allowed with the + * tasklist_lock held. We will bump the task_struct's + * usage count and then drop tasklist_lock. + */ + get_task_struct(p); + read_unlock(&tasklist_lock); + + retval = -EPERM; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + + cpus_allowed = cpuset_cpus_allowed(p); + cpus_and(new_mask, new_mask, cpus_allowed); + retval = set_cpus_allowed(p, new_mask); + +out_unlock: + put_task_struct(p); + unlock_cpu_hotplug(); + return retval; +} + +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, + cpumask_t *new_mask) +{ + if (len < sizeof(cpumask_t)) { + memset(new_mask, 0, sizeof(cpumask_t)); + } else if (len > sizeof(cpumask_t)) { + len = sizeof(cpumask_t); + } + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; +} + +/** + * sys_sched_setaffinity - set the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to the new cpu mask + */ +asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, + unsigned long __user *user_mask_ptr) +{ + cpumask_t new_mask; + int retval; + + retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); + if (retval) + return retval; + + return sched_setaffinity(pid, new_mask); +} + +/* + * Represents all cpu's present in the system + * In systems capable of hotplug, this map could dynamically grow + * as new cpu's are detected in the system via any platform specific + * method, such as ACPI for e.g. + */ + +cpumask_t cpu_present_map; +EXPORT_SYMBOL(cpu_present_map); + +#ifndef CONFIG_SMP +cpumask_t cpu_online_map = CPU_MASK_ALL; +cpumask_t cpu_possible_map = CPU_MASK_ALL; +#endif + +long sched_getaffinity(pid_t pid, cpumask_t *mask) +{ + int retval; + task_t *p; + + lock_cpu_hotplug(); + read_lock(&tasklist_lock); + + retval = -ESRCH; + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = 0; + cpus_and(*mask, p->cpus_allowed, cpu_possible_map); + +out_unlock: + read_unlock(&tasklist_lock); + unlock_cpu_hotplug(); + if (retval) + return retval; + + return 0; +} + +/** + * sys_sched_getaffinity - get the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to hold the current cpu mask + */ +asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, + unsigned long __user *user_mask_ptr) +{ + int ret; + cpumask_t mask; + + if (len < sizeof(cpumask_t)) + return -EINVAL; + + ret = sched_getaffinity(pid, &mask); + if (ret < 0) + return ret; + + if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) + return -EFAULT; + + return sizeof(cpumask_t); +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * this function yields the current CPU by moving the calling thread + * to the expired array. If there are no other threads running on this + * CPU then this function will return. + */ +asmlinkage long sys_sched_yield(void) +{ +#ifdef CONFIG_SCHEDSTATS + runqueue_t *rq; +#endif + + local_irq_disable(); +#ifdef CONFIG_SCHEDSTATS + rq = this_rq(); + + schedstat_inc(rq, yld_cnt); + spin_lock(&rq->lock); + if (current->array->nr_active == 1) { + schedstat_inc(rq, yld_act_empty); + if (!rq->expired->nr_active) + schedstat_inc(rq, yld_both_empty); + } else if (!rq->expired->nr_active) + schedstat_inc(rq, yld_exp_empty); + + /* + * requeue_task is cheaper so perform that if possible. + */ + requeue_task(current, array); + + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: + */ + __release(rq->lock); + _raw_spin_unlock(&rq->lock); + preempt_enable_no_resched(); +#endif + current->used_slice = -1; + local_irq_enable(); + + schedule(); + + return 0; +} + +static inline void __cond_resched(void) +{ + do { + add_preempt_count(PREEMPT_ACTIVE); + schedule(); + sub_preempt_count(PREEMPT_ACTIVE); + } while (need_resched()); +} + +int __sched cond_resched(void) +{ + if (need_resched()) { + __cond_resched(); + return 1; + } + return 0; +} + +EXPORT_SYMBOL(cond_resched); + +/* + * cond_resched_lock() - if a reschedule is pending, drop the given lock, + * call schedule, and on return reacquire the lock. + * + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * operations here to prevent schedule() from being called twice (once via + * spin_unlock(), once by hand). + */ +int cond_resched_lock(spinlock_t * lock) +{ +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) + if (lock->break_lock) { + lock->break_lock = 0; + spin_unlock(lock); + cpu_relax(); + spin_lock(lock); + } +#endif + if (need_resched()) { + _raw_spin_unlock(lock); + preempt_enable_no_resched(); + __cond_resched(); + spin_lock(lock); + return 1; + } + return 0; +} + +EXPORT_SYMBOL(cond_resched_lock); + +int __sched cond_resched_softirq(void) +{ + BUG_ON(!in_softirq()); + + if (need_resched()) { + __local_bh_enable(); + __cond_resched(); + local_bh_disable(); + return 1; + } + return 0; +} + +EXPORT_SYMBOL(cond_resched_softirq); + + +/** + * yield - yield the current processor to other threads. + * + * this is a shortcut for kernel-space yielding - it marks the + * thread runnable and calls sys_sched_yield(). + */ +void __sched yield(void) +{ + set_current_state(TASK_RUNNING); + sys_sched_yield(); +} + +EXPORT_SYMBOL(yield); + +/* + * This task is about to go to sleep on IO. Increment rq->nr_iowait so + * that process accounting knows that this is a task in IO wait state. + * + * But don't do that if it is a deliberate, throttling IO wait (this task + * has set its backing_dev_info: the queue against which it should throttle) + */ +void __sched io_schedule(void) +{ + struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); + + atomic_inc(&rq->nr_iowait); + schedule(); + atomic_dec(&rq->nr_iowait); +} + +EXPORT_SYMBOL(io_schedule); + +long __sched io_schedule_timeout(long timeout) +{ + struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); + long ret; + + atomic_inc(&rq->nr_iowait); + ret = schedule_timeout(timeout); + atomic_dec(&rq->nr_iowait); + return ret; +} + +/** + * sys_sched_get_priority_max - return maximum RT priority. + * @policy: scheduling class. + * + * this syscall returns the maximum rt_priority that can be used + * by a given scheduling class. + */ +asmlinkage long sys_sched_get_priority_max(int policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = MAX_USER_RT_PRIO-1; + break; + case SCHED_NORMAL: + case SCHED_ISO: + ret = 0; + break; + } + return ret; +} + +/** + * sys_sched_get_priority_min - return minimum RT priority. + * @policy: scheduling class. + * + * this syscall returns the minimum rt_priority that can be used + * by a given scheduling class. + */ +asmlinkage long sys_sched_get_priority_min(int policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = 1; + break; + case SCHED_NORMAL: + case SCHED_ISO: + ret = 0; + } + return ret; +} + +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + */ +asmlinkage +long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) +{ + int retval = -EINVAL; + struct timespec t; + task_t *p; + unsigned long flags; + runqueue_t *rq; + + if (pid < 0) + goto out_nounlock; + + retval = -ESRCH; + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + rq = task_rq_lock(p, &flags); + jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : task_timeslice(p, rq), &t); + task_rq_unlock(rq, &flags); + read_unlock(&tasklist_lock); + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; +out_nounlock: + return retval; +out_unlock: + read_unlock(&tasklist_lock); + return retval; +} + +static inline struct task_struct *eldest_child(struct task_struct *p) +{ + if (list_empty(&p->children)) return NULL; + return list_entry(p->children.next,struct task_struct,sibling); +} + +static inline struct task_struct *older_sibling(struct task_struct *p) +{ + if (p->sibling.prev==&p->parent->children) return NULL; + return list_entry(p->sibling.prev,struct task_struct,sibling); +} + +static inline struct task_struct *younger_sibling(struct task_struct *p) +{ + if (p->sibling.next==&p->parent->children) return NULL; + return list_entry(p->sibling.next,struct task_struct,sibling); +} + +static void show_task(task_t * p) +{ + task_t *relative; + unsigned state; + unsigned long free = 0; + static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; + + printk("%-13.13s ", p->comm); + state = p->state ? __ffs(p->state) + 1 : 0; + if (state < ARRAY_SIZE(stat_nam)) + printk(stat_nam[state]); + else + printk("?"); +#if (BITS_PER_LONG == 32) + if (state == TASK_RUNNING) + printk(" running "); + else + printk(" %08lX ", thread_saved_pc(p)); +#else + if (state == TASK_RUNNING) + printk(" running task "); + else + printk(" %016lx ", thread_saved_pc(p)); +#endif +#ifdef CONFIG_DEBUG_STACK_USAGE + { + unsigned long * n = (unsigned long *) (p->thread_info+1); + while (!*n) + n++; + free = (unsigned long) n - (unsigned long)(p->thread_info+1); + } +#endif + printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); + if ((relative = eldest_child(p))) + printk("%5d ", relative->pid); + else + printk(" "); + if ((relative = younger_sibling(p))) + printk("%7d", relative->pid); + else + printk(" "); + if ((relative = older_sibling(p))) + printk(" %5d", relative->pid); + else + printk(" "); + if (!p->mm) + printk(" (L-TLB)\n"); + else + printk(" (NOTLB)\n"); + + if (state != TASK_RUNNING) + show_stack(p, NULL); +} + +void show_state(void) +{ + task_t *g, *p; + +#if (BITS_PER_LONG == 32) + printk("\n" + " sibling\n"); + printk(" task PC pid father child younger older\n"); +#else + printk("\n" + " sibling\n"); + printk(" task PC pid father child younger older\n"); +#endif + read_lock(&tasklist_lock); + do_each_thread(g, p) { + /* + * reset the NMI-timeout, listing all files on a slow + * console might take alot of time: + */ + touch_nmi_watchdog(); + show_task(p); + } while_each_thread(g, p); + + read_unlock(&tasklist_lock); +} + +void __devinit init_idle(task_t *idle, int cpu) +{ + runqueue_t *rq = cpu_rq(cpu); + unsigned long flags; + + idle->array = NULL; + idle->prio = MAX_PRIO; + idle->state = TASK_RUNNING; + idle->used_slice = 0; + set_task_cpu(idle, cpu); + + spin_lock_irqsave(&rq->lock, flags); + rq->curr = rq->idle = idle; +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + idle->oncpu = 1; +#endif + set_tsk_need_resched(idle); + spin_unlock_irqrestore(&rq->lock, flags); + + /* Set the preempt count _outside_ the spinlocks! */ +#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) + idle->thread_info->preempt_count = (idle->lock_depth >= 0); +#else + idle->thread_info->preempt_count = 0; +#endif +} + +/* + * In a system that switches off the HZ timer nohz_cpu_mask + * indicates which cpus entered this state. This is used + * in the rcu update to wait only for active cpus. For system + * which do not switch off the HZ timer nohz_cpu_mask should + * always be CPU_MASK_NONE. + */ +cpumask_t nohz_cpu_mask = CPU_MASK_NONE; + +#ifdef CONFIG_SMP +/* + * This is how migration works: + * + * 1) we queue a migration_req_t structure in the source CPU's + * runqueue and wake up that CPU's migration thread. + * 2) we down() the locked semaphore => thread blocks. + * 3) migration thread wakes up (implicitly it forces the migrated + * thread off the CPU) + * 4) it gets the migration request and checks whether the migrated + * task is still in the wrong runqueue. + * 5) if it's in the wrong runqueue then the migration thread removes + * it and puts it into the right queue. + * 6) migration thread up()s the semaphore. + * 7) we wake up and the migration is done. + */ + +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +int set_cpus_allowed(task_t *p, cpumask_t new_mask) +{ + unsigned long flags; + int ret = 0; + migration_req_t req; + runqueue_t *rq; + + perfctr_set_cpus_allowed(p, new_mask); + + rq = task_rq_lock(p, &flags); + if (!cpus_intersects(new_mask, cpu_online_map)) { + ret = -EINVAL; + goto out; + } + + p->cpus_allowed = new_mask; + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpu_isset(task_cpu(p), new_mask)) + goto out; + + if (migrate_task(p, any_online_cpu(new_mask), &req)) { + /* Need help from migration thread: drop lock and wait. */ + task_rq_unlock(rq, &flags); + wake_up_process(rq->migration_thread); + wait_for_completion(&req.done); + tlb_migrate_finish(p->mm); + return 0; + } +out: + task_rq_unlock(rq, &flags); + return ret; +} + +EXPORT_SYMBOL_GPL(set_cpus_allowed); + +/* + * Move (not current) task off this cpu, onto dest cpu. We're doing + * this because either it can't run here any more (set_cpus_allowed() + * away from this CPU, or CPU going down), or because we're + * attempting to rebalance this task on exec (sched_exec). + * + * So we race with normal scheduler movements, but that's OK, as long + * as the task is no longer on this CPU. + */ +static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) +{ + runqueue_t *rq_dest, *rq_src; + + if (unlikely(cpu_is_offline(dest_cpu))) + return; + + rq_src = cpu_rq(src_cpu); + rq_dest = cpu_rq(dest_cpu); + + double_rq_lock(rq_src, rq_dest); + /* Already moved. */ + if (task_cpu(p) != src_cpu) + goto out; + /* Affinity changed (again). */ + if (!cpu_isset(dest_cpu, p->cpus_allowed)) + goto out; + + set_task_cpu(p, dest_cpu); + if (p->array) { + /* + * Sync timestamp with rq_dest's before activating. + * The same thing could be achieved by doing this step + * afterwards, and pretending it was a local activate. + * This way is cleaner and logically correct. + */ + p->timestamp = p->timestamp - rq_src->timestamp_last_tick + + rq_dest->timestamp_last_tick; + deactivate_task(p, rq_src); + activate_task(p, rq_dest, 0); + if (task_preempts_curr(p, rq_dest)) + resched_task(rq_dest->curr); + } + +out: + double_rq_unlock(rq_src, rq_dest); +} + +/* + * migration_thread - this is a highprio system thread that performs + * thread migration by bumping thread off CPU then 'pushing' onto + * another runqueue. + */ +static int migration_thread(void * data) +{ + runqueue_t *rq; + int cpu = (long)data; + + rq = cpu_rq(cpu); + BUG_ON(rq->migration_thread != current); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + struct list_head *head; + migration_req_t *req; + + if (current->flags & PF_FREEZE) + refrigerator(PF_FREEZE); + + spin_lock_irq(&rq->lock); + + if (cpu_is_offline(cpu)) { + spin_unlock_irq(&rq->lock); + goto wait_to_die; + } + + if (rq->active_balance) { + active_load_balance(rq, cpu); + rq->active_balance = 0; + } + + head = &rq->migration_queue; + + if (list_empty(head)) { + spin_unlock_irq(&rq->lock); + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + continue; + } + req = list_entry(head->next, migration_req_t, list); + list_del_init(head->next); + + if (req->type == REQ_MOVE_TASK) { + spin_unlock(&rq->lock); + __migrate_task(req->task, cpu, req->dest_cpu); + local_irq_enable(); + } else if (req->type == REQ_SET_DOMAIN) { + rq->sd = req->sd; + spin_unlock_irq(&rq->lock); + } else { + spin_unlock_irq(&rq->lock); + WARN_ON(1); + } + + complete(&req->done); + } + __set_current_state(TASK_RUNNING); + return 0; + +wait_to_die: + /* Wait for kthread_stop */ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +/* Figure out where task on dead CPU should go, use force if neccessary. */ +static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) +{ + int dest_cpu; + cpumask_t mask; + + /* On same node? */ + mask = node_to_cpumask(cpu_to_node(dead_cpu)); + cpus_and(mask, mask, tsk->cpus_allowed); + dest_cpu = any_online_cpu(mask); + + /* On any allowed CPU? */ + if (dest_cpu == NR_CPUS) + dest_cpu = any_online_cpu(tsk->cpus_allowed); + + /* No more Mr. Nice Guy. */ + if (dest_cpu == NR_CPUS) { + tsk->cpus_allowed = cpuset_cpus_allowed(tsk); + dest_cpu = any_online_cpu(tsk->cpus_allowed); + + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (tsk->mm && printk_ratelimit()) + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + tsk->pid, tsk->comm, dead_cpu); + } + __migrate_task(tsk, dead_cpu, dest_cpu); +} + +/* + * While a dead CPU has no uninterruptible tasks queued at this point, + * it might still have a nonzero ->nr_uninterruptible counter, because + * for performance reasons the counter is not stricly tracking tasks to + * their home CPUs. So we just add the counter to another CPU's counter, + * to keep the global sum constant after CPU-down: + */ +static void migrate_nr_uninterruptible(runqueue_t *rq_src) +{ + runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); + unsigned long flags; + + local_irq_save(flags); + double_rq_lock(rq_src, rq_dest); + rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; + rq_src->nr_uninterruptible = 0; + double_rq_unlock(rq_src, rq_dest); + local_irq_restore(flags); +} + +/* Run through task list and migrate tasks from the dead cpu. */ +static void migrate_live_tasks(int src_cpu) +{ + struct task_struct *tsk, *t; + + write_lock_irq(&tasklist_lock); + + do_each_thread(t, tsk) { + if (tsk == current) + continue; + + if (task_cpu(tsk) == src_cpu) + move_task_off_dead_cpu(src_cpu, tsk); + } while_each_thread(t, tsk); + + write_unlock_irq(&tasklist_lock); +} + +/* Schedules idle task to be the next runnable task on current CPU. + * It does so by boosting its priority to highest possible and adding it to + * the _front_ of runqueue. Used by CPU offline code. + */ +void sched_idle_next(void) +{ + int cpu = smp_processor_id(); + runqueue_t *rq = this_rq(); + struct task_struct *p = rq->idle; + unsigned long flags; + + /* cpu has to be offline */ + BUG_ON(cpu_online(cpu)); + + /* Strictly not necessary since rest of the CPUs are stopped by now + * and interrupts disabled on current cpu. + */ + spin_lock_irqsave(&rq->lock, flags); + + __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); + /* Add idle task to _front_ of it's priority queue */ + __activate_idle_task(p, rq); + + spin_unlock_irqrestore(&rq->lock, flags); +} + +/* Ensures that the idle task is using init_mm right before its cpu goes + * offline. + */ +void idle_task_exit(void) +{ + struct mm_struct *mm = current->active_mm; + + BUG_ON(cpu_online(smp_processor_id())); + + if (mm != &init_mm) + switch_mm(mm, &init_mm, current); + mmdrop(mm); +} + +static void migrate_dead(unsigned int dead_cpu, task_t *tsk) +{ + struct runqueue *rq = cpu_rq(dead_cpu); + + /* Must be exiting, otherwise would be on tasklist. */ + BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD); + + /* Cannot have done final schedule yet: would have vanished. */ + BUG_ON(tsk->flags & PF_DEAD); + + get_task_struct(tsk); + + /* + * Drop lock around migration; if someone else moves it, + * that's OK. No task can be added to this CPU, so iteration is + * fine. + */ + spin_unlock_irq(&rq->lock); + move_task_off_dead_cpu(dead_cpu, tsk); + spin_lock_irq(&rq->lock); + + put_task_struct(tsk); +} + +/* release_task() removes task from tasklist, so we won't find dead tasks. */ +static void migrate_dead_tasks(unsigned int dead_cpu) +{ + unsigned arr, i; + struct runqueue *rq = cpu_rq(dead_cpu); + + for (arr = 0; arr < 2; arr++) { + for (i = 0; i < MAX_PRIO; i++) { + struct list_head *list = &rq->arrays[arr].queue[i]; + while (!list_empty(list)) + migrate_dead(dead_cpu, + list_entry(list->next, task_t, + run_list)); + } + } +} +#endif /* CONFIG_HOTPLUG_CPU */ + +/* + * migration_call - callback that gets triggered when a CPU is added. + * Here we can start up the necessary migration thread for the new CPU. + */ +static int migration_call(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + struct task_struct *p; + struct runqueue *rq; + unsigned long flags; + + switch (action) { + case CPU_UP_PREPARE: + p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); + if (IS_ERR(p)) + return NOTIFY_BAD; + p->flags |= PF_NOFREEZE; + kthread_bind(p, cpu); + /* Must be high prio: stop_machine expects to yield to it. */ + rq = task_rq_lock(p, &flags); + __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); + task_rq_unlock(rq, &flags); + cpu_rq(cpu)->migration_thread = p; + break; + case CPU_ONLINE: + /* Strictly unneccessary, as first user will wake it. */ + wake_up_process(cpu_rq(cpu)->migration_thread); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + /* Unbind it from offline cpu so it can run. Fall thru. */ + kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id()); + kthread_stop(cpu_rq(cpu)->migration_thread); + cpu_rq(cpu)->migration_thread = NULL; + break; + case CPU_DEAD: + migrate_live_tasks(cpu); + rq = cpu_rq(cpu); + kthread_stop(rq->migration_thread); + rq->migration_thread = NULL; + /* Idle task back to normal (off runqueue, low prio) */ + rq = task_rq_lock(rq->idle, &flags); + deactivate_task(rq->idle, rq); + rq->idle->static_prio = MAX_PRIO; + __setscheduler(rq->idle, SCHED_NORMAL, 0); + migrate_dead_tasks(cpu); + task_rq_unlock(rq, &flags); + migrate_nr_uninterruptible(rq); + BUG_ON(rq->nr_running != 0); + + /* No need to migrate the tasks: it was best-effort if + * they didn't do lock_cpu_hotplug(). Just wake up + * the requestors. */ + spin_lock_irq(&rq->lock); + while (!list_empty(&rq->migration_queue)) { + migration_req_t *req; + req = list_entry(rq->migration_queue.next, + migration_req_t, list); + BUG_ON(req->type != REQ_MOVE_TASK); + list_del_init(&req->list); + complete(&req->done); + } + spin_unlock_irq(&rq->lock); + break; +#endif + } + return NOTIFY_OK; +} + +/* Register at highest priority so that task migration (migrate_all_tasks) + * happens before everything else. + */ +static struct notifier_block __devinitdata migration_notifier = { + .notifier_call = migration_call, + .priority = 10 +}; + +int __init migration_init(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + /* Start one for boot CPU. */ + migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); + migration_call(&migration_notifier, CPU_ONLINE, cpu); + register_cpu_notifier(&migration_notifier); + return 0; +} +#endif + +#ifdef CONFIG_SMP +#define SCHED_DOMAIN_DEBUG +#ifdef SCHED_DOMAIN_DEBUG +static void sched_domain_debug(struct sched_domain *sd, int cpu) +{ + int level = 0; + + printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + + do { + int i; + char str[NR_CPUS]; + struct sched_group *group = sd->groups; + cpumask_t groupmask; + + cpumask_scnprintf(str, NR_CPUS, sd->span); + cpus_clear(groupmask); + + printk(KERN_DEBUG); + for (i = 0; i < level + 1; i++) + printk(" "); + printk("domain %d: ", level); + + if (!(sd->flags & SD_LOAD_BALANCE)) { + printk("does not load-balance\n"); + if (sd->parent) + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); + break; + } + + printk("span %s\n", str); + + if (!cpu_isset(cpu, sd->span)) + printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); + if (!cpu_isset(cpu, group->cpumask)) + printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); + + printk(KERN_DEBUG); + for (i = 0; i < level + 2; i++) + printk(" "); + printk("groups:"); + do { + if (!group) { + printk("\n"); + printk(KERN_ERR "ERROR: group is NULL\n"); + break; + } + + if (!group->cpu_power) { + printk("\n"); + printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); + } + + if (!cpus_weight(group->cpumask)) { + printk("\n"); + printk(KERN_ERR "ERROR: empty group\n"); + } + + if (cpus_intersects(groupmask, group->cpumask)) { + printk("\n"); + printk(KERN_ERR "ERROR: repeated CPUs\n"); + } + + cpus_or(groupmask, groupmask, group->cpumask); + + cpumask_scnprintf(str, NR_CPUS, group->cpumask); + printk(" %s", str); + + group = group->next; + } while (group != sd->groups); + printk("\n"); + + if (!cpus_equal(sd->span, groupmask)) + printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + + level++; + sd = sd->parent; + + if (sd) { + if (!cpus_subset(groupmask, sd->span)) + printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); + } + + } while (sd); +} +#else +#define sched_domain_debug(sd, cpu) {} +#endif + +/* + * Attach the domain 'sd' to 'cpu' as its base domain. Callers must + * hold the hotplug lock. + */ +void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) +{ + migration_req_t req; + unsigned long flags; + runqueue_t *rq = cpu_rq(cpu); + int local = 1; + + sched_domain_debug(sd, cpu); + + spin_lock_irqsave(&rq->lock, flags); + + if (cpu == smp_processor_id() || !cpu_online(cpu)) { + rq->sd = sd; + } else { + init_completion(&req.done); + req.type = REQ_SET_DOMAIN; + req.sd = sd; + list_add(&req.list, &rq->migration_queue); + local = 0; + } + + spin_unlock_irqrestore(&rq->lock, flags); + + if (!local) { + wake_up_process(rq->migration_thread); + wait_for_completion(&req.done); + } +} + +/* cpus with isolated domains */ +cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; + +/* Setup the mask of cpus configured for isolated domains */ +static int __init isolated_cpu_setup(char *str) +{ + int ints[NR_CPUS], i; + + str = get_options(str, ARRAY_SIZE(ints), ints); + cpus_clear(cpu_isolated_map); + for (i = 1; i <= ints[0]; i++) + if (ints[i] < NR_CPUS) + cpu_set(ints[i], cpu_isolated_map); + return 1; +} + +__setup ("isolcpus=", isolated_cpu_setup); + +/* + * init_sched_build_groups takes an array of groups, the cpumask we wish + * to span, and a pointer to a function which identifies what group a CPU + * belongs to. The return value of group_fn must be a valid index into the + * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we + * keep track of groups covered with a cpumask_t). + * + * init_sched_build_groups will build a circular linked list of the groups + * covered by the given span, and will set each group's ->cpumask correctly, + * and ->cpu_power to 0. + */ +void __devinit init_sched_build_groups(struct sched_group groups[], + cpumask_t span, int (*group_fn)(int cpu)) +{ + struct sched_group *first = NULL, *last = NULL; + cpumask_t covered = CPU_MASK_NONE; + int i; + + for_each_cpu_mask(i, span) { + int group = group_fn(i); + struct sched_group *sg = &groups[group]; + int j; + + if (cpu_isset(i, covered)) + continue; + + sg->cpumask = CPU_MASK_NONE; + sg->cpu_power = 0; + + for_each_cpu_mask(j, span) { + if (group_fn(j) != group) + continue; + + cpu_set(j, covered); + cpu_set(j, sg->cpumask); + } + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; + } + last->next = first; +} + + +#ifdef ARCH_HAS_SCHED_DOMAIN +extern void __devinit arch_init_sched_domains(void); +extern void __devinit arch_destroy_sched_domains(void); +#else +#ifdef CONFIG_SCHED_SMT +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static struct sched_group sched_group_cpus[NR_CPUS]; +static int __devinit cpu_to_cpu_group(int cpu) +{ + return cpu; +} +#endif + +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +static struct sched_group sched_group_phys[NR_CPUS]; +static int __devinit cpu_to_phys_group(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + return first_cpu(cpu_sibling_map[cpu]); +#else + return cpu; +#endif +} + +#ifdef CONFIG_NUMA + +static DEFINE_PER_CPU(struct sched_domain, node_domains); +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +static int __devinit cpu_to_node_group(int cpu) +{ + return cpu_to_node(cpu); +} +#endif + +#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) +/* + * The domains setup code relies on siblings not spanning + * multiple nodes. Make sure the architecture has a proper + * siblings map: + */ +static void check_sibling_maps(void) +{ + int i, j; + + for_each_online_cpu(i) { + for_each_cpu_mask(j, cpu_sibling_map[i]) { + if (cpu_to_node(i) != cpu_to_node(j)) { + printk(KERN_INFO "warning: CPU %d siblings map " + "to different node - isolating " + "them.\n", i); + cpu_sibling_map[i] = cpumask_of_cpu(i); + break; + } + } + } +} +#endif + +/* + * Set up scheduler domains and groups. Callers must hold the hotplug lock. + */ +static void __devinit arch_init_sched_domains(void) +{ + int i; + cpumask_t cpu_default_map; + +#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) + check_sibling_maps(); +#endif + /* + * Setup mask for cpus without special case scheduling requirements. + * For now this just excludes isolated cpus, but could be used to + * exclude other special cases in the future. + */ + cpus_complement(cpu_default_map, cpu_isolated_map); + cpus_and(cpu_default_map, cpu_default_map, cpu_online_map); + + /* + * Set up domains. Isolated domains just stay on the dummy domain. + */ + for_each_cpu_mask(i, cpu_default_map) { + int group; + struct sched_domain *sd = NULL, *p; + cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); + + cpus_and(nodemask, nodemask, cpu_default_map); + +#ifdef CONFIG_NUMA + sd = &per_cpu(node_domains, i); + group = cpu_to_node_group(i); + *sd = SD_NODE_INIT; + sd->span = cpu_default_map; + sd->groups = &sched_group_nodes[group]; +#endif + + p = sd; + sd = &per_cpu(phys_domains, i); + group = cpu_to_phys_group(i); + *sd = SD_CPU_INIT; + sd->span = nodemask; + sd->parent = p; + sd->groups = &sched_group_phys[group]; + +#ifdef CONFIG_SCHED_SMT + p = sd; + sd = &per_cpu(cpu_domains, i); + group = cpu_to_cpu_group(i); + *sd = SD_SIBLING_INIT; + sd->span = cpu_sibling_map[i]; + cpus_and(sd->span, sd->span, cpu_default_map); + sd->parent = p; + sd->groups = &sched_group_cpus[group]; +#endif + } + +#ifdef CONFIG_SCHED_SMT + /* Set up CPU (sibling) groups */ + for_each_online_cpu(i) { + cpumask_t this_sibling_map = cpu_sibling_map[i]; + cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); + if (i != first_cpu(this_sibling_map)) + continue; + + init_sched_build_groups(sched_group_cpus, this_sibling_map, + &cpu_to_cpu_group); + } +#endif + + /* Set up physical groups */ + for (i = 0; i < MAX_NUMNODES; i++) { + cpumask_t nodemask = node_to_cpumask(i); + + cpus_and(nodemask, nodemask, cpu_default_map); + if (cpus_empty(nodemask)) + continue; + + init_sched_build_groups(sched_group_phys, nodemask, + &cpu_to_phys_group); + } + +#ifdef CONFIG_NUMA + /* Set up node groups */ + init_sched_build_groups(sched_group_nodes, cpu_default_map, + &cpu_to_node_group); +#endif + + /* Calculate CPU power for physical packages and nodes */ + for_each_cpu_mask(i, cpu_default_map) { + int power; + struct sched_domain *sd; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i); + power = SCHED_LOAD_SCALE; + sd->groups->cpu_power = power; +#endif + + sd = &per_cpu(phys_domains, i); + power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * + (cpus_weight(sd->groups->cpumask)-1) / 10; + sd->groups->cpu_power = power; + +#ifdef CONFIG_NUMA + if (i == first_cpu(sd->groups->cpumask)) { + /* Only add "power" once for each physical package. */ + sd = &per_cpu(node_domains, i); + sd->groups->cpu_power += power; + } +#endif + } + + /* Attach the domains */ + for_each_online_cpu(i) { + struct sched_domain *sd; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i); +#else + sd = &per_cpu(phys_domains, i); +#endif + cpu_attach_domain(sd, i); + } + init_sched_domain_sysctl(); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void __devinit arch_destroy_sched_domains(void) +{ + destroy_sched_domain_sysctl(); +} +#endif + +#endif /* ARCH_HAS_SCHED_DOMAIN */ + +/* + * Initial dummy domain for early boot and for hotplug cpu. Being static, + * it is initialized to zero, so all balancing flags are cleared which is + * what we want. + */ +static struct sched_domain sched_domain_dummy; + +#ifdef CONFIG_HOTPLUG_CPU +/* + * Force a reinitialization of the sched domains hierarchy. The domains + * and groups cannot be updated in place without racing with the balancing + * code, so we temporarily attach all running cpus to a "dummy" domain + * which will prevent rebalancing while the sched domains are recalculated. + */ +static int update_sched_domains(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int i; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_DOWN_PREPARE: + for_each_online_cpu(i) + cpu_attach_domain(&sched_domain_dummy, i); + arch_destroy_sched_domains(); + return NOTIFY_OK; + + case CPU_UP_CANCELED: + case CPU_DOWN_FAILED: + case CPU_ONLINE: + case CPU_DEAD: + /* + * Fall through and re-initialise the domains. + */ + break; + default: + return NOTIFY_DONE; + } + + /* The hotplug lock is already held by cpu_up/cpu_down */ + arch_init_sched_domains(); + + return NOTIFY_OK; +} +#endif + +void __init sched_init_smp(void) +{ + lock_cpu_hotplug(); + arch_init_sched_domains(); + unlock_cpu_hotplug(); + /* XXX: Theoretical race here - CPU may be hotplugged now */ + hotcpu_notifier(update_sched_domains, 0); +} +#else +void __init sched_init_smp(void) +{ +} +#endif /* CONFIG_SMP */ + +int in_sched_functions(unsigned long addr) +{ + /* Linker adds these: start and end of __sched functions */ + extern char __sched_text_start[], __sched_text_end[]; + return in_lock_functions(addr) || + (addr >= (unsigned long)__sched_text_start + && addr < (unsigned long)__sched_text_end); +} + +void __init sched_init(void) +{ + runqueue_t *rq; + int i, j, k; + + for (i = 0; i < NR_CPUS; i++) { + prio_array_t *array; + + rq = cpu_rq(i); + spin_lock_init(&rq->lock); + rq->active = rq->arrays; + rq->expired = rq->arrays + 1; + rq->best_expired_prio = MAX_PRIO; + rq->iso_refractory = rq->iso_ticks = 0; + +#ifdef CONFIG_SMP + rq->sd = &sched_domain_dummy; + rq->cpu_load = 0; + rq->active_balance = 0; + rq->push_cpu = 0; + rq->migration_thread = NULL; + INIT_LIST_HEAD(&rq->migration_queue); +#endif + atomic_set(&rq->nr_iowait, 0); + + for (j = 0; j < 2; j++) { + array = rq->arrays + j; + array->min_prio = MAX_PRIO; + for (k = 0; k < MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + /* delimiter for bitsearch */ + __set_bit(MAX_PRIO, array->bitmap); + } + INIT_LIST_HEAD(&rq->iso_queue); + } + + /* + * The boot idle thread does lazy MMU switching as well: + */ + atomic_inc(&init_mm.mm_count); + enter_lazy_tlb(&init_mm, current); + + /* + * Make us the idle thread. Technically, schedule() should not be + * called from this thread, however somewhere below it might be, + * but because we are the idle thread, we just pick up running again + * when this runqueue becomes "idle". + */ + init_idle(current, smp_processor_id()); +} + +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +void __might_sleep(char *file, int line) +{ +#if defined(in_atomic) + static unsigned long prev_jiffy; /* ratelimiting */ + + if ((in_atomic() || irqs_disabled()) && + system_state == SYSTEM_RUNNING && !oops_in_progress) { + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + printk(KERN_ERR "Debug: sleeping function called from invalid" + " context at %s:%d\n", file, line); + printk("in_atomic():%d, irqs_disabled():%d\n", + in_atomic(), irqs_disabled()); + dump_stack(); + } +#endif +} +EXPORT_SYMBOL(__might_sleep); +#endif + +#if defined(CONFIG_DEBUG_KERNEL)&&defined(CONFIG_SYSCTL)&&defined(CONFIG_SMP) +static struct ctl_table sd_ctl_dir[] = { + {1, "sched_domain", NULL, 0, 0755, NULL, }, + {0,}, +}; + +static struct ctl_table sd_ctl_root[] = { + {1, "kernel", NULL, 0, 0755, sd_ctl_dir, }, + {0,}, +}; +static char *sched_strdup(char *str) +{ + int n = strlen(str)+1; + char *s = kmalloc(n, GFP_KERNEL); + if (!s) + return NULL; + return strcpy(s, str); +} +static struct ctl_table *sd_alloc_ctl_entry(int n) +{ + struct ctl_table *entry = + kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL); + BUG_ON(!entry); + memset(entry, 0, n * sizeof(struct ctl_table)); + return entry; +} + +static void +set_table_entry(struct ctl_table *entry, int ctl_name, const char *procname, + void *data, int maxlen, mode_t mode, + proc_handler *proc_handler) +{ + entry->ctl_name = ctl_name; + entry->procname = procname; + entry->data = data; + entry->maxlen = maxlen; + entry->mode = mode; + entry->proc_handler = proc_handler; +} + +static struct ctl_table * +sd_alloc_ctl_domain_table(struct sched_domain *sd) +{ + struct ctl_table *table; + table = sd_alloc_ctl_entry(9); + + set_table_entry(&table[0], 1, "min_interval", &sd->min_interval, + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[1], 2, "max_interval", &sd->max_interval, + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[2], 3, "busy_factor", &sd->busy_factor, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[3], 4, "imbalance_pct", &sd->imbalance_pct, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[4], 5, "cache_hot_time", &sd->cache_hot_time, + sizeof(long long), 0644, proc_doulonglongvec_minmax); + set_table_entry(&table[5], 6, "cache_nice_tries", &sd->cache_nice_tries, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[6], 7, "per_cpu_gain", &sd->per_cpu_gain, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[7], 8, "flags", &sd->flags, + sizeof(int), 0644, proc_dointvec_minmax); + return table; +} + +static ctl_table *sd_alloc_ctl_cpu_table(int cpu) +{ + struct sched_domain *sd; + int domain_num = 0, i; + struct ctl_table *entry, *table; + char buf[32]; + for_each_domain(cpu, sd) + domain_num++; + entry = table = sd_alloc_ctl_entry(domain_num + 1); + + i = 0; + for_each_domain(cpu, sd) { + sprintf(buf, "domain-%d", i); + entry->ctl_name = i + 1; + entry->procname = sched_strdup(buf); + entry->mode = 0755; + entry->child = sd_alloc_ctl_domain_table(sd); + entry++; + i++; + } + return table; +} + +static struct ctl_table_header *sd_sysctl_header; +void init_sched_domain_sysctl() +{ + int i, cpu_num = num_online_cpus(); + char buf[32]; + struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + + sd_ctl_dir[0].child = entry; + + for (i = 0; i < cpu_num; i++, entry++) { + sprintf(buf, "cpu%d", i); + entry->ctl_name = i + 1; + entry->procname = sched_strdup(buf); + entry->mode = 0755; + entry->child = sd_alloc_ctl_cpu_table(i); + } + sd_sysctl_header = register_sysctl_table(sd_ctl_root, 0); +} + +void destroy_sched_domain_sysctl() +{ + int cpu, cpu_num = num_online_cpus(); + struct sched_domain *sd; + struct ctl_table *root = sd_ctl_dir[0].child; + struct ctl_table *entry, *table; + + unregister_sysctl_table(sd_sysctl_header); + entry = root; + for (cpu = 0; cpu < cpu_num; cpu++, entry++) { + kfree(entry->procname); + table = entry->child; + for_each_domain(cpu, sd) { + kfree(table->procname); + kfree(table->child); + table++; + } + kfree(entry->child); + } + kfree(root); +} +#else +void init_sched_domain_sysctl() +{ +} +void destroy_sched_domain_sysctl() +{ +} +#endif + +#ifdef CONFIG_MAGIC_SYSRQ +void normalize_rt_tasks(void) +{ + struct task_struct *p; + prio_array_t *array; + unsigned long flags; + runqueue_t *rq; + + read_lock_irq(&tasklist_lock); + for_each_process (p) { + if (!rt_task(p) && !iso_task(p)) + continue; + + rq = task_rq_lock(p, &flags); + + array = p->array; + if (array) + deactivate_task(p, rq); + __setscheduler(p, SCHED_NORMAL, 0); + if (policy == SCHED_FIFO || policy == SCHED_RR) + p->used_slice = 0; + + if (array) { + __activate_task(p, rq, array); + resched_task(rq->curr); + } + + task_rq_unlock(rq, &flags); + } + read_unlock_irq(&tasklist_lock); +} + +#endif /* CONFIG_MAGIC_SYSRQ */ diff -Naur linux-2.6.11-rc2-mm1/kernel/sched.c linux-2.6.11-rc2/kernel/sched.c --- linux-2.6.11-rc2-mm1/kernel/sched.c 2005-01-24 13:38:29.000000000 -0800 +++ linux-2.6.11-rc2/kernel/sched.c 2005-01-24 14:56:22.270924632 -0800 @@ -58,120 +58,68 @@ * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], * and back. */ -#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) -#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 30) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 30) #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) /* * 'User priority' is the nice value converted to something we * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. + * it's a [ 0 ... 58 ] range. */ #define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) +#define US_TO_JIFFIES(x) ((x) * HZ / 1000000) +#define JIFFIES_TO_US(x) ((x) * 1000000 / HZ) + /* - * Some helpers for converting nanosecond timing to jiffy resolution + * MIN_TIMESLICE is the timeslice that a minimum priority process gets if the$ + * is a maximum priority process runnable. MAX_TIMESLICE is derived from the + * formula in task_timeslice. It cannot be changed here. It is the timesilce + * that the maximum priority process will get. Larger timeslices are attainab$ + * by low priority processes however. */ -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +int sched_base_timeslice = 64; +int sched_min_base = 1; +int sched_max_base = 10000; + + #define RT_TIMESLICE (50 * 1000 / HZ) /* 50ms */ +#define BASE_TIMESLICE (sched_base_timeslice) +#define MIN_TIMESLICE 1 + + /* Maximum amount of history that will be used to calculate priority */ +#define MAX_SLEEP_SHIFT 19 +#define MAX_SLEEP (1UL << MAX_SLEEP_SHIFT) /* roughly 0.52s */ /* - * These are the 'tuning knobs' of the scheduler: - * - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), - * default timeslice is 100 msecs, maximum timeslice is 800 msecs. - * Timeslices get refilled after they expire. + * Maximum effect that 1 block of activity (run/sleep/etc) can have. This is + * will moderate dicard freak events (eg. SIGSTOP) */ -#define MIN_TIMESLICE max(5 * HZ / 1000, 1) -#define DEF_TIMESLICE (100 * HZ / 1000) -#define ON_RUNQUEUE_WEIGHT 30 -#define CHILD_PENALTY 95 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) -#define STARVATION_LIMIT (MAX_SLEEP_AVG) -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) +#define MAX_SLEEP_AFFECT (MAX_SLEEP/4) /* - * If a task is 'interactive' then we reinsert it in the active - * array after it has expired its current timeslice. (it will not - * continue to run immediately, it will still roundrobin with - * other interactive tasks.) - * - * This part scales the interactivity limit depending on niceness. - * - * We scale it linearly, offset by the INTERACTIVE_DELTA delta. - * Here are a few examples of different nice levels: - * - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] - * - * (the X axis represents the possible -5 ... 0 ... +5 dynamic - * priority range a task can explore, a value of '1' means the - * task is rated interactive.) - * - * Ie. nice +19 tasks can never get 'interactive' enough to be - * reinserted into the active array. And only heavily CPU-hog nice -20 - * tasks will be expired. Default nice 0 tasks are somewhere between, - * it takes some effort for them to get interactive, but it's not - * too hard. + * The amount of history can be decreased (on fork for example). This puts a + * lower bound on it. */ +#define MIN_HISTORY (MAX_SLEEP/8) -#define CURRENT_BONUS(p) \ - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ - MAX_SLEEP_AVG) - -#define GRANULARITY (10 * HZ / 1000 ? : 1) - -#ifdef CONFIG_SMP -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ - num_online_cpus()) -#else -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) -#endif - -#define SCALE(v1,v1_max,v2_max) \ - (v1) * (v2_max) / (v1_max) - -#define DELTA(p) \ - (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) - -#define TASK_INTERACTIVE(p) \ - ((p)->prio <= (p)->static_prio - DELTA(p)) - -#define INTERACTIVE_SLEEP(p) \ - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) +#define FORKED_TS_MAX (US_TO_JIFFIES(MIN_HISTORY) ?: 1) /* - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] - * to time slice values: [800ms ... 100ms ... 5ms] - * - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. + * SLEEP_FACTOR is a fixed point factor used to scale history tracking things. + * In particular: total_time, sleep_time, sleep_avg. */ +#define SLEEP_FACTOR 1024 -#define SCALE_PRIO(x, prio) \ - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) +/* + * The scheduler classifies a process as performing one of the following + * activities + */ +#define STIME_SLEEP 1 /* Sleeping */ +#define STIME_RUN 2 /* Using CPU */ -static unsigned int task_timeslice(task_t *p) -{ - if (p->static_prio < NICE_TO_PRIO(0)) - return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); - else - return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); -} +#define TASK_PREEMPTS_CURR(p, rq) ( (p)->prio < (rq)->curr->prio ) int iso_cpu = 70; /* The soft %cpu limit on SCHED_ISO tasks */ int iso_period = 5; /* The time over which SCHED_ISO cpu decays */ @@ -188,6 +136,7 @@ typedef struct runqueue runqueue_t; struct prio_array { + int min_prio; unsigned int nr_active; unsigned long bitmap[BITMAP_SIZE]; struct list_head queue[MAX_PRIO]; @@ -209,8 +158,10 @@ */ unsigned long nr_running; #ifdef CONFIG_SMP - unsigned long cpu_load; + unsigned long cpu_load[4]; #endif + unsigned long array_sequence; + unsigned long nr_uninterruptible; unsigned long iso_ticks; struct list_head iso_queue; int iso_refractory; @@ -229,17 +180,16 @@ * one CPU and if it got migrated afterwards it may decrease * it on another CPU. Always updated under the runqueue lock: */ - unsigned long nr_uninterruptible; - - unsigned long expired_timestamp; + unsigned long long timestamp_last_tick; task_t *curr, *idle; struct mm_struct *prev_mm; - prio_array_t *active, *expired, arrays[2]; - int best_expired_prio; atomic_t nr_iowait; + prio_array_t *active, *expired, arrays[2]; #ifdef CONFIG_SMP + unsigned long long timestamp_last_tick; + struct sched_domain *sd; /* For active balancing */ @@ -261,35 +211,13 @@ unsigned long yld_cnt; /* schedule() stats */ - unsigned long sched_noswitch; unsigned long sched_switch; unsigned long sched_cnt; unsigned long sched_goidle; - /* pull_task() stats */ - unsigned long pt_gained[MAX_IDLE_TYPES]; - unsigned long pt_lost[MAX_IDLE_TYPES]; - - /* active_load_balance() stats */ - unsigned long alb_cnt; - unsigned long alb_lost; - unsigned long alb_gained; - unsigned long alb_failed; - /* try_to_wake_up() stats */ unsigned long ttwu_cnt; - unsigned long ttwu_attempts; - unsigned long ttwu_moved; - - /* wake_up_new_task() stats */ - unsigned long wunt_cnt; - unsigned long wunt_moved; - - /* sched_migrate_task() stats */ - unsigned long smt_cnt; - - /* sched_balance_exec() stats */ - unsigned long sbe_cnt; + unsigned long ttwu_local; #endif }; @@ -303,14 +231,71 @@ #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -/* - * Default context-switch locking: - */ #ifndef prepare_arch_switch -# define prepare_arch_switch(rq, next) do { } while (0) -# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) -# define task_running(rq, p) ((rq)->curr == (p)) +# define prepare_arch_switch(next) do { } while (0) +#endif +#ifndef finish_arch_switch +# define finish_arch_switch(prev) do { } while (0) +#endif + +#ifndef __ARCH_WANT_UNLOCKED_CTXSW +static inline int task_running(runqueue_t *rq, task_t *p) +{ + return rq->curr == p; +} + +static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) +{ +} + +static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) +{ + spin_unlock_irq(&rq->lock); +} + +#else /* __ARCH_WANT_UNLOCKED_CTXSW */ +static inline int task_running(runqueue_t *rq, task_t *p) +{ +#ifdef CONFIG_SMP + return p->oncpu; +#else + return rq->curr == p; #endif +} + +static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->oncpu = 1; +#endif +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + spin_unlock_irq(&rq->lock); +#else + spin_unlock(&rq->lock); +#endif +} + +static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->oncpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->oncpu = 0; +#endif +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_enable(); +#endif +} +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ static inline int task_preempts_curr(task_t *p, runqueue_t *rq) { @@ -359,7 +344,7 @@ * bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 10 +#define SCHEDSTAT_VERSION 11 static int show_schedstat(struct seq_file *seq, void *v) { @@ -377,22 +362,14 @@ /* runqueue-specific stats */ seq_printf(seq, - "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu " - "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", + "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", cpu, rq->yld_both_empty, - rq->yld_act_empty, rq->yld_exp_empty, - rq->yld_cnt, rq->sched_noswitch, + rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, rq->sched_switch, rq->sched_cnt, rq->sched_goidle, - rq->alb_cnt, rq->alb_gained, rq->alb_lost, - rq->alb_failed, - rq->ttwu_cnt, rq->ttwu_moved, rq->ttwu_attempts, - rq->wunt_cnt, rq->wunt_moved, - rq->smt_cnt, rq->sbe_cnt, rq->rq_sched_info.cpu_time, + rq->ttwu_cnt, rq->ttwu_local, + rq->rq_sched_info.cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); - for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++) - seq_printf(seq, " %lu %lu", rq->pt_gained[itype], - rq->pt_lost[itype]); seq_printf(seq, "\n"); #ifdef CONFIG_SMP @@ -403,17 +380,21 @@ cpumask_scnprintf(mask_str, NR_CPUS, sd->span); seq_printf(seq, "domain%d %s", dcnt++, mask_str); for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; - itype++) { - seq_printf(seq, " %lu %lu %lu %lu %lu", + itype++) { + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", sd->lb_cnt[itype], + sd->lb_balanced[itype], sd->lb_failed[itype], sd->lb_imbalance[itype], + sd->lb_gained[itype], + sd->lb_hot_gained[itype], sd->lb_nobusyq[itype], sd->lb_nobusyg[itype]); } - seq_printf(seq, " %lu %lu %lu %lu\n", + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n", + sd->alb_cnt, sd->alb_failed, sd->alb_pushed, sd->sbe_pushed, sd->sbe_attempts, - sd->ttwu_wake_affine, sd->ttwu_wake_balance); + sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); } #endif } @@ -446,27 +427,13 @@ .release = single_release, }; -# define schedstat_inc(rq, field) rq->field++; -# define schedstat_add(rq, field, amt) rq->field += amt; +# define schedstat_inc(rq, field) do { rq->field++; } while (0) +# define schedstat_add(rq, field, amt) do { rq->field += amt; } while (0) #else /* !CONFIG_SCHEDSTATS */ -# define schedstat_inc(rq, field) do { } while (0); -# define schedstat_add(rq, field, amt) do { } while (0); +# define schedstat_inc(rq, field) do { } while (0) +# define schedstat_add(rq, field, amt) do { } while (0) #endif -/* - * rq_lock - lock a given runqueue and disable interrupts. - */ -static runqueue_t *this_rq_lock(void) - __acquires(rq->lock) -{ - runqueue_t *rq; - - local_irq_disable(); - rq = this_rq(); - spin_lock(&rq->lock); - - return rq; -} #ifdef CONFIG_SCHED_SMT static int cpu_and_siblings_are_idle(int cpu) @@ -626,8 +593,18 @@ static void enqueue_task(struct task_struct *p, prio_array_t *array) { + struct list_head *entry = array->queue + p->prio; sched_info_queued(p); - list_add_tail(&p->run_list, array->queue + p->prio); + + if (!rt_task(p)) { + /* + * Cycle tasks on the same priority level. This reduces their + * timeslice fluctuations due to higher priority tasks expiring. + */ + if (!list_empty(entry)) + entry = entry->next; + } + list_add_tail(&p->run_list, entry); __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; @@ -660,44 +637,123 @@ p->array = array; } +static inline unsigned long long clock_us(void) +{ + return sched_clock() >> 10; +} + /* - * effective_prio - return the priority that is based on the static - * priority but is modified by bonuses/penalties. - * - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] - * into the -5 ... 0 ... +5 bonus/penalty range. - * - * We use 25% of the full 0...39 priority range so that: - * - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. + * add_task_time updates a task @p after @time of doing the specified @type + * of activity. See STIME_*. This is used for priority calculation. + */ +static inline void add_task_time(task_t *p, unsigned long long time, unsigned long type) +{ + unsigned long ratio; + unsigned long long tmp; + unsigned long t; + + if (type == STIME_SLEEP) { + if (time > MAX_SLEEP_AFFECT*4) + time = MAX_SLEEP_AFFECT*4; + t = ((unsigned long)time + 3) / 4; + } else { + unsigned long div = 60 - USER_PRIO(p->static_prio); + t = (unsigned long)time * 30; + t = t / div; + t = t * 30; + t = t / div; + } + + ratio = MAX_SLEEP - t; + tmp = (unsigned long long)ratio*p->total_time + MAX_SLEEP/2; + tmp >>= MAX_SLEEP_SHIFT; + p->total_time = (unsigned long)tmp; + + tmp = (unsigned long long)ratio*p->sleep_time + MAX_SLEEP/2; + tmp >>= MAX_SLEEP_SHIFT; + p->sleep_time = (unsigned long)tmp; + + p->total_time += t; + if (type == STIME_SLEEP) + p->sleep_time += t; +} + +static unsigned long task_sleep_avg(task_t *p) +{ + return (SLEEP_FACTOR * p->sleep_time) / (p->total_time + 1); +} + +/* + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. * - * Both properties are important to certain workloads. + * Timeslices are scaled, so if only low priority processes are running, + * they will all get long timeslices. */ -static int effective_prio(task_t *p) +static int task_timeslice(task_t *p, runqueue_t *rq) { + int idx, base, delta; + int timeslice; + + if (rt_task(p)) + return RT_TIMESLICE; + + idx = min(p->prio, rq->expired->min_prio); + delta = p->prio - idx; + base = BASE_TIMESLICE * (MAX_USER_PRIO + 1) / (delta + 2); + base = base * (MAX_USER_PRIO + 1) / (delta + 2); + + base = base * 40 / (70 - USER_PRIO(idx)); + base = base * 40 / (70 - USER_PRIO(idx)); + + timeslice = base >> 10; + timeslice = timeslice * HZ / 1000; + if (timeslice < MIN_TIMESLICE) + timeslice = MIN_TIMESLICE; + + return timeslice; +} + +/* + * task_priority: calculates a task's priority based on previous running + * history (see add_task_time). The priority is just a simple linear function + * based on sleep_avg and static_prio. + */ +static int task_priority(task_t *p) +{ + unsigned long sleep_avg; int bonus, prio; if (rt_task(p)) return p->prio; - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; + sleep_avg = task_sleep_avg(p); + + prio = USER_PRIO(p->static_prio) + 10; + bonus = (((MAX_USER_PRIO + 1) / 3) * sleep_avg + (SLEEP_FACTOR / 2)) + / SLEEP_FACTOR; + prio = MAX_RT_PRIO + prio - bonus; - prio = p->static_prio - bonus; if (prio < MAX_RT_PRIO) - prio = MAX_RT_PRIO; + return MAX_RT_PRIO; if (prio > MAX_PRIO-1) - prio = MAX_PRIO-1; + return MAX_PRIO-1; + return prio; } /* * __activate_task - move a task to the runqueue. */ -static inline void __activate_task(task_t *p, runqueue_t *rq) +static inline void __activate_task(task_t *p, runqueue_t *rq, prio_array_t *array) { - enqueue_task(p, rq->active); + enqueue_task(p, array); rq->nr_running++; + if (!rt_task(p)) { + if (p->prio < array->min_prio) + array->min_prio = p->prio; + } } /* @@ -709,67 +765,6 @@ rq->nr_running++; } -static void recalc_task_prio(task_t *p, unsigned long long now) -{ - unsigned long long __sleep_time = now - p->timestamp; - unsigned long sleep_time; - - if (__sleep_time > NS_MAX_SLEEP_AVG) - sleep_time = NS_MAX_SLEEP_AVG; - else - sleep_time = (unsigned long)__sleep_time; - - if (likely(sleep_time > 0)) { - /* - * User tasks that sleep a long time are categorised as - * idle and will get just interactive status to stay active & - * prevent them suddenly becoming cpu hogs and starving - * other processes. - */ - if (p->mm && p->activated != -1 && - sleep_time > INTERACTIVE_SLEEP(p)) { - p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - - DEF_TIMESLICE); - } else { - /* - * The lower the sleep avg a task has the more - * rapidly it will rise with sleep time. - */ - sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; - - /* - * Tasks waking from uninterruptible sleep are - * limited in their sleep_avg rise as they - * are likely to be waiting on I/O - */ - if (p->activated == -1 && p->mm) { - if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) - sleep_time = 0; - else if (p->sleep_avg + sleep_time >= - INTERACTIVE_SLEEP(p)) { - p->sleep_avg = INTERACTIVE_SLEEP(p); - sleep_time = 0; - } - } - - /* - * This code gives a bonus to interactive tasks. - * - * The boost works by updating the 'average sleep time' - * value here, based on ->timestamp. The more time a - * task spends sleeping, the higher the average gets - - * and the higher the priority boost gets as well. - */ - p->sleep_avg += sleep_time; - - if (p->sleep_avg > NS_MAX_SLEEP_AVG) - p->sleep_avg = NS_MAX_SLEEP_AVG; - } - } - - p->prio = effective_prio(p); -} - /* * activate_task - move a task to the runqueue and do priority recalculation * @@ -778,9 +773,10 @@ */ static void activate_task(task_t *p, runqueue_t *rq, int local) { - unsigned long long now; + unsigned long long now, sleep; + prio_array_t *array; - now = sched_clock(); + now = clock_us(); #ifdef CONFIG_SMP if (!local) { /* Compensate for drifting sched_clock */ @@ -790,40 +786,34 @@ } #endif - recalc_task_prio(p, now); /* - * This checks to make sure it's not an uninterruptible task - * that is now waking up. + * If we have slept through an active/expired array switch, restart + * our timeslice too. */ - if (!p->activated) { - /* - * Tasks which were woken up by interrupts (ie. hw events) - * are most likely of interactive nature. So we give them - * the credit of extending their sleep time to the period - * of time they spend on the runqueue, waiting for execution - * on a CPU, first time around: - */ - if (in_interrupt()) - p->activated = 2; - else { - /* - * Normal first-time wakeups get a credit too for - * on-runqueue time, but it will be weighted down: - */ - p->activated = 1; - } - } + + sleep = now - p->timestamp; p->timestamp = now; + add_task_time(p, sleep, STIME_SLEEP); + p->prio = task_priority(p); + + array = rq->active; + if (unlikely(p->used_slice == -1)) { + /* This only applys to newly woken children */ + array = rq->expired; + p->used_slice = 0; + } else if (rq->array_sequence != p->array_sequence) + p->used_slice = 0; - __activate_task(p, rq); + __activate_task(p, rq, array); } /* * deactivate_task - remove a task from the runqueue. */ -static void deactivate_task(struct task_struct *p, runqueue_t *rq) +static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) { + p->array_sequence = rq->array_sequence; rq->nr_running--; dequeue_task(p, p->array); p->array = NULL; @@ -966,23 +956,23 @@ * We want to under-estimate the load of migration sources, to * balance conservatively. */ -static inline unsigned long source_load(int cpu) +static inline unsigned long source_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; - return min(rq->cpu_load, load_now); + return min(rq->cpu_load[type], load_now); } /* * Return a high guess at the load of a migration-target cpu */ -static inline unsigned long target_load(int cpu) +static inline unsigned long target_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; - return max(rq->cpu_load, load_now); + return max(rq->cpu_load[type], load_now); } #endif @@ -1047,12 +1037,11 @@ runqueue_t *rq; #ifdef CONFIG_SMP unsigned long load, this_load; - struct sched_domain *sd; + struct sched_domain *sd, *this_sd = NULL; int new_cpu; #endif rq = task_rq_lock(p, &flags); - schedstat_inc(rq, ttwu_cnt); old_state = p->state; if (!(old_state & state)) goto out; @@ -1067,68 +1056,75 @@ if (unlikely(task_running(rq, p))) goto out_activate; - new_cpu = cpu; - + schedstat_inc(rq, ttwu_cnt); + if (cpu == this_cpu) { + schedstat_inc(rq, ttwu_local); + } else { + for_each_domain(this_cpu, sd) { + if (cpu_isset(cpu, sd->span)) { + schedstat_inc(sd, ttwu_wake_remote); + this_sd = sd; + break; + } + } + } if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) goto out_set_cpu; - load = source_load(cpu); - this_load = target_load(this_cpu); - /* - * If sync wakeup then subtract the (maximum possible) effect of - * the currently running task from the load of the current CPU: + * Check for affine wakeup and passive balancing possibilities. */ - if (sync) - this_load -= SCHED_LOAD_SCALE; + if (this_sd) { + unsigned int imbalance; - /* Don't pull the task off an idle CPU to a busy one */ - if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2) - goto out_set_cpu; + load = source_load(cpu, this_sd->wake_balance_load_idx); + this_load = target_load(this_cpu, this_sd->wake_balance_load_idx); - new_cpu = this_cpu; /* Wake to this CPU if we can */ + /* Don't pull the task off an idle CPU to a busy one */ + if (load < SCHED_LOAD_SCALE + && load + this_load > SCHED_LOAD_SCALE + && this_load > load) + goto out_set_cpu; + + new_cpu = this_cpu; /* Wake to this CPU if we can */ - /* - * Scan domains for affine wakeup and passive balancing - * possibilities. - */ - for_each_domain(this_cpu, sd) { - unsigned int imbalance; /* * Start passive balancing when half the imbalance_pct * limit is reached. */ - imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; + imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; - if ((sd->flags & SD_WAKE_AFFINE) && - !task_hot(p, rq->timestamp_last_tick, sd)) { - /* - * This domain has SD_WAKE_AFFINE and p is cache cold - * in this domain. - */ - if (cpu_isset(cpu, sd->span)) { - schedstat_inc(sd, ttwu_wake_affine); + if ((this_sd->flags & SD_WAKE_AFFINE) && + !task_hot(p, rq->timestamp_last_tick, this_sd)) { + unsigned long load, this_load; + load = target_load(cpu, this_sd->wake_balance_load_idx); + this_load = source_load(this_cpu, this_sd->wake_balance_load_idx); + + if (imbalance*load > 100*this_load) { + /* + * This domain has SD_WAKE_AFFINE and + * p is cache cold in this domain, and + * there is no bad imbalance. + */ + schedstat_inc(this_sd, ttwu_move_affine); goto out_set_cpu; } - } else if ((sd->flags & SD_WAKE_BALANCE) && - imbalance*this_load <= 100*load) { + } + if ((this_sd->flags & SD_WAKE_BALANCE) && + imbalance*this_load <= 100*load) { /* * This domain has SD_WAKE_BALANCE and there is * an imbalance. */ - if (cpu_isset(cpu, sd->span)) { - schedstat_inc(sd, ttwu_wake_balance); - goto out_set_cpu; - } + schedstat_inc(this_sd, ttwu_move_balance); + goto out_set_cpu; } } new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ out_set_cpu: - schedstat_inc(rq, ttwu_attempts); new_cpu = wake_idle(new_cpu, p); if (new_cpu != cpu) { - schedstat_inc(rq, ttwu_moved); set_task_cpu(p, new_cpu); task_rq_unlock(rq, &flags); /* might preempt at this point */ @@ -1145,23 +1141,8 @@ out_activate: #endif /* CONFIG_SMP */ - if (old_state == TASK_UNINTERRUPTIBLE) { + if (old_state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible--; - /* - * Tasks on involuntary sleep don't earn - * sleep_avg beyond just interactive state. - */ - p->activated = -1; - } - - /* - * Sync wakeups (i.e. those types of wakeups where the waker - * has indicated that it will leave the CPU in short order) - * don't trigger a preemption, if the woken up task will run on - * this cpu. (in this case the 'I will reschedule' promise of - * the waker guarantees that the freshly woken up task is going - * to be considered on this CPU.) - */ activate_task(p, rq, cpu == this_cpu); if (!sync || cpu != this_cpu) { if (task_preempts_curr(p, rq)) @@ -1201,6 +1182,9 @@ */ void fastcall sched_fork(task_t *p) { + unsigned long sleep_avg; + runqueue_t *rq; + /* * We mark the process as running here, but have not actually * inserted it onto the runqueue yet. This guarantees that @@ -1211,46 +1195,57 @@ INIT_LIST_HEAD(&p->run_list); INIT_LIST_HEAD(&p->iso_list); p->array = NULL; - spin_lock_init(&p->switch_lock); #ifdef CONFIG_SCHEDSTATS memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + p->oncpu = 0; +#endif #ifdef CONFIG_PREEMPT - /* - * During context-switch we hold precisely one spinlock, which - * schedule_tail drops. (in the common case it's this_rq()->lock, - * but it also can be p->switch_lock.) So we compensate with a count - * of 1. Also, we want to start with kernel preemption disabled. - */ + /* Want to start with kernel preemption disabled. */ p->thread_info->preempt_count = 1; #endif - /* - * Share the timeslice between parent and child, thus the - * total amount of pending timeslices in the system doesn't change, - * resulting in more scheduling fairness. - */ + p->timestamp = clock_us(); + p->used_slice = 0; + if (rt_task(p)) { + BUG_ON(!rt_task(current)); + return; + } + + preempt_disable(); + rq = this_rq(); + + /* XXX */ + if (unlikely(p->comm[0] == 'X' && p->comm[1] == 'F')) { + static int warned = 0; + if (!warned) { + printk(KERN_INFO "Renicing %s for you\n", p->comm); + warned = 1; + } + p->static_prio = NICE_TO_PRIO(-10); + } + + /* Get MIN_HISTORY of history with the same sleep_avg as parent. */ + sleep_avg = task_sleep_avg(current); + p->total_time = MIN_HISTORY; + p->sleep_time = p->total_time * sleep_avg / SLEEP_FACTOR; + + /* Parent loses 1/4 of sleep time for forking */ + current->sleep_time = 3*current->sleep_time/4; + local_irq_disable(); - p->time_slice = (current->time_slice + 1) >> 1; - /* - * The remainder of the first timeslice might be recovered by - * the parent if the child exits early enough. - */ - p->first_time_slice = 1; - current->time_slice >>= 1; - p->timestamp = sched_clock(); - if (unlikely(!current->time_slice)) { - /* - * This case is rare, it happens when the parent has only - * a single jiffy left from its timeslice. Taking the - * runqueue lock is not a problem. - */ - current->time_slice = 1; - preempt_disable(); - scheduler_tick(); - local_irq_enable(); - preempt_enable(); - } else - local_irq_enable(); + if (unlikely(current->used_slice == -1 || current == rq->idle)) + p->used_slice = -1; + else { + int ts = task_timeslice(current, rq); + current->used_slice += (ts + 3) / 4; + if (current->used_slice >= ts) { + current->used_slice = -1; + set_need_resched(); + } + } + local_irq_enable(); + preempt_enable(); } /* @@ -1264,36 +1259,38 @@ { unsigned long flags; int this_cpu, cpu; - runqueue_t *rq, *this_rq; - - rq = task_rq_lock(p, &flags); - cpu = task_cpu(p); - this_cpu = smp_processor_id(); + runqueue_t *rq; + prio_array_t *array; BUG_ON(p->state != TASK_RUNNING); - schedstat_inc(rq, wunt_cnt); - /* - * We decrease the sleep average of forking parents - * and children as well, to keep max-interactive tasks - * from forking tasks that are max-interactive. The parent - * (current) is done further down, under its lock. - */ - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + p->prio = task_priority(p); - p->prio = effective_prio(p); + rq = task_rq_lock(p, &flags); + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + + array = rq->active; + if (!rt_task(p)) { + if (unlikely(p->used_slice == -1)) { + p->used_slice = 0; + array = rq->expired; + } else { + int total = task_timeslice(p, rq); + int ts = max((total + 3) / 4, MIN_TIMESLICE); + ts = min(ts, (int)FORKED_TS_MAX); + p->used_slice = total - ts; + } + } if (likely(cpu == this_cpu)) { - if (!(clone_flags & CLONE_VM)) { + if (!(clone_flags & CLONE_VM) && likely(array == rq->active)) { /* * The VM isn't cloned, so we're in a good position to * do child-runs-first in anticipation of an exec. This * usually avoids a lot of COW overhead. */ - if (unlikely(!current->array)) - __activate_task(p, rq); - else { + if (p->prio >= current->prio) { p->prio = current->prio; list_add_tail(&p->run_list, ¤t->run_list); p->array = current->array; @@ -1301,20 +1298,16 @@ rq->nr_running++; if (iso_task(p)) enqueue_iso_task(p); - } + } else + __activate_task(p, rq, array); set_need_resched(); - } else + } else { /* Run child last */ - __activate_task(p, rq); - /* - * We skip the following code due to cpu == this_cpu - * - * task_rq_unlock(rq, &flags); - * this_rq = task_rq_lock(current, &flags); - */ - this_rq = rq; - } else { - this_rq = cpu_rq(this_cpu); + __activate_task(p, rq, array); + } +#ifdef CONFIG_SMP + } else { + runqueue_t *this_rq = this_rq(); /* * Not the local CPU - must adjust timestamp. This should @@ -1322,52 +1315,16 @@ */ p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) + rq->timestamp_last_tick; - __activate_task(p, rq); + __activate_task(p, rq, array); if (task_preempts_curr(p, rq)) resched_task(rq->curr); - - schedstat_inc(rq, wunt_moved); - /* - * Parent and child are on different CPUs, now get the - * parent runqueue to update the parent's ->sleep_avg: - */ - task_rq_unlock(rq, &flags); - this_rq = task_rq_lock(current, &flags); - } - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - task_rq_unlock(this_rq, &flags); +#endif + } + task_rq_unlock(rq, &flags); } -/* - * Potentially available exiting-child timeslices are - * retrieved here - this way the parent does not get - * penalized for creating too many threads. - * - * (this cannot be used to 'generate' timeslices - * artificially, because any timeslice recovered here - * was given away by the parent in the first place.) - */ void fastcall sched_exit(task_t * p) { - unsigned long flags; - runqueue_t *rq; - - /* - * If the child was a (relative-) CPU hog then decrease - * the sleep_avg of the parent as well. - */ - rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice) { - p->parent->time_slice += p->time_slice; - if (unlikely(p->parent->time_slice > task_timeslice(p))) - p->parent->time_slice = task_timeslice(p); - } - if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = p->parent->sleep_avg / - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / - (EXIT_WEIGHT + 1); - task_rq_unlock(rq, &flags); } /** @@ -1404,7 +1361,8 @@ * Manfred Spraul */ prev_task_flags = prev->flags; - finish_arch_switch(rq, prev); + finish_arch_switch(prev); + finish_lock_switch(rq, prev); if (mm) mmdrop(mm); if (unlikely(prev_task_flags & PF_DEAD)) @@ -1419,7 +1377,10 @@ __releases(rq->lock) { finish_task_switch(prev); - +#ifdef __ARCH_WANT_UNLOCKED_CTXSW + /* In this case, finish_task_switch does not reenable preemption */ + preempt_enable(); +#endif if (current->set_child_tid) put_user(current->pid, current->set_child_tid); } @@ -1584,7 +1545,7 @@ cpus_and(mask, sd->span, p->cpus_allowed); for_each_cpu_mask(i, mask) { - load = target_load(i); + load = target_load(i, sd->wake_balance_load_idx); if (load < min_load) { min_cpu = i; @@ -1597,7 +1558,7 @@ } /* add +1 to account for the new task */ - this_load = source_load(this_cpu) + SCHED_LOAD_SCALE; + this_load = source_load(this_cpu, sd->wake_balance_load_idx) + SCHED_LOAD_SCALE; /* * Would with the addition of the new task to the @@ -1630,7 +1591,6 @@ || unlikely(cpu_is_offline(dest_cpu))) goto out; - schedstat_inc(rq, smt_cnt); /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ @@ -1658,7 +1618,6 @@ struct sched_domain *tmp, *sd = NULL; int new_cpu, this_cpu = get_cpu(); - schedstat_inc(this_rq(), sbe_cnt); /* Prefer the current CPU if there's only this task running */ if (this_rq()->nr_running <= 1) goto out; @@ -1694,6 +1653,10 @@ set_task_cpu(p, this_cpu); this_rq->nr_running++; enqueue_task(p, this_array); + if (!rt_task(p)) { + if (p->prio < this_array->min_prio) + this_array->min_prio = p->prio; + } p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + this_rq->timestamp_last_tick; /* @@ -1745,8 +1708,8 @@ * Called with both runqueues locked. */ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, - unsigned long max_nr_move, struct sched_domain *sd, - enum idle_type idle) + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) { prio_array_t *array, *dst_array; struct list_head *head, *curr; @@ -1801,13 +1764,10 @@ goto skip_bitmap; } - /* - * Right now, this is the only place pull_task() is called, - * so we can safely collect pull_task() stats here rather than - * inside pull_task(). - */ - schedstat_inc(this_rq, pt_gained[idle]); - schedstat_inc(busiest, pt_lost[idle]); +#ifdef CONFIG_SCHEDSTATS + if (task_hot(tmp, busiest->timestamp_last_tick, sd)) + schedstat_inc(sd, lb_hot_gained[idle]); +#endif pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); pulled++; @@ -1820,6 +1780,13 @@ goto skip_bitmap; } out: + /* + * Right now, this is the only place pull_task() is called, + * so we can safely collect pull_task() stats here rather than + * inside pull_task(). + */ + schedstat_add(sd, lb_gained[idle], pulled); + return pulled; } @@ -1834,8 +1801,11 @@ { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; + int load_idx = sd->idle_load_idx; max_load = this_load = total_load = total_pwr = 0; + if (idle == NOT_IDLE) + load_idx = sd->load_idx; do { unsigned long load; @@ -1850,9 +1820,9 @@ for_each_cpu_mask(i, group->cpumask) { /* Bias balancing toward cpus of our domain */ if (local_group) - load = target_load(i); + load = target_load(i, load_idx); else - load = source_load(i); + load = source_load(i, load_idx); nr_cpus++; avg_load += load; @@ -1905,7 +1875,7 @@ *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power)) / SCHED_LOAD_SCALE; - if (*imbalance < SCHED_LOAD_SCALE - 1) { + if (*imbalance < SCHED_LOAD_SCALE) { unsigned long pwr_now = 0, pwr_move = 0; unsigned long tmp; @@ -1946,13 +1916,12 @@ } /* Get rid of the scaling factor, rounding down as we divide */ - *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE; + *imbalance = *imbalance / SCHED_LOAD_SCALE; return busiest; out_balanced: - if (busiest && (idle == NEWLY_IDLE || - (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) { + if (busiest && idle != NOT_IDLE && max_load > SCHED_LOAD_SCALE) { *imbalance = 1; return busiest; } @@ -1971,7 +1940,7 @@ int i; for_each_cpu_mask(i, group->cpumask) { - load = source_load(i); + load = source_load(i, 0); if (load > max_load) { max_load = load; @@ -1996,7 +1965,6 @@ unsigned long imbalance; int nr_moved; - spin_lock(&this_rq->lock); schedstat_inc(sd, lb_cnt[idle]); group = find_busiest_group(sd, this_cpu, &imbalance, idle); @@ -2031,12 +1999,11 @@ * still unbalanced. nr_moved simply stays zero, so it is * correctly treated as an imbalance. */ - double_lock_balance(this_rq, busiest); + double_rq_lock(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, idle); - spin_unlock(&busiest->lock); + double_rq_unlock(this_rq, busiest); } - spin_unlock(&this_rq->lock); if (!nr_moved) { schedstat_inc(sd, lb_failed[idle]); @@ -2078,7 +2045,7 @@ return nr_moved; out_balanced: - spin_unlock(&this_rq->lock); + schedstat_inc(sd, lb_balanced[idle]); /* tune up the balancing interval */ if (sd->balance_interval < sd->max_interval) @@ -2121,8 +2088,11 @@ schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); nr_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, NEWLY_IDLE); - if (!nr_moved) + if (!nr_moved) { schedstat_inc(sd, lb_failed[NEWLY_IDLE]); + } else { + schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); + } spin_unlock(&busiest->lock); @@ -2164,7 +2134,7 @@ cpumask_t visited_cpus; int cpu; - schedstat_inc(busiest_rq, alb_cnt); + /* * Search for suitable CPUs to push tasks to in successively higher * domains with SD_LOAD_BALANCE set. @@ -2174,13 +2144,14 @@ if (!(sd->flags & SD_LOAD_BALANCE)) /* no more domains to search */ break; + schedstat_inc(sd, alb_cnt); + + if (busiest->nr_running <= 1) + return; cpu_group = sd->groups; do { for_each_cpu_mask(cpu, cpu_group->cpumask) { - if (busiest_rq->nr_running <= 1) - /* no more tasks left to move */ - return; if (cpu_isset(cpu, visited_cpus)) continue; cpu_set(cpu, visited_cpus); @@ -2199,10 +2170,9 @@ double_lock_balance(busiest_rq, target_rq); if (move_tasks(target_rq, cpu, busiest_rq, 1, sd, SCHED_IDLE)) { - schedstat_inc(busiest_rq, alb_lost); - schedstat_inc(target_rq, alb_gained); + schedstat_inc(sd, alb_pushed); } else { - schedstat_inc(busiest_rq, alb_failed); + schedstat_inc(sd, alb_failed); } spin_unlock(&target_rq->lock); } @@ -2229,18 +2199,23 @@ unsigned long old_load, this_load; unsigned long j = jiffies + CPU_OFFSET(this_cpu); struct sched_domain *sd; + int i; - /* Update our load */ - old_load = this_rq->cpu_load; this_load = this_rq->nr_running * SCHED_LOAD_SCALE; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (this_load > old_load) - old_load++; - this_rq->cpu_load = (old_load + this_load) / 2; + /* Update our load */ + for (i = 0; i < 4; i++) { + unsigned long new_load = this_load; + int scale = 1<cpu_load[i]; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale-1; + this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; + } for_each_domain(this_cpu, sd) { unsigned long interval; @@ -2278,24 +2253,6 @@ } #endif -static inline int wake_priority_sleeper(runqueue_t *rq) -{ - int ret = 0; -#ifdef CONFIG_SCHED_SMT - spin_lock(&rq->lock); - /* - * If an SMT sibling task has been put to sleep for priority - * reasons reschedule the idle task to see if it can now run. - */ - if (rq->nr_running) { - resched_task(rq->idle); - ret = 1; - } - spin_unlock(&rq->lock); -#endif - return ret; -} - DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); @@ -2327,22 +2284,6 @@ } /* - * We place interactive tasks back into the active array, if possible. - * - * To guarantee that this does not starve expired tasks we ignore the - * interactivity of a task if the first expired task had to wait more - * than a 'reasonable' amount of time. This deadline timeout is - * load-dependent, as the frequency of array switched decreases with - * increasing number of running tasks. We also ignore the interactivity - * if a better static_prio task has expired: - */ -#define EXPIRED_STARVING(rq) \ - ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ - (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ - ((rq)->curr->static_prio > (rq)->best_expired_prio)) - -/* * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() @@ -2445,14 +2386,19 @@ */ void scheduler_tick(void) { + enum idle_type cpu_status; int cpu = smp_processor_id(); runqueue_t *rq = this_rq(); task_t *p = current; unsigned long long now = sched_clock(); + int ts; update_cpu_clock(p, rq, now); - rq->timestamp_last_tick = now; +#ifdef CONFIG_SMP + rq->timestamp_last_tick = clock_us(); +#endif + /* * The iso_ticks accounting is incremented only when a SCHED_ISO task @@ -2466,117 +2412,32 @@ dec_iso_ticks(rq, p); if (p == rq->idle) { - if (wake_priority_sleeper(rq)) - goto out; - rebalance_tick(cpu, rq, SCHED_IDLE); - return; + cpu_status = SCHED_IDLE; + goto out; } - + cpu_status = NOT_IDLE; /* Task might have expired already, but not scheduled off yet */ - if (p->array != rq->active) { - set_tsk_need_resched(p); + if (unlikely(p->used_slice == -1)) goto out; - } - spin_lock(&rq->lock); - /* - * The task was running during this tick - update the - * time slice counter. Note: we do not update a thread's - * priority until it either goes to sleep or uses up its - * timeslice. This makes it possible for interactive tasks - * to use up their timeslices at their highest priority levels. - */ - if (rt_task(p)) { - /* - * RR tasks need a special form of timeslice management. - * FIFO tasks have no timeslices. - */ - if ((p->policy == SCHED_RR) && !--p->time_slice) { - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - set_tsk_need_resched(p); + if (unlikely(p->policy == SCHED_FIFO)) + goto out; - /* put it at the end of the queue: */ - requeue_task(p, rq->active); - } - goto out_unlock; + /* p was running during this tick. Update its time slice counter. */ + p->used_slice++; + ts = task_timeslice(p, rq); + if (unlikely(p->used_slice >= ts)) { + p->used_slice = -1; + set_tsk_need_resched(p); } +out: + rebalance_tick(cpu, rq, cpu_status); + +} + - if (iso_task(p)) { - if (rq->iso_refractory) { - /* - * If we are in the refractory period for SCHED_ISO - * tasks we schedule them as SCHED_NORMAL. If their - * priority is set to the ISO priority, change it. - */ - if (p->prio == MAX_RT_PRIO) { - dequeue_task(p, rq->active); - p->prio = effective_prio(p); - enqueue_task(p, rq->active); - } - goto sched_normal; - } - if (p->prio > MAX_RT_PRIO) { - dequeue_task(p, rq->active); - p->prio = MAX_RT_PRIO; - enqueue_task(p, rq->active); - } - if (!(--p->time_slice % GRANULARITY)) { - set_tsk_need_resched(p); - requeue_iso_task(p); - } - if (!p->time_slice) - p->time_slice = task_timeslice(p); - goto out_unlock; - } -sched_normal: - if (!--p->time_slice) { - dequeue_task(p, rq->active); - set_tsk_need_resched(p); - p->prio = effective_prio(p); - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; - if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { - enqueue_task(p, rq->expired); - if (p->static_prio < rq->best_expired_prio) - rq->best_expired_prio = p->static_prio; - } else - enqueue_task(p, rq->active); - } else { - /* - * Prevent a too long timeslice allowing a task to monopolize - * the CPU. We do this by splitting up the timeslice into - * smaller pieces. - * - * Note: this does not mean the task's timeslices expire or - * get lost in any way, they just might be preempted by - * another task of equal priority. (one with higher - * priority would have preempted this task already.) We - * requeue this task to the end of the list on this priority - * level, which is in essence a round-robin of tasks with - * equal priority. - * - * This only applies to tasks in the interactive - * delta range with at least TIMESLICE_GRANULARITY to requeue. - */ - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - - p->time_slice) % TIMESLICE_GRANULARITY(p)) && - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq->active)) { - requeue_task(p, rq->active); - set_tsk_need_resched(p); - } - } -out_unlock: - spin_unlock(&rq->lock); -out: - rebalance_tick(cpu, rq, NOT_IDLE); -} static inline int iso_ready(runqueue_t *rq) { @@ -2603,136 +2464,6 @@ out: return p; } - -#ifdef CONFIG_SCHED_SMT -static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) -{ - struct sched_domain *sd = this_rq->sd; - cpumask_t sibling_map; - int i; - - if (!(sd->flags & SD_SHARE_CPUPOWER)) - return; - - /* - * Unlock the current runqueue because we have to lock in - * CPU order to avoid deadlocks. Caller knows that we might - * unlock. We keep IRQs disabled. - */ - spin_unlock(&this_rq->lock); - - sibling_map = sd->span; - - for_each_cpu_mask(i, sibling_map) - spin_lock(&cpu_rq(i)->lock); - /* - * We clear this CPU from the mask. This both simplifies the - * inner loop and keps this_rq locked when we exit: - */ - cpu_clear(this_cpu, sibling_map); - - for_each_cpu_mask(i, sibling_map) { - runqueue_t *smt_rq = cpu_rq(i); - - /* - * If an SMT sibling task is sleeping due to priority - * reasons wake it up now. - */ - if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) - resched_task(smt_rq->idle); - } - - for_each_cpu_mask(i, sibling_map) - spin_unlock(&cpu_rq(i)->lock); - /* - * We exit with this_cpu's rq still held and IRQs - * still disabled: - */ -} - -static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) -{ - struct sched_domain *sd = this_rq->sd; - cpumask_t sibling_map; - prio_array_t *array; - int ret = 0, i, idx; - task_t *p; - - if (!(sd->flags & SD_SHARE_CPUPOWER)) - return 0; - - /* - * The same locking rules and details apply as for - * wake_sleeping_dependent(): - */ - spin_unlock(&this_rq->lock); - sibling_map = sd->span; - for_each_cpu_mask(i, sibling_map) - spin_lock(&cpu_rq(i)->lock); - cpu_clear(this_cpu, sibling_map); - - /* - * Establish next task to be run - it might have gone away because - * we released the runqueue lock above: - */ - if (!this_rq->nr_running) - goto out_unlock; - array = this_rq->active; - if (!array->nr_active) - array = this_rq->expired; - BUG_ON(!array->nr_active); - - idx = sched_find_first_bit(array->bitmap); - if (unlikely(iso_ready(this_rq) && idx >= MAX_RT_PRIO)) - p = queue_iso(this_rq, array); - else - p = list_entry(array->queue[idx].next, task_t, run_list); - - for_each_cpu_mask(i, sibling_map) { - runqueue_t *smt_rq = cpu_rq(i); - task_t *smt_curr = smt_rq->curr; - - /* - * If a user task with lower static priority than the - * running task on the SMT sibling is trying to schedule, - * delay it till there is proportionately less timeslice - * left of the sibling task to prevent a lower priority - * task from using an unfair proportion of the - * physical cpu's resources. -ck - */ - if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(p) || rt_task(smt_curr)) && - p->mm && smt_curr->mm && !rt_task(p) && !iso_task(p)) - ret = 1; - - /* - * Reschedule a lower priority task on the SMT sibling, - * or wake it up if it has been put to sleep for priority - * reasons. - */ - if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(smt_curr) || rt_task(p) || - iso_task(p)) && smt_curr->mm && p->mm && - !rt_task(smt_curr)) || (smt_curr == smt_rq->idle && - smt_rq->nr_running)) - resched_task(smt_curr); - } -out_unlock: - for_each_cpu_mask(i, sibling_map) - spin_unlock(&cpu_rq(i)->lock); - return ret; -} -#else -static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) -{ -} - -static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) -{ - return 0; -} -#endif - #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) void fastcall add_preempt_count(int val) @@ -2784,13 +2515,11 @@ * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ - if (likely(!current->exit_state)) { - if (unlikely(in_atomic())) { - printk(KERN_ERR "scheduling while atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), current->pid); - dump_stack(); - } + if (unlikely(in_atomic()) && + likely(!(current->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)))) { + printk(KERN_ERR "scheduling while atomic: %s/0x%08x/%d\n", + current->comm, preempt_count(), current->pid); + dump_stack(); } profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -2810,18 +2539,16 @@ dump_stack(); } - schedstat_inc(rq, sched_cnt); - now = sched_clock(); - if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) - run_time = now - prev->timestamp; - else - run_time = NS_MAX_SLEEP_AVG; + + + - /* - * Tasks charged proportionately less run_time at high sleep_avg to - * delay them losing their interactive status - */ - run_time /= (CURRENT_BONUS(prev) ? : 1); + + schedstat_inc(rq, sched_cnt); + now = clock_us(); + run_time = now - prev->timestamp; + prev->timestamp = prev->last_ran = now; + add_task_time(prev, run_time, STIME_RUN); spin_lock_irq(&rq->lock); @@ -2835,40 +2562,40 @@ unlikely(signal_pending(prev)))) prev->state = TASK_RUNNING; else { + deactivate_task(prev, rq); if (prev->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; - deactivate_task(prev, rq); - } + goto no_check_expired; + } + } - cpu = smp_processor_id(); + if (unlikely(prev->used_slice == -1)) { + if (rt_task(prev)) { + /* SCHED_FIFO can come in here too, from sched_yield */ + dequeue_task(prev, prev->array); + enqueue_task(prev, rq->active); + } else { + dequeue_task(prev, prev->array); + prev->prio = task_priority(prev); + enqueue_task(prev, rq->expired); + if (prev->prio < rq->expired->min_prio) + rq->expired->min_prio = prev->prio; + } + prev->used_slice = 0; + } +no_check_expired: + + cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) { -go_idle: + rq->array_sequence++; idle_balance(cpu, rq); if (!rq->nr_running) { next = rq->idle; - rq->expired_timestamp = 0; - wake_sleeping_dependent(cpu, rq); - /* - * wake_sleeping_dependent() might have released - * the runqueue, so break out if we got new - * tasks meanwhile: - */ - if (!rq->nr_running) - goto switch_tasks; + rq->arrays[0].min_prio = MAX_PRIO; + rq->arrays[1].min_prio = MAX_PRIO; + goto switch_tasks; } - } else { - if (dependent_sleeper(cpu, rq)) { - next = rq->idle; - goto switch_tasks; - } - /* - * dependent_sleeper() releases and reacquires the runqueue - * lock, hence go into the idle loop if the rq went - * empty meanwhile: - */ - if (unlikely(!rq->nr_running)) - goto go_idle; } array = rq->active; @@ -2877,35 +2604,22 @@ * Switch the active and expired arrays. */ schedstat_inc(rq, sched_switch); + rq->array_sequence++; rq->active = rq->expired; rq->expired = array; + rq->expired->min_prio = MAX_PRIO; array = rq->active; - rq->expired_timestamp = 0; - rq->best_expired_prio = MAX_PRIO; - } else - schedstat_inc(rq, sched_noswitch); + + } - idx = sched_find_first_bit(array->bitmap); - if (unlikely(iso_ready(rq) && idx >= MAX_RT_PRIO)) + idx = sched_find_first_bit(array->bitmap); +if (unlikely(iso_ready(rq) && idx >= MAX_RT_PRIO)) next = queue_iso(rq, array); else { queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); } - if (!rt_task(next) && !(iso_task(next) && !rq->iso_refractory) && - next->activated > 0) { - unsigned long long delta = now - next->timestamp; - - if (next->activated == 1) - delta = delta * - (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; - array = next->array; - dequeue_task(next, array); - recalc_task_prio(next, next->timestamp + delta); - enqueue_task(next, array); - } - next->activated = 0; switch_tasks: if (next == rq->idle) schedstat_inc(rq, sched_goidle); @@ -2915,11 +2629,6 @@ update_cpu_clock(prev, rq, now); - prev->sleep_avg -= run_time; - if ((long)prev->sleep_avg <= 0) - prev->sleep_avg = 0; - prev->timestamp = prev->last_ran = now; - sched_info_switch(prev, next); if (likely(prev != next)) { next->timestamp = now; @@ -2927,7 +2636,8 @@ rq->curr = next; ++*switch_count; - prepare_arch_switch(rq, next); + prepare_lock_switch(rq, next); + prepare_arch_switch(next); ltt_ev_schedchange(prev, next); prev = context_switch(rq, prev, next); barrier(); @@ -3570,7 +3280,7 @@ oldprio = p->prio; __setscheduler(p, policy, param->sched_priority); if (array) { - __activate_task(p, rq); + __activate_task(p, rq,array); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on @@ -3839,33 +3549,24 @@ */ asmlinkage long sys_sched_yield(void) { - runqueue_t *rq = this_rq_lock(); - prio_array_t *array = current->array; - prio_array_t *target = rq->expired; +#ifdef CONFIG_SCHEDSTATS + runqueue_t *rq; +#endif - schedstat_inc(rq, yld_cnt); - /* - * We implement yielding by moving the task into the expired - * queue. - * - * (special rule: RT tasks will just roundrobin in the active - * array.) - */ - if (rt_task(current)) - target = rq->active; + local_irq_disable(); +#ifdef CONFIG_SCHEDSTATS + rq = this_rq(); + schedstat_inc(rq, yld_cnt); + spin_lock(&rq->lock); if (current->array->nr_active == 1) { schedstat_inc(rq, yld_act_empty); if (!rq->expired->nr_active) schedstat_inc(rq, yld_both_empty); } else if (!rq->expired->nr_active) schedstat_inc(rq, yld_exp_empty); - - if (array != target) { - dequeue_task(current, array); - enqueue_task(current, target); - } else - /* + + /* * requeue_task is cheaper so perform that if possible. */ requeue_task(current, array); @@ -3877,6 +3578,9 @@ __release(rq->lock); _raw_spin_unlock(&rq->lock); preempt_enable_no_resched(); +#endif + current->used_slice = -1; + local_irq_enable(); schedule(); @@ -4053,6 +3757,8 @@ int retval = -EINVAL; struct timespec t; task_t *p; + unsigned long flags; + runqueue_t *rq; if (pid < 0) goto out_nounlock; @@ -4067,8 +3773,9 @@ if (retval) goto out_unlock; - jiffies_to_timespec(p->policy & SCHED_FIFO ? - 0 : task_timeslice(p), &t); + rq = task_rq_lock(p, &flags); + jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : task_timeslice(p, rq), &t); + task_rq_unlock(rq, &flags); read_unlock(&tasklist_lock); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; out_nounlock: @@ -4181,14 +3888,17 @@ runqueue_t *rq = cpu_rq(cpu); unsigned long flags; - idle->sleep_avg = 0; idle->array = NULL; idle->prio = MAX_PRIO; idle->state = TASK_RUNNING; + idle->used_slice = 0; set_task_cpu(idle, cpu); spin_lock_irqsave(&rq->lock, flags); rq->curr = rq->idle = idle; +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + idle->oncpu = 1; +#endif set_tsk_need_resched(idle); spin_unlock_irqrestore(&rq->lock, flags); @@ -5091,12 +4801,12 @@ spin_lock_init(&rq->lock); rq->active = rq->arrays; rq->expired = rq->arrays + 1; - rq->best_expired_prio = MAX_PRIO; rq->iso_refractory = rq->iso_ticks = 0; #ifdef CONFIG_SMP rq->sd = &sched_domain_dummy; - rq->cpu_load = 0; + for (j = 0; j < 4; j++) + rq->cpu_load[j] = 0; rq->active_balance = 0; rq->push_cpu = 0; rq->migration_thread = NULL; @@ -5106,11 +4816,12 @@ for (j = 0; j < 2; j++) { array = rq->arrays + j; + array->min_prio = MAX_PRIO; for (k = 0; k < MAX_PRIO; k++) { INIT_LIST_HEAD(array->queue + k); __clear_bit(k, array->bitmap); } - // delimiter for bitsearch + /* delimiter for bitsearch */ __set_bit(MAX_PRIO, array->bitmap); } INIT_LIST_HEAD(&rq->iso_queue); @@ -5307,10 +5018,13 @@ array = p->array; if (array) - deactivate_task(p, task_rq(p)); + deactivate_task(p, rq); __setscheduler(p, SCHED_NORMAL, 0); + if (policy == SCHED_FIFO || policy == SCHED_RR) + p->used_slice = 0; + if (array) { - __activate_task(p, task_rq(p)); + __activate_task(p, rq, array); resched_task(rq->curr); } diff -Naur linux-2.6.11-rc2-mm1/kernel/sysctl.c linux-2.6.11-rc2/kernel/sysctl.c --- linux-2.6.11-rc2-mm1/kernel/sysctl.c 2005-01-24 13:38:29.000000000 -0800 +++ linux-2.6.11-rc2/kernel/sysctl.c 2005-01-24 14:47:22.839930608 -0800 @@ -65,6 +65,9 @@ extern int printk_ratelimit_jiffies; extern int printk_ratelimit_burst; extern int pid_max_min, pid_max_max; +extern int sched_base_timeslice; +extern int sched_min_base; +extern int sched_max_base; #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) int unknown_nmi_panic; @@ -215,12 +218,25 @@ .mode = 0555, .child = dev_table, }, + { + .ctl_name = KERN_SCHED_TIMESLICE, + .procname = "base_timeslice", + .data = &sched_base_timeslice, + .maxlen = sizeof (sched_base_timeslice), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &sched_min_base, + .extra2 = &sched_max_base, + }, + { .ctl_name = 0 } }; /* Constants for minimum and maximum testing in vm_table. We use these as one-element integer vectors. */ static int zero; +static int one = 1; static int one_hundred = 100; static ctl_table kern_table[] = { @@ -730,17 +746,7 @@ .mode = 0444 /* read-only*/, .proc_handler = &proc_dointvec, }, - { - .ctl_name = VM_SWAPPINESS, - .procname = "swappiness", - .data = &vm_swappiness, - .maxlen = sizeof(vm_swappiness), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &zero, - .extra2 = &one_hundred, - }, + #ifdef CONFIG_HUGETLB_PAGE { .ctl_name = VM_HUGETLB_PAGES, @@ -810,16 +816,6 @@ .strategy = &sysctl_intvec, .extra1 = &zero, }, - { - .ctl_name = VM_VFS_CACHE_PRESSURE, - .procname = "vfs_cache_pressure", - .data = &sysctl_vfs_cache_pressure, - .maxlen = sizeof(sysctl_vfs_cache_pressure), - .mode = 0644, - .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, - .extra1 = &zero, - }, #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT { .ctl_name = VM_LEGACY_VA_LAYOUT, @@ -842,6 +838,7 @@ .proc_handler = &proc_dointvec_jiffies, .strategy = &sysctl_jiffies, }, + #endif { .ctl_name = 0 } }; diff -Naur linux-2.6.11-rc2-mm1/mm/oom_kill.c linux-2.6.11-rc2/mm/oom_kill.c --- linux-2.6.11-rc2-mm1/mm/oom_kill.c 2005-01-24 13:38:29.000000000 -0800 +++ linux-2.6.11-rc2/mm/oom_kill.c 2005-01-24 14:48:20.573153824 -0800 @@ -189,14 +189,15 @@ return; } task_unlock(p); - printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm); + +printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm); /* - * We give our sacrificial lamb high priority and access to - * all the memory it needs. That way it should be able to - * exit() and clear out its resources quickly... + * We give our sacrificial lamb access to all the memory it needs. + * That way it should be able to exit() and clear out its resources + * quickly... */ - p->time_slice = HZ; + set_tsk_thread_flag(p, TIF_MEMDIE); /* This process has hardware access, be more careful. */