diff -Naur linux-2.6.12-rc5-mm1/CREDITS linux-2.6.12-rc5-mm1-plug/CREDITS --- linux-2.6.12-rc5-mm1/CREDITS 2005-05-25 16:23:27.558431864 -0700 +++ linux-2.6.12-rc5-mm1-plug/CREDITS 2005-05-25 17:04:42.718150392 -0700 @@ -2624,7 +2624,6 @@ E: mikpe@csd.uu.se W: http://www.csd.uu.se/~mikpe/ D: Miscellaneous fixes -D: Performance-monitoring counters driver N: Reed H. Petty E: rhp@draper.net diff -Naur linux-2.6.12-rc5-mm1/Documentation/cpusets.txt linux-2.6.12-rc5-mm1-plug/Documentation/cpusets.txt --- linux-2.6.12-rc5-mm1/Documentation/cpusets.txt 2005-05-25 16:23:27.714408152 -0700 +++ linux-2.6.12-rc5-mm1-plug/Documentation/cpusets.txt 2005-05-25 17:02:50.595195672 -0700 @@ -51,14 +51,6 @@ If a cpuset is cpu or mem exclusive, no other cpuset, other than a direct ancestor or descendent, may share any of the same CPUs or Memory Nodes. -A cpuset that is cpu exclusive has a sched domain associated with it. -The sched domain consists of all cpus in the current cpuset that are not -part of any exclusive child cpusets. -This ensures that the scheduler load balacing code only balances -against the cpus that are in the sched domain as defined above and not -all of the cpus in the system. This removes any overhead due to -load balancing code trying to pull tasks outside of the cpu exclusive -cpuset only to be prevented by the tasks' cpus_allowed mask. User level code may create and destroy cpusets by name in the cpuset virtual file system, manage the attributes and permissions of these @@ -92,9 +84,6 @@ and a database), or * NUMA systems running large HPC applications with demanding performance characteristics. - * Also cpu_exclusive cpusets are useful for servers running orthogonal - workloads such as RT applications requiring low latency and HPC - applications that are throughput sensitive These subsets, or "soft partitions" must be able to be dynamically adjusted, as the job mix changes, without impacting other concurrently @@ -136,8 +125,6 @@ - A cpuset may be marked exclusive, which ensures that no other cpuset (except direct ancestors and descendents) may contain any overlapping CPUs or Memory Nodes. - Also a cpu_exclusive cpuset would be associated with a sched - domain. - You can list all the tasks (by pid) attached to any cpuset. The implementation of cpusets requires a few, simple hooks @@ -149,9 +136,6 @@ allowed in that tasks cpuset. - in sched.c migrate_all_tasks(), to keep migrating tasks within the CPUs allowed by their cpuset, if possible. - - in sched.c, a new API partition_sched_domains for handling - sched domain changes associated with cpu_exclusive cpusets - and related changes in both sched.c and arch/ia64/kernel/domain.c - in the mbind and set_mempolicy system calls, to mask the requested Memory Nodes by what's allowed in that tasks cpuset. - in page_alloc, to restrict memory to allowed nodes. diff -Naur linux-2.6.12-rc5-mm1/Documentation/perfctr/low-level-api.txt linux-2.6.12-rc5-mm1-plug/Documentation/perfctr/low-level-api.txt --- linux-2.6.12-rc5-mm1/Documentation/perfctr/low-level-api.txt 2005-05-25 16:23:27.859386112 -0700 +++ linux-2.6.12-rc5-mm1-plug/Documentation/perfctr/low-level-api.txt 1969-12-31 16:00:00.000000000 -0800 @@ -1,216 +0,0 @@ -$Id: low-level-api.txt,v 1.1 2004/07/02 18:57:05 mikpe Exp $ - -PERFCTR LOW-LEVEL DRIVERS API -============================= - -This document describes the common low-level API. -See low-level-$ARCH.txt for architecture-specific documentation. - -General Model -============= -The model is that of a processor with: -- A non-programmable clock-like counter, the "TSC". - The TSC frequency is assumed to be constant, but it is not - assumed to be identical to the core frequency. - The TSC may be absent. -- A set of programmable counters, the "perfctrs" or "pmcs". - Control data may be per-counter, global, or both. - The counters are not assumed to be interchangeable. - - A normal counter that simply counts events is referred to - as an "accumulation-mode" or "a-mode" counter. Its total - count is computed by adding the counts for the individual - periods during which the counter is active. Two per-counter - state variables are used for this: "sum", which is the - total count up to but not including the current period, - and "start", which records the value of the hardware counter - at the start of the current period. At the end of a period, - the hardware counter's value is read again, and the increment - relative the start value is added to the sum. This strategy - is used because it avoids a number of hardware problems. - - A counter that has been programmed to generate an interrupt - on overflow is referred to as an "interrupt-mode" or "i-mode" - counter. I-mode counters are initialised to specific values, - and after overflowing are reset to their (re)start values. - The total event count is available just as for a-mode counters. - - The set of counters may be empty, in which case only the - TSC (which must be present) can be sampled. - -Contents of -================================= - -"struct perfctr_sum_ctrs" -------------------------- -struct perfctr_sum_ctrs { - unsigned long long tsc; - unsigned long long pmc[..]; /* one per counter */ -}; - -Architecture-specific container for counter values. -Used in the kernel/user API, but not by the low-level drivers. - -"struct perfctr_cpu_control" ----------------------------- -This struct includes at least the following fields: - - unsigned int tsc_on; - unsigned int nractrs; /* # of a-mode counters */ - unsigned int nrictrs; /* # of i-mode counters */ - unsigned int pmc_map[..]; /* one per counter: virt-to-phys mapping */ - unsigned int evntsel[..]; /* one per counter: hw control data */ - int ireset[..]; /* one per counter: i-mode (re)start value */ - -Architecture-specific container for control data. -Used both in the kernel/user API and by the low-level drivers -(embedded in "struct perfctr_cpu_state"). - -"tsc_on" is non-zero if the TSC should be sampled. - -"nractrs" is the number of a-mode counters, corresponding to -elements 0..nractrs-1 in the per-counter arrays. - -"nrictrs" is the number of i-mode counters, corresponding to -elements nractrs..nractrs+nrictrs-1 in the per-counter arrays. - -"nractrs+nrictrs" is the total number of counters to program -and sample. A-mode and i-mode counters are separated in order -to allow quick enumeration of either set, which is needed in -some low-level driver operations. - -"pmc_map[]" maps each counter to its corresponding hardware counter -identification. No two counters may map to the same hardware counter. -This mapping is present because the hardware may have asymmetric -counters or other addressing quirks, which means that a counter's index -may not suffice to address its hardware counter. - -"evntsel[]" contains the per-counter control data. Architecture-specific -global control data, if any, is placed in architecture-specific fields. - -"ireset[]" contains the (re)start values for the i-mode counters. -Only indices nractrs..nractrs+nrictrs-1 are used. - -"struct perfctr_cpu_state" --------------------------- -This struct includes at least the following fields: - - unsigned int cstatus; - unsigned int tsc_start; - unsigned long long tsc_sum; - struct { - unsigned int map; - unsigned int start; - unsigned long long sum; - } pmc[..]; /* one per counter; the size is not part of the user ABI */ -#ifdef __KERNEL__ - struct perfctr_cpu_control control; -#endif - -This type records the state and control data for a collection -of counters. It is used by many low-level operations, and may -be exported to user-space via mmap(). - -"cstatus" is a re-encoding of control.tsc_on/nractrs/nrictrs, -used because it reduces overheads in key low-level operations. -Operations on cstatus values include: -- unsigned int perfctr_mk_cstatus(unsigned int tsc_on, unsigned int nractrs, unsigned int nrictrs); - Construct a cstatus value. -- unsigned int perfctr_cstatus_enabled(unsigned int cstatus); - Check if any part (tsc_on, nractrs, nrictrs) of the cstatus is non-zero. -- int perfctr_cstatus_has_tsc(unsigned int cstatus); - Check if the tsc_on part of the cstatus is non-zero. -- unsigned int perfctr_cstatus_nrctrs(unsigned int cstatus); - Retrieve nractrs+nrictrs from the cstatus. -- unsigned int perfctr_cstatus_has_ictrs(unsigned int cstatus); - Check if the nrictrs part of cstatus is non-zero. - -"tsc_start" and "tsc_sum" record the state of the TSC. - -"pmc[]" contains the per-counter state, in the "start" and "sum" -fields. The "map" field contains the corresponding hardware counter -identification, from the counter's entry in "control.pmc_map[]"; -it is copied into pmc[] to reduce overheads in key low-level operations. - -"control" contains the control data which determines the -behaviour of the counters. - -User-space overflow signal handler items ----------------------------------------- -After a counter has overflowed, a user-space signal handler may -be invoked with a "struct siginfo" identifying the source of the -signal and the set of overflown counters. - -#define SI_PMC_OVF .. - -Value to be stored in "si.si_code". - -#define si_pmc_ovf_mask .. - -Field in which to store a bit-mask of the overflown counters. - -Kernel-internal API -------------------- - -/* Driver init/exit. - perfctr_cpu_init() performs hardware detection and may fail. */ -extern int perfctr_cpu_init(void); -extern void perfctr_cpu_exit(void); - -/* CPU type name. Set if perfctr_cpu_init() was successful. */ -extern char *perfctr_cpu_name; - -/* Hardware reservation. A high-level driver must reserve the - hardware before it may use it, and release it afterwards. - "service" is a unique string identifying the high-level driver. - perfctr_cpu_reserve() returns NULL on success; if another - high-level driver has reserved the hardware, then that - driver's "service" string is returned. */ -extern const char *perfctr_cpu_reserve(const char *service); -extern void perfctr_cpu_release(const char *service); - -/* PRE: state has no running interrupt-mode counters. - Check that the new control data is valid. - Update the low-level driver's private control data. - is_global should be zero for per-process counters and non-zero - for global-mode counters. - Returns a negative error code if the control data is invalid. */ -extern int perfctr_cpu_update_control(struct perfctr_cpu_state *state, int is_global); - -/* Stop i-mode counters. Update sums and start values. - Read a-mode counters. Subtract from start and accumulate into sums. - Must be called with preemption disabled. */ -extern void perfctr_cpu_suspend(struct perfctr_cpu_state *state); - -/* Reset i-mode counters to their start values. - Write control registers. - Read a-mode counters and update their start values. - Must be called with preemption disabled. */ -extern void perfctr_cpu_resume(struct perfctr_cpu_state *state); - -/* Perform an efficient combined suspend/resume operation. - Must be called with preemption disabled. */ -extern void perfctr_cpu_sample(struct perfctr_cpu_state *state); - -/* The type of a perfctr overflow interrupt handler. - It will be called in IRQ context, with preemption disabled. */ -typedef void (*perfctr_ihandler_t)(unsigned long pc); - -/* Install a perfctr overflow interrupt handler. - Should be called after perfctr_cpu_reserve() but before - any counter state has been activated. */ -extern void perfctr_cpu_set_ihandler(perfctr_ihandler_t); - -/* PRE: The state has been suspended and sampled by perfctr_cpu_suspend(). - Should be called from the high-level driver's perfctr_ihandler_t, - and preemption must not have been enabled. - Identify which counters have overflown, reset their start values - from ireset[], and perform any necessary hardware cleanup. - Returns a bit-mask of the overflown counters. */ -extern unsigned int perfctr_cpu_identify_overflow(struct perfctr_cpu_state*); - -/* Call perfctr_cpu_ireload() just before perfctr_cpu_resume() to - bypass internal caching and force a reload of the i-mode pmcs. - This ensures that perfctr_cpu_identify_overflow()'s state changes - are propagated to the hardware. */ -extern void perfctr_cpu_ireload(struct perfctr_cpu_state*); diff -Naur linux-2.6.12-rc5-mm1/Documentation/perfctr/low-level-ppc32.txt linux-2.6.12-rc5-mm1-plug/Documentation/perfctr/low-level-ppc32.txt --- linux-2.6.12-rc5-mm1/Documentation/perfctr/low-level-ppc32.txt 2005-05-25 16:23:27.860385960 -0700 +++ linux-2.6.12-rc5-mm1-plug/Documentation/perfctr/low-level-ppc32.txt 1969-12-31 16:00:00.000000000 -0800 @@ -1,164 +0,0 @@ -$Id: low-level-ppc32.txt,v 1.1 2004/07/02 18:57:05 mikpe Exp $ - -PERFCTRS PPC32 LOW-LEVEL API -============================ - -See low-level-api.txt for the common low-level API. -This document only describes ppc32-specific behaviour. -For detailed hardware control register layouts, see -the manufacturers' documentation. - -Supported processors -==================== -- PowerPC 604, 604e, 604ev. -- PowerPC 750/740, 750CX, 750FX, 750GX. -- PowerPC 7400, 7410, 7451/7441, 7457/7447. -- Any generic PowerPC with a timebase register. - -Contents of -================================= - -"struct perfctr_sum_ctrs" -------------------------- -struct perfctr_sum_ctrs { - unsigned long long tsc; - unsigned long long pmc[8]; -}; - -The pmc[] array has room for 8 counters. - -"struct perfctr_cpu_control" ----------------------------- -struct perfctr_cpu_control { - unsigned int tsc_on; - unsigned int nractrs; /* # of a-mode counters */ - unsigned int nrictrs; /* # of i-mode counters */ - unsigned int pmc_map[8]; - unsigned int evntsel[8]; /* one per counter, even on P5 */ - int ireset[8]; /* [0,0x7fffffff], for i-mode counters */ - struct { - unsigned int mmcr0; /* sans PMC{1,2}SEL */ - unsigned int mmcr2; /* only THRESHMULT */ - /* IABR/DABR/BAMR not supported */ - } ppc; - unsigned int _reserved1; - unsigned int _reserved2; - unsigned int _reserved3; - unsigned int _reserved4; -}; - -The per-counter arrays have room for 8 elements. - -ireset[] values must be non-negative, since overflow occurs on -the non-negative-to-negative transition. - -The ppc sub-struct contains PowerPC-specific control data: -- mmcr0: global control data for the MMCR0 SPR; the event - selectors for PMC1 and PMC2 are in evntsel[], not in mmcr0 -- mmcr2: global control data for the MMCR2 SPR; only the - THRESHMULT field can be specified - -"struct perfctr_cpu_state" --------------------------- -struct perfctr_cpu_state { - unsigned int cstatus; - struct { /* k1 is opaque in the user ABI */ - unsigned int id; - int isuspend_cpu; - } k1; - /* The two tsc fields must be inlined. Placing them in a - sub-struct causes unwanted internal padding on x86-64. */ - unsigned int tsc_start; - unsigned long long tsc_sum; - struct { - unsigned int map; - unsigned int start; - unsigned long long sum; - } pmc[8]; /* the size is not part of the user ABI */ -#ifdef __KERNEL__ - unsigned int ppc_mmcr[3]; - struct perfctr_cpu_control control; -#endif -}; - -The k1 sub-struct is used by the low-level driver for -caching purposes. "id" identifies the control data, and -"isuspend_cpu" identifies the CPU on which the i-mode -counters were last suspended. - -The pmc[] array has room for 8 elements. - -ppc_mmcr[] is computed from control by the low-level driver, -and provides the data for the MMCR0, MMCR1, and MMCR2 SPRs. - -User-space overflow signal handler items ----------------------------------------- -#ifdef __KERNEL__ -#define SI_PMC_OVF (__SI_FAULT|'P') -#else -#define SI_PMC_OVF ('P') -#endif -#define si_pmc_ovf_mask _sifields._pad[0] - -Kernel-internal API -------------------- - -In perfctr_cpu_update_control(), the is_global parameter -is ignored. (It is only relevant for x86.) - -CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK is never defined. -(It is only relevant for x86.) - -Overflow interrupt handling is not yet implemented. - -Processor-specific Notes -======================== - -General -------- -pmc_map[] contains a counter number, an integer between 0 and 5. -It never contains an SPR number. - -Basic operation (the strategy for a-mode counters, caching -control register contents, recording "suspend CPU" for i-mode -counters) is the same as in the x86 driver. - -PowerPC 604/750/74xx --------------------- -These processors use similar hardware layouts, differing -mainly in the number of counter and control registers. -The set of available events differ greatly, but that only -affects users, not the low-level driver itself. - -The hardware has 2 (604), 4 (604e/750/7400/7410), or 6 -(745x) counters (PMC1 to PMC6), and 1 (604), 2 (604e/750), -or 3 (74xx) control registers (MMCR0 to MMCR2). - -MMCR0 contains global control bits, and the event selection -fields for PMC1 and PMC2. MMCR1 contains event selection fields -for PMC3-PMC6. MMCR2 contains the THRESHMULT flag, which -specifies how MMCR0[THRESHOLD] should be scaled. - -In control.ppc.mmcr0, the PMC1SEL and PMC2SEL fields (0x00001FFF) -are reserved. The PMXE flag (0x04000000) may only be set when -the driver supports overflow interrupts. - -If FCECE or TRIGGER is set in MMCR0 on a 74xx processor, then -MMCR0 can change asynchronously. The driver handles this, at -the cost of some additional work in perfctr_cpu_suspend(). -Not setting these flags avoids that overhead. - -In control.ppc.mmcr2, only the THRESHMULT flag (0x80000000) -may be set, and only on 74xx processors. - -The SIA (sampled instruction address) register is not used. -The SDA (sampled data address) register is 604/604e-only, -and is not used. The BAMR (breakpoint address mask) register -is not used, but it is cleared by the driver. - -Generic PowerPC with timebase ------------------------------ -The driver supports any PowerPC as long as it has a timebase -register, and the TB frequency is available via Open Firmware. -In this case, the only valid usage mode is with tsc_on == 1 -and nractrs == nrictrs == 0 in the control data. diff -Naur linux-2.6.12-rc5-mm1/Documentation/perfctr/low-level-x86.txt linux-2.6.12-rc5-mm1-plug/Documentation/perfctr/low-level-x86.txt --- linux-2.6.12-rc5-mm1/Documentation/perfctr/low-level-x86.txt 2005-05-25 16:23:27.861385808 -0700 +++ linux-2.6.12-rc5-mm1-plug/Documentation/perfctr/low-level-x86.txt 1969-12-31 16:00:00.000000000 -0800 @@ -1,360 +0,0 @@ -$Id: low-level-x86.txt,v 1.1 2004/07/02 18:57:05 mikpe Exp $ - -PERFCTRS X86 LOW-LEVEL API -========================== - -See low-level-api.txt for the common low-level API. -This document only describes x86-specific behaviour. -For detailed hardware control register layouts, see -the manufacturers' documentation. - -Contents -======== -- Supported processors -- Contents of -- Processor-specific Notes -- Implementation Notes - -Supported processors -==================== -- Intel P5, P5MMX, P6, P4. -- AMD K7, K8. (P6 clones, with some changes) -- Cyrix 6x86MX, MII, and III. (good P5 clones) -- Centaur WinChip C6, 2, and 3. (bad P5 clones) -- VIA C3. (bad P6 clone) -- Any generic x86 with a TSC. - -Contents of -================================ - -"struct perfctr_sum_ctrs" -------------------------- -struct perfctr_sum_ctrs { - unsigned long long tsc; - unsigned long long pmc[18]; -}; - -The pmc[] array has room for 18 counters. - -"struct perfctr_cpu_control" ----------------------------- -struct perfctr_cpu_control { - unsigned int tsc_on; - unsigned int nractrs; /* # of a-mode counters */ - unsigned int nrictrs; /* # of i-mode counters */ - unsigned int pmc_map[18]; - unsigned int evntsel[18]; /* one per counter, even on P5 */ - struct { - unsigned int escr[18]; - unsigned int pebs_enable; /* for replay tagging */ - unsigned int pebs_matrix_vert; /* for replay tagging */ - } p4; - int ireset[18]; /* < 0, for i-mode counters */ - unsigned int _reserved1; - unsigned int _reserved2; - unsigned int _reserved3; - unsigned int _reserved4; -}; - -The per-counter arrays have room for 18 elements. - -ireset[] values must be negative, since overflow occurs on -the negative-to-non-negative transition. - -The p4 sub-struct contains P4-specific control data: -- escr[]: the control data to write to the ESCR register - associatied with the counter -- pebs_enable: the control data to write to the PEBS_ENABLE MSR -- pebs_matrix_vert: the control data to write to the - PEBS_MATRIX_VERT MSR - -"struct perfctr_cpu_state" --------------------------- -struct perfctr_cpu_state { - unsigned int cstatus; - struct { /* k1 is opaque in the user ABI */ - unsigned int id; - int isuspend_cpu; - } k1; - /* The two tsc fields must be inlined. Placing them in a - sub-struct causes unwanted internal padding on x86-64. */ - unsigned int tsc_start; - unsigned long long tsc_sum; - struct { - unsigned int map; - unsigned int start; - unsigned long long sum; - } pmc[18]; /* the size is not part of the user ABI */ -#ifdef __KERNEL__ - struct perfctr_cpu_control control; - unsigned int p4_escr_map[18]; -#endif -}; - -The k1 sub-struct is used by the low-level driver for -caching purposes. "id" identifies the control data, and -"isuspend_cpu" identifies the CPU on which the i-mode -counters were last suspended. - -The pmc[] array has room for 18 elements. - -p4_escr_map[] is computed from control by the low-level driver, -and provides the MSR number for the counter's associated ESCR. - -User-space overflow signal handler items ----------------------------------------- -#ifdef __KERNEL__ -#define SI_PMC_OVF (__SI_FAULT|'P') -#else -#define SI_PMC_OVF ('P') -#endif -#define si_pmc_ovf_mask _sifields._pad[0] - -Kernel-internal API -------------------- - -In perfctr_cpu_update_control(), the is_global parameter controls -whether monitoring the other thread (T1) on HT P4s is permitted -or not. On other processors the parameter is ignored. - -SMP kernels define CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK and -"extern cpumask_t perfctr_cpus_forbidden_mask;". -On HT P4s, resource conflicts can occur because both threads -(T0 and T1) in a processor share the same perfctr registers. -To prevent conflicts, only thread 0 in each processor is allowed -to access the counters. perfctr_cpus_forbidden_mask contains the -smp_processor_id()s of each processor's thread 1, and it is the -responsibility of the high-level driver to ensure that it never -accesses the perfctr state from a forbidden thread. - -Overflow interrupt handling requires local APIC support in the kernel. - -Processor-specific Notes -======================== - -General -------- -pmc_map[] contains a counter number, as used by the RDPMC instruction. -It never contains an MSR number. - -Counters are 32, 40, or 48 bits wide. The driver always only -reads the low 32 bits. This avoids performance issues, and -errata on some processors. - -Writing to counters or their control registers tends to be -very expensive. This is why a-mode counters only use read -operations on the counter registers. Caching of control -register contents is done to avoid writing them. "Suspend CPU" -is recorded for i-mode counters to avoid writing the counter -registers when the counters are resumed (their control -registers must be written at both suspend and resume, however). - -Some processors are unable to stop the counters (Centaur/VIA), -and some are unable to reinitialise them to arbitrary values (P6). -Storing the counters' total counts in the hardware counters -would break as soon as context-switches occur. This is another -reason why the accumulate-differences method for maintaining the -counter values is used. - -Intel P5 --------- -The hardware stores both counters' control data in a single -control register, the CESR MSR. The evntsel values are -limited to 16 bits each, and are combined by the low-level -driver to form the value for the CESR. Apart from that, -the evntsel values are direct images of the CESR. - -Bits 0xFE00 in an evntsel value are reserved. -At least one evntsel CPL bit (0x00C0) must be set. - -For Cyrix' P5 clones, evntsel bits 0xFA00 are reserved. - -For Centaur's P5 clones, evntsel bits 0xFF00 are reserved. -It has no CPL bits to set. The TSC is broken and cannot be used. - -Intel P6 --------- -The evntsel values are mapped directly onto the counters' -EVNTSEL control registers. - -The global enable bit (22) in EVNTSEL0 must be set. That bit is -reserved in EVNTSEL1. - -Bits 21 and 19 (0x00280000) in each evntsel are reserved. - -For an i-mode counter, bit 20 (0x00100000) of its evntsel must be -set. For a-mode counters, that bit must not be set. - -Hardware quirk: Counters are 40 bits wide, but writing to a -counter only writes the low 32 bits: remaining bits are -sign-extended from bit 31. - -AMD K7/K8 ---------- -Similar to Intel P6. The main difference is that each evntsel has -its own enable bit, which must be set. - -VIA C3 ------- -Superficially similar to Intel P6, but only PERFCTR1/EVNTSEL1 -are programmable. pmc_map[0] must be 1, if nractrs == 1. - -Bits 0xFFFFFE00 in the evntsel are reserved. There are no auxiliary -control bits to set. - -Generic -------- -Only permits TSC sampling, with tsc_on == 1 and nractrs == nrictrs == 0 -in the control data. - -Intel P4 --------- -For each counter, its evntsel[] value is mapped onto its CCCR -control register, and its p4.escr[] value is mapped onto its -associated ESCR control register. - -The ESCR register number is computed from the hardware counter -number (from pmc_map[]) and the ESCR SELECT field in the CCCR, -and is cached in p4_escr_map[]. - -pmc_map[] contains the value to pass to RDPMC when reading the -counter. It is strongly recommended to set bit 31 (fast rdpmc). - -In each evntsel/CCCR value: -- the OVF, OVF_PMI_T1 and hardware-reserved bits (0xB80007FF) - are reserved and must not be set -- bit 11 (EXTENDED_CASCADE) is only permitted on P4 models >= 2, - and for counters 12 and 15-17 -- bits 16 and 17 (ACTIVE_THREAD) must both be set on non-HT processors -- at least one of bits 12 (ENABLE), 30 (CASCADE), or 11 (EXTENDED_CASCADE) - must be set -- bit 26 (OVF_PMI_T0) must be clear for a-mode counters, and set - for i-mode counters; if bit 25 (FORCE_OVF) also is set, then - the corresponding ireset[] value must be exactly -1 - -In each p4.escr[] value: -- bit 32 is reserved and must not be set -- the CPL_T1 field (bits 0 and 1) must be zero except on HT processors - when global-mode counters are used -- IQ_ESCR0 and IQ_ESCR1 can only be used on P4 models <= 2 - -PEBS is not supported, but the replay tagging bits in PEBS_ENABLE -and PEBS_MATRIX_VERT may be used. - -If p4.pebs_enable is zero, then p4.pebs_matrix_vert must also be zero. - -If p4.pebs_enable is non-zero: -- only bits 24, 10, 9, 2, 1, and 0 may be set; note that in contrast - to Intel's documentation, bit 25 (ENABLE_PEBS_MY_THR) is not needed - and must not be set -- bit 24 (UOP_TAG) must be set -- at least one of bits 10, 9, 2, 1, or 0 must be set -- in p4.pebs_matrix_vert, all bits except 1 and 0 must be clear, - and at least one of bits 1 and 0 must be set - -Implementation Notes -==================== - -Caching -------- -Each 'struct perfctr_cpu_state' contains two cache-related fields: -- 'id': a unique identifier for the control data contents -- 'isuspend_cpu': the identity of the CPU on which a state containing - interrupt-mode counters was last suspended - -To this the driver adds a per-CPU cache, recording: -- the 'id' of the control data currently in that CPU -- the current contents of each control register - -When perfctr_cpu_update_control() has validated the new control data, -it also updates the id field. - -The driver's internal 'write_control' function, called from the -perfctr_cpu_resume() API function, first checks if the state's id -matches that of the CPU's cache, and if so, returns. Otherwise -it checks each control register in the state and updates those -that do not match the cache. Finally, it writes the state's id -to the cache. Tests on various x86 processor types have shown that -MSR writes are very expensive: the purpose of these cache checks -is to avoid MSR writes whenever possible. - -Unlike accumulation-mode counters, interrupt-mode counters must be -physically stopped when suspended, primilarly to avoid overflow -interrupts in contexts not expecting them, and secondarily to avoid -increments to the counters themselves (see below). - -When suspending interrupt-mode counters, the driver: -- records the CPU identity in the per-CPU cache -- stops each interrupt-mode counter by disabling its control register -- lets the cache and state id values remain the same - -Later, when resuming interrupt-mode counters, the driver: -- if the state and cache id values match: - * the cache id is cleared, to force a reload of the control - registers stopped at suspend (see below) - * if the state's "suspend" CPU identity matches the current CPU, - the counter registers are still valid, and the procedure returns -- if the procedure did not return above, it then loops over each - interrupt-mode counter: - * the counter's control register is physically disabled, unless - the cache indicates that it already is disabled; this is necessary - to prevent premature events and overflow interrupts if the CPU's - registers previously belonged to some other state - * then the counter register itself is restored -After this interrupt-mode specific resume code is complete, the -driver continues by calling 'write_control' as described above. -The state and cache ids will not match, forcing write_control to -reload the disabled interrupt-mode control registers. - -Call-site Backpatching ----------------------- -The x86 family of processors is quite diverse in how their -performance counters work and are accessed. There are three -main designs (P5, P6, and P4) with several variations. -To handle this the processor type detection and initialisation -code sets up a number of function pointers to point to the -correct procedures for the actual CPU type. - -Calls via function pointers are more expensive than direct calls, -so the driver actually performs direct calls to wrappers that -backpatch the original call sites to instead call the actual -CPU-specific functions in the future. - -Unsynchronised code backpatching in SMP systems doesn't work -on Intel P6 processors due to an erratum, so the driver performs -a "finalise backpatching" step after the CPU-specific function -pointers have been set up. This step invokes the API procedures -on a temporary state object, set up to force every backpatchable -call site to be invoked and adjusted. - -Several low-level API procedures are called in the context-switch -path by the per-process perfctrs kernel extension, which motivates -the efforts to reduce runtime overheads as much as possible. - -Overflow Interrupts -------------------- -The x86 hardware enables overflow interrupts via the local -APIC's LVTPC entry, which is only present in P6/K7/K8/P4. - -The low-level driver supports overflow interrupts as follows: -- It reserves a local APIC vector, 0xee, as LOCAL_PERFCTR_VECTOR. -- It adds a local APIC exception handler to entry.S, which - invokes the driver's smp_perfctr_interrupt() procedure. -- It adds code to i8259.c to bind the LOCAL_PERFCTR_VECTOR - interrupt gate to the exception handler in entry.S. -- During processor type detection, it records whether the - processor supports the local APIC, and sets up function pointers - for the suspend and resume operations on interrupt-mode counters. -- When the low-level driver is activated, it enables overflow - interrupts by writing LOCAL_PERFCTR_VECTOR to each CPU's APIC_LVTPC. -- Overflow interrupts now end up in smp_perfctr_interrupt(), which - ACKs the interrupt and invokes the interrupt handler installed - by the high-level service/driver. -- When the low-level driver is deactivated, it disables overflow - interrupts by masking APIC_LVTPC in each CPU. It then releases - the local APIC back to the NMI watchdog. - -At compile-time, the low-level driver indicates overflow interrupt -support by enabling CONFIG_PERFCTR_INTERRUPT_SUPPORT. If the feature -is also available at runtime, it sets the PERFCTR_FEATURE_PCINT flag -in the perfctr_info object. diff -Naur linux-2.6.12-rc5-mm1/Documentation/perfctr/overview.txt linux-2.6.12-rc5-mm1-plug/Documentation/perfctr/overview.txt --- linux-2.6.12-rc5-mm1/Documentation/perfctr/overview.txt 2005-05-25 16:23:27.862385656 -0700 +++ linux-2.6.12-rc5-mm1-plug/Documentation/perfctr/overview.txt 1969-12-31 16:00:00.000000000 -0800 @@ -1,129 +0,0 @@ -$Id: perfctr-documentation-update.patch,v 1.1 2004/07/12 05:41:57 akpm Exp $ - -AN OVERVIEW OF PERFCTR -====================== -The perfctr package adds support to the Linux kernel for using -the performance-monitoring counters found in many processors. - -Perfctr is internally organised in three layers: - -- The low-level drivers, one for each supported architecture. - Currently there are two, one for 32 and 64-bit x86 processors, - and one for 32-bit PowerPC processors. - - low-level-api.txt documents the model of the performance counters - used in this package, and the internal API to the low-level drivers. - - low-level-{x86,ppc}.txt provide documentation specific for those - architectures and their low-level drivers. - -- The high-level services. - There is currently one, a kernel extension adding support for - virtualised per-process performance counters. - See virtual.txt for documentation on this kernel extension. - - [There used to be a second high-level service, a simple driver - to control and access all performance counters in all processors. - This driver is currently removed, pending an acceptable new API.] - -- The top-level, which performs initialisation and implements - common procedures and system calls. - -Rationale ---------- -The perfctr package solves three problems: - -- Hardware invariably restricts programming of the performance - counter registers to kernel-level code, and sometimes also - restricts reading the counters to kernel-level code. - - Perfctr adds APIs allowing user-space code access the counters. - In the case of the per-process counters kernel extension, - even non-privileged processes are allowed access. - -- Hardware often limits the precision of the hardware counters, - making them unsuitable for storing total event counts. - - The counts are instead maintained as 64-bit values in software, - with the hardware counters used to derive increments over given - time periods. - -- In a non-modified kernel, the thread state does not include the - performance monitoring counters, and the context switch code - does not save and restore them. In this situation the counters - are system-wide, making them unreliable and inaccurate when used - for monitoring specific processes or specific segments of code. - - The per-process counters kernel extension treats the counter state as - part of the thread state, solving the reliability and accuracy problems. - -Non-goals ---------- -Providing high-level interfaces that abstract and hide the -underlying hardware is a non-goal. Such abstractions can -and should be implemented in user-space, for several reasons: - -- The complexity and variability of the hardware means that - any abstraction would be inaccurate. There would be both - loss of functionality, and presence of functionality which - isn't supportable on any given processor. User-space tools - and libraries can implement this, on top of the processor- - specific interfaces provided by the kernel. - -- The implementation of such an abstraction would be large - and complex. (Consider ESCR register assignment on P4.) - Performing complex actions in user-space simplifies the - kernel, allowing it to concentrate on validating control - data, managing processes, and driving the hardware. - (C.f. the role of compilers.) - -- The abstraction is purely a user-convenience thing. The - kernel-level components have no need for it. - -Common System Calls -=================== -This lists those system calls that are not tied to -a specific high-level service/driver. - -Querying CPU and Driver Information ------------------------------------ -int err = sys_perfctr_info(struct perfctr_info *info, - struct perfctr_cpu_mask *cpus, - struct perfctr_cpu_mask *forbidden); - -This operation retrieves information from the kernel about -the processors in the system. - -If non-NULL, '*info' will be updated with information about the -capabilities of the processor and the low-level driver. - -If non-NULL, '*cpus' will be updated with a bitmask listing the -set of processors in the system. The size of this bitmask is not -statically known, so the protocol is: - -1. User-space initialises cpus->nrwords to the number of elements - allocated for cpus->mask[]. -2. The kernel reads cpus->nrwords, and then writes the required - number of words to cpus->nrwords. -3. If the required number of words is less than the original value - of cpus->nrwords, then an EOVERFLOW error is signalled. -4. Otherwise, the kernel converts its internal cpumask_t value - to the external format and writes that to cpus->mask[]. - -If non-NULL, '*forbidden' will be updated with a bitmask listing -the set of processors in the system on which users must not try -to use performance counters. This is currently only relevant for -hyper-threaded Pentium 4/Xeon systems. The protocol is the same -as for '*cpus'. - -Notes: -- The internal representation of a cpumask_t is as an array of - unsigned long. This representation is unsuitable for user-space, - because it is not binary-compatible between 32 and 64-bit - variants of a big-endian processor. The 'struct perfctr_cpu_mask' - type uses an array of unsigned 32-bit integers. -- The protocol for retrieving a 'struct perfctr_cpu_mask' was - designed to allow user-space to quickly determine the correct - size of the 'mask[]' array. Other system calls use weaker protocols, - which force user-space to guess increasingly larger values in a - loop, until finally an acceptable value was guessed. diff -Naur linux-2.6.12-rc5-mm1/Documentation/perfctr/virtual.txt linux-2.6.12-rc5-mm1-plug/Documentation/perfctr/virtual.txt --- linux-2.6.12-rc5-mm1/Documentation/perfctr/virtual.txt 2005-05-25 16:23:27.863385504 -0700 +++ linux-2.6.12-rc5-mm1-plug/Documentation/perfctr/virtual.txt 1969-12-31 16:00:00.000000000 -0800 @@ -1,357 +0,0 @@ -$Id: virtual.txt,v 1.3 2004/08/09 09:42:22 mikpe Exp $ - -VIRTUAL PER-PROCESS PERFORMANCE COUNTERS -======================================== -This document describes the virtualised per-process performance -counters kernel extension. See "General Model" in low-level-api.txt -for the model of the processor's performance counters. - -Contents -======== -- Summary -- Design & Implementation Notes - * State - * Thread Management Hooks - * Synchronisation Rules - * The Pseudo File System -- API For User-Space - * Opening/Creating the State - * Updating the Control - * Unlinking the State - * Reading the State - * Resuming After Handling Overflow Signal - * Reading the Counter Values -- Limitations / TODO List - -Summary -======= -The virtualised per-process performance counters facility -(virtual perfctrs) is a kernel extension which extends the -thread state to record perfctr settings and values, and augments -the context-switch code to save perfctr values at suspends and -restore them at resumes. This "virtualises" the performance -counters in much the same way as the kernel already virtualises -general-purpose and floating-point registers. - -Virtual perfctrs also adds an API allowing non-privileged -user-space processes to set up and access their perfctrs. - -As this facility is primarily intended to support developers -of user-space code, both virtualisation and allowing access -from non-privileged code are essential features. - -Design & Implementation Notes -============================= - -State ------ -The state of a thread's perfctrs is packaged up in an object of -type 'struct vperfctr'. It consists of CPU-dependent state, a -sampling timer, and some auxiliary administrative data. This is -an independent object, with its own lifetime and access rules. - -The state object is attached to the thread via a pointer in its -thread_struct. While attached, the object records the identity -of its owner thread: this is used for user-space API accesses -from threads other than the owner. - -The state is separate from the thread_struct for several resons: -- It's potentially large, hence it's allocated only when needed. -- It can outlive its owner thread. The state can be opened as - a pseudo file: as long as that file is live, so is the object. -- It can be mapped, via mmap() on the pseudo file's descriptor. - To facilitate this, a full page is allocated and reserved. - -Thread Management Hooks ------------------------ -Virtual perfctrs hooks into several thread management events: - -- exit_thread(): Calls perfctr_exit_thread() to stop the counters - and mark the vperfctr object as dead. - -- copy_thread(): Calls perfctr_copy_thread() to initialise - the child's vperfctr pointer. The child gets a new vperfctr - object containing the same control data as its parent. - Kernel-generated threads do not inherit any vperfctr state. - -- release_task(): Calls perfctr_release_task() to detach the - vperfctr object from the thread. If the child and its parent - still have the same perfctr control settings, then the child's - final counts are propagated back into its parent. - -- switch_to(): - * Calls perfctr_suspend_thread() on the previous thread, to - suspend its counters. - * Calls perfctr_resume_thread() on the next thread, to resume - its counters. Also resets the sampling timer (see below). - -- update_process_times(): Calls perfctr_sample_thread(), which - decrements the sampling timer and samples the counters if the - timer reaches zero. - - Sampling is normally only done at switch_to(), but if too much - time passes before the next switch_to(), a hardware counter may - increment by more than its range (usually 2^32). If this occurs, - the difference from its start value will be incorrect, causing - its updated sum to also be incorrect. The sampling timer is used - to prevent this problem, which has been observed on SMP machines, - and on high clock frequency UP machines. - -- set_cpus_allowed(): Calls perfctr_set_cpus_allowed() to detect - attempts to migrate the thread to a "forbidden" CPU, in which - case a flag in the vperfctr object is set. perfctr_resume_thread() - checks this flag, and if set, marks the counters as stopped and - sends a SIGILL to the thread. - - The notion of forbidden CPUs is a workaround for a design flaw - in hyper-threaded Pentium 4s and Xeons. See low-level-x86.txt - for details. - -To reduce overheads, these hooks are implemented as inline functions -that check if the thread is using perfctrs before calling the code -that implements the behaviour. The hooks also reduce to no-ops if -CONFIG_PERFCTR_VIRTUAL is disabled. - -Synchronisation Rules ---------------------- -There are five types of accesses to a thread's perfctr state: - -1. Thread management events (see above) done by the thread itself. - Suspend, resume, and sample are lock-less. - -2. API operations done by the thread itself. - These are lock-less, except when an individual operation - has specific synchronisation needs. For instance, preemption - is often disabled to prevent accesses due to context switches. - -3. API operations done by a different thread ("monitor thread"). - The owner thread must be suspended for the duration of the operation. - This is ensured by requiring that the monitor thread is ptrace()ing - the owner thread, and that the owner thread is in TASK_STOPPED state. - -4. set_cpus_allowed(). - The kernel does not lock the target during set_cpus_allowed(), - so it can execute concurrently with the owner thread or with - some monitor thread. In particular, the state may be deallocated. - - To solve this problem, both perfctr_set_cpus_allowed() and the - operations that can change the owner thread's perfctr pointer - (creat, unlink, exit) perform a task_lock() on the owner thread - before accessing the perfctr pointer. - -5. release_task(). - Reaping a child may or may not be done by the parent of that child. - When done by the parent, no lock is taken. Otherwise, a task_lock() - on the parent is done before accessing its thread's perfctr pointer. - -The Pseudo File System ----------------------- -The perfctr state is accessed from user-space via a file descriptor. - -The main reason for this is to enable mmap() on the file descriptor, -which gives read-only access to the state. - -The file descriptor is a handle to the perfctr state object. This -allows a very simple implementation of the user-space 'perfex' -program, which runs another program with given perfctr settings -and reports their final values. Without this handle, monitoring -applications like perfex would have to be implemented like debuggers -in order to catch the target thread's exit and retrieve the counter -values before the exit completes and the state disappears. - -The file for a perfctr state object belongs to the vperfctrs pseudo -file system. Files in this file system support only a few operations: -- mmap() -- release() decrements the perfctr object's reference count and - deallocates the object when no references remain -- the listing of a thread's open file descriptors identifies - perfctr state file descriptors as belonging to "vperfctrfs" -The implementation is based on the code for pipefs. - -In previous versions of the perfctr package, the file descriptors -for perfctr state objects also supported the API's ioctl() method. - -API For User-Space -================== - -Opening/Creating the State --------------------------- -int fd = sys_vperfctr_open(int tid, int creat); - -'tid' must be the id of a thread, or 0 which is interpreted as an -alias for the current thread. - -This operation returns an open file descriptor which is a handle -on the thread's perfctr state object. - -If 'creat' is non-zero and the object did not exist, then it is -created and attached to the thread. The newly created state object -is inactive, with all control fields disabled and all counters -having the value zero. If 'creat' is non-zero and the object -already existed, then an EEXIST error is signalled. - -If 'tid' does not denote the current thread, then it must denote a -thread that is stopped and under ptrace control by the current thread. - -Notes: -- The access rule in the non-self case is the same as for the - ptrace() system call. It ensures that no other thread, including - the target thread itself, can access or change the target thread's - perfctr state during the operation. -- An open file descriptor for a perfctr state object counts as a - reference to that object; even if detached from its thread the - object will not be deallocated until the last reference is gone. -- The file descriptor can be passed to mmap(), for low-overhead - counter sampling. See "READING THE COUNTER VALUES" for details. -- The file descriptor can be passed to another thread. Accesses - from threads other than the owner are permitted as long as they - posses the file descriptor and use ptrace() for synchronisation. - -Updating the Control --------------------- -int err = sys_vperfctr_control(int fd, const struct vperfctr_control *control); - -'fd' must be the return value from a call to sys_vperfctr_open(), -The perfctr object must still be attached to its owner thread. - -This operation stops and samples any currently running counters in -the thread, and then updates the control settings. If the resulting -state has any enabled counters, then the counters are restarted. - -Before restarting, the counter sums are reset to zero. However, -if a counter's bit is set in the control object's 'preserve' -bitmask field, then that counter's sum is not reset. The TSC's -sum is only reset if the TSC is disabled in the new state. - -If any of the programmable counters are enabled, then the thread's -CPU affinity mask is adjusted to exclude the set of forbidden CPUs. - -If the control data activates any interrupt-mode counters, then -a signal (specified by the 'si_signo' control field) will be sent -to the owner thread after an overflow interrupt. The documentation -for sys_vperfctr_iresume() describes this mechanism. - -If 'fd' does not denote the current thread, then it must denote a -thread that is stopped and under ptrace control by the current thread. -The perfctr state object denoted by 'fd' must still be attached -to its owner thread. - -Notes: -- It is strongly recommended to memset() the vperfctr_control object - to all-bits-zero before setting the fields of interest. -- Stopping the counters is done by invoking the control operation - with a control object that activates neither the TSC nor any PMCs. - -Unlinking the State -------------------- -int err = sys_vperfctr_unlink(int fd); - -'fd' must be the return value from a call to sys_vperfctr_open(). - -This operation stops and samples the thread's counters, and then -detaches the perfctr state object from the thread. If the object -already had been detached, then no action is performed. - -If 'fd' does not denote the current thread, then it must denote a -thread that is stopped and under ptrace control by the current thread. - -Reading the State ------------------ -int err = sys_vperfctr_read(int fd, struct perfctr_sum_ctrs *sum, - struct vperfctr_control *control, - struct perfctr_sum_ctrs *children); - -'fd' must be the return value from a call to sys_vperfctr_open(). - -This operation copies data from the perfctr state object to -user-space. If 'sum' is non-NULL, then the counter sums are -written to it. If 'control' is non-NULL, then the control data -is written to it. If 'children' is non-NULL, then the sums of -exited childrens' counters are written to it. - -If the perfctr state object is attached to the current thread, -then the counters are sampled and updated first. - -If 'fd' does not denote the current thread, then it must denote a -thread that is stopped and under ptrace control by the current thread. - -Notes: -- An alternate and faster way to retrieve the counter sums is described - below. This system call can be used if the hardware does not permit - user-space reads of the counters. - -Resuming After Handling Overflow Signal ---------------------------------------- -int err = sys_vperfctr_iresume(int fd); - -'fd' must be the return value from a call to sys_vperfctr_open(). -The perfctr object must still be attached to its owner thread. - -When an interrupt-mode counter has overflowed, the counters -are sampled and suspended (TSC remains active). Then a signal, -as specified by the 'si_signo' control field, is sent to the -owner thread: the associated 'struct siginfo' has 'si_code' -equal to 'SI_PMC_OVF', and 'si_pmc_ovf_mask' equal to the set -of overflown counters. - -The counters are suspended to avoid generating new performance -counter events during the execution of the signal handler, but -the previous settings are saved. Calling sys_vperfctr_iresume() -restores the previous settings and resumes the counters. Doing -this is optional. - -If 'fd' does not denote the current thread, then it must denote a -thread that is stopped and under ptrace control by the current thread. - -Reading the Counter Values --------------------------- -The value of a counter is computed from three components: - - value = sum + (now - start); - -Two of these (sum and start) reside in the kernel's state object, -and the third (now) is the contents of the hardware counter. -To perform this computation in user-space requires access to -the state object. This is achieved by passing the file descriptor -from sys_vperfctr_open() to mmap(): - - volatile const struct vperfctr_state *kstate; - kstate = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_SHARED, fd, 0); - -Reading the three components is a non-atomic operation. If the -thread is scheduled during the operation, the three values will -not be consistent and the wrong result will be computed. -To detect this situation, user-space should check the kernel -state's TSC start value before and after the operation, and -retry the operation in case of a mismatch. - -The algorithm for retrieving the value of counter 'i' is: - - tsc0 = kstate->cpu_state.tsc_start; - for(;;) { - rdpmcl(kstate->cpu_state.pmc[i].map, now); - start = kstate->cpu_state.pmc[i].start; - sum = kstate->cpu_state.pmc[i].sum; - tsc1 = kstate->cpu_state.tsc_start; - if (likely(tsc1 == tsc0)) - break; - tsc0 = tsc1; - } - return sum + (now - start); - -The algorithm for retrieving the value of the TSC is similar, -as is the algorithm for retrieving the values of all counters. - -Notes: -- Since the state's TSC time-stamps are used, the algorithm requires - that user-space enables TSC sampling. -- The algorithm requires that the hardware allows user-space reads - of the counter registers. If this property isn't statically known - for the architecture, user-space should retrieve the kernel's - 'struct perfctr_info' object and check that the PERFCTR_FEATURE_RDPMC - flag is set. - -Limitations / TODO List -======================= -- Buffering of overflow samples is not implemented. So far, not a - single user has requested it. diff -Naur linux-2.6.12-rc5-mm1/MAINTAINERS linux-2.6.12-rc5-mm1-plug/MAINTAINERS --- linux-2.6.12-rc5-mm1/MAINTAINERS 2005-05-25 16:23:47.444408736 -0700 +++ linux-2.6.12-rc5-mm1-plug/MAINTAINERS 2005-05-25 17:04:42.730148568 -0700 @@ -1832,12 +1832,6 @@ L: netdev@oss.sgi.com S: Supported -PERFORMANCE-MONITORING COUNTERS DRIVER -P: Mikael Pettersson -M: mikpe@csd.uu.se -W: http://www.csd.uu.se/~mikpe/linux/perfctr/ -S: Maintained - PNP SUPPORT P: Adam Belay M: ambx1@neo.rr.com diff -Naur linux-2.6.12-rc5-mm1/arch/i386/Kconfig linux-2.6.12-rc5-mm1-plug/arch/i386/Kconfig --- linux-2.6.12-rc5-mm1/arch/i386/Kconfig 2005-05-25 16:23:25.989670352 -0700 +++ linux-2.6.12-rc5-mm1-plug/arch/i386/Kconfig 2005-05-25 17:05:25.083709848 -0700 @@ -939,8 +939,6 @@ If unsure, say Y. Only embedded should say N here. -source "drivers/perfctr/Kconfig" - config PHYSICAL_START hex "Physical address where the kernel is loaded" if EMBEDDED default "0x100000" diff -Naur linux-2.6.12-rc5-mm1/arch/i386/kernel/entry.S linux-2.6.12-rc5-mm1-plug/arch/i386/kernel/entry.S --- linux-2.6.12-rc5-mm1/arch/i386/kernel/entry.S 2005-05-25 16:23:26.142647096 -0700 +++ linux-2.6.12-rc5-mm1-plug/arch/i386/kernel/entry.S 2005-05-25 17:04:43.721997784 -0700 @@ -445,16 +445,6 @@ /* The include is where all of the SMP etc. interrupts come from */ #include "entry_arch.h" -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PERFCTR) -ENTRY(perfctr_interrupt) - pushl $LOCAL_PERFCTR_VECTOR-256 - SAVE_ALL - pushl %esp - call smp_perfctr_interrupt - addl $4, %esp - jmp ret_from_intr -#endif - ENTRY(divide_error) pushl $0 # no error code pushl $do_divide_error diff -Naur linux-2.6.12-rc5-mm1/arch/i386/kernel/i8259.c linux-2.6.12-rc5-mm1-plug/arch/i386/kernel/i8259.c --- linux-2.6.12-rc5-mm1/arch/i386/kernel/i8259.c 2005-05-25 16:23:26.150645880 -0700 +++ linux-2.6.12-rc5-mm1-plug/arch/i386/kernel/i8259.c 2005-05-25 17:04:43.722997632 -0700 @@ -24,7 +24,6 @@ #include #include #include -#include #include @@ -425,8 +424,6 @@ */ intr_init_hook(); - perfctr_vector_init(); - /* * Set the clock to HZ Hz, we already have a valid * vector now: diff -Naur linux-2.6.12-rc5-mm1/arch/i386/kernel/process.c linux-2.6.12-rc5-mm1-plug/arch/i386/kernel/process.c --- linux-2.6.12-rc5-mm1/arch/i386/kernel/process.c 2005-05-25 16:23:26.181641168 -0700 +++ linux-2.6.12-rc5-mm1-plug/arch/i386/kernel/process.c 2005-05-25 17:04:43.724997328 -0700 @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -183,8 +182,6 @@ { int cpu = _smp_processor_id(); - set_tsk_need_resched(current); - /* endless idle loop with no priority at all */ while (1) { while (!need_resched()) { @@ -401,7 +398,6 @@ tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; put_cpu(); } - perfctr_exit_thread(&tsk->thread); } void flush_thread(void) @@ -481,8 +477,6 @@ savesegment(fs,p->thread.fs); savesegment(gs,p->thread.gs); - perfctr_copy_task(p, regs); - tsk = current; if (unlikely(NULL != tsk->thread.io_bitmap_ptr)) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); @@ -721,7 +715,6 @@ disable_tsc(prev_p->thread_info, next_p->thread_info); - perfctr_resume_thread(next); return prev_p; } diff -Naur linux-2.6.12-rc5-mm1/arch/i386/kernel/syscall_table.S linux-2.6.12-rc5-mm1-plug/arch/i386/kernel/syscall_table.S --- linux-2.6.12-rc5-mm1/arch/i386/kernel/syscall_table.S 2005-05-25 16:23:26.201638128 -0700 +++ linux-2.6.12-rc5-mm1-plug/arch/i386/kernel/syscall_table.S 2005-05-25 17:04:43.811984104 -0700 @@ -291,7 +291,3 @@ .long sys_keyctl .long sys_ioprio_set .long sys_ioprio_get /* 290 */ - .long sys_vperfctr_open - .long sys_vperfctr_control - .long sys_vperfctr_write - .long sys_vperfctr_read diff -Naur linux-2.6.12-rc5-mm1/arch/ia64/kernel/domain.c linux-2.6.12-rc5-mm1-plug/arch/ia64/kernel/domain.c --- linux-2.6.12-rc5-mm1/arch/ia64/kernel/domain.c 2005-05-25 16:23:26.374611832 -0700 +++ linux-2.6.12-rc5-mm1-plug/arch/ia64/kernel/domain.c 2005-05-25 17:02:50.598195216 -0700 @@ -14,7 +14,7 @@ #include #include -#define SD_NODES_PER_DOMAIN 16 +#define SD_NODES_PER_DOMAIN 6 #ifdef CONFIG_NUMA /** @@ -27,7 +27,7 @@ * * Should use nodemask_t. */ -static int find_next_best_node(int node, unsigned long *used_nodes) +static int __devinit find_next_best_node(int node, unsigned long *used_nodes) { int i, n, val, min_val, best_node = 0; @@ -66,7 +66,7 @@ * should be one that prevents unnecessary balancing, but also spreads tasks * out optimally. */ -static cpumask_t sched_domain_node_span(int node) +static cpumask_t __devinit sched_domain_node_span(int node) { int i; cpumask_t span, nodemask; @@ -96,7 +96,7 @@ #ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static struct sched_group sched_group_cpus[NR_CPUS]; -static int cpu_to_cpu_group(int cpu) +static int __devinit cpu_to_cpu_group(int cpu) { return cpu; } @@ -104,7 +104,7 @@ static DEFINE_PER_CPU(struct sched_domain, phys_domains); static struct sched_group sched_group_phys[NR_CPUS]; -static int cpu_to_phys_group(int cpu) +static int __devinit cpu_to_phys_group(int cpu) { #ifdef CONFIG_SCHED_SMT return first_cpu(cpu_sibling_map[cpu]); @@ -125,36 +125,44 @@ static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); static struct sched_group sched_group_allnodes[MAX_NUMNODES]; -static int cpu_to_allnodes_group(int cpu) +static int __devinit cpu_to_allnodes_group(int cpu) { return cpu_to_node(cpu); } #endif /* - * Build sched domains for a given set of cpus and attach the sched domains - * to the individual cpus + * Set up scheduler domains and groups. Callers must hold the hotplug lock. */ -void build_sched_domains(const cpumask_t *cpu_map) +void __devinit arch_init_sched_domains(void) { int i; + cpumask_t cpu_default_map; /* - * Set up domains for cpus specified by the cpu_map. + * Setup mask for cpus without special case scheduling requirements. + * For now this just excludes isolated cpus, but could be used to + * exclude other special cases in the future. */ - for_each_cpu_mask(i, *cpu_map) { + cpus_complement(cpu_default_map, cpu_isolated_map); + cpus_and(cpu_default_map, cpu_default_map, cpu_online_map); + + /* + * Set up domains. Isolated domains just stay on the dummy domain. + */ + for_each_cpu_mask(i, cpu_default_map) { int group; struct sched_domain *sd = NULL, *p; cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); - cpus_and(nodemask, nodemask, *cpu_map); + cpus_and(nodemask, nodemask, cpu_default_map); #ifdef CONFIG_NUMA if (num_online_cpus() > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { sd = &per_cpu(allnodes_domains, i); *sd = SD_ALLNODES_INIT; - sd->span = *cpu_map; + sd->span = cpu_default_map; group = cpu_to_allnodes_group(i); sd->groups = &sched_group_allnodes[group]; p = sd; @@ -165,7 +173,7 @@ *sd = SD_NODE_INIT; sd->span = sched_domain_node_span(cpu_to_node(i)); sd->parent = p; - cpus_and(sd->span, sd->span, *cpu_map); + cpus_and(sd->span, sd->span, cpu_default_map); #endif p = sd; @@ -182,7 +190,7 @@ group = cpu_to_cpu_group(i); *sd = SD_SIBLING_INIT; sd->span = cpu_sibling_map[i]; - cpus_and(sd->span, sd->span, *cpu_map); + cpus_and(sd->span, sd->span, cpu_default_map); sd->parent = p; sd->groups = &sched_group_cpus[group]; #endif @@ -190,9 +198,9 @@ #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask(i, cpu_default_map) { cpumask_t this_sibling_map = cpu_sibling_map[i]; - cpus_and(this_sibling_map, this_sibling_map, *cpu_map); + cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); if (i != first_cpu(this_sibling_map)) continue; @@ -205,7 +213,7 @@ for (i = 0; i < MAX_NUMNODES; i++) { cpumask_t nodemask = node_to_cpumask(i); - cpus_and(nodemask, nodemask, *cpu_map); + cpus_and(nodemask, nodemask, cpu_default_map); if (cpus_empty(nodemask)) continue; @@ -214,7 +222,7 @@ } #ifdef CONFIG_NUMA - init_sched_build_groups(sched_group_allnodes, *cpu_map, + init_sched_build_groups(sched_group_allnodes, cpu_default_map, &cpu_to_allnodes_group); for (i = 0; i < MAX_NUMNODES; i++) { @@ -225,12 +233,12 @@ cpumask_t covered = CPU_MASK_NONE; int j; - cpus_and(nodemask, nodemask, *cpu_map); + cpus_and(nodemask, nodemask, cpu_default_map); if (cpus_empty(nodemask)) continue; domainspan = sched_domain_node_span(i); - cpus_and(domainspan, domainspan, *cpu_map); + cpus_and(domainspan, domainspan, cpu_default_map); sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); sched_group_nodes[i] = sg; @@ -258,7 +266,7 @@ int n = (i + j) % MAX_NUMNODES; cpus_complement(notcovered, covered); - cpus_and(tmp, notcovered, *cpu_map); + cpus_and(tmp, notcovered, cpu_default_map); cpus_and(tmp, tmp, domainspan); if (cpus_empty(tmp)) break; @@ -285,7 +293,7 @@ #endif /* Calculate CPU power for physical packages and nodes */ - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask(i, cpu_default_map) { int power; struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT @@ -351,35 +359,13 @@ cpu_attach_domain(sd, i); } } -/* - * Set up scheduler domains and groups. Callers must hold the hotplug lock. - */ -void arch_init_sched_domains(const cpumask_t *cpu_map) -{ - cpumask_t cpu_default_map; - - /* - * Setup mask for cpus without special case scheduling requirements. - * For now this just excludes isolated cpus, but could be used to - * exclude other special cases in the future. - */ - cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); - - build_sched_domains(&cpu_default_map); -} -void arch_destroy_sched_domains(const cpumask_t *cpu_map) +void __devinit arch_destroy_sched_domains(void) { #ifdef CONFIG_NUMA int i; for (i = 0; i < MAX_NUMNODES; i++) { - cpumask_t nodemask = node_to_cpumask(i); struct sched_group *oldsg, *sg = sched_group_nodes[i]; - - cpus_and(nodemask, nodemask, *cpu_map); - if (cpus_empty(nodemask)) - continue; - if (sg == NULL) continue; sg = sg->next; diff -Naur linux-2.6.12-rc5-mm1/arch/ia64/kernel/process.c linux-2.6.12-rc5-mm1-plug/arch/ia64/kernel/process.c --- linux-2.6.12-rc5-mm1/arch/ia64/kernel/process.c 2005-05-25 16:23:26.415605600 -0700 +++ linux-2.6.12-rc5-mm1-plug/arch/ia64/kernel/process.c 2005-05-25 17:02:50.599195064 -0700 @@ -262,8 +262,6 @@ { void (*mark_idle)(int) = ia64_mark_idle; - set_tsk_need_resched(current); - /* endless idle loop with no priority at all */ while (1) { #ifdef CONFIG_SMP diff -Naur linux-2.6.12-rc5-mm1/arch/ppc/Kconfig linux-2.6.12-rc5-mm1-plug/arch/ppc/Kconfig --- linux-2.6.12-rc5-mm1/arch/ppc/Kconfig 2005-05-25 16:23:26.950524280 -0700 +++ linux-2.6.12-rc5-mm1-plug/arch/ppc/Kconfig 2005-05-25 17:04:43.750993376 -0700 @@ -277,8 +277,6 @@ depends on 4xx || 8xx default y -source "drivers/perfctr/Kconfig" - endmenu menu "Platform options" diff -Naur linux-2.6.12-rc5-mm1/arch/ppc/kernel/head.S linux-2.6.12-rc5-mm1-plug/arch/ppc/kernel/head.S --- linux-2.6.12-rc5-mm1/arch/ppc/kernel/head.S 2005-05-25 16:23:26.967521696 -0700 +++ linux-2.6.12-rc5-mm1-plug/arch/ppc/kernel/head.S 2005-05-25 17:04:43.766990944 -0700 @@ -502,11 +502,7 @@ Trap_0f: EXCEPTION_PROLOG addi r3,r1,STACK_FRAME_OVERHEAD -#ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT - EXC_XFER_EE(0xf00, do_perfctr_interrupt) -#else EXC_XFER_EE(0xf00, UnknownException) -#endif /* * Handle TLB miss for instruction on 603/603e. diff -Naur linux-2.6.12-rc5-mm1/arch/ppc/kernel/misc.S linux-2.6.12-rc5-mm1-plug/arch/ppc/kernel/misc.S --- linux-2.6.12-rc5-mm1/arch/ppc/kernel/misc.S 2005-05-25 16:23:26.985518960 -0700 +++ linux-2.6.12-rc5-mm1-plug/arch/ppc/kernel/misc.S 2005-05-25 17:04:43.751993224 -0700 @@ -1443,7 +1443,3 @@ .long sys_waitid .long sys_ioprio_set .long sys_ioprio_get - .long sys_vperfctr_open /* 275 */ - .long sys_vperfctr_control - .long sys_vperfctr_write - .long sys_vperfctr_read diff -Naur linux-2.6.12-rc5-mm1/arch/ppc/kernel/process.c linux-2.6.12-rc5-mm1-plug/arch/ppc/kernel/process.c --- linux-2.6.12-rc5-mm1/arch/ppc/kernel/process.c 2005-05-25 16:23:26.999516832 -0700 +++ linux-2.6.12-rc5-mm1-plug/arch/ppc/kernel/process.c 2005-05-25 17:04:43.752993072 -0700 @@ -35,7 +35,6 @@ #include #include #include -#include #include #include @@ -302,9 +301,7 @@ #endif /* CONFIG_SPE */ new_thread = &new->thread; old_thread = ¤t->thread; - perfctr_suspend_thread(&prev->thread); last = _switch(old_thread, new_thread); - perfctr_resume_thread(¤t->thread); local_irq_restore(s); return last; } @@ -366,7 +363,6 @@ if (last_task_used_spe == current) last_task_used_spe = NULL; #endif - perfctr_exit_thread(¤t->thread); } void flush_thread(void) @@ -459,8 +455,6 @@ p->thread.last_syscall = -1; - perfctr_copy_task(p, regs); - return 0; } diff -Naur linux-2.6.12-rc5-mm1/arch/ppc64/Kconfig linux-2.6.12-rc5-mm1-plug/arch/ppc64/Kconfig --- linux-2.6.12-rc5-mm1/arch/ppc64/Kconfig 2005-05-25 16:23:26.689563952 -0700 +++ linux-2.6.12-rc5-mm1-plug/arch/ppc64/Kconfig 2005-05-25 17:04:43.775989576 -0700 @@ -287,7 +287,6 @@ depends on PPC_ISERIES default y -source "drivers/perfctr/Kconfig" config PPC_RTAS bool diff -Naur linux-2.6.12-rc5-mm1/arch/ppc64/kernel/idle.c linux-2.6.12-rc5-mm1-plug/arch/ppc64/kernel/idle.c --- linux-2.6.12-rc5-mm1/arch/ppc64/kernel/idle.c 2005-05-25 16:23:26.714560152 -0700 +++ linux-2.6.12-rc5-mm1-plug/arch/ppc64/kernel/idle.c 2005-05-25 17:02:50.599195064 -0700 @@ -305,7 +305,6 @@ void cpu_idle(void) { - set_tsk_need_resched(current); idle_loop(); } diff -Naur linux-2.6.12-rc5-mm1/arch/ppc64/kernel/misc.S linux-2.6.12-rc5-mm1-plug/arch/ppc64/kernel/misc.S --- linux-2.6.12-rc5-mm1/arch/ppc64/kernel/misc.S 2005-05-25 16:23:26.732557416 -0700 +++ linux-2.6.12-rc5-mm1-plug/arch/ppc64/kernel/misc.S 2005-05-25 17:04:43.776989424 -0700 @@ -1131,12 +1131,6 @@ .llong .sys32_request_key .llong .compat_sys_keyctl .llong .compat_sys_waitid - .llong .sys_ni_syscall /* 273 reserved for sys_ioprio_set */ - .llong .sys_ni_syscall /* 274 reserved for sys_ioprio_get */ - .llong .sys_vperfctr_open /* 275 */ - .llong .sys_vperfctr_control - .llong .sys_vperfctr_write - .llong .sys_vperfctr_read .balign 8 _GLOBAL(sys_call_table) @@ -1413,9 +1407,3 @@ .llong .sys_request_key /* 270 */ .llong .sys_keyctl .llong .sys_waitid - .llong .sys_ni_syscall /* 273 reserved for sys_ioprio_set */ - .llong .sys_ni_syscall /* 274 reserved for sys_ioprio_get */ - .llong .sys_vperfctr_open /* 275 */ - .llong .sys_vperfctr_control - .llong .sys_vperfctr_write - .llong .sys_vperfctr_read diff -Naur linux-2.6.12-rc5-mm1/arch/ppc64/kernel/process.c linux-2.6.12-rc5-mm1-plug/arch/ppc64/kernel/process.c --- linux-2.6.12-rc5-mm1/arch/ppc64/kernel/process.c 2005-05-25 16:23:26.767552096 -0700 +++ linux-2.6.12-rc5-mm1-plug/arch/ppc64/kernel/process.c 2005-05-25 17:04:43.805985016 -0700 @@ -36,7 +36,6 @@ #include #include #include -#include #include #include @@ -218,9 +217,7 @@ local_irq_save(flags); - perfctr_suspend_thread(&prev->thread); last = _switch(old_thread, new_thread); - perfctr_resume_thread(¤t->thread); local_irq_restore(flags); @@ -318,7 +315,6 @@ last_task_used_altivec = NULL; #endif /* CONFIG_ALTIVEC */ #endif /* CONFIG_SMP */ - perfctr_exit_thread(¤t->thread); } void flush_thread(void) @@ -421,8 +417,6 @@ */ kregs->nip = *((unsigned long *)ret_from_fork); - perfctr_copy_task(p, regs); - return 0; } diff -Naur linux-2.6.12-rc5-mm1/arch/x86_64/Kconfig linux-2.6.12-rc5-mm1-plug/arch/x86_64/Kconfig --- linux-2.6.12-rc5-mm1/arch/x86_64/Kconfig 2005-05-25 16:23:27.397456336 -0700 +++ linux-2.6.12-rc5-mm1-plug/arch/x86_64/Kconfig 2005-05-25 17:04:43.738995200 -0700 @@ -508,8 +508,6 @@ depends on IA32_EMULATION default y -source "drivers/perfctr/Kconfig" - endmenu source drivers/Kconfig diff -Naur linux-2.6.12-rc5-mm1/arch/x86_64/ia32/ia32entry.S linux-2.6.12-rc5-mm1-plug/arch/x86_64/ia32/ia32entry.S --- linux-2.6.12-rc5-mm1/arch/x86_64/ia32/ia32entry.S 2005-05-25 16:23:27.394456792 -0700 +++ linux-2.6.12-rc5-mm1-plug/arch/x86_64/ia32/ia32entry.S 2005-05-25 17:04:43.737995352 -0700 @@ -595,12 +595,6 @@ .quad sys_add_key .quad sys_request_key .quad sys_keyctl - .quad quiet_ni_syscall /* sys_ioprio_set */ - .quad quiet_ni_syscall /* sys_ioprio_get */ /* 290 */ - .quad sys_vperfctr_open - .quad sys_vperfctr_control - .quad sys_vperfctr_write - .quad sys_vperfctr_read /* don't forget to change IA32_NR_syscalls */ ia32_syscall_end: .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8 diff -Naur linux-2.6.12-rc5-mm1/arch/x86_64/kernel/entry.S linux-2.6.12-rc5-mm1-plug/arch/x86_64/kernel/entry.S --- linux-2.6.12-rc5-mm1/arch/x86_64/kernel/entry.S 2005-05-25 16:23:27.426451928 -0700 +++ linux-2.6.12-rc5-mm1-plug/arch/x86_64/kernel/entry.S 2005-05-25 17:04:43.739995048 -0700 @@ -554,11 +554,6 @@ apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt #endif -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PERFCTR) -ENTRY(perfctr_interrupt) - apicinterrupt LOCAL_PERFCTR_VECTOR,smp_perfctr_interrupt -#endif - /* * Exception entry points. */ diff -Naur linux-2.6.12-rc5-mm1/arch/x86_64/kernel/i8259.c linux-2.6.12-rc5-mm1-plug/arch/x86_64/kernel/i8259.c --- linux-2.6.12-rc5-mm1/arch/x86_64/kernel/i8259.c 2005-05-25 16:23:27.430451320 -0700 +++ linux-2.6.12-rc5-mm1-plug/arch/x86_64/kernel/i8259.c 2005-05-25 17:04:43.740994896 -0700 @@ -24,7 +24,6 @@ #include #include #include -#include #include @@ -580,8 +579,6 @@ set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); #endif - perfctr_vector_init(); - /* * Set the clock to HZ Hz, we already have a valid * vector now: diff -Naur linux-2.6.12-rc5-mm1/arch/x86_64/kernel/process.c linux-2.6.12-rc5-mm1-plug/arch/x86_64/kernel/process.c --- linux-2.6.12-rc5-mm1/arch/x86_64/kernel/process.c 2005-05-25 16:23:27.448448584 -0700 +++ linux-2.6.12-rc5-mm1-plug/arch/x86_64/kernel/process.c 2005-05-25 17:04:43.741994744 -0700 @@ -33,7 +33,6 @@ #include #include #include -#include #include #include @@ -162,8 +161,6 @@ */ void cpu_idle (void) { - set_tsk_need_resched(current); - /* endless idle loop with no priority at all */ while (1) { while (!need_resched()) { @@ -308,7 +305,6 @@ t->io_bitmap_max = 0; put_cpu(); } - perfctr_exit_thread(&me->thread); } void flush_thread(void) @@ -411,8 +407,6 @@ asm("mov %%es,%0" : "=m" (p->thread.es)); asm("mov %%ds,%0" : "=m" (p->thread.ds)); - perfctr_copy_task(p, regs); - if (unlikely(me->thread.io_bitmap_ptr != NULL)) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { @@ -583,8 +577,6 @@ disable_tsc(prev_p->thread_info, next_p->thread_info); - perfctr_resume_thread(next); - return prev_p; } diff -Naur linux-2.6.12-rc5-mm1/drivers/Makefile linux-2.6.12-rc5-mm1-plug/drivers/Makefile --- linux-2.6.12-rc5-mm1/drivers/Makefile 2005-05-25 16:23:32.900619728 -0700 +++ linux-2.6.12-rc5-mm1-plug/drivers/Makefile 2005-05-25 17:04:42.719150240 -0700 @@ -62,7 +62,6 @@ obj-$(CONFIG_EISA) += eisa/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_MMC) += mmc/ -obj-$(CONFIG_PERFCTR) += perfctr/ obj-$(CONFIG_INFINIBAND) += infiniband/ obj-$(CONFIG_SGI_IOC4) += sn/ obj-y += firmware/ diff -Naur linux-2.6.12-rc5-mm1/drivers/i2c/chips/ds1337.c.orig linux-2.6.12-rc5-mm1-plug/drivers/i2c/chips/ds1337.c.orig --- linux-2.6.12-rc5-mm1/drivers/i2c/chips/ds1337.c.orig 2005-05-25 16:23:31.735796808 -0700 +++ linux-2.6.12-rc5-mm1-plug/drivers/i2c/chips/ds1337.c.orig 1969-12-31 16:00:00.000000000 -0800 @@ -1,386 +0,0 @@ -/* - * linux/drivers/i2c/chips/ds1337.c - * - * Copyright (C) 2005 James Chapman - * - * based on linux/drivers/acorn/char/pcf8583.c - * Copyright (C) 2000 Russell King - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Driver for Dallas Semiconductor DS1337 and DS1339 real time clock chip - */ - -#include -#include -#include -#include -#include -#include -#include -#include /* get the user-level API */ -#include -#include - -/* Device registers */ -#define DS1337_REG_HOUR 2 -#define DS1337_REG_DAY 3 -#define DS1337_REG_DATE 4 -#define DS1337_REG_MONTH 5 -#define DS1337_REG_CONTROL 14 -#define DS1337_REG_STATUS 15 - -/* FIXME - how do we export these interface constants? */ -#define DS1337_GET_DATE 0 -#define DS1337_SET_DATE 1 - -/* - * Functions declaration - */ -static unsigned short normal_i2c[] = { 0x68, I2C_CLIENT_END }; -static unsigned int normal_isa[] = { I2C_CLIENT_ISA_END }; - -SENSORS_INSMOD_1(ds1337); - -static int ds1337_attach_adapter(struct i2c_adapter *adapter); -static int ds1337_detect(struct i2c_adapter *adapter, int address, int kind); -static void ds1337_init_client(struct i2c_client *client); -static int ds1337_detach_client(struct i2c_client *client); -static int ds1337_command(struct i2c_client *client, unsigned int cmd, - void *arg); - -/* - * Driver data (common to all clients) - */ -static struct i2c_driver ds1337_driver = { - .owner = THIS_MODULE, - .name = "ds1337", - .flags = I2C_DF_NOTIFY, - .attach_adapter = ds1337_attach_adapter, - .detach_client = ds1337_detach_client, - .command = ds1337_command, -}; - -/* - * Client data (each client gets its own) - */ -struct ds1337_data { - struct i2c_client client; - struct list_head list; -}; - -/* - * Internal variables - */ -static LIST_HEAD(ds1337_clients); - -static inline int ds1337_read(struct i2c_client *client, u8 reg, u8 *value) -{ - s32 tmp = i2c_smbus_read_byte_data(client, reg); - - if (tmp < 0) - return -EIO; - - *value = tmp; - - return 0; -} - -/* - * Chip access functions - */ -static int ds1337_get_datetime(struct i2c_client *client, struct rtc_time *dt) -{ - int result; - u8 buf[7]; - u8 val; - struct i2c_msg msg[2]; - u8 offs = 0; - - if (!dt) { - dev_dbg(&client->dev, "%s: EINVAL: dt=NULL\n", __FUNCTION__); - return -EINVAL; - } - - msg[0].addr = client->addr; - msg[0].flags = 0; - msg[0].len = 1; - msg[0].buf = &offs; - - msg[1].addr = client->addr; - msg[1].flags = I2C_M_RD; - msg[1].len = sizeof(buf); - msg[1].buf = &buf[0]; - - result = i2c_transfer(client->adapter, msg, 2); - - dev_dbg(&client->dev, "%s: [%d] %02x %02x %02x %02x %02x %02x %02x\n", - __FUNCTION__, result, buf[0], buf[1], buf[2], buf[3], - buf[4], buf[5], buf[6]); - - if (result == 2) { - dt->tm_sec = BCD2BIN(buf[0]); - dt->tm_min = BCD2BIN(buf[1]); - val = buf[2] & 0x3f; - dt->tm_hour = BCD2BIN(val); - dt->tm_wday = BCD2BIN(buf[3]) - 1; - dt->tm_mday = BCD2BIN(buf[4]); - val = buf[5] & 0x7f; - dt->tm_mon = BCD2BIN(val) - 1; - dt->tm_year = BCD2BIN(buf[6]); - if (buf[5] & 0x80) - dt->tm_year += 100; - - dev_dbg(&client->dev, "%s: secs=%d, mins=%d, " - "hours=%d, mday=%d, mon=%d, year=%d, wday=%d\n", - __FUNCTION__, dt->tm_sec, dt->tm_min, - dt->tm_hour, dt->tm_mday, - dt->tm_mon, dt->tm_year, dt->tm_wday); - - return 0; - } - - dev_err(&client->dev, "error reading data! %d\n", result); - return -EIO; -} - -static int ds1337_set_datetime(struct i2c_client *client, struct rtc_time *dt) -{ - int result; - u8 buf[8]; - u8 val; - struct i2c_msg msg[1]; - - if (!dt) { - dev_dbg(&client->dev, "%s: EINVAL: dt=NULL\n", __FUNCTION__); - return -EINVAL; - } - - dev_dbg(&client->dev, "%s: secs=%d, mins=%d, hours=%d, " - "mday=%d, mon=%d, year=%d, wday=%d\n", __FUNCTION__, - dt->tm_sec, dt->tm_min, dt->tm_hour, - dt->tm_mday, dt->tm_mon, dt->tm_year, dt->tm_wday); - - buf[0] = 0; /* reg offset */ - buf[1] = BIN2BCD(dt->tm_sec); - buf[2] = BIN2BCD(dt->tm_min); - buf[3] = BIN2BCD(dt->tm_hour) | (1 << 6); - buf[4] = BIN2BCD(dt->tm_wday) + 1; - buf[5] = BIN2BCD(dt->tm_mday); - buf[6] = BIN2BCD(dt->tm_mon) + 1; - val = dt->tm_year; - if (val >= 100) { - val -= 100; - buf[6] |= (1 << 7); - } - buf[7] = BIN2BCD(val); - - msg[0].addr = client->addr; - msg[0].flags = 0; - msg[0].len = sizeof(buf); - msg[0].buf = &buf[0]; - - result = i2c_transfer(client->adapter, msg, 1); - if (result == 1) - return 0; - - dev_err(&client->dev, "error writing data! %d\n", result); - return -EIO; -} - -static int ds1337_command(struct i2c_client *client, unsigned int cmd, - void *arg) -{ - dev_dbg(&client->dev, "%s: cmd=%d\n", __FUNCTION__, cmd); - - switch (cmd) { - case DS1337_GET_DATE: - return ds1337_get_datetime(client, arg); - - case DS1337_SET_DATE: - return ds1337_set_datetime(client, arg); - - default: - return -EINVAL; - } -} - -/* - * Public API for access to specific device. Useful for low-level - * RTC access from kernel code. - */ -int ds1337_do_command(int bus, int cmd, void *arg) -{ - struct list_head *walk; - struct list_head *tmp; - struct ds1337_data *data; - - list_for_each_safe(walk, tmp, &ds1337_clients) { - data = list_entry(walk, struct ds1337_data, list); - if (data->client.adapter->nr == bus) - return ds1337_command(&data->client, cmd, arg); - } - - return -ENODEV; -} - -static int ds1337_attach_adapter(struct i2c_adapter *adapter) -{ - return i2c_detect(adapter, &addr_data, ds1337_detect); -} - -/* - * The following function does more than just detection. If detection - * succeeds, it also registers the new chip. - */ -static int ds1337_detect(struct i2c_adapter *adapter, int address, int kind) -{ - struct i2c_client *new_client; - struct ds1337_data *data; - int err = 0; - const char *name = ""; - - if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE_DATA | - I2C_FUNC_I2C)) - goto exit; - - if (!(data = kmalloc(sizeof(struct ds1337_data), GFP_KERNEL))) { - err = -ENOMEM; - goto exit; - } - memset(data, 0, sizeof(struct ds1337_data)); - INIT_LIST_HEAD(&data->list); - - /* The common I2C client data is placed right before the - * DS1337-specific data. - */ - new_client = &data->client; - i2c_set_clientdata(new_client, data); - new_client->addr = address; - new_client->adapter = adapter; - new_client->driver = &ds1337_driver; - new_client->flags = 0; - - /* - * Now we do the remaining detection. A negative kind means that - * the driver was loaded with no force parameter (default), so we - * must both detect and identify the chip. A zero kind means that - * the driver was loaded with the force parameter, the detection - * step shall be skipped. A positive kind means that the driver - * was loaded with the force parameter and a given kind of chip is - * requested, so both the detection and the identification steps - * are skipped. - * - * For detection, we read registers that are most likely to cause - * detection failure, i.e. those that have more bits with fixed - * or reserved values. - */ - - /* Default to an DS1337 if forced */ - if (kind == 0) - kind = ds1337; - - if (kind < 0) { /* detection and identification */ - u8 data; - - /* Check that status register bits 6-2 are zero */ - if ((ds1337_read(new_client, DS1337_REG_STATUS, &data) < 0) || - (data & 0x7c)) - goto exit_free; - - /* Check for a valid day register value */ - if ((ds1337_read(new_client, DS1337_REG_DAY, &data) < 0) || - (data == 0) || (data & 0xf8)) - goto exit_free; - - /* Check for a valid date register value */ - if ((ds1337_read(new_client, DS1337_REG_DATE, &data) < 0) || - (data == 0) || (data & 0xc0) || ((data & 0x0f) > 9) || - (data >= 0x32)) - goto exit_free; - - /* Check for a valid month register value */ - if ((ds1337_read(new_client, DS1337_REG_MONTH, &data) < 0) || - (data == 0) || (data & 0x60) || ((data & 0x0f) > 9) || - ((data >= 0x13) && (data <= 0x19))) - goto exit_free; - - /* Check that control register bits 6-5 are zero */ - if ((ds1337_read(new_client, DS1337_REG_CONTROL, &data) < 0) || - (data & 0x60)) - goto exit_free; - - kind = ds1337; - } - - if (kind == ds1337) - name = "ds1337"; - - /* We can fill in the remaining client fields */ - strlcpy(new_client->name, name, I2C_NAME_SIZE); - - /* Tell the I2C layer a new client has arrived */ - if ((err = i2c_attach_client(new_client))) - goto exit_free; - - /* Initialize the DS1337 chip */ - ds1337_init_client(new_client); - - /* Add client to local list */ - list_add(&data->list, &ds1337_clients); - - return 0; - -exit_free: - kfree(data); -exit: - return err; -} - -static void ds1337_init_client(struct i2c_client *client) -{ - s32 val; - - /* Ensure that device is set in 24-hour mode */ - val = i2c_smbus_read_byte_data(client, DS1337_REG_HOUR); - if ((val >= 0) && (val & (1 << 6)) == 0) - i2c_smbus_write_byte_data(client, DS1337_REG_HOUR, - val | (1 << 6)); -} - -static int ds1337_detach_client(struct i2c_client *client) -{ - int err; - struct ds1337_data *data = i2c_get_clientdata(client); - - if ((err = i2c_detach_client(client))) { - dev_err(&client->dev, "Client deregistration failed, " - "client not detached.\n"); - return err; - } - - list_del(&data->list); - kfree(data); - return 0; -} - -static int __init ds1337_init(void) -{ - return i2c_add_driver(&ds1337_driver); -} - -static void __exit ds1337_exit(void) -{ - i2c_del_driver(&ds1337_driver); -} - -MODULE_AUTHOR("James Chapman "); -MODULE_DESCRIPTION("DS1337 RTC driver"); -MODULE_LICENSE("GPL"); - -EXPORT_SYMBOL_GPL(ds1337_do_command); - -module_init(ds1337_init); -module_exit(ds1337_exit); diff -Naur linux-2.6.12-rc5-mm1/drivers/perfctr/Kconfig linux-2.6.12-rc5-mm1-plug/drivers/perfctr/Kconfig --- linux-2.6.12-rc5-mm1/drivers/perfctr/Kconfig 2005-05-25 16:23:37.404934968 -0700 +++ linux-2.6.12-rc5-mm1-plug/drivers/perfctr/Kconfig 1969-12-31 16:00:00.000000000 -0800 @@ -1,64 +0,0 @@ -# $Id: Kconfig,v 1.10 2004/05/24 11:00:55 mikpe Exp $ -# Performance-monitoring counters driver configuration -# - -menu "Performance-monitoring counters support" - -config PERFCTR - bool "Performance monitoring counters support" - help - This driver provides access to the performance-monitoring counter - registers available in some (but not all) modern processors. - These special-purpose registers can be programmed to count low-level - performance-related events which occur during program execution, - such as cache misses, pipeline stalls, etc. - - You can safely say Y here, even if you intend to run the kernel - on a processor without performance-monitoring counters. - - At you can find - the corresponding user-space components, as well as other - versions of this package. A mailing list is also available, at - . - -config PERFCTR_INIT_TESTS - bool "Init-time hardware tests" - depends on PERFCTR - default n - help - This option makes the driver perform additional hardware tests - during initialisation, and log their results in the kernel's - message buffer. For most supported processors, these tests simply - measure the runtime overheads of performance counter operations. - - If you have a less well-known processor (one not listed in the - etc/costs/ directory in the user-space package), you should enable - this option and email the results to the perfctr developers. - - If unsure, say N. - -config PERFCTR_VIRTUAL - bool "Virtual performance counters support" - depends on PERFCTR - default y - help - The processor's performance-monitoring counters are special-purpose - global registers. This option adds support for virtual per-process - performance-monitoring counters which only run when the process - to which they belong is executing. This improves the accuracy of - performance measurements by reducing "noise" from other processes. - - Say Y. - -config PERFCTR_INTERRUPT_SUPPORT - prompt "Performance counter overflow interrupt support" if PPC - bool - depends on PERFCTR - default y if X86_LOCAL_APIC - -config PERFCTR_CPUS_FORBIDDEN_MASK - bool - depends on PERFCTR - default y if X86 && SMP - -endmenu diff -Naur linux-2.6.12-rc5-mm1/drivers/perfctr/Makefile linux-2.6.12-rc5-mm1-plug/drivers/perfctr/Makefile --- linux-2.6.12-rc5-mm1/drivers/perfctr/Makefile 2005-05-25 16:23:37.404934968 -0700 +++ linux-2.6.12-rc5-mm1-plug/drivers/perfctr/Makefile 1969-12-31 16:00:00.000000000 -0800 @@ -1,19 +0,0 @@ -# $Id: Makefile,v 1.27 2005/03/23 01:29:34 mikpe Exp $ -# Makefile for the Performance-monitoring counters driver. - -# This also covers x86_64. -perfctr-objs-$(CONFIG_X86) := x86.o -tests-objs-$(CONFIG_X86) := x86_tests.o - -perfctr-objs-$(CONFIG_PPC32) := ppc.o -tests-objs-$(CONFIG_PPC32) := ppc_tests.o - -perfctr-objs-$(CONFIG_PPC64) := ppc64.o -tests-objs-$(CONFIG_PPC64) := ppc64_tests.o - -perfctr-objs-y += init.o -perfctr-objs-$(CONFIG_PERFCTR_INIT_TESTS) += $(tests-objs-y) -perfctr-objs-$(CONFIG_PERFCTR_VIRTUAL) += virtual.o - -perfctr-objs := $(perfctr-objs-y) -obj-$(CONFIG_PERFCTR) := perfctr.o diff -Naur linux-2.6.12-rc5-mm1/drivers/perfctr/cpumask.h linux-2.6.12-rc5-mm1-plug/drivers/perfctr/cpumask.h --- linux-2.6.12-rc5-mm1/drivers/perfctr/cpumask.h 2005-05-25 16:23:37.402935272 -0700 +++ linux-2.6.12-rc5-mm1-plug/drivers/perfctr/cpumask.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,25 +0,0 @@ -/* $Id: cpumask.h,v 1.7 2004/05/12 19:59:01 mikpe Exp $ - * Performance-monitoring counters driver. - * Partial simulation of cpumask_t on non-cpumask_t kernels. - * Extension to allow inspecting a cpumask_t as array of ulong. - * Appropriate definition of perfctr_cpus_forbidden_mask. - * - * Copyright (C) 2003-2004 Mikael Pettersson - */ - -#ifdef CPU_ARRAY_SIZE -#define PERFCTR_CPUMASK_NRLONGS CPU_ARRAY_SIZE -#else -#define PERFCTR_CPUMASK_NRLONGS 1 -#endif - -/* CPUs in `perfctr_cpus_forbidden_mask' must not use the - performance-monitoring counters. TSC use is unrestricted. - This is needed to prevent resource conflicts on hyper-threaded P4s. */ -#ifdef CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK -extern cpumask_t perfctr_cpus_forbidden_mask; -#define perfctr_cpu_is_forbidden(cpu) cpu_isset((cpu), perfctr_cpus_forbidden_mask) -#else -#define perfctr_cpus_forbidden_mask CPU_MASK_NONE -#define perfctr_cpu_is_forbidden(cpu) 0 /* cpu_isset() needs an lvalue :-( */ -#endif diff -Naur linux-2.6.12-rc5-mm1/drivers/perfctr/init.c linux-2.6.12-rc5-mm1-plug/drivers/perfctr/init.c --- linux-2.6.12-rc5-mm1/drivers/perfctr/init.c 2005-05-25 16:23:37.403935120 -0700 +++ linux-2.6.12-rc5-mm1-plug/drivers/perfctr/init.c 1969-12-31 16:00:00.000000000 -0800 @@ -1,115 +0,0 @@ -/* $Id: init.c,v 1.81 2005/03/17 23:49:07 mikpe Exp $ - * Performance-monitoring counters driver. - * Top-level initialisation code. - * - * Copyright (C) 1999-2005 Mikael Pettersson - */ -#include -#include -#include -#include -#include -#include - -#include "cpumask.h" -#include "virtual.h" -#include "version.h" - -struct perfctr_info perfctr_info; - -static ssize_t -driver_version_show(struct class *class, char *buf) -{ - return sprintf(buf, "%s\n", VERSION); -} - -static ssize_t -cpu_features_show(struct class *class, char *buf) -{ - return sprintf(buf, "%#x\n", perfctr_info.cpu_features); -} - -static ssize_t -cpu_khz_show(struct class *class, char *buf) -{ - return sprintf(buf, "%u\n", perfctr_info.cpu_khz); -} - -static ssize_t -tsc_to_cpu_mult_show(struct class *class, char *buf) -{ - return sprintf(buf, "%u\n", perfctr_info.tsc_to_cpu_mult); -} - -static ssize_t -state_user_offset_show(struct class *class, char *buf) -{ - return sprintf(buf, "%u\n", (unsigned int)offsetof(struct perfctr_cpu_state, user)); -} - -static ssize_t -cpus_online_show(struct class *class, char *buf) -{ - int ret = cpumask_scnprintf(buf, PAGE_SIZE-1, cpu_online_map); - buf[ret++] = '\n'; - return ret; -} - -static ssize_t -cpus_forbidden_show(struct class *class, char *buf) -{ - int ret = cpumask_scnprintf(buf, PAGE_SIZE-1, perfctr_cpus_forbidden_mask); - buf[ret++] = '\n'; - return ret; -} - -static struct class_attribute perfctr_class_attrs[] = { - __ATTR_RO(driver_version), - __ATTR_RO(cpu_features), - __ATTR_RO(cpu_khz), - __ATTR_RO(tsc_to_cpu_mult), - __ATTR_RO(state_user_offset), - __ATTR_RO(cpus_online), - __ATTR_RO(cpus_forbidden), - __ATTR_NULL -}; - -static struct class perfctr_class = { - .name = "perfctr", - .class_attrs = perfctr_class_attrs, -}; - -char *perfctr_cpu_name __initdata; - -static int __init perfctr_init(void) -{ - int err; - - err = perfctr_cpu_init(); - if (err) { - printk(KERN_INFO "perfctr: not supported by this processor\n"); - return err; - } - err = vperfctr_init(); - if (err) - return err; - err = class_register(&perfctr_class); - if (err) { - printk(KERN_ERR "perfctr: class initialisation failed\n"); - return err; - } - printk(KERN_INFO "perfctr: driver %s, cpu type %s at %u kHz\n", - VERSION, - perfctr_cpu_name, - perfctr_info.cpu_khz); - return 0; -} - -static void __exit perfctr_exit(void) -{ - vperfctr_exit(); - perfctr_cpu_exit(); -} - -module_init(perfctr_init) -module_exit(perfctr_exit) diff -Naur linux-2.6.12-rc5-mm1/drivers/perfctr/ppc.c linux-2.6.12-rc5-mm1-plug/drivers/perfctr/ppc.c --- linux-2.6.12-rc5-mm1/drivers/perfctr/ppc.c 2005-05-25 16:23:37.411933904 -0700 +++ linux-2.6.12-rc5-mm1-plug/drivers/perfctr/ppc.c 1969-12-31 16:00:00.000000000 -0800 @@ -1,1090 +0,0 @@ -/* $Id: ppc.c,v 1.39 2005/04/08 14:36:49 mikpe Exp $ - * PPC32 performance-monitoring counters driver. - * - * Copyright (C) 2004-2005 Mikael Pettersson - */ -#include -#include -#include -#include -#include -#include -#include /* tb_ticks_per_jiffy, get_tbl() */ - -#include "ppc_tests.h" - -/* Support for lazy evntsel and perfctr SPR updates. */ -struct per_cpu_cache { /* roughly a subset of perfctr_cpu_state */ - unsigned int id; /* cache owner id */ - /* Physically indexed cache of the MMCRs. */ - unsigned int ppc_mmcr[3]; -}; -static DEFINE_PER_CPU(struct per_cpu_cache, per_cpu_cache); -#define __get_cpu_cache(cpu) (&per_cpu(per_cpu_cache, cpu)) -#define get_cpu_cache() (&__get_cpu_var(per_cpu_cache)) - -/* Structure for counter snapshots, as 32-bit values. */ -struct perfctr_low_ctrs { - unsigned int tsc; - unsigned int pmc[6]; -}; - -enum pm_type { - PM_NONE, - PM_604, - PM_604e, - PM_750, /* XXX: Minor event set diffs between IBM and Moto. */ - PM_7400, - PM_7450, -}; -static enum pm_type pm_type; - -static unsigned int new_id(void) -{ - static DEFINE_SPINLOCK(lock); - static unsigned int counter; - int id; - - spin_lock(&lock); - id = ++counter; - spin_unlock(&lock); - return id; -} - -#ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT -static void perfctr_default_ihandler(unsigned long pc) -{ -} - -static perfctr_ihandler_t perfctr_ihandler = perfctr_default_ihandler; - -void do_perfctr_interrupt(struct pt_regs *regs) -{ - preempt_disable(); - (*perfctr_ihandler)(instruction_pointer(regs)); - preempt_enable_no_resched(); -} - -void perfctr_cpu_set_ihandler(perfctr_ihandler_t ihandler) -{ - perfctr_ihandler = ihandler ? ihandler : perfctr_default_ihandler; -} - -#else -#define perfctr_cstatus_has_ictrs(cstatus) 0 -#endif - -#if defined(CONFIG_SMP) && defined(CONFIG_PERFCTR_INTERRUPT_SUPPORT) - -static inline void -set_isuspend_cpu(struct perfctr_cpu_state *state, int cpu) -{ - state->isuspend_cpu = cpu; -} - -static inline int -is_isuspend_cpu(const struct perfctr_cpu_state *state, int cpu) -{ - return state->isuspend_cpu == cpu; -} - -static inline void clear_isuspend_cpu(struct perfctr_cpu_state *state) -{ - state->isuspend_cpu = NR_CPUS; -} - -#else -static inline void set_isuspend_cpu(struct perfctr_cpu_state *state, int cpu) { } -static inline int is_isuspend_cpu(const struct perfctr_cpu_state *state, int cpu) { return 1; } -static inline void clear_isuspend_cpu(struct perfctr_cpu_state *state) { } -#endif - -/* The ppc driver internally uses cstatus & (1<<30) to record that - a context has an asynchronously changing MMCR0. */ -static inline unsigned int perfctr_cstatus_set_mmcr0_quirk(unsigned int cstatus) -{ - return cstatus | (1 << 30); -} - -static inline int perfctr_cstatus_has_mmcr0_quirk(unsigned int cstatus) -{ - return cstatus & (1 << 30); -} - -/**************************************************************** - * * - * Driver procedures. * - * * - ****************************************************************/ - -/* - * The PowerPC 604/750/74xx family. - * - * Common features - * --------------- - * - Per counter event selection data in subfields of control registers. - * MMCR0 contains both global control and PMC1/PMC2 event selectors. - * - Overflow interrupt support is present in all processors, but an - * erratum makes it difficult to use in 750/7400/7410 processors. - * - There is no concept of per-counter qualifiers: - * - User-mode/supervisor-mode restrictions are global. - * - Two groups of counters, PMC1 and PMC2-PMC. Each group - * has a single overflow interrupt/event enable/disable flag. - * - The instructions used to read (mfspr) and write (mtspr) the control - * and counter registers (SPRs) only support hardcoded register numbers. - * There is no support for accessing an SPR via a runtime value. - * - Each counter supports its own unique set of events. However, events - * 0-1 are common for PMC1-PMC4, and events 2-4 are common for PMC1-PMC4. - * - There is no separate high-resolution core clock counter. - * The time-base counter is available, but it typically runs an order of - * magnitude slower than the core clock. - * Any performance counter can be programmed to count core clocks, but - * doing this (a) reserves one PMC, and (b) needs indirect accesses - * since the SPR number in general isn't known at compile-time. - * - * 604 - * --- - * 604 has MMCR0, PMC1, PMC2, SIA, and SDA. - * - * MMCR0[THRESHOLD] is not automatically multiplied. - * - * On the 604, software must always reset MMCR0[ENINT] after - * taking a PMI. This is not the case for the 604e. - * - * 604e - * ---- - * 604e adds MMCR1, PMC3, and PMC4. - * Bus-to-core multiplier is available via HID1[PLL_CFG]. - * - * MMCR0[THRESHOLD] is automatically multiplied by 4. - * - * When the 604e vectors to the PMI handler, it automatically - * clears any pending PMIs. Unlike the 604, the 604e does not - * require MMCR0[ENINT] to be cleared (and possibly reset) - * before external interrupts can be re-enabled. - * - * 750 - * --- - * 750 adds user-readable MMCRn/PMCn/SIA registers, and removes SDA. - * - * MMCR0[THRESHOLD] is not automatically multiplied. - * - * Motorola MPC750UM.pdf, page C-78, states: "The performance monitor - * of the MPC755 functions the same as that of the MPC750, (...), except - * that for both the MPC750 and MPC755, no combination of the thermal - * assist unit, the decrementer register, and the performance monitor - * can be used at any one time. If exceptions for any two of these - * functional blocks are enabled together, multiple exceptions caused - * by any of these three blocks cause unpredictable results." - * - * IBM 750CXe_Err_DD2X.pdf, Erratum #13, states that a PMI which - * occurs immediately after a delayed decrementer exception can - * corrupt SRR0, causing the processor to hang. It also states that - * PMIs via TB bit transitions can be used to simulate the decrementer. - * - * 750FX adds dual-PLL support and programmable core frequency switching. - * - * 750FX DD2.3 fixed the DEC/PMI SRR0 corruption erratum. - * - * 74xx - * ---- - * 7400 adds MMCR2 and BAMR. - * - * MMCR0[THRESHOLD] is multiplied by 2 or 32, as specified - * by MMCR2[THRESHMULT]. - * - * 74xx changes the semantics of several MMCR0 control bits, - * compared to 604/750. - * - * PPC7410 Erratum No. 10: Like the MPC750 TAU/DECR/PMI erratum. - * Erratum No. 14 marks TAU as unsupported in 7410, but this leaves - * perfmon and decrementer interrupts as being mutually exclusive. - * Affects PPC7410 1.0-1.2 (PVR 0x800C1100-0x800C1102). 1.3 and up - * (PVR 0x800C1103 up) are Ok. - * - * 7450 adds PMC5 and PMC6. - * - * 7455/7445 V3.3 (PVR 80010303) and later use the 7457 PLL table, - * earlier revisions use the 7450 PLL table - */ - -static inline unsigned int read_pmc(unsigned int pmc) -{ - switch (pmc) { - default: /* impossible, but silences gcc warning */ - case 0: - return mfspr(SPRN_PMC1); - case 1: - return mfspr(SPRN_PMC2); - case 2: - return mfspr(SPRN_PMC3); - case 3: - return mfspr(SPRN_PMC4); - case 4: - return mfspr(SPRN_PMC5); - case 5: - return mfspr(SPRN_PMC6); - } -} - -static void ppc_read_counters(struct perfctr_cpu_state *state, - struct perfctr_low_ctrs *ctrs) -{ - unsigned int cstatus, nrctrs, i; - - cstatus = state->user.cstatus; - if (perfctr_cstatus_has_tsc(cstatus)) - ctrs->tsc = get_tbl(); - nrctrs = perfctr_cstatus_nractrs(cstatus); - for(i = 0; i < nrctrs; ++i) { - unsigned int pmc = state->control.pmc_map[i]; - ctrs->pmc[i] = read_pmc(pmc); - } -} - -static unsigned int pmc_max_event(unsigned int pmc) -{ - switch (pmc) { - default: /* impossible, but silences gcc warning */ - case 0: - return 127; - case 1: - return 63; - case 2: - return 31; - case 3: - return 31; - case 4: - return 31; - case 5: - return 63; - } -} - -static unsigned int get_nr_pmcs(void) -{ - switch (pm_type) { - case PM_7450: - return 6; - case PM_7400: - case PM_750: - case PM_604e: - return 4; - case PM_604: - return 2; - default: /* PM_NONE, but silences gcc warning */ - return 0; - } -} - -static int ppc_check_control(struct perfctr_cpu_state *state) -{ - unsigned int i, nractrs, nrctrs, pmc_mask, pmi_mask, pmc; - unsigned int nr_pmcs, evntsel[6]; - - nr_pmcs = get_nr_pmcs(); - nractrs = state->control.header.nractrs; - nrctrs = nractrs + state->control.header.nrictrs; - if (nrctrs < nractrs || nrctrs > nr_pmcs) - return -EINVAL; - - pmc_mask = 0; - pmi_mask = 0; - evntsel[1-1] = (state->control.mmcr0 >> (31-25)) & 0x7F; - evntsel[2-1] = (state->control.mmcr0 >> (31-31)) & 0x3F; - evntsel[3-1] = (state->control.mmcr1 >> (31- 4)) & 0x1F; - evntsel[4-1] = (state->control.mmcr1 >> (31- 9)) & 0x1F; - evntsel[5-1] = (state->control.mmcr1 >> (31-14)) & 0x1F; - evntsel[6-1] = (state->control.mmcr1 >> (31-20)) & 0x3F; - - for(i = 0; i < nrctrs; ++i) { - pmc = state->control.pmc_map[i]; - if (pmc >= nr_pmcs || (pmc_mask & (1<= nractrs) - pmi_mask |= (1< pmc_max_event(pmc)) - return -EINVAL; - } - - /* unused event selectors must be zero */ - for(i = 0; i < ARRAY_SIZE(evntsel); ++i) - if (!(pmc_mask & (1<control.mmcr2 & MMCR2_RESERVED) - return -EINVAL; - break; - default: - if (state->control.mmcr2) - return -EINVAL; - } - - /* check MMCR1; non-existent event selectors are taken care of - by the "unused event selectors must be zero" check above */ - if (state->control.mmcr1 & MMCR1__RESERVED) - return -EINVAL; - - /* We do not yet handle TBEE as the only exception cause, - so PMXE requires at least one interrupt-mode counter. */ - if ((state->control.mmcr0 & MMCR0_PMXE) && !state->control.header.nrictrs) - return -EINVAL; - - state->id = new_id(); - - /* - * MMCR0[FC] and MMCR0[TRIGGER] may change on 74xx if FCECE or - * TRIGGER is set. At suspends we must read MMCR0 back into - * the state and the cache and then freeze the counters, and - * at resumes we must unfreeze the counters and reload MMCR0. - */ - switch (pm_type) { - case PM_7450: - case PM_7400: - if (state->control.mmcr0 & (MMCR0_FCECE | MMCR0_TRIGGER)) - state->user.cstatus = perfctr_cstatus_set_mmcr0_quirk(state->user.cstatus); - default: - ; - } - - /* The MMCR0 handling for FCECE and TRIGGER is also needed for PMXE. */ - if (state->control.mmcr0 & (MMCR0_PMXE | MMCR0_FCECE | MMCR0_TRIGGER)) - state->user.cstatus = perfctr_cstatus_set_mmcr0_quirk(state->user.cstatus); - - return 0; -} - -#ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT -/* PRE: perfctr_cstatus_has_ictrs(state->cstatus) != 0 */ -/* PRE: counters frozen */ -static void ppc_isuspend(struct perfctr_cpu_state *state) -{ - struct per_cpu_cache *cache; - unsigned int cstatus, nrctrs, i; - int cpu; - - cpu = smp_processor_id(); - set_isuspend_cpu(state, cpu); /* early to limit cpu's live range */ - cache = __get_cpu_cache(cpu); - cstatus = state->user.cstatus; - nrctrs = perfctr_cstatus_nrctrs(cstatus); - for(i = perfctr_cstatus_nractrs(cstatus); i < nrctrs; ++i) { - unsigned int pmc = state->control.pmc_map[i]; - unsigned int now = read_pmc(pmc); - state->user.pmc[i].sum += now - state->user.pmc[i].start; - state->user.pmc[i].start = now; - } - /* cache->id is still == state->id */ -} - -static void ppc_iresume(const struct perfctr_cpu_state *state) -{ - struct per_cpu_cache *cache; - unsigned int cstatus, nrctrs, i; - int cpu; - unsigned int pmc[6]; - - cpu = smp_processor_id(); - cache = __get_cpu_cache(cpu); - if (cache->id == state->id) { - /* Clearing cache->id to force write_control() - to unfreeze MMCR0 would be done here, but it - is subsumed by resume()'s MMCR0 reload logic. */ - if (is_isuspend_cpu(state, cpu)) - return; /* skip reload of PMCs */ - } - /* - * The CPU state wasn't ours. - * - * The counters must be frozen before being reinitialised, - * to prevent unexpected increments and missed overflows. - * - * All unused counters must be reset to a non-overflow state. - */ - if (!(cache->ppc_mmcr[0] & MMCR0_FC)) { - cache->ppc_mmcr[0] |= MMCR0_FC; - mtspr(SPRN_MMCR0, cache->ppc_mmcr[0]); - } - memset(&pmc[0], 0, sizeof pmc); - cstatus = state->user.cstatus; - nrctrs = perfctr_cstatus_nrctrs(cstatus); - for(i = perfctr_cstatus_nractrs(cstatus); i < nrctrs; ++i) - pmc[state->control.pmc_map[i]] = state->user.pmc[i].start; - - switch (pm_type) { - case PM_7450: - mtspr(SPRN_PMC6, pmc[6-1]); - mtspr(SPRN_PMC5, pmc[5-1]); - case PM_7400: - case PM_750: - case PM_604e: - mtspr(SPRN_PMC4, pmc[4-1]); - mtspr(SPRN_PMC3, pmc[3-1]); - case PM_604: - mtspr(SPRN_PMC2, pmc[2-1]); - mtspr(SPRN_PMC1, pmc[1-1]); - case PM_NONE: - ; - } - /* cache->id remains != state->id */ -} -#endif - -static void ppc_write_control(const struct perfctr_cpu_state *state) -{ - struct per_cpu_cache *cache; - unsigned int value; - - cache = get_cpu_cache(); - if (cache->id == state->id) - return; - /* - * Order matters here: update threshmult and event - * selectors before updating global control, which - * potentially enables PMIs. - * - * Since mtspr doesn't accept a runtime value for the - * SPR number, unroll the loop so each mtspr targets - * a constant SPR. - * - * For processors without MMCR2, we ensure that the - * cache and the state indicate the same value for it, - * preventing any actual mtspr to it. Ditto for MMCR1. - */ - value = state->control.mmcr2; - if (value != cache->ppc_mmcr[2]) { - cache->ppc_mmcr[2] = value; - mtspr(SPRN_MMCR2, value); - } - value = state->control.mmcr1; - if (value != cache->ppc_mmcr[1]) { - cache->ppc_mmcr[1] = value; - mtspr(SPRN_MMCR1, value); - } - value = state->control.mmcr0; - if (value != cache->ppc_mmcr[0]) { - cache->ppc_mmcr[0] = value; - mtspr(SPRN_MMCR0, value); - } - cache->id = state->id; -} - -static void ppc_clear_counters(void) -{ - switch (pm_type) { - case PM_7450: - case PM_7400: - mtspr(SPRN_MMCR2, 0); - mtspr(SPRN_BAMR, 0); - case PM_750: - case PM_604e: - mtspr(SPRN_MMCR1, 0); - case PM_604: - mtspr(SPRN_MMCR0, 0); - case PM_NONE: - ; - } - switch (pm_type) { - case PM_7450: - mtspr(SPRN_PMC6, 0); - mtspr(SPRN_PMC5, 0); - case PM_7400: - case PM_750: - case PM_604e: - mtspr(SPRN_PMC4, 0); - mtspr(SPRN_PMC3, 0); - case PM_604: - mtspr(SPRN_PMC2, 0); - mtspr(SPRN_PMC1, 0); - case PM_NONE: - ; - } -} - -/* - * Driver methods, internal and exported. - */ - -static void perfctr_cpu_write_control(const struct perfctr_cpu_state *state) -{ - return ppc_write_control(state); -} - -static void perfctr_cpu_read_counters(struct perfctr_cpu_state *state, - struct perfctr_low_ctrs *ctrs) -{ - return ppc_read_counters(state, ctrs); -} - -#ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT -static void perfctr_cpu_isuspend(struct perfctr_cpu_state *state) -{ - return ppc_isuspend(state); -} - -static void perfctr_cpu_iresume(const struct perfctr_cpu_state *state) -{ - return ppc_iresume(state); -} - -/* Call perfctr_cpu_ireload() just before perfctr_cpu_resume() to - bypass internal caching and force a reload if the I-mode PMCs. */ -void perfctr_cpu_ireload(struct perfctr_cpu_state *state) -{ - state->control.mmcr0 |= MMCR0_PMXE; -#ifdef CONFIG_SMP - clear_isuspend_cpu(state); -#else - get_cpu_cache()->id = 0; -#endif -} - -/* PRE: the counters have been suspended and sampled by perfctr_cpu_suspend() */ -unsigned int perfctr_cpu_identify_overflow(struct perfctr_cpu_state *state) -{ - unsigned int cstatus, nrctrs, i, pmc_mask; - - cstatus = state->user.cstatus; - nrctrs = perfctr_cstatus_nrctrs(cstatus); - pmc_mask = 0; - for(i = perfctr_cstatus_nractrs(cstatus); i < nrctrs; ++i) { - if ((int)state->user.pmc[i].start < 0) { /* PPC-specific */ - unsigned int pmc = state->control.pmc_map[i]; - /* XXX: "+=" to correct for overshots */ - state->user.pmc[i].start = state->control.ireset[pmc]; - pmc_mask |= (1 << i); - } - } - if (!pmc_mask && (state->control.mmcr0 & MMCR0_TBEE)) - pmc_mask = (1<<8); /* fake TB bit flip indicator */ - return pmc_mask; -} - -static inline int check_ireset(struct perfctr_cpu_state *state) -{ - unsigned int nrctrs, i; - - i = state->control.header.nractrs; - nrctrs = i + state->control.header.nrictrs; - for(; i < nrctrs; ++i) { - unsigned int pmc = state->control.pmc_map[i]; - if ((int)state->control.ireset[pmc] < 0) /* PPC-specific */ - return -EINVAL; - state->user.pmc[i].start = state->control.ireset[pmc]; - } - return 0; -} - -#else /* CONFIG_PERFCTR_INTERRUPT_SUPPORT */ -static inline void perfctr_cpu_isuspend(struct perfctr_cpu_state *state) { } -static inline void perfctr_cpu_iresume(const struct perfctr_cpu_state *state) { } -static inline int check_ireset(struct perfctr_cpu_state *state) { return 0; } -#endif /* CONFIG_PERFCTR_INTERRUPT_SUPPORT */ - -static int check_control(struct perfctr_cpu_state *state) -{ - return ppc_check_control(state); -} - -int perfctr_cpu_update_control(struct perfctr_cpu_state *state, int is_global) -{ - int err; - - clear_isuspend_cpu(state); - state->user.cstatus = 0; - - /* disallow i-mode counters if we cannot catch the interrupts */ - if (!(perfctr_info.cpu_features & PERFCTR_FEATURE_PCINT) - && state->control.header.nrictrs) - return -EPERM; - - err = check_control(state); /* may initialise state->cstatus */ - if (err < 0) - return err; - err = check_ireset(state); - if (err < 0) { - state->user.cstatus = 0; - return err; - } - state->user.cstatus |= perfctr_mk_cstatus(state->control.header.tsc_on, - state->control.header.nractrs, - state->control.header.nrictrs); - return 0; -} - -/* - * get_reg_offset() maps SPR numbers to offsets into struct perfctr_cpu_control, - * suitable for accessing control data of type unsigned int. - */ -static const struct { - unsigned int spr; - unsigned int offset; -} reg_offsets[] = { - { SPRN_MMCR0, offsetof(struct perfctr_cpu_control, mmcr0) }, - { SPRN_MMCR1, offsetof(struct perfctr_cpu_control, mmcr1) }, - { SPRN_MMCR2, offsetof(struct perfctr_cpu_control, mmcr2) }, - { SPRN_PMC1, offsetof(struct perfctr_cpu_control, ireset[1-1]) }, - { SPRN_PMC2, offsetof(struct perfctr_cpu_control, ireset[2-1]) }, - { SPRN_PMC3, offsetof(struct perfctr_cpu_control, ireset[3-1]) }, - { SPRN_PMC4, offsetof(struct perfctr_cpu_control, ireset[4-1]) }, - { SPRN_PMC5, offsetof(struct perfctr_cpu_control, ireset[5-1]) }, - { SPRN_PMC6, offsetof(struct perfctr_cpu_control, ireset[6-1]) }, -}; - -static int get_reg_offset(unsigned int spr) -{ - unsigned int i; - - for(i = 0; i < ARRAY_SIZE(reg_offsets); ++i) - if (spr == reg_offsets[i].spr) - return reg_offsets[i].offset; - return -1; -} - -static int access_regs(struct perfctr_cpu_control *control, - void *argp, unsigned int argbytes, int do_write) -{ - struct perfctr_cpu_reg *regs; - unsigned int i, nr_regs, *where; - int offset; - - nr_regs = argbytes / sizeof(struct perfctr_cpu_reg); - if (nr_regs * sizeof(struct perfctr_cpu_reg) != argbytes) - return -EINVAL; - regs = (struct perfctr_cpu_reg*)argp; - - for(i = 0; i < nr_regs; ++i) { - offset = get_reg_offset(regs[i].nr); - if (offset < 0) - return -EINVAL; - where = (unsigned int*)((char*)control + offset); - if (do_write) - *where = regs[i].value; - else - regs[i].value = *where; - } - return argbytes; -} - -int perfctr_cpu_control_write(struct perfctr_cpu_control *control, unsigned int domain, - const void *srcp, unsigned int srcbytes) -{ - if (domain != PERFCTR_DOMAIN_CPU_REGS) - return -EINVAL; - return access_regs(control, (void*)srcp, srcbytes, 1); -} - -int perfctr_cpu_control_read(const struct perfctr_cpu_control *control, unsigned int domain, - void *dstp, unsigned int dstbytes) -{ - if (domain != PERFCTR_DOMAIN_CPU_REGS) - return -EINVAL; - return access_regs((struct perfctr_cpu_control*)control, dstp, dstbytes, 0); -} - -void perfctr_cpu_suspend(struct perfctr_cpu_state *state) -{ - unsigned int i, cstatus, nractrs; - struct perfctr_low_ctrs now; - - if (perfctr_cstatus_has_mmcr0_quirk(state->user.cstatus)) { - unsigned int mmcr0 = mfspr(SPRN_MMCR0); - mtspr(SPRN_MMCR0, mmcr0 | MMCR0_FC); - get_cpu_cache()->ppc_mmcr[0] = mmcr0 | MMCR0_FC; - state->control.mmcr0 = mmcr0; - } - if (perfctr_cstatus_has_ictrs(state->user.cstatus)) - perfctr_cpu_isuspend(state); - perfctr_cpu_read_counters(state, &now); - cstatus = state->user.cstatus; - if (perfctr_cstatus_has_tsc(cstatus)) - state->user.tsc_sum += now.tsc - state->user.tsc_start; - nractrs = perfctr_cstatus_nractrs(cstatus); - for(i = 0; i < nractrs; ++i) - state->user.pmc[i].sum += now.pmc[i] - state->user.pmc[i].start; -} - -void perfctr_cpu_resume(struct perfctr_cpu_state *state) -{ - if (perfctr_cstatus_has_ictrs(state->user.cstatus)) - perfctr_cpu_iresume(state); - if (perfctr_cstatus_has_mmcr0_quirk(state->user.cstatus)) - get_cpu_cache()->id = 0; /* force reload of MMCR0 */ - perfctr_cpu_write_control(state); - //perfctr_cpu_read_counters(state, &state->start); - { - struct perfctr_low_ctrs now; - unsigned int i, cstatus, nrctrs; - perfctr_cpu_read_counters(state, &now); - cstatus = state->user.cstatus; - if (perfctr_cstatus_has_tsc(cstatus)) - state->user.tsc_start = now.tsc; - nrctrs = perfctr_cstatus_nractrs(cstatus); - for(i = 0; i < nrctrs; ++i) - state->user.pmc[i].start = now.pmc[i]; - } - ++state->user.samplecnt; -} - -void perfctr_cpu_sample(struct perfctr_cpu_state *state) -{ - unsigned int i, cstatus, nractrs; - struct perfctr_low_ctrs now; - - perfctr_cpu_read_counters(state, &now); - cstatus = state->user.cstatus; - if (perfctr_cstatus_has_tsc(cstatus)) { - state->user.tsc_sum += now.tsc - state->user.tsc_start; - state->user.tsc_start = now.tsc; - } - nractrs = perfctr_cstatus_nractrs(cstatus); - for(i = 0; i < nractrs; ++i) { - state->user.pmc[i].sum += now.pmc[i] - state->user.pmc[i].start; - state->user.pmc[i].start = now.pmc[i]; - } - ++state->user.samplecnt; -} - -static void perfctr_cpu_clear_counters(void) -{ - struct per_cpu_cache *cache; - - cache = get_cpu_cache(); - memset(cache, 0, sizeof *cache); - cache->id = -1; - - ppc_clear_counters(); -} - -/**************************************************************** - * * - * Processor detection and initialisation procedures. * - * * - ****************************************************************/ - -/* Derive CPU core frequency from TB frequency and PLL_CFG. */ - -enum pll_type { - PLL_NONE, /* for e.g. 604 which has no HID1[PLL_CFG] */ - PLL_604e, - PLL_750, - PLL_750FX, - PLL_7400, - PLL_7450, - PLL_7457, -}; - -/* These are the known bus-to-core ratios, indexed by PLL_CFG. - Multiplied by 2 since half-multiplier steps are present. */ - -static unsigned char cfg_ratio_604e[16] __initdata = { // *2 - 2, 2, 14, 2, 4, 13, 5, 9, - 6, 11, 8, 10, 3, 12, 7, 0 -}; - -static unsigned char cfg_ratio_750[16] __initdata = { // *2 - 5, 15, 14, 2, 4, 13, 20, 9, // 0b0110 is 18 if L1_TSTCLK=0, but that is abnormal - 6, 11, 8, 10, 16, 12, 7, 0 -}; - -static unsigned char cfg_ratio_750FX[32] __initdata = { // *2 - 0, 0, 2, 2, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 22, 24, 26, - 28, 30, 32, 34, 36, 38, 40, 0 -}; - -static unsigned char cfg_ratio_7400[16] __initdata = { // *2 - 18, 15, 14, 2, 4, 13, 5, 9, - 6, 11, 8, 10, 16, 12, 7, 0 -}; - -static unsigned char cfg_ratio_7450[32] __initdata = { // *2 - 1, 0, 15, 30, 14, 0, 2, 0, - 4, 0, 13, 26, 5, 0, 9, 18, - 6, 0, 11, 22, 8, 20, 10, 24, - 16, 28, 12, 32, 7, 0, 0, 0 -}; - -static unsigned char cfg_ratio_7457[32] __initdata = { // *2 - 23, 34, 15, 30, 14, 36, 2, 40, - 4, 42, 13, 26, 17, 48, 19, 18, - 6, 21, 11, 22, 8, 20, 10, 24, - 16, 28, 12, 32, 27, 56, 0, 25 -}; - -static unsigned int __init tb_to_core_ratio(enum pll_type pll_type) -{ - unsigned char *cfg_ratio; - unsigned int shift = 28, mask = 0xF, hid1, pll_cfg, ratio; - - switch (pll_type) { - case PLL_604e: - cfg_ratio = cfg_ratio_604e; - break; - case PLL_750: - cfg_ratio = cfg_ratio_750; - break; - case PLL_750FX: - cfg_ratio = cfg_ratio_750FX; - hid1 = mfspr(SPRN_HID1); - switch ((hid1 >> 16) & 0x3) { /* HID1[PI0,PS] */ - case 0: /* PLL0 with external config */ - shift = 31-4; /* access HID1[PCE] */ - break; - case 2: /* PLL0 with internal config */ - shift = 31-20; /* access HID1[PC0] */ - break; - case 1: case 3: /* PLL1 */ - shift = 31-28; /* access HID1[PC1] */ - break; - } - mask = 0x1F; - break; - case PLL_7400: - cfg_ratio = cfg_ratio_7400; - break; - case PLL_7450: - cfg_ratio = cfg_ratio_7450; - shift = 12; - mask = 0x1F; - break; - case PLL_7457: - cfg_ratio = cfg_ratio_7457; - shift = 12; - mask = 0x1F; - break; - default: - return 0; - } - hid1 = mfspr(SPRN_HID1); - pll_cfg = (hid1 >> shift) & mask; - ratio = cfg_ratio[pll_cfg]; - if (!ratio) - printk(KERN_WARNING "perfctr: unknown PLL_CFG 0x%x\n", pll_cfg); - return (4/2) * ratio; -} - -static unsigned int __init pll_to_core_khz(enum pll_type pll_type) -{ - unsigned int tb_to_core = tb_to_core_ratio(pll_type); - perfctr_info.tsc_to_cpu_mult = tb_to_core; - return tb_ticks_per_jiffy * tb_to_core * (HZ/10) / (1000/10); -} - -/* Extract core and timebase frequencies from Open Firmware. */ - -static unsigned int __init of_to_core_khz(void) -{ - struct device_node *cpu; - unsigned int *fp, core, tb; - - cpu = find_type_devices("cpu"); - if (!cpu) - return 0; - fp = (unsigned int*)get_property(cpu, "clock-frequency", NULL); - if (!fp || !(core = *fp)) - return 0; - fp = (unsigned int*)get_property(cpu, "timebase-frequency", NULL); - if (!fp || !(tb = *fp)) - return 0; - perfctr_info.tsc_to_cpu_mult = core / tb; - return core / 1000; -} - -static unsigned int __init detect_cpu_khz(enum pll_type pll_type) -{ - unsigned int khz; - - khz = pll_to_core_khz(pll_type); - if (khz) - return khz; - - khz = of_to_core_khz(); - if (khz) - return khz; - - printk(KERN_WARNING "perfctr: unable to determine CPU speed\n"); - return 0; -} - -static int __init known_init(void) -{ - static char known_name[] __initdata = "PowerPC 60x/7xx/74xx"; - unsigned int features; - enum pll_type pll_type; - unsigned int pvr; - int have_mmcr1; - - features = PERFCTR_FEATURE_RDTSC | PERFCTR_FEATURE_RDPMC; - have_mmcr1 = 1; - pvr = mfspr(SPRN_PVR); - switch (PVR_VER(pvr)) { - case 0x0004: /* 604 */ - pm_type = PM_604; - pll_type = PLL_NONE; - features = PERFCTR_FEATURE_RDTSC; - have_mmcr1 = 0; - break; - case 0x0009: /* 604e; */ - case 0x000A: /* 604ev */ - pm_type = PM_604e; - pll_type = PLL_604e; - features = PERFCTR_FEATURE_RDTSC; - break; - case 0x0008: /* 750/740 */ - pm_type = PM_750; - pll_type = PLL_750; - break; - case 0x7000: case 0x7001: /* IBM750FX */ - if ((pvr & 0xFF0F) >= 0x0203) - features |= PERFCTR_FEATURE_PCINT; - pm_type = PM_750; - pll_type = PLL_750FX; - break; - case 0x7002: /* IBM750GX */ - features |= PERFCTR_FEATURE_PCINT; - pm_type = PM_750; - pll_type = PLL_750FX; - break; - case 0x000C: /* 7400 */ - pm_type = PM_7400; - pll_type = PLL_7400; - break; - case 0x800C: /* 7410 */ - if ((pvr & 0xFFFF) >= 0x1103) - features |= PERFCTR_FEATURE_PCINT; - pm_type = PM_7400; - pll_type = PLL_7400; - break; - case 0x8000: /* 7451/7441 */ - features |= PERFCTR_FEATURE_PCINT; - pm_type = PM_7450; - pll_type = PLL_7450; - break; - case 0x8001: /* 7455/7445 */ - features |= PERFCTR_FEATURE_PCINT; - pm_type = PM_7450; - pll_type = ((pvr & 0xFFFF) < 0x0303) ? PLL_7450 : PLL_7457; - break; - case 0x8002: /* 7457/7447 */ - case 0x8003: /* 7447A */ - features |= PERFCTR_FEATURE_PCINT; - pm_type = PM_7450; - pll_type = PLL_7457; - break; - case 0x8004: /* 7448 */ - features |= PERFCTR_FEATURE_PCINT; - pm_type = PM_7450; - pll_type = PLL_NONE; /* known to differ from 7447A, no details yet */ - break; - default: - return -ENODEV; - } - perfctr_info.cpu_features = features; - perfctr_cpu_name = known_name; - perfctr_info.cpu_khz = detect_cpu_khz(pll_type); - perfctr_ppc_init_tests(have_mmcr1); - return 0; -} - -static int __init unknown_init(void) -{ - static char unknown_name[] __initdata = "Generic PowerPC with TB"; - unsigned int khz; - - khz = detect_cpu_khz(PLL_NONE); - if (!khz) - return -ENODEV; - perfctr_info.cpu_features = PERFCTR_FEATURE_RDTSC; - perfctr_cpu_name = unknown_name; - perfctr_info.cpu_khz = khz; - pm_type = PM_NONE; - return 0; -} - -static void perfctr_cpu_clear_one(void *ignore) -{ - /* PREEMPT note: when called via on_each_cpu(), - this is in IRQ context with preemption disabled. */ - perfctr_cpu_clear_counters(); -} - -static void perfctr_cpu_reset(void) -{ - on_each_cpu(perfctr_cpu_clear_one, NULL, 1, 1); - perfctr_cpu_set_ihandler(NULL); -} - -static int init_done; - -int __init perfctr_cpu_init(void) -{ - int err; - - perfctr_info.cpu_features = 0; - - err = known_init(); - if (err) { - err = unknown_init(); - if (err) - goto out; - } - - perfctr_cpu_reset(); - init_done = 1; - out: - return err; -} - -void __exit perfctr_cpu_exit(void) -{ - perfctr_cpu_reset(); -} - -/**************************************************************** - * * - * Hardware reservation. * - * * - ****************************************************************/ - -static DECLARE_MUTEX(mutex); -static const char *current_service = 0; - -const char *perfctr_cpu_reserve(const char *service) -{ - const char *ret; - - if (!init_done) - return "unsupported hardware"; - down(&mutex); - ret = current_service; - if (!ret) - current_service = service; - up(&mutex); - return ret; -} - -void perfctr_cpu_release(const char *service) -{ - down(&mutex); - if (service != current_service) { - printk(KERN_ERR "%s: attempt by %s to release while reserved by %s\n", - __FUNCTION__, service, current_service); - } else { - /* power down the counters */ - perfctr_cpu_reset(); - current_service = 0; - } - up(&mutex); -} diff -Naur linux-2.6.12-rc5-mm1/drivers/perfctr/ppc64.c linux-2.6.12-rc5-mm1-plug/drivers/perfctr/ppc64.c --- linux-2.6.12-rc5-mm1/drivers/perfctr/ppc64.c 2005-05-25 16:23:37.405934816 -0700 +++ linux-2.6.12-rc5-mm1-plug/drivers/perfctr/ppc64.c 1969-12-31 16:00:00.000000000 -0800 @@ -1,743 +0,0 @@ -/* - * PPC64 performance-monitoring counters driver. - * - * based on Mikael Pettersson's 32 bit ppc code - * Copyright (C) 2004 David Gibson, IBM Corporation. - * Copyright (C) 2004 Mikael Pettersson - */ - -#include -#include -#include -#include -#include -#include -#include /* tb_ticks_per_jiffy */ -#include -#include - -#include "ppc64_tests.h" - -extern void ppc64_enable_pmcs(void); - -/* Support for lazy perfctr SPR updates. */ -struct per_cpu_cache { /* roughly a subset of perfctr_cpu_state */ - unsigned int id; /* cache owner id */ - /* Physically indexed cache of the MMCRs. */ - unsigned long ppc64_mmcr0, ppc64_mmcr1, ppc64_mmcra; -}; -static DEFINE_PER_CPU(struct per_cpu_cache, per_cpu_cache); -#define __get_cpu_cache(cpu) (&per_cpu(per_cpu_cache, cpu)) -#define get_cpu_cache() (&__get_cpu_var(per_cpu_cache)) - -/* Structure for counter snapshots, as 32-bit values. */ -struct perfctr_low_ctrs { - u64 tsc; - u32 pmc[8]; -}; - -static unsigned int new_id(void) -{ - static DEFINE_SPINLOCK(lock); - static unsigned int counter; - int id; - - spin_lock(&lock); - id = ++counter; - spin_unlock(&lock); - return id; -} - -static inline u32 read_pmc(int pmc) -{ - switch (pmc) { - case 0: - return mfspr(SPRN_PMC1); - break; - case 1: - return mfspr(SPRN_PMC2); - break; - case 2: - return mfspr(SPRN_PMC3); - break; - case 3: - return mfspr(SPRN_PMC4); - break; - case 4: - return mfspr(SPRN_PMC5); - break; - case 5: - return mfspr(SPRN_PMC6); - break; - case 6: - return mfspr(SPRN_PMC7); - break; - case 7: - return mfspr(SPRN_PMC8); - break; - - default: - return -EINVAL; - } -} - -static inline void write_pmc(int pmc, u32 val) -{ - switch (pmc) { - case 0: - mtspr(SPRN_PMC1, val); - break; - case 1: - mtspr(SPRN_PMC2, val); - break; - case 2: - mtspr(SPRN_PMC3, val); - break; - case 3: - mtspr(SPRN_PMC4, val); - break; - case 4: - mtspr(SPRN_PMC5, val); - break; - case 5: - mtspr(SPRN_PMC6, val); - break; - case 6: - mtspr(SPRN_PMC7, val); - break; - case 7: - mtspr(SPRN_PMC8, val); - break; - } -} - -#ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT -static void perfctr_default_ihandler(unsigned long pc) -{ - unsigned int mmcr0 = mfspr(SPRN_MMCR0); - - mmcr0 &= ~MMCR0_PMXE; - mtspr(SPRN_MMCR0, mmcr0); -} - -static perfctr_ihandler_t perfctr_ihandler = perfctr_default_ihandler; - -void do_perfctr_interrupt(struct pt_regs *regs) -{ - unsigned long mmcr0; - - /* interrupts are disabled here, so we don't need to - * preempt_disable() */ - - (*perfctr_ihandler)(instruction_pointer(regs)); - - /* clear PMAO so the interrupt doesn't reassert immediately */ - mmcr0 = mfspr(SPRN_MMCR0) & ~MMCR0_PMAO; - mtspr(SPRN_MMCR0, mmcr0); -} - -void perfctr_cpu_set_ihandler(perfctr_ihandler_t ihandler) -{ - perfctr_ihandler = ihandler ? ihandler : perfctr_default_ihandler; -} - -#else -#define perfctr_cstatus_has_ictrs(cstatus) 0 -#endif - - -#if defined(CONFIG_SMP) && defined(CONFIG_PERFCTR_INTERRUPT_SUPPORT) - -static inline void -set_isuspend_cpu(struct perfctr_cpu_state *state, int cpu) -{ - state->isuspend_cpu = cpu; -} - -static inline int -is_isuspend_cpu(const struct perfctr_cpu_state *state, int cpu) -{ - return state->isuspend_cpu == cpu; -} - -static inline void clear_isuspend_cpu(struct perfctr_cpu_state *state) -{ - state->isuspend_cpu = NR_CPUS; -} - -#else -static inline void set_isuspend_cpu(struct perfctr_cpu_state *state, int cpu) { } -static inline int is_isuspend_cpu(const struct perfctr_cpu_state *state, int cpu) { return 1; } -static inline void clear_isuspend_cpu(struct perfctr_cpu_state *state) { } -#endif - - -static void ppc64_clear_counters(void) -{ - mtspr(SPRN_MMCR0, 0); - mtspr(SPRN_MMCR1, 0); - mtspr(SPRN_MMCRA, 0); - - mtspr(SPRN_PMC1, 0); - mtspr(SPRN_PMC2, 0); - mtspr(SPRN_PMC3, 0); - mtspr(SPRN_PMC4, 0); - mtspr(SPRN_PMC5, 0); - mtspr(SPRN_PMC6, 0); - - if (cpu_has_feature(CPU_FTR_PMC8)) { - mtspr(SPRN_PMC7, 0); - mtspr(SPRN_PMC8, 0); - } -} - -/* - * Driver methods, internal and exported. - */ - -static void perfctr_cpu_write_control(const struct perfctr_cpu_state *state) -{ - struct per_cpu_cache *cache; - unsigned long long value; - - cache = get_cpu_cache(); - /* - * Order matters here: update threshmult and event - * selectors before updating global control, which - * potentially enables PMIs. - * - * Since mtspr doesn't accept a runtime value for the - * SPR number, unroll the loop so each mtspr targets - * a constant SPR. - * - * For processors without MMCR2, we ensure that the - * cache and the state indicate the same value for it, - * preventing any actual mtspr to it. Ditto for MMCR1. - */ - value = state->control.mmcra; - if (value != cache->ppc64_mmcra) { - cache->ppc64_mmcra = value; - mtspr(SPRN_MMCRA, value); - } - value = state->control.mmcr1; - if (value != cache->ppc64_mmcr1) { - cache->ppc64_mmcr1 = value; - mtspr(SPRN_MMCR1, value); - } - value = state->control.mmcr0; - if (perfctr_cstatus_has_ictrs(state->user.cstatus)) - value |= MMCR0_PMXE; - if (value != cache->ppc64_mmcr0) { - cache->ppc64_mmcr0 = value; - mtspr(SPRN_MMCR0, value); - } - cache->id = state->id; -} - -static void perfctr_cpu_read_counters(struct perfctr_cpu_state *state, - struct perfctr_low_ctrs *ctrs) -{ - unsigned int cstatus, i, pmc; - - cstatus = state->user.cstatus; - if (perfctr_cstatus_has_tsc(cstatus)) - ctrs->tsc = mftb(); - - for (i = 0; i < perfctr_cstatus_nractrs(cstatus); ++i) { - pmc = state->control.pmc_map[i]; - ctrs->pmc[i] = read_pmc(pmc); - } -} - -#ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT -static void perfctr_cpu_isuspend(struct perfctr_cpu_state *state) -{ - unsigned int cstatus, nrctrs, i; - int cpu; - - cpu = smp_processor_id(); - set_isuspend_cpu(state, cpu); /* early to limit cpu's live range */ - cstatus = state->user.cstatus; - nrctrs = perfctr_cstatus_nrctrs(cstatus); - for (i = perfctr_cstatus_nractrs(cstatus); i < nrctrs; ++i) { - int pmc = state->control.pmc_map[i]; - u32 now = read_pmc(pmc); - - state->user.pmc[i].sum += (u32)(now-state->user.pmc[i].start); - state->user.pmc[i].start = now; - } -} - -static void perfctr_cpu_iresume(const struct perfctr_cpu_state *state) -{ - struct per_cpu_cache *cache; - unsigned int cstatus, nrctrs, i; - int cpu; - - cpu = smp_processor_id(); - cache = __get_cpu_cache(cpu); - if (cache->id == state->id) { - /* Clearing cache->id to force write_control() - to unfreeze MMCR0 would be done here, but it - is subsumed by resume()'s MMCR0 reload logic. */ - if (is_isuspend_cpu(state, cpu)) { - return; /* skip reload of PMCs */ - } - } - /* - * The CPU state wasn't ours. - * - * The counters must be frozen before being reinitialised, - * to prevent unexpected increments and missed overflows. - * - * All unused counters must be reset to a non-overflow state. - */ - if (!(cache->ppc64_mmcr0 & MMCR0_FC)) { - cache->ppc64_mmcr0 |= MMCR0_FC; - mtspr(SPRN_MMCR0, cache->ppc64_mmcr0); - } - cstatus = state->user.cstatus; - nrctrs = perfctr_cstatus_nrctrs(cstatus); - for (i = perfctr_cstatus_nractrs(cstatus); i < nrctrs; ++i) { - write_pmc(state->control.pmc_map[i], state->user.pmc[i].start); - } -} - -/* Call perfctr_cpu_ireload() just before perfctr_cpu_resume() to - bypass internal caching and force a reload if the I-mode PMCs. */ -void perfctr_cpu_ireload(struct perfctr_cpu_state *state) -{ -#ifdef CONFIG_SMP - clear_isuspend_cpu(state); -#else - get_cpu_cache()->id = 0; -#endif -} - -/* PRE: the counters have been suspended and sampled by perfctr_cpu_suspend() */ -unsigned int perfctr_cpu_identify_overflow(struct perfctr_cpu_state *state) -{ - unsigned int cstatus, nractrs, nrctrs, i; - unsigned int pmc_mask = 0; - int nr_pmcs = 6; - - if (cpu_has_feature(CPU_FTR_PMC8)) - nr_pmcs = 8; - - cstatus = state->user.cstatus; - nractrs = perfctr_cstatus_nractrs(cstatus); - nrctrs = perfctr_cstatus_nrctrs(cstatus); - - /* Ickity, ickity, ick. We don't have fine enough interrupt - * control to disable interrupts on all the counters we're not - * interested in. So, we have to deal with overflows on actrs - * amd unused PMCs as well as the ones we actually care - * about. */ - for (i = 0; i < nractrs; ++i) { - int pmc = state->control.pmc_map[i]; - u32 val = read_pmc(pmc); - - /* For actrs, force a sample if they overflowed */ - - if ((s32)val < 0) { - state->user.pmc[i].sum += (u32)(val - state->user.pmc[i].start); - state->user.pmc[i].start = 0; - write_pmc(pmc, 0); - } - } - for (; i < nrctrs; ++i) { - if ((s32)state->user.pmc[i].start < 0) { /* PPC64-specific */ - int pmc = state->control.pmc_map[i]; - /* XXX: "+=" to correct for overshots */ - state->user.pmc[i].start = state->control.ireset[pmc]; - pmc_mask |= (1 << i); - } - } - - /* Clear any unused overflowed counters, so we don't loop on - * the interrupt */ - for (i = 0; i < nr_pmcs; ++i) { - if (! (state->unused_pmcs & (1<control.header.nractrs; - nrctrs = i + state->control.header.nrictrs; - for(; i < nrctrs; ++i) { - unsigned int pmc = state->control.pmc_map[i]; - if ((int)state->control.ireset[pmc] < 0) /* PPC64-specific */ - return -EINVAL; - state->user.pmc[i].start = state->control.ireset[pmc]; - } - return 0; -} - -#else /* CONFIG_PERFCTR_INTERRUPT_SUPPORT */ -static inline void perfctr_cpu_isuspend(struct perfctr_cpu_state *state) { } -static inline void perfctr_cpu_iresume(const struct perfctr_cpu_state *state) { } -static inline int check_ireset(struct perfctr_cpu_state *state) { return 0; } -#endif /* CONFIG_PERFCTR_INTERRUPT_SUPPORT */ - -static int check_control(struct perfctr_cpu_state *state) -{ - unsigned int i, nractrs, nrctrs, pmc_mask, pmc; - unsigned int nr_pmcs = 6; - - if (cpu_has_feature(CPU_FTR_PMC8)) - nr_pmcs = 8; - - nractrs = state->control.header.nractrs; - nrctrs = nractrs + state->control.header.nrictrs; - if (nrctrs < nractrs || nrctrs > nr_pmcs) - return -EINVAL; - - pmc_mask = 0; - for (i = 0; i < nrctrs; ++i) { - pmc = state->control.pmc_map[i]; - if (pmc >= nr_pmcs || (pmc_mask & (1<control.mmcr0 & MMCR0_PMXE) - || (state->control.mmcr0 & MMCR0_PMAO) - || (state->control.mmcr0 & MMCR0_TBEE) ) - return -EINVAL; - - state->unused_pmcs = ((1 << nr_pmcs)-1) & ~pmc_mask; - - state->id = new_id(); - - return 0; -} - -int perfctr_cpu_update_control(struct perfctr_cpu_state *state, int is_global) -{ - int err; - - clear_isuspend_cpu(state); - state->user.cstatus = 0; - - /* disallow i-mode counters if we cannot catch the interrupts */ - if (!(perfctr_info.cpu_features & PERFCTR_FEATURE_PCINT) - && state->control.header.nrictrs) - return -EPERM; - - err = check_control(state); /* may initialise state->cstatus */ - if (err < 0) - return err; - err = check_ireset(state); - if (err < 0) - return err; - state->user.cstatus |= perfctr_mk_cstatus(state->control.header.tsc_on, - state->control.header.nractrs, - state->control.header.nrictrs); - return 0; -} - -/* - * get_reg_offset() maps SPR numbers to offsets into struct perfctr_cpu_control. - */ -static const struct { - unsigned int spr; - unsigned int offset; - unsigned int size; -} reg_offsets[] = { - { SPRN_MMCR0, offsetof(struct perfctr_cpu_control, mmcr0), sizeof(long) }, - { SPRN_MMCR1, offsetof(struct perfctr_cpu_control, mmcr1), sizeof(long) }, - { SPRN_MMCRA, offsetof(struct perfctr_cpu_control, mmcra), sizeof(long) }, - { SPRN_PMC1, offsetof(struct perfctr_cpu_control, ireset[1-1]), sizeof(int) }, - { SPRN_PMC2, offsetof(struct perfctr_cpu_control, ireset[2-1]), sizeof(int) }, - { SPRN_PMC3, offsetof(struct perfctr_cpu_control, ireset[3-1]), sizeof(int) }, - { SPRN_PMC4, offsetof(struct perfctr_cpu_control, ireset[4-1]), sizeof(int) }, - { SPRN_PMC5, offsetof(struct perfctr_cpu_control, ireset[5-1]), sizeof(int) }, - { SPRN_PMC6, offsetof(struct perfctr_cpu_control, ireset[6-1]), sizeof(int) }, - { SPRN_PMC7, offsetof(struct perfctr_cpu_control, ireset[7-1]), sizeof(int) }, - { SPRN_PMC8, offsetof(struct perfctr_cpu_control, ireset[8-1]), sizeof(int) }, -}; - -static int get_reg_offset(unsigned int spr, unsigned int *size) -{ - unsigned int i; - - for(i = 0; i < ARRAY_SIZE(reg_offsets); ++i) - if (spr == reg_offsets[i].spr) { - *size = reg_offsets[i].size; - return reg_offsets[i].offset; - } - return -1; -} - -static int access_regs(struct perfctr_cpu_control *control, - void *argp, unsigned int argbytes, int do_write) -{ - struct perfctr_cpu_reg *regs; - unsigned int i, nr_regs, size; - int offset; - - nr_regs = argbytes / sizeof(struct perfctr_cpu_reg); - if (nr_regs * sizeof(struct perfctr_cpu_reg) != argbytes) - return -EINVAL; - regs = (struct perfctr_cpu_reg*)argp; - - for(i = 0; i < nr_regs; ++i) { - offset = get_reg_offset(regs[i].nr, &size); - if (offset < 0) - return -EINVAL; - if (size == sizeof(long)) { - unsigned long *where = (unsigned long*)((char*)control + offset); - if (do_write) - *where = regs[i].value; - else - regs[i].value = *where; - } else { - unsigned int *where = (unsigned int*)((char*)control + offset); - if (do_write) - *where = regs[i].value; - else - regs[i].value = *where; - } - } - return argbytes; -} - -int perfctr_cpu_control_write(struct perfctr_cpu_control *control, unsigned int domain, - const void *srcp, unsigned int srcbytes) -{ - if (domain != PERFCTR_DOMAIN_CPU_REGS) - return -EINVAL; - return access_regs(control, (void*)srcp, srcbytes, 1); -} - -int perfctr_cpu_control_read(const struct perfctr_cpu_control *control, unsigned int domain, - void *dstp, unsigned int dstbytes) -{ - if (domain != PERFCTR_DOMAIN_CPU_REGS) - return -EINVAL; - return access_regs((struct perfctr_cpu_control*)control, dstp, dstbytes, 0); -} - -void perfctr_cpu_suspend(struct perfctr_cpu_state *state) -{ - unsigned int i, cstatus; - struct perfctr_low_ctrs now; - - /* quiesce the counters */ - mtspr(SPRN_MMCR0, MMCR0_FC); - get_cpu_cache()->ppc64_mmcr0 = MMCR0_FC; - - if (perfctr_cstatus_has_ictrs(state->user.cstatus)) - perfctr_cpu_isuspend(state); - - perfctr_cpu_read_counters(state, &now); - cstatus = state->user.cstatus; - if (perfctr_cstatus_has_tsc(cstatus)) - state->user.tsc_sum += now.tsc - state->user.tsc_start; - - for (i = 0; i < perfctr_cstatus_nractrs(cstatus); ++i) - state->user.pmc[i].sum += (u32)(now.pmc[i]-state->user.pmc[i].start); -} - -void perfctr_cpu_resume(struct perfctr_cpu_state *state) -{ - struct perfctr_low_ctrs now; - unsigned int i, cstatus; - - if (perfctr_cstatus_has_ictrs(state->user.cstatus)) - perfctr_cpu_iresume(state); - perfctr_cpu_write_control(state); - - perfctr_cpu_read_counters(state, &now); - cstatus = state->user.cstatus; - if (perfctr_cstatus_has_tsc(cstatus)) - state->user.tsc_start = now.tsc; - - for (i = 0; i < perfctr_cstatus_nractrs(cstatus); ++i) - state->user.pmc[i].start = now.pmc[i]; - - ++state->user.samplecnt; -} - -void perfctr_cpu_sample(struct perfctr_cpu_state *state) -{ - unsigned int i, cstatus, nractrs; - struct perfctr_low_ctrs now; - - perfctr_cpu_read_counters(state, &now); - cstatus = state->user.cstatus; - if (perfctr_cstatus_has_tsc(cstatus)) { - state->user.tsc_sum += now.tsc - state->user.tsc_start; - state->user.tsc_start = now.tsc; - } - nractrs = perfctr_cstatus_nractrs(cstatus); - for(i = 0; i < nractrs; ++i) { - state->user.pmc[i].sum += (u32)(now.pmc[i]-state->user.pmc[i].start); - state->user.pmc[i].start = now.pmc[i]; - } - ++state->user.samplecnt; -} - -static void perfctr_cpu_clear_counters(void) -{ - struct per_cpu_cache *cache; - - cache = get_cpu_cache(); - memset(cache, 0, sizeof *cache); - cache->id = 0; - - ppc64_clear_counters(); -} - -/**************************************************************** - * * - * Processor detection and initialisation procedures. * - * * - ****************************************************************/ - -static void ppc64_cpu_setup(void) -{ - /* allow user to initialize these???? */ - - unsigned long long mmcr0 = mfspr(SPRN_MMCR0); - unsigned long long mmcra = mfspr(SPRN_MMCRA); - - - ppc64_enable_pmcs(); - - mmcr0 |= MMCR0_FC; - mtspr(SPRN_MMCR0, mmcr0); - - mmcr0 |= MMCR0_FCM1|MMCR0_PMXE|MMCR0_FCECE; - mmcr0 |= MMCR0_PMC1CE|MMCR0_PMCjCE; - mtspr(SPRN_MMCR0, mmcr0); - - mmcra |= MMCRA_SAMPLE_ENABLE; - mtspr(SPRN_MMCRA, mmcra); - - printk("setup on cpu %d, mmcr0 %lx\n", smp_processor_id(), - mfspr(SPRN_MMCR0)); - printk("setup on cpu %d, mmcr1 %lx\n", smp_processor_id(), - mfspr(SPRN_MMCR1)); - printk("setup on cpu %d, mmcra %lx\n", smp_processor_id(), - mfspr(SPRN_MMCRA)); - -/* mtmsrd(mfmsr() | MSR_PMM); */ - - ppc64_clear_counters(); - - mmcr0 = mfspr(SPRN_MMCR0); - mmcr0 &= ~MMCR0_PMAO; - mmcr0 &= ~MMCR0_FC; - mtspr(SPRN_MMCR0, mmcr0); - - printk("start on cpu %d, mmcr0 %llx\n", smp_processor_id(), mmcr0); -} - - -static void perfctr_cpu_clear_one(void *ignore) -{ - /* PREEMPT note: when called via on_each_cpu(), - this is in IRQ context with preemption disabled. */ - perfctr_cpu_clear_counters(); -} - -static void perfctr_cpu_reset(void) -{ - on_each_cpu(perfctr_cpu_clear_one, NULL, 1, 1); - perfctr_cpu_set_ihandler(NULL); -} - -int __init perfctr_cpu_init(void) -{ - extern unsigned long ppc_proc_freq; - extern unsigned long ppc_tb_freq; - - perfctr_info.cpu_features = PERFCTR_FEATURE_RDTSC - | PERFCTR_FEATURE_RDPMC | PERFCTR_FEATURE_PCINT; - - perfctr_cpu_name = "PowerPC64"; - - perfctr_info.cpu_khz = ppc_proc_freq / 1000; - /* We need to round here rather than truncating, because in a - * few cases the raw ratio can end up being 7.9999 or - * suchlike */ - perfctr_info.tsc_to_cpu_mult = - (ppc_proc_freq + ppc_tb_freq - 1) / ppc_tb_freq; - - on_each_cpu((void *)ppc64_cpu_setup, NULL, 0, 1); - - perfctr_ppc64_init_tests(); - - perfctr_cpu_reset(); - return 0; -} - -void __exit perfctr_cpu_exit(void) -{ - perfctr_cpu_reset(); -} - -/**************************************************************** - * * - * Hardware reservation. * - * * - ****************************************************************/ - -static spinlock_t service_mutex = SPIN_LOCK_UNLOCKED; -static const char *current_service = NULL; - -const char *perfctr_cpu_reserve(const char *service) -{ - const char *ret; - - spin_lock(&service_mutex); - - ret = current_service; - if (ret) - goto out; - - ret = "unknown driver (oprofile?)"; - if (reserve_pmc_hardware(do_perfctr_interrupt) != 0) - goto out; - - current_service = service; - ret = NULL; - - out: - spin_unlock(&service_mutex); - return ret; -} - -void perfctr_cpu_release(const char *service) -{ - spin_lock(&service_mutex); - - if (service != current_service) { - printk(KERN_ERR "%s: attempt by %s to release while reserved by %s\n", - __FUNCTION__, service, current_service); - goto out; - } - - /* power down the counters */ - perfctr_cpu_reset(); - current_service = NULL; - release_pmc_hardware(); - - out: - spin_unlock(&service_mutex); -} diff -Naur linux-2.6.12-rc5-mm1/drivers/perfctr/ppc64_tests.c linux-2.6.12-rc5-mm1-plug/drivers/perfctr/ppc64_tests.c --- linux-2.6.12-rc5-mm1/drivers/perfctr/ppc64_tests.c 2005-05-25 16:23:37.406934664 -0700 +++ linux-2.6.12-rc5-mm1-plug/drivers/perfctr/ppc64_tests.c 1969-12-31 16:00:00.000000000 -0800 @@ -1,322 +0,0 @@ -/* - * Performance-monitoring counters driver. - * Optional PPC64-specific init-time tests. - * - * Copyright (C) 2004 David Gibson, IBM Corporation. - * Copyright (C) 2004 Mikael Pettersson - */ -#include -#include -#include -#include -#include -#include -#include /* for tb_ticks_per_jiffy */ -#include "ppc64_tests.h" - -#define NITER 256 -#define X2(S) S"; "S -#define X8(S) X2(X2(X2(S))) - -static void __init do_read_tbl(unsigned int unused) -{ - unsigned int i, dummy; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mftbl %0") : "=r"(dummy)); -} - -static void __init do_read_pmc1(unsigned int unused) -{ - unsigned int i, dummy; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mfspr %0," __stringify(SPRN_PMC1)) : "=r"(dummy)); -} - -static void __init do_read_pmc2(unsigned int unused) -{ - unsigned int i, dummy; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mfspr %0," __stringify(SPRN_PMC2)) : "=r"(dummy)); -} - -static void __init do_read_pmc3(unsigned int unused) -{ - unsigned int i, dummy; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mfspr %0," __stringify(SPRN_PMC3)) : "=r"(dummy)); -} - -static void __init do_read_pmc4(unsigned int unused) -{ - unsigned int i, dummy; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mfspr %0," __stringify(SPRN_PMC4)) : "=r"(dummy)); -} - -static void __init do_read_mmcr0(unsigned int unused) -{ - unsigned int i, dummy; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mfspr %0," __stringify(SPRN_MMCR0)) : "=r"(dummy)); -} - -static void __init do_read_mmcr1(unsigned int unused) -{ - unsigned int i, dummy; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mfspr %0," __stringify(SPRN_MMCR1)) : "=r"(dummy)); -} - -static void __init do_write_pmc2(unsigned int arg) -{ - unsigned int i; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mtspr " __stringify(SPRN_PMC2) ",%0") : : "r"(arg)); -} - -static void __init do_write_pmc3(unsigned int arg) -{ - unsigned int i; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mtspr " __stringify(SPRN_PMC3) ",%0") : : "r"(arg)); -} - -static void __init do_write_pmc4(unsigned int arg) -{ - unsigned int i; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mtspr " __stringify(SPRN_PMC4) ",%0") : : "r"(arg)); -} - -static void __init do_write_mmcr1(unsigned int arg) -{ - unsigned int i; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mtspr " __stringify(SPRN_MMCR1) ",%0") : : "r"(arg)); -} - -static void __init do_write_mmcr0(unsigned int arg) -{ - unsigned int i; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mtspr " __stringify(SPRN_MMCR0) ",%0") : : "r"(arg)); -} - -static void __init do_empty_loop(unsigned int unused) -{ - unsigned i; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__("" : : ); -} - -static unsigned __init run(void (*doit)(unsigned int), unsigned int arg) -{ - unsigned int start, stop; - start = mfspr(SPRN_PMC1); - (*doit)(arg); /* should take < 2^32 cycles to complete */ - stop = mfspr(SPRN_PMC1); - return stop - start; -} - -static void __init init_tests_message(void) -{ -#if 0 - printk(KERN_INFO "Please email the following PERFCTR INIT lines " - "to mikpe@csd.uu.se\n" - KERN_INFO "To remove this message, rebuild the driver " - "with CONFIG_PERFCTR_INIT_TESTS=n\n"); - printk(KERN_INFO "PERFCTR INIT: PVR 0x%08x, CPU clock %u kHz, TB clock %lu kHz\n", - pvr, - perfctr_info.cpu_khz, - tb_ticks_per_jiffy*(HZ/10)/(1000/10)); -#endif -} - -static void __init clear(void) -{ - mtspr(SPRN_MMCR0, 0); - mtspr(SPRN_MMCR1, 0); - mtspr(SPRN_MMCRA, 0); - mtspr(SPRN_PMC1, 0); - mtspr(SPRN_PMC2, 0); - mtspr(SPRN_PMC3, 0); - mtspr(SPRN_PMC4, 0); - mtspr(SPRN_PMC5, 0); - mtspr(SPRN_PMC6, 0); - mtspr(SPRN_PMC7, 0); - mtspr(SPRN_PMC8, 0); -} - -static void __init check_fcece(unsigned int pmc1ce) -{ - unsigned int mmcr0; - unsigned int pmc1; - int x = 0; - - /* JHE check out section 1.6.6.2 of the POWER5 pdf */ - - /* - * This test checks if MMCR0[FC] is set after PMC1 overflows - * when MMCR0[FCECE] is set. - * 74xx documentation states this behaviour, while documentation - * for 604/750 processors doesn't mention this at all. - * - * Also output the value of PMC1 shortly after the overflow. - * This tells us if PMC1 really was frozen. On 604/750, it may not - * freeze since we don't enable PMIs. [No freeze confirmed on 750.] - * - * When pmc1ce == 0, MMCR0[PMC1CE] is zero. It's unclear whether - * this masks all PMC1 overflow events or just PMC1 PMIs. - * - * PMC1 counts processor cycles, with 100 to go before overflowing. - * FCECE is set. - * PMC1CE is clear if !pmc1ce, otherwise set. - */ - pmc1 = mfspr(SPRN_PMC1); - - mtspr(SPRN_PMC1, 0x80000000-100); - mmcr0 = MMCR0_FCECE | MMCR0_SHRFC; - - if (pmc1ce) - mmcr0 |= MMCR0_PMC1CE; - - mtspr(SPRN_MMCR0, mmcr0); - - pmc1 = mfspr(SPRN_PMC1); - - do { - do_empty_loop(0); - - pmc1 = mfspr(SPRN_PMC1); - if (x++ > 20000000) { - break; - } - } while (!(mfspr(SPRN_PMC1) & 0x80000000)); - do_empty_loop(0); - - printk(KERN_INFO "PERFCTR INIT: %s(%u): MMCR0[FC] is %u, PMC1 is %#lx\n", - __FUNCTION__, pmc1ce, - !!(mfspr(SPRN_MMCR0) & MMCR0_FC), mfspr(SPRN_PMC1)); - mtspr(SPRN_MMCR0, 0); - mtspr(SPRN_PMC1, 0); -} - -static void __init check_trigger(unsigned int pmc1ce) -{ - unsigned int mmcr0; - unsigned int pmc1; - int x = 0; - - /* - * This test checks if MMCR0[TRIGGER] is reset after PMC1 overflows. - * 74xx documentation states this behaviour, while documentation - * for 604/750 processors doesn't mention this at all. - * [No reset confirmed on 750.] - * - * Also output the values of PMC1 and PMC2 shortly after the overflow. - * PMC2 should be equal to PMC1-0x80000000. - * - * When pmc1ce == 0, MMCR0[PMC1CE] is zero. It's unclear whether - * this masks all PMC1 overflow events or just PMC1 PMIs. - * - * PMC1 counts processor cycles, with 100 to go before overflowing. - * PMC2 counts processor cycles, starting from 0. - * TRIGGER is set, so PMC2 doesn't start until PMC1 overflows. - * PMC1CE is clear if !pmc1ce, otherwise set. - */ - mtspr(SPRN_PMC2, 0); - mtspr(SPRN_PMC1, 0x80000000-100); - mmcr0 = MMCR0_TRIGGER | MMCR0_SHRFC | MMCR0_FCHV; - - if (pmc1ce) - mmcr0 |= MMCR0_PMC1CE; - - mtspr(SPRN_MMCR0, mmcr0); - do { - do_empty_loop(0); - pmc1 = mfspr(SPRN_PMC1); - if (x++ > 20000000) { - break; - } - - } while (!(mfspr(SPRN_PMC1) & 0x80000000)); - do_empty_loop(0); - printk(KERN_INFO "PERFCTR INIT: %s(%u): MMCR0[TRIGGER] is %u, PMC1 is %#lx, PMC2 is %#lx\n", - __FUNCTION__, pmc1ce, - !!(mfspr(SPRN_MMCR0) & MMCR0_TRIGGER), mfspr(SPRN_PMC1), mfspr(SPRN_PMC2)); - mtspr(SPRN_MMCR0, 0); - mtspr(SPRN_PMC1, 0); - mtspr(SPRN_PMC2, 0); -} - -static void __init measure_overheads(void) -{ - int i; - unsigned int mmcr0, loop, ticks[12]; - const char *name[12]; - - clear(); - - /* PMC1 = "processor cycles", - PMC2 = "completed instructions", - not disabled in any mode, - no interrupts */ - /* mmcr0 = (0x01 << 6) | (0x02 << 0); */ - mmcr0 = MMCR0_SHRFC | MMCR0_FCWAIT; - mtspr(SPRN_MMCR0, mmcr0); - - name[0] = "mftbl"; - ticks[0] = run(do_read_tbl, 0); - name[1] = "mfspr (pmc1)"; - ticks[1] = run(do_read_pmc1, 0); - name[2] = "mfspr (pmc2)"; - ticks[2] = run(do_read_pmc2, 0); - name[3] = "mfspr (pmc3)"; - ticks[3] = run(do_read_pmc3, 0); - name[4] = "mfspr (pmc4)"; - ticks[4] = run(do_read_pmc4, 0); - name[5] = "mfspr (mmcr0)"; - ticks[5] = run(do_read_mmcr0, 0); - name[6] = "mfspr (mmcr1)"; - ticks[6] = run(do_read_mmcr1, 0); - name[7] = "mtspr (pmc2)"; - ticks[7] = run(do_write_pmc2, 0); - name[8] = "mtspr (pmc3)"; - ticks[8] = run(do_write_pmc3, 0); - name[9] = "mtspr (pmc4)"; - ticks[9] = run(do_write_pmc4, 0); - name[10] = "mtspr (mmcr1)"; - ticks[10] = run(do_write_mmcr1, 0); - name[11] = "mtspr (mmcr0)"; - ticks[11] = run(do_write_mmcr0, mmcr0); - - loop = run(do_empty_loop, 0); - - clear(); - - init_tests_message(); - printk(KERN_INFO "PERFCTR INIT: NITER == %u\n", NITER); - printk(KERN_INFO "PERFCTR INIT: loop overhead is %u cycles\n", loop); - for(i = 0; i < ARRAY_SIZE(ticks); ++i) { - unsigned int x; - if (!ticks[i]) - continue; - x = ((ticks[i] - loop) * 10) / NITER; - printk(KERN_INFO "PERFCTR INIT: %s cost is %u.%u cycles (%u total)\n", - name[i], x/10, x%10, ticks[i]); - } - - check_fcece(0); -#if 0 - check_fcece(1); - check_trigger(0); - check_trigger(1); -#endif -} - -void __init perfctr_ppc64_init_tests(void) -{ - preempt_disable(); - measure_overheads(); - preempt_enable(); -} diff -Naur linux-2.6.12-rc5-mm1/drivers/perfctr/ppc64_tests.h linux-2.6.12-rc5-mm1-plug/drivers/perfctr/ppc64_tests.h --- linux-2.6.12-rc5-mm1/drivers/perfctr/ppc64_tests.h 2005-05-25 16:23:37.407934512 -0700 +++ linux-2.6.12-rc5-mm1-plug/drivers/perfctr/ppc64_tests.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,12 +0,0 @@ -/* - * Performance-monitoring counters driver. - * Optional PPC32-specific init-time tests. - * - * Copyright (C) 2004 Mikael Pettersson - */ - -#ifdef CONFIG_PERFCTR_INIT_TESTS -extern void perfctr_ppc64_init_tests(void); -#else -static inline void perfctr_ppc64_init_tests(void) { } -#endif diff -Naur linux-2.6.12-rc5-mm1/drivers/perfctr/ppc_tests.c linux-2.6.12-rc5-mm1-plug/drivers/perfctr/ppc_tests.c --- linux-2.6.12-rc5-mm1/drivers/perfctr/ppc_tests.c 2005-05-25 16:23:37.413933600 -0700 +++ linux-2.6.12-rc5-mm1-plug/drivers/perfctr/ppc_tests.c 1969-12-31 16:00:00.000000000 -0800 @@ -1,288 +0,0 @@ -/* $Id: ppc_tests.c,v 1.4 2004/05/21 16:57:53 mikpe Exp $ - * Performance-monitoring counters driver. - * Optional PPC32-specific init-time tests. - * - * Copyright (C) 2004 Mikael Pettersson - */ -#include -#include -#include -#include -#include -#include -#include /* for tb_ticks_per_jiffy */ -#include "ppc_tests.h" - -#define NITER 256 -#define X2(S) S"; "S -#define X8(S) X2(X2(X2(S))) - -static void __init do_read_tbl(unsigned int unused) -{ - unsigned int i, dummy; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mftbl %0") : "=r"(dummy)); -} - -static void __init do_read_pmc1(unsigned int unused) -{ - unsigned int i, dummy; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mfspr %0," __stringify(SPRN_PMC1)) : "=r"(dummy)); -} - -static void __init do_read_pmc2(unsigned int unused) -{ - unsigned int i, dummy; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mfspr %0," __stringify(SPRN_PMC2)) : "=r"(dummy)); -} - -static void __init do_read_pmc3(unsigned int unused) -{ - unsigned int i, dummy; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mfspr %0," __stringify(SPRN_PMC3)) : "=r"(dummy)); -} - -static void __init do_read_pmc4(unsigned int unused) -{ - unsigned int i, dummy; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mfspr %0," __stringify(SPRN_PMC4)) : "=r"(dummy)); -} - -static void __init do_read_mmcr0(unsigned int unused) -{ - unsigned int i, dummy; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mfspr %0," __stringify(SPRN_MMCR0)) : "=r"(dummy)); -} - -static void __init do_read_mmcr1(unsigned int unused) -{ - unsigned int i, dummy; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mfspr %0," __stringify(SPRN_MMCR1)) : "=r"(dummy)); -} - -static void __init do_write_pmc2(unsigned int arg) -{ - unsigned int i; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mtspr " __stringify(SPRN_PMC2) ",%0") : : "r"(arg)); -} - -static void __init do_write_pmc3(unsigned int arg) -{ - unsigned int i; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mtspr " __stringify(SPRN_PMC3) ",%0") : : "r"(arg)); -} - -static void __init do_write_pmc4(unsigned int arg) -{ - unsigned int i; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mtspr " __stringify(SPRN_PMC4) ",%0") : : "r"(arg)); -} - -static void __init do_write_mmcr1(unsigned int arg) -{ - unsigned int i; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mtspr " __stringify(SPRN_MMCR1) ",%0") : : "r"(arg)); -} - -static void __init do_write_mmcr0(unsigned int arg) -{ - unsigned int i; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("mtspr " __stringify(SPRN_MMCR0) ",%0") : : "r"(arg)); -} - -static void __init do_empty_loop(unsigned int unused) -{ - unsigned i; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__("" : : ); -} - -static unsigned __init run(void (*doit)(unsigned int), unsigned int arg) -{ - unsigned int start, stop; - start = mfspr(SPRN_PMC1); - (*doit)(arg); /* should take < 2^32 cycles to complete */ - stop = mfspr(SPRN_PMC1); - return stop - start; -} - -static void __init init_tests_message(void) -{ - unsigned int pvr = mfspr(SPRN_PVR); - printk(KERN_INFO "Please email the following PERFCTR INIT lines " - "to mikpe@csd.uu.se\n" - KERN_INFO "To remove this message, rebuild the driver " - "with CONFIG_PERFCTR_INIT_TESTS=n\n"); - printk(KERN_INFO "PERFCTR INIT: PVR 0x%08x, CPU clock %u kHz, TB clock %u kHz\n", - pvr, - perfctr_info.cpu_khz, - tb_ticks_per_jiffy*(HZ/10)/(1000/10)); -} - -static void __init clear(int have_mmcr1) -{ - mtspr(SPRN_MMCR0, 0); - mtspr(SPRN_PMC1, 0); - mtspr(SPRN_PMC2, 0); - if (have_mmcr1) { - mtspr(SPRN_MMCR1, 0); - mtspr(SPRN_PMC3, 0); - mtspr(SPRN_PMC4, 0); - } -} - -static void __init check_fcece(unsigned int pmc1ce) -{ - unsigned int mmcr0; - - /* - * This test checks if MMCR0[FC] is set after PMC1 overflows - * when MMCR0[FCECE] is set. - * 74xx documentation states this behaviour, while documentation - * for 604/750 processors doesn't mention this at all. - * - * Also output the value of PMC1 shortly after the overflow. - * This tells us if PMC1 really was frozen. On 604/750, it may not - * freeze since we don't enable PMIs. [No freeze confirmed on 750.] - * - * When pmc1ce == 0, MMCR0[PMC1CE] is zero. It's unclear whether - * this masks all PMC1 overflow events or just PMC1 PMIs. - * - * PMC1 counts processor cycles, with 100 to go before overflowing. - * FCECE is set. - * PMC1CE is clear if !pmc1ce, otherwise set. - */ - mtspr(SPRN_PMC1, 0x80000000-100); - mmcr0 = (1<<(31-6)) | (0x01 << 6); - if (pmc1ce) - mmcr0 |= (1<<(31-16)); - mtspr(SPRN_MMCR0, mmcr0); - do { - do_empty_loop(0); - } while (!(mfspr(SPRN_PMC1) & 0x80000000)); - do_empty_loop(0); - printk(KERN_INFO "PERFCTR INIT: %s(%u): MMCR0[FC] is %u, PMC1 is %#x\n", - __FUNCTION__, pmc1ce, - !!(mfspr(SPRN_MMCR0) & (1<<(31-0))), mfspr(SPRN_PMC1)); - mtspr(SPRN_MMCR0, 0); - mtspr(SPRN_PMC1, 0); -} - -static void __init check_trigger(unsigned int pmc1ce) -{ - unsigned int mmcr0; - - /* - * This test checks if MMCR0[TRIGGER] is reset after PMC1 overflows. - * 74xx documentation states this behaviour, while documentation - * for 604/750 processors doesn't mention this at all. - * [No reset confirmed on 750.] - * - * Also output the values of PMC1 and PMC2 shortly after the overflow. - * PMC2 should be equal to PMC1-0x80000000. - * - * When pmc1ce == 0, MMCR0[PMC1CE] is zero. It's unclear whether - * this masks all PMC1 overflow events or just PMC1 PMIs. - * - * PMC1 counts processor cycles, with 100 to go before overflowing. - * PMC2 counts processor cycles, starting from 0. - * TRIGGER is set, so PMC2 doesn't start until PMC1 overflows. - * PMC1CE is clear if !pmc1ce, otherwise set. - */ - mtspr(SPRN_PMC2, 0); - mtspr(SPRN_PMC1, 0x80000000-100); - mmcr0 = (1<<(31-18)) | (0x01 << 6) | (0x01 << 0); - if (pmc1ce) - mmcr0 |= (1<<(31-16)); - mtspr(SPRN_MMCR0, mmcr0); - do { - do_empty_loop(0); - } while (!(mfspr(SPRN_PMC1) & 0x80000000)); - do_empty_loop(0); - printk(KERN_INFO "PERFCTR INIT: %s(%u): MMCR0[TRIGGER] is %u, PMC1 is %#x, PMC2 is %#x\n", - __FUNCTION__, pmc1ce, - !!(mfspr(SPRN_MMCR0) & (1<<(31-18))), mfspr(SPRN_PMC1), mfspr(SPRN_PMC2)); - mtspr(SPRN_MMCR0, 0); - mtspr(SPRN_PMC1, 0); - mtspr(SPRN_PMC2, 0); -} - -static void __init -measure_overheads(int have_mmcr1) -{ - int i; - unsigned int mmcr0, loop, ticks[12]; - const char *name[12]; - - clear(have_mmcr1); - - /* PMC1 = "processor cycles", - PMC2 = "completed instructions", - not disabled in any mode, - no interrupts */ - mmcr0 = (0x01 << 6) | (0x02 << 0); - mtspr(SPRN_MMCR0, mmcr0); - - name[0] = "mftbl"; - ticks[0] = run(do_read_tbl, 0); - name[1] = "mfspr (pmc1)"; - ticks[1] = run(do_read_pmc1, 0); - name[2] = "mfspr (pmc2)"; - ticks[2] = run(do_read_pmc2, 0); - name[3] = "mfspr (pmc3)"; - ticks[3] = have_mmcr1 ? run(do_read_pmc3, 0) : 0; - name[4] = "mfspr (pmc4)"; - ticks[4] = have_mmcr1 ? run(do_read_pmc4, 0) : 0; - name[5] = "mfspr (mmcr0)"; - ticks[5] = run(do_read_mmcr0, 0); - name[6] = "mfspr (mmcr1)"; - ticks[6] = have_mmcr1 ? run(do_read_mmcr1, 0) : 0; - name[7] = "mtspr (pmc2)"; - ticks[7] = run(do_write_pmc2, 0); - name[8] = "mtspr (pmc3)"; - ticks[8] = have_mmcr1 ? run(do_write_pmc3, 0) : 0; - name[9] = "mtspr (pmc4)"; - ticks[9] = have_mmcr1 ? run(do_write_pmc4, 0) : 0; - name[10] = "mtspr (mmcr1)"; - ticks[10] = have_mmcr1 ? run(do_write_mmcr1, 0) : 0; - name[11] = "mtspr (mmcr0)"; - ticks[11] = run(do_write_mmcr0, mmcr0); - - loop = run(do_empty_loop, 0); - - clear(have_mmcr1); - - init_tests_message(); - printk(KERN_INFO "PERFCTR INIT: NITER == %u\n", NITER); - printk(KERN_INFO "PERFCTR INIT: loop overhead is %u cycles\n", loop); - for(i = 0; i < ARRAY_SIZE(ticks); ++i) { - unsigned int x; - if (!ticks[i]) - continue; - x = ((ticks[i] - loop) * 10) / NITER; - printk(KERN_INFO "PERFCTR INIT: %s cost is %u.%u cycles (%u total)\n", - name[i], x/10, x%10, ticks[i]); - } - check_fcece(0); - check_fcece(1); - check_trigger(0); - check_trigger(1); -} - -void __init perfctr_ppc_init_tests(int have_mmcr1) -{ - preempt_disable(); - measure_overheads(have_mmcr1); - preempt_enable(); -} diff -Naur linux-2.6.12-rc5-mm1/drivers/perfctr/ppc_tests.h linux-2.6.12-rc5-mm1-plug/drivers/perfctr/ppc_tests.h --- linux-2.6.12-rc5-mm1/drivers/perfctr/ppc_tests.h 2005-05-25 16:23:37.414933448 -0700 +++ linux-2.6.12-rc5-mm1-plug/drivers/perfctr/ppc_tests.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,12 +0,0 @@ -/* $Id: ppc_tests.h,v 1.1 2004/01/12 01:59:11 mikpe Exp $ - * Performance-monitoring counters driver. - * Optional PPC32-specific init-time tests. - * - * Copyright (C) 2004 Mikael Pettersson - */ - -#ifdef CONFIG_PERFCTR_INIT_TESTS -extern void perfctr_ppc_init_tests(int have_mmcr1); -#else -static inline void perfctr_ppc_init_tests(int have_mmcr1) { } -#endif diff -Naur linux-2.6.12-rc5-mm1/drivers/perfctr/version.h linux-2.6.12-rc5-mm1-plug/drivers/perfctr/version.h --- linux-2.6.12-rc5-mm1/drivers/perfctr/version.h 2005-05-25 16:23:37.414933448 -0700 +++ linux-2.6.12-rc5-mm1-plug/drivers/perfctr/version.h 1969-12-31 16:00:00.000000000 -0800 @@ -1 +0,0 @@ -#define VERSION "2.7.15" diff -Naur linux-2.6.12-rc5-mm1/drivers/perfctr/virtual.c linux-2.6.12-rc5-mm1-plug/drivers/perfctr/virtual.c --- linux-2.6.12-rc5-mm1/drivers/perfctr/virtual.c 2005-05-25 16:23:37.416933144 -0700 +++ linux-2.6.12-rc5-mm1-plug/drivers/perfctr/virtual.c 1969-12-31 16:00:00.000000000 -0800 @@ -1,1253 +0,0 @@ -/* $Id: virtual.c,v 1.115 2005/03/28 22:39:02 mikpe Exp $ - * Virtual per-process performance counters. - * - * Copyright (C) 1999-2005 Mikael Pettersson - */ -#include -#include -#include /* for unlikely() in 2.4.18 and older */ -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "cpumask.h" -#include "virtual.h" - -/**************************************************************** - * * - * Data types and macros. * - * * - ****************************************************************/ - -struct vperfctr { -/* User-visible fields: (must be first for mmap()) */ - struct perfctr_cpu_state cpu_state; -/* Kernel-private fields: */ - int si_signo; - atomic_t count; - spinlock_t owner_lock; - struct task_struct *owner; - /* sampling_timer and bad_cpus_allowed are frequently - accessed, so they get to share a cache line */ - unsigned int sampling_timer ____cacheline_aligned; -#ifdef CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK - atomic_t bad_cpus_allowed; -#endif - unsigned int preserve; - unsigned int resume_cstatus; -#ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT - unsigned int ireload_needed; /* only valid if resume_cstatus != 0 */ -#endif - /* children_lock protects inheritance_id and children, - when parent is not the one doing release_task() */ - spinlock_t children_lock; - unsigned long long inheritance_id; - struct perfctr_sum_ctrs children; - /* schedule_work() data for when an operation cannot be - done in the current context due to locking rules */ - struct work_struct work; - struct task_struct *parent_tsk; -}; -#define IS_RUNNING(perfctr) perfctr_cstatus_enabled((perfctr)->cpu_state.user.cstatus) - -#ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT - -static void vperfctr_ihandler(unsigned long pc); -static void vperfctr_handle_overflow(struct task_struct*, struct vperfctr*); - -static inline void vperfctr_set_ihandler(void) -{ - perfctr_cpu_set_ihandler(vperfctr_ihandler); -} - -#else -static inline void vperfctr_set_ihandler(void) { } -#endif - -#ifdef CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK - -static inline void vperfctr_init_bad_cpus_allowed(struct vperfctr *perfctr) -{ - atomic_set(&perfctr->bad_cpus_allowed, 0); -} - -#else /* !CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK */ -static inline void vperfctr_init_bad_cpus_allowed(struct vperfctr *perfctr) { } -#endif /* !CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK */ - -/**************************************************************** - * * - * Resource management. * - * * - ****************************************************************/ - -/* XXX: perhaps relax this to number of _live_ perfctrs */ -static DECLARE_MUTEX(nrctrs_mutex); -static int nrctrs; -static const char this_service[] = __FILE__; - -static int inc_nrctrs(void) -{ - const char *other; - - other = NULL; - down(&nrctrs_mutex); - if (++nrctrs == 1) { - other = perfctr_cpu_reserve(this_service); - if (other) - nrctrs = 0; - } - up(&nrctrs_mutex); - if (other) { - printk(KERN_ERR __FILE__ - ": cannot operate, perfctr hardware taken by '%s'\n", - other); - return -EBUSY; - } - vperfctr_set_ihandler(); - return 0; -} - -static void dec_nrctrs(void) -{ - down(&nrctrs_mutex); - if (--nrctrs == 0) - perfctr_cpu_release(this_service); - up(&nrctrs_mutex); -} - -/* Allocate a `struct vperfctr'. Claim and reserve - an entire page so that it can be mmap():ed. */ -static struct vperfctr *vperfctr_alloc(void) -{ - unsigned long page; - - if (inc_nrctrs() != 0) - return ERR_PTR(-EBUSY); - page = get_zeroed_page(GFP_KERNEL); - if (!page) { - dec_nrctrs(); - return ERR_PTR(-ENOMEM); - } - SetPageReserved(virt_to_page(page)); - return (struct vperfctr*) page; -} - -static void vperfctr_free(struct vperfctr *perfctr) -{ - ClearPageReserved(virt_to_page(perfctr)); - free_page((unsigned long)perfctr); - dec_nrctrs(); -} - -static struct vperfctr *get_empty_vperfctr(void) -{ - struct vperfctr *perfctr = vperfctr_alloc(); - if (!IS_ERR(perfctr)) { - atomic_set(&perfctr->count, 1); - vperfctr_init_bad_cpus_allowed(perfctr); - spin_lock_init(&perfctr->owner_lock); - spin_lock_init(&perfctr->children_lock); - } - return perfctr; -} - -static void put_vperfctr(struct vperfctr *perfctr) -{ - if (atomic_dec_and_test(&perfctr->count)) - vperfctr_free(perfctr); -} - -static void scheduled_vperfctr_free(void *perfctr) -{ - vperfctr_free((struct vperfctr*)perfctr); -} - -static void schedule_put_vperfctr(struct vperfctr *perfctr) -{ - if (!atomic_dec_and_test(&perfctr->count)) - return; - INIT_WORK(&perfctr->work, scheduled_vperfctr_free, perfctr); - schedule_work(&perfctr->work); -} - -static unsigned long long new_inheritance_id(void) -{ - static spinlock_t lock = SPIN_LOCK_UNLOCKED; - static unsigned long long counter; - unsigned long long id; - - spin_lock(&lock); - id = ++counter; - spin_unlock(&lock); - return id; -} - -/**************************************************************** - * * - * Basic counter operations. * - * These must all be called by the owner process only. * - * These must all be called with preemption disabled. * - * * - ****************************************************************/ - -/* PRE: IS_RUNNING(perfctr) - * Suspend the counters. - */ -static inline void vperfctr_suspend(struct vperfctr *perfctr) -{ - perfctr_cpu_suspend(&perfctr->cpu_state); -} - -static inline void vperfctr_reset_sampling_timer(struct vperfctr *perfctr) -{ - /* XXX: base the value on perfctr_info.cpu_khz instead! */ - perfctr->sampling_timer = HZ/2; -} - -/* PRE: perfctr == current->thread.perfctr && IS_RUNNING(perfctr) - * Restart the counters. - */ -static inline void vperfctr_resume(struct vperfctr *perfctr) -{ - perfctr_cpu_resume(&perfctr->cpu_state); - vperfctr_reset_sampling_timer(perfctr); -} - -static inline void vperfctr_resume_with_overflow_check(struct vperfctr *perfctr) -{ -#ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT - if (perfctr_cpu_has_pending_interrupt(&perfctr->cpu_state)) { - vperfctr_handle_overflow(current, perfctr); - return; - } -#endif - vperfctr_resume(perfctr); -} - -/* Sample the counters but do not suspend them. */ -static void vperfctr_sample(struct vperfctr *perfctr) -{ - if (IS_RUNNING(perfctr)) { - perfctr_cpu_sample(&perfctr->cpu_state); - vperfctr_reset_sampling_timer(perfctr); - } -} - -#ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT -/* vperfctr interrupt handler (XXX: add buffering support) */ -/* PREEMPT note: called in IRQ context with preemption disabled. */ -static void vperfctr_ihandler(unsigned long pc) -{ - struct task_struct *tsk = current; - struct vperfctr *perfctr; - - perfctr = tsk->thread.perfctr; - if (!perfctr) { - printk(KERN_ERR "%s: BUG! pid %d has no vperfctr\n", - __FUNCTION__, tsk->pid); - return; - } - if (!perfctr_cstatus_has_ictrs(perfctr->cpu_state.user.cstatus)) { - printk(KERN_ERR "%s: BUG! vperfctr has cstatus %#x (pid %d, comm %s)\n", - __FUNCTION__, perfctr->cpu_state.user.cstatus, tsk->pid, tsk->comm); - return; - } - vperfctr_suspend(perfctr); - vperfctr_handle_overflow(tsk, perfctr); -} - -static void vperfctr_handle_overflow(struct task_struct *tsk, - struct vperfctr *perfctr) -{ - unsigned int pmc_mask; - siginfo_t si; - sigset_t old_blocked; - - pmc_mask = perfctr_cpu_identify_overflow(&perfctr->cpu_state); - if (!pmc_mask) { -#ifdef CONFIG_PPC64 - /* On some hardware (ppc64, in particular) it's - * impossible to control interrupts finely enough to - * eliminate overflows on counters we don't care - * about. So in this case just restart the counters - * and keep going. */ - vperfctr_resume(perfctr); -#else - printk(KERN_ERR "%s: BUG! pid %d has unidentifiable overflow source\n", - __FUNCTION__, tsk->pid); -#endif - return; - } - perfctr->ireload_needed = 1; - /* suspend a-mode and i-mode PMCs, leaving only TSC on */ - /* XXX: some people also want to suspend the TSC */ - perfctr->resume_cstatus = perfctr->cpu_state.user.cstatus; - if (perfctr_cstatus_has_tsc(perfctr->resume_cstatus)) { - perfctr->cpu_state.user.cstatus = perfctr_mk_cstatus(1, 0, 0); - vperfctr_resume(perfctr); - } else - perfctr->cpu_state.user.cstatus = 0; - si.si_signo = perfctr->si_signo; - si.si_errno = 0; - si.si_code = SI_PMC_OVF; - si.si_pmc_ovf_mask = pmc_mask; - - /* deliver signal without waking up the receiver */ - spin_lock_irq(&tsk->sighand->siglock); - old_blocked = tsk->blocked; - sigaddset(&tsk->blocked, si.si_signo); - spin_unlock_irq(&tsk->sighand->siglock); - - if (!send_sig_info(si.si_signo, &si, tsk)) - send_sig(si.si_signo, tsk, 1); - - spin_lock_irq(&tsk->sighand->siglock); - tsk->blocked = old_blocked; - recalc_sigpending(); - spin_unlock_irq(&tsk->sighand->siglock); -} -#endif - -/**************************************************************** - * * - * Process management operations. * - * These must all, with the exception of vperfctr_unlink() * - * and __vperfctr_set_cpus_allowed(), be called by the owner * - * process only. * - * * - ****************************************************************/ - -/* do_fork() -> copy_process() -> copy_thread() -> __vperfctr_copy(). - * Inherit the parent's perfctr settings to the child. - * PREEMPT note: do_fork() etc do not run with preemption disabled. -*/ -void __vperfctr_copy(struct task_struct *child_tsk, struct pt_regs *regs) -{ - struct vperfctr *parent_perfctr; - struct vperfctr *child_perfctr; - - /* Do not inherit perfctr settings to kernel-generated - threads, like those created by kmod. */ - child_perfctr = NULL; - if (!user_mode(regs)) - goto out; - - /* Allocation may sleep. Do it before the critical region. */ - child_perfctr = get_empty_vperfctr(); - if (IS_ERR(child_perfctr)) { - child_perfctr = NULL; - goto out; - } - - /* Although we're executing in the parent, if it is scheduled - then a remote monitor may attach and change the perfctr - pointer or the object it points to. This may already have - occurred when we get here, so the old copy of the pointer - in the child cannot be trusted. */ - preempt_disable(); - parent_perfctr = current->thread.perfctr; - if (parent_perfctr) { - child_perfctr->cpu_state.control = parent_perfctr->cpu_state.control; - child_perfctr->si_signo = parent_perfctr->si_signo; - child_perfctr->inheritance_id = parent_perfctr->inheritance_id; - } - preempt_enable(); - if (!parent_perfctr) { - put_vperfctr(child_perfctr); - child_perfctr = NULL; - goto out; - } - (void)perfctr_cpu_update_control(&child_perfctr->cpu_state, 0); - child_perfctr->owner = child_tsk; - out: - child_tsk->thread.perfctr = child_perfctr; -} - -/* Called from exit_thread() or do_vperfctr_unlink(). - * If the counters are running, stop them and sample their final values. - * Mark the vperfctr object as dead. - * Optionally detach the vperfctr object from its owner task. - * PREEMPT note: exit_thread() does not run with preemption disabled. - */ -static void vperfctr_unlink(struct task_struct *owner, struct vperfctr *perfctr, int do_unlink) -{ - /* this synchronises with sys_vperfctr() */ - spin_lock(&perfctr->owner_lock); - perfctr->owner = NULL; - spin_unlock(&perfctr->owner_lock); - - /* perfctr suspend+detach must be atomic wrt process suspend */ - /* this also synchronises with perfctr_set_cpus_allowed() */ - task_lock(owner); - if (IS_RUNNING(perfctr) && owner == current) - vperfctr_suspend(perfctr); - if (do_unlink) - owner->thread.perfctr = NULL; - task_unlock(owner); - - perfctr->cpu_state.user.cstatus = 0; - perfctr->resume_cstatus = 0; - if (do_unlink) - put_vperfctr(perfctr); -} - -void __vperfctr_exit(struct vperfctr *perfctr) -{ - vperfctr_unlink(current, perfctr, 0); -} - -/* release_task() -> perfctr_release_task() -> __vperfctr_release(). - * A task is being released. If it inherited its perfctr settings - * from its parent, then merge its final counts back into the parent. - * Then unlink the child's perfctr. - * PRE: caller has write_lock_irq(&tasklist_lock). - * PREEMPT note: preemption is disabled due to tasklist_lock. - * - * When current == parent_tsk, the child's counts can be merged - * into the parent's immediately. This is the common case. - * - * When current != parent_tsk, the parent must be task_lock()ed - * before its perfctr state can be accessed. task_lock() is illegal - * here due to the write_lock_irq(&tasklist_lock) in release_task(), - * so the operation is done via schedule_work(). - */ -static void do_vperfctr_release(struct vperfctr *child_perfctr, struct task_struct *parent_tsk) -{ - struct vperfctr *parent_perfctr; - unsigned int cstatus, nrctrs, i; - - parent_perfctr = parent_tsk->thread.perfctr; - if (parent_perfctr && child_perfctr) { - spin_lock(&parent_perfctr->children_lock); - if (parent_perfctr->inheritance_id == child_perfctr->inheritance_id) { - cstatus = parent_perfctr->cpu_state.user.cstatus; - if (perfctr_cstatus_has_tsc(cstatus)) - parent_perfctr->children.tsc += - child_perfctr->cpu_state.user.tsc_sum + - child_perfctr->children.tsc; - nrctrs = perfctr_cstatus_nrctrs(cstatus); - for(i = 0; i < nrctrs; ++i) - parent_perfctr->children.pmc[i] += - child_perfctr->cpu_state.user.pmc[i].sum + - child_perfctr->children.pmc[i]; - } - spin_unlock(&parent_perfctr->children_lock); - } - schedule_put_vperfctr(child_perfctr); -} - -static void scheduled_release(void *data) -{ - struct vperfctr *child_perfctr = data; - struct task_struct *parent_tsk = child_perfctr->parent_tsk; - - task_lock(parent_tsk); - do_vperfctr_release(child_perfctr, parent_tsk); - task_unlock(parent_tsk); - put_task_struct(parent_tsk); -} - -void __vperfctr_release(struct task_struct *child_tsk) -{ - struct task_struct *parent_tsk = child_tsk->parent; - struct vperfctr *child_perfctr = child_tsk->thread.perfctr; - - child_tsk->thread.perfctr = NULL; - if (parent_tsk == current) - do_vperfctr_release(child_perfctr, parent_tsk); - else { - get_task_struct(parent_tsk); - INIT_WORK(&child_perfctr->work, scheduled_release, child_perfctr); - child_perfctr->parent_tsk = parent_tsk; - schedule_work(&child_perfctr->work); - } -} - -/* schedule() --> switch_to() --> .. --> __vperfctr_suspend(). - * If the counters are running, suspend them. - * PREEMPT note: switch_to() runs with preemption disabled. - */ -void __vperfctr_suspend(struct vperfctr *perfctr) -{ - if (IS_RUNNING(perfctr)) - vperfctr_suspend(perfctr); -} - -/* schedule() --> switch_to() --> .. --> __vperfctr_resume(). - * PRE: perfctr == current->thread.perfctr - * If the counters are runnable, resume them. - * PREEMPT note: switch_to() runs with preemption disabled. - */ -void __vperfctr_resume(struct vperfctr *perfctr) -{ - if (IS_RUNNING(perfctr)) { -#ifdef CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK - if (unlikely(atomic_read(&perfctr->bad_cpus_allowed)) && - perfctr_cstatus_nrctrs(perfctr->cpu_state.user.cstatus)) { - perfctr->cpu_state.user.cstatus = 0; - perfctr->resume_cstatus = 0; - BUG_ON(current->state != TASK_RUNNING); - send_sig(SIGILL, current, 1); - return; - } -#endif - vperfctr_resume_with_overflow_check(perfctr); - } -} - -/* Called from update_one_process() [triggered by timer interrupt]. - * PRE: perfctr == current->thread.perfctr. - * Sample the counters but do not suspend them. - * Needed to avoid precision loss due to multiple counter - * wraparounds between resume/suspend for CPU-bound processes. - * PREEMPT note: called in IRQ context with preemption disabled. - */ -void __vperfctr_sample(struct vperfctr *perfctr) -{ - if (--perfctr->sampling_timer == 0) - vperfctr_sample(perfctr); -} - -#ifdef CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK -/* Called from set_cpus_allowed(). - * PRE: current holds task_lock(owner) - * PRE: owner->thread.perfctr == perfctr - */ -void __vperfctr_set_cpus_allowed(struct task_struct *owner, - struct vperfctr *perfctr, - cpumask_t new_mask) -{ - if (cpus_intersects(new_mask, perfctr_cpus_forbidden_mask)) { - atomic_set(&perfctr->bad_cpus_allowed, 1); - if (printk_ratelimit()) - printk(KERN_WARNING "perfctr: process %d (comm %s) issued unsafe" - " set_cpus_allowed() on process %d (comm %s)\n", - current->pid, current->comm, owner->pid, owner->comm); - } else - atomic_set(&perfctr->bad_cpus_allowed, 0); -} -#endif - -/**************************************************************** - * * - * Virtual perfctr system calls implementation. * - * These can be called by the owner process (tsk == current), * - * a monitor process which has the owner under ptrace ATTACH * - * control (tsk && tsk != current), or anyone with a handle to * - * an unlinked perfctr (!tsk). * - * * - ****************************************************************/ - -static int do_vperfctr_write(struct vperfctr *perfctr, - unsigned int domain, - const void __user *srcp, - unsigned int srcbytes, - struct task_struct *tsk) -{ - void *tmp; - int err; - - if (!tsk) - return -ESRCH; /* attempt to update unlinked perfctr */ - - if (srcbytes > PAGE_SIZE) /* primitive sanity check */ - return -EINVAL; - tmp = kmalloc(srcbytes, GFP_USER); - if (!tmp) - return -ENOMEM; - err = -EFAULT; - if (copy_from_user(tmp, srcp, srcbytes)) - goto out_kfree; - - /* PREEMPT note: preemption is disabled over the entire - region since we're updating an active perfctr. */ - preempt_disable(); - if (IS_RUNNING(perfctr)) { - if (tsk == current) - vperfctr_suspend(perfctr); - perfctr->cpu_state.user.cstatus = 0; - perfctr->resume_cstatus = 0; - } - - switch (domain) { - case VPERFCTR_DOMAIN_CONTROL: { - struct vperfctr_control control; - - err = -EINVAL; - if (srcbytes > sizeof(control)) - break; - control.si_signo = perfctr->si_signo; - control.preserve = perfctr->preserve; - memcpy(&control, tmp, srcbytes); - /* XXX: validate si_signo? */ - perfctr->si_signo = control.si_signo; - perfctr->preserve = control.preserve; - err = 0; - break; - } - case PERFCTR_DOMAIN_CPU_CONTROL: - err = -EINVAL; - if (srcbytes > sizeof(perfctr->cpu_state.control.header)) - break; - memcpy(&perfctr->cpu_state.control.header, tmp, srcbytes); - err = 0; - break; - case PERFCTR_DOMAIN_CPU_MAP: - err = -EINVAL; - if (srcbytes > sizeof(perfctr->cpu_state.control.pmc_map)) - break; - memcpy(perfctr->cpu_state.control.pmc_map, tmp, srcbytes); - err = 0; - break; - default: - err = perfctr_cpu_control_write(&perfctr->cpu_state.control, - domain, tmp, srcbytes); - } - - preempt_enable(); - out_kfree: - kfree(tmp); - return err; -} - -static int vperfctr_enable_control(struct vperfctr *perfctr, struct task_struct *tsk) -{ - int err; - unsigned int next_cstatus; - unsigned int nrctrs, i; - - if (perfctr->cpu_state.control.header.nractrs || - perfctr->cpu_state.control.header.nrictrs) { - cpumask_t old_mask, new_mask; - - old_mask = tsk->cpus_allowed; - cpus_andnot(new_mask, old_mask, perfctr_cpus_forbidden_mask); - - if (cpus_empty(new_mask)) - return -EINVAL; - if (!cpus_equal(new_mask, old_mask)) - set_cpus_allowed(tsk, new_mask); - } - - perfctr->cpu_state.user.cstatus = 0; - perfctr->resume_cstatus = 0; - - /* remote access note: perfctr_cpu_update_control() is ok */ - err = perfctr_cpu_update_control(&perfctr->cpu_state, 0); - if (err < 0) - return err; - next_cstatus = perfctr->cpu_state.user.cstatus; - if (!perfctr_cstatus_enabled(next_cstatus)) - return 0; - - if (!perfctr_cstatus_has_tsc(next_cstatus)) - perfctr->cpu_state.user.tsc_sum = 0; - - nrctrs = perfctr_cstatus_nrctrs(next_cstatus); - for(i = 0; i < nrctrs; ++i) - if (!(perfctr->preserve & (1<cpu_state.user.pmc[i].sum = 0; - - spin_lock(&perfctr->children_lock); - perfctr->inheritance_id = new_inheritance_id(); - memset(&perfctr->children, 0, sizeof perfctr->children); - spin_unlock(&perfctr->children_lock); - - return 0; -} - -static inline void vperfctr_ireload(struct vperfctr *perfctr) -{ -#ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT - if (perfctr->ireload_needed) { - perfctr->ireload_needed = 0; - /* remote access note: perfctr_cpu_ireload() is ok */ - perfctr_cpu_ireload(&perfctr->cpu_state); - } -#endif -} - -static int do_vperfctr_resume(struct vperfctr *perfctr, struct task_struct *tsk) -{ - unsigned int resume_cstatus; - int ret; - - if (!tsk) - return -ESRCH; /* attempt to update unlinked perfctr */ - - /* PREEMPT note: preemption is disabled over the entire - region because we're updating an active perfctr. */ - preempt_disable(); - - if (IS_RUNNING(perfctr) && tsk == current) - vperfctr_suspend(perfctr); - - resume_cstatus = perfctr->resume_cstatus; - if (perfctr_cstatus_enabled(resume_cstatus)) { - perfctr->cpu_state.user.cstatus = resume_cstatus; - perfctr->resume_cstatus = 0; - vperfctr_ireload(perfctr); - ret = 0; - } else { - ret = vperfctr_enable_control(perfctr, tsk); - resume_cstatus = perfctr->cpu_state.user.cstatus; - } - - if (ret >= 0 && perfctr_cstatus_enabled(resume_cstatus) && tsk == current) - vperfctr_resume(perfctr); - - preempt_enable(); - - return ret; -} - -static int do_vperfctr_suspend(struct vperfctr *perfctr, struct task_struct *tsk) -{ - if (!tsk) - return -ESRCH; /* attempt to update unlinked perfctr */ - - /* PREEMPT note: preemption is disabled over the entire - region since we're updating an active perfctr. */ - preempt_disable(); - - if (IS_RUNNING(perfctr)) { - if (tsk == current) - vperfctr_suspend(perfctr); - perfctr->resume_cstatus = perfctr->cpu_state.user.cstatus; - perfctr->cpu_state.user.cstatus = 0; - } - - preempt_enable(); - - return 0; -} - -static int do_vperfctr_unlink(struct vperfctr *perfctr, struct task_struct *tsk) -{ - if (tsk) - vperfctr_unlink(tsk, perfctr, 1); - return 0; -} - -static int do_vperfctr_clear(struct vperfctr *perfctr, struct task_struct *tsk) -{ - if (!tsk) - return -ESRCH; /* attempt to update unlinked perfctr */ - - /* PREEMPT note: preemption is disabled over the entire - region because we're updating an active perfctr. */ - preempt_disable(); - - if (IS_RUNNING(perfctr) && tsk == current) - vperfctr_suspend(perfctr); - - memset(&perfctr->cpu_state, 0, sizeof perfctr->cpu_state); - perfctr->resume_cstatus = 0; - - spin_lock(&perfctr->children_lock); - perfctr->inheritance_id = 0; - memset(&perfctr->children, 0, sizeof perfctr->children); - spin_unlock(&perfctr->children_lock); - - preempt_enable(); - - return 0; -} - -static int do_vperfctr_control(struct vperfctr *perfctr, - unsigned int cmd, - struct task_struct *tsk) -{ - switch (cmd) { - case VPERFCTR_CONTROL_UNLINK: - return do_vperfctr_unlink(perfctr, tsk); - case VPERFCTR_CONTROL_SUSPEND: - return do_vperfctr_suspend(perfctr, tsk); - case VPERFCTR_CONTROL_RESUME: - return do_vperfctr_resume(perfctr, tsk); - case VPERFCTR_CONTROL_CLEAR: - return do_vperfctr_clear(perfctr, tsk); - default: - return -EINVAL; - } -} - -static int do_vperfctr_read(struct vperfctr *perfctr, - unsigned int domain, - void __user *dstp, - unsigned int dstbytes, - struct task_struct *tsk) -{ - union { - struct perfctr_sum_ctrs sum; - struct vperfctr_control control; - struct perfctr_sum_ctrs children; - } *tmp; - unsigned int tmpbytes; - int ret; - - tmpbytes = dstbytes; - if (tmpbytes > PAGE_SIZE) /* primitive sanity check */ - return -EINVAL; - if (tmpbytes < sizeof(*tmp)) - tmpbytes = sizeof(*tmp); - tmp = kmalloc(tmpbytes, GFP_USER); - if (!tmp) - return -ENOMEM; - - /* PREEMPT note: While we're reading our own control, another - process may ptrace ATTACH to us and update our control. - Disable preemption to ensure we get a consistent copy. - Not needed for other cases since the perfctr is either - unlinked or its owner is ptrace ATTACH suspended by us. */ - if (tsk == current) - preempt_disable(); - - switch (domain) { - case VPERFCTR_DOMAIN_SUM: { - int j; - - vperfctr_sample(perfctr); - tmp->sum.tsc = perfctr->cpu_state.user.tsc_sum; - for(j = 0; j < ARRAY_SIZE(tmp->sum.pmc); ++j) - tmp->sum.pmc[j] = perfctr->cpu_state.user.pmc[j].sum; - ret = sizeof(tmp->sum); - break; - } - case VPERFCTR_DOMAIN_CONTROL: - tmp->control.si_signo = perfctr->si_signo; - tmp->control.preserve = perfctr->preserve; - ret = sizeof(tmp->control); - break; - case VPERFCTR_DOMAIN_CHILDREN: - if (tsk) - spin_lock(&perfctr->children_lock); - tmp->children = perfctr->children; - if (tsk) - spin_unlock(&perfctr->children_lock); - ret = sizeof(tmp->children); - break; - case PERFCTR_DOMAIN_CPU_CONTROL: - if (tmpbytes > sizeof(perfctr->cpu_state.control.header)) - tmpbytes = sizeof(perfctr->cpu_state.control.header); - memcpy(tmp, &perfctr->cpu_state.control.header, tmpbytes); - ret = tmpbytes; - break; - case PERFCTR_DOMAIN_CPU_MAP: - if (tmpbytes > sizeof(perfctr->cpu_state.control.pmc_map)) - tmpbytes = sizeof(perfctr->cpu_state.control.pmc_map); - memcpy(tmp, perfctr->cpu_state.control.pmc_map, tmpbytes); - ret = tmpbytes; - break; - default: - ret = -EFAULT; - if (copy_from_user(tmp, dstp, dstbytes) == 0) - ret = perfctr_cpu_control_read(&perfctr->cpu_state.control, - domain, tmp, dstbytes); - } - - if (tsk == current) - preempt_enable(); - - if (ret > 0) { - if (ret > dstbytes) - ret = dstbytes; - if (ret > 0 && copy_to_user(dstp, tmp, ret)) - ret = -EFAULT; - } - kfree(tmp); - return ret; -} - -/**************************************************************** - * * - * Virtual perfctr file operations. * - * * - ****************************************************************/ - -static int vperfctr_mmap(struct file *filp, struct vm_area_struct *vma) -{ - struct vperfctr *perfctr; - - /* Only allow read-only mapping of first page. */ - if ((vma->vm_end - vma->vm_start) != PAGE_SIZE || - vma->vm_pgoff != 0 || - (pgprot_val(vma->vm_page_prot) & _PAGE_RW) || - (vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) - return -EPERM; - perfctr = filp->private_data; - if (!perfctr) - return -EPERM; - return remap_pfn_range(vma, vma->vm_start, - virt_to_phys(perfctr) >> PAGE_SHIFT, - PAGE_SIZE, vma->vm_page_prot); -} - -static int vperfctr_release(struct inode *inode, struct file *filp) -{ - struct vperfctr *perfctr = filp->private_data; - filp->private_data = NULL; - if (perfctr) - put_vperfctr(perfctr); - return 0; -} - -static struct file_operations vperfctr_file_ops = { - .mmap = vperfctr_mmap, - .release = vperfctr_release, -}; - -/**************************************************************** - * * - * File system for virtual perfctrs. Based on pipefs. * - * * - ****************************************************************/ - -#define VPERFCTRFS_MAGIC (('V'<<24)|('P'<<16)|('M'<<8)|('C')) - -/* The code to set up a `struct file_system_type' for a pseudo fs - is unfortunately not the same in 2.4 and 2.6. */ -#include /* needed for 2.6, included by fs.h in 2.4 */ - -static struct super_block * -vperfctrfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return get_sb_pseudo(fs_type, "vperfctr:", NULL, VPERFCTRFS_MAGIC); -} - -static struct file_system_type vperfctrfs_type = { - .name = "vperfctrfs", - .get_sb = vperfctrfs_get_sb, - .kill_sb = kill_anon_super, -}; - -/* XXX: check if s/vperfctr_mnt/vperfctrfs_type.kern_mnt/ would work */ -static struct vfsmount *vperfctr_mnt; -#define vperfctr_fs_init_done() (vperfctr_mnt != NULL) - -static int __init vperfctrfs_init(void) -{ - int err = register_filesystem(&vperfctrfs_type); - if (!err) { - vperfctr_mnt = kern_mount(&vperfctrfs_type); - if (!IS_ERR(vperfctr_mnt)) - return 0; - err = PTR_ERR(vperfctr_mnt); - unregister_filesystem(&vperfctrfs_type); - vperfctr_mnt = NULL; - } - return err; -} - -static void __exit vperfctrfs_exit(void) -{ - unregister_filesystem(&vperfctrfs_type); - mntput(vperfctr_mnt); -} - -static struct inode *vperfctr_get_inode(void) -{ - struct inode *inode; - - inode = new_inode(vperfctr_mnt->mnt_sb); - if (!inode) - return NULL; - inode->i_fop = &vperfctr_file_ops; - inode->i_state = I_DIRTY; - inode->i_mode = S_IFCHR | S_IRUSR | S_IWUSR; - inode->i_uid = current->fsuid; - inode->i_gid = current->fsgid; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - inode->i_blksize = 0; - return inode; -} - -static int vperfctrfs_delete_dentry(struct dentry *dentry) -{ - return 1; -} - -static struct dentry_operations vperfctrfs_dentry_operations = { - .d_delete = vperfctrfs_delete_dentry, -}; - -static struct dentry *vperfctr_d_alloc_root(struct inode *inode) -{ - struct qstr this; - char name[32]; - struct dentry *dentry; - - sprintf(name, "[%lu]", inode->i_ino); - this.name = name; - this.len = strlen(name); - this.hash = inode->i_ino; /* will go */ - dentry = d_alloc(vperfctr_mnt->mnt_sb->s_root, &this); - if (dentry) { - dentry->d_op = &vperfctrfs_dentry_operations; - d_add(dentry, inode); - } - return dentry; -} - -static struct file *vperfctr_get_filp(void) -{ - struct file *filp; - struct inode *inode; - struct dentry *dentry; - - filp = get_empty_filp(); - if (!filp) - goto out; - inode = vperfctr_get_inode(); - if (!inode) - goto out_filp; - dentry = vperfctr_d_alloc_root(inode); - if (!dentry) - goto out_inode; - - filp->f_vfsmnt = mntget(vperfctr_mnt); - filp->f_dentry = dentry; - filp->f_mapping = dentry->d_inode->i_mapping; - - filp->f_pos = 0; - filp->f_flags = 0; - filp->f_op = &vperfctr_file_ops; /* fops_get() if MODULE */ - filp->f_mode = FMODE_READ; - filp->f_version = 0; - - return filp; - - out_inode: - iput(inode); - out_filp: - put_filp(filp); /* doesn't run ->release() like fput() does */ - out: - return NULL; -} - -/**************************************************************** - * * - * Virtual perfctr actual system calls. * - * * - ****************************************************************/ - -/* tid is the actual task/thread id (née pid, stored as ->pid), - pid/tgid is that 2.6 thread group id crap (stored as ->tgid) */ -asmlinkage long sys_vperfctr_open(int tid, int creat) -{ - struct file *filp; - struct task_struct *tsk; - struct vperfctr *perfctr; - int err; - int fd; - - if (!vperfctr_fs_init_done()) - return -ENODEV; - filp = vperfctr_get_filp(); - if (!filp) - return -ENOMEM; - err = fd = get_unused_fd(); - if (err < 0) - goto err_filp; - perfctr = NULL; - if (creat) { - perfctr = get_empty_vperfctr(); /* may sleep */ - if (IS_ERR(perfctr)) { - err = PTR_ERR(perfctr); - goto err_fd; - } - } - tsk = current; - if (tid != 0 && tid != tsk->pid) { /* remote? */ - read_lock(&tasklist_lock); - tsk = find_task_by_pid(tid); - if (tsk) - get_task_struct(tsk); - read_unlock(&tasklist_lock); - err = -ESRCH; - if (!tsk) - goto err_perfctr; - err = ptrace_check_attach(tsk, 0); - if (err < 0) - goto err_tsk; - } - if (creat) { - /* check+install must be atomic to prevent remote-control races */ - task_lock(tsk); - if (!tsk->thread.perfctr) { - perfctr->owner = tsk; - tsk->thread.perfctr = perfctr; - err = 0; - } else - err = -EEXIST; - task_unlock(tsk); - if (err) - goto err_tsk; - } else { - perfctr = tsk->thread.perfctr; - /* XXX: Old API needed to allow NULL perfctr here. - Do we want to keep or change that rule? */ - } - filp->private_data = perfctr; - if (perfctr) - atomic_inc(&perfctr->count); - if (tsk != current) - put_task_struct(tsk); - fd_install(fd, filp); - return fd; - err_tsk: - if (tsk != current) - put_task_struct(tsk); - err_perfctr: - if (perfctr) /* can only occur if creat != 0 */ - put_vperfctr(perfctr); - err_fd: - put_unused_fd(fd); - err_filp: - fput(filp); - return err; -} - -static struct vperfctr *fd_get_vperfctr(int fd) -{ - struct vperfctr *perfctr; - struct file *filp; - int err; - - err = -EBADF; - filp = fget(fd); - if (!filp) - goto out; - err = -EINVAL; - if (filp->f_op != &vperfctr_file_ops) - goto out_filp; - perfctr = filp->private_data; - if (!perfctr) - goto out_filp; - atomic_inc(&perfctr->count); - fput(filp); - return perfctr; - out_filp: - fput(filp); - out: - return ERR_PTR(err); -} - -static struct task_struct *vperfctr_get_tsk(struct vperfctr *perfctr) -{ - struct task_struct *tsk; - - tsk = current; - if (perfctr != current->thread.perfctr) { - /* this synchronises with vperfctr_unlink() and itself */ - spin_lock(&perfctr->owner_lock); - tsk = perfctr->owner; - if (tsk) - get_task_struct(tsk); - spin_unlock(&perfctr->owner_lock); - if (tsk) { - int ret = ptrace_check_attach(tsk, 0); - if (ret < 0) { - put_task_struct(tsk); - return ERR_PTR(ret); - } - } - } - return tsk; -} - -static void vperfctr_put_tsk(struct task_struct *tsk) -{ - if (tsk && tsk != current) - put_task_struct(tsk); -} - -asmlinkage long sys_vperfctr_write(int fd, unsigned int domain, - const void __user *argp, - unsigned int argbytes) -{ - struct vperfctr *perfctr; - struct task_struct *tsk; - int ret; - - perfctr = fd_get_vperfctr(fd); - if (IS_ERR(perfctr)) - return PTR_ERR(perfctr); - tsk = vperfctr_get_tsk(perfctr); - if (IS_ERR(tsk)) { - ret = PTR_ERR(tsk); - goto out; - } - ret = do_vperfctr_write(perfctr, domain, argp, argbytes, tsk); - vperfctr_put_tsk(tsk); - out: - put_vperfctr(perfctr); - return ret; -} - -asmlinkage long sys_vperfctr_control(int fd, unsigned int cmd) -{ - struct vperfctr *perfctr; - struct task_struct *tsk; - int ret; - - perfctr = fd_get_vperfctr(fd); - if (IS_ERR(perfctr)) - return PTR_ERR(perfctr); - tsk = vperfctr_get_tsk(perfctr); - if (IS_ERR(tsk)) { - ret = PTR_ERR(tsk); - goto out; - } - ret = do_vperfctr_control(perfctr, cmd, tsk); - vperfctr_put_tsk(tsk); - out: - put_vperfctr(perfctr); - return ret; -} - -asmlinkage long sys_vperfctr_read(int fd, unsigned int domain, - void __user *argp, unsigned int argbytes) -{ - struct vperfctr *perfctr; - struct task_struct *tsk; - int ret; - - perfctr = fd_get_vperfctr(fd); - if (IS_ERR(perfctr)) - return PTR_ERR(perfctr); - tsk = vperfctr_get_tsk(perfctr); - if (IS_ERR(tsk)) { - ret = PTR_ERR(tsk); - goto out; - } - ret = do_vperfctr_read(perfctr, domain, argp, argbytes, tsk); - vperfctr_put_tsk(tsk); - out: - put_vperfctr(perfctr); - return ret; -} - -/**************************************************************** - * * - * module_init/exit * - * * - ****************************************************************/ - -int __init vperfctr_init(void) -{ - return vperfctrfs_init(); -} - -void __exit vperfctr_exit(void) -{ - vperfctrfs_exit(); -} diff -Naur linux-2.6.12-rc5-mm1/drivers/perfctr/virtual.h linux-2.6.12-rc5-mm1-plug/drivers/perfctr/virtual.h --- linux-2.6.12-rc5-mm1/drivers/perfctr/virtual.h 2005-05-25 16:23:37.417932992 -0700 +++ linux-2.6.12-rc5-mm1-plug/drivers/perfctr/virtual.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,13 +0,0 @@ -/* $Id: virtual.h,v 1.13 2004/05/31 18:18:55 mikpe Exp $ - * Virtual per-process performance counters. - * - * Copyright (C) 1999-2004 Mikael Pettersson - */ - -#ifdef CONFIG_PERFCTR_VIRTUAL -extern int vperfctr_init(void); -extern void vperfctr_exit(void); -#else -static inline int vperfctr_init(void) { return 0; } -static inline void vperfctr_exit(void) { } -#endif diff -Naur linux-2.6.12-rc5-mm1/drivers/perfctr/x86.c linux-2.6.12-rc5-mm1-plug/drivers/perfctr/x86.c --- linux-2.6.12-rc5-mm1/drivers/perfctr/x86.c 2005-05-25 16:23:37.445928736 -0700 +++ linux-2.6.12-rc5-mm1-plug/drivers/perfctr/x86.c 1969-12-31 16:00:00.000000000 -0800 @@ -1,1795 +0,0 @@ -/* $Id: x86.c,v 1.158 2005/04/08 14:36:49 mikpe Exp $ - * x86/x86_64 performance-monitoring counters driver. - * - * Copyright (C) 1999-2005 Mikael Pettersson - */ -#include -#include -#include -#include -#include - -#include -#undef MSR_P6_PERFCTR0 -#undef MSR_IA32_MISC_ENABLE -#include -#include -struct hw_interrupt_type; -#include -#include /* cpu_khz */ - -#include "cpumask.h" -#include "x86_tests.h" - -/* Support for lazy evntsel and perfctr MSR updates. */ -struct per_cpu_cache { /* roughly a subset of perfctr_cpu_state */ - unsigned int id; /* cache owner id */ -#ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT - unsigned int interrupts_masked; -#endif - struct { - /* NOTE: these caches have physical indices, not virtual */ - unsigned int evntsel[18]; - unsigned int escr[0x3E2-0x3A0]; - unsigned int pebs_enable; - unsigned int pebs_matrix_vert; - } control; -}; -static DEFINE_PER_CPU(struct per_cpu_cache, per_cpu_cache); -#define __get_cpu_cache(cpu) (&per_cpu(per_cpu_cache, cpu)) -#define get_cpu_cache() (&__get_cpu_var(per_cpu_cache)) - -/* Structure for counter snapshots, as 32-bit values. */ -struct perfctr_low_ctrs { - unsigned int tsc; - unsigned int pmc[18]; -}; - -/* Intel P5, Cyrix 6x86MX/MII/III, Centaur WinChip C6/2/3 */ -#define MSR_P5_CESR 0x11 -#define MSR_P5_CTR0 0x12 /* .. 0x13 */ -#define P5_CESR_CPL 0x00C0 -#define P5_CESR_RESERVED (~0x01FF) -#define MII_CESR_RESERVED (~0x05FF) -#define C6_CESR_RESERVED (~0x00FF) - -/* Intel P6, VIA C3 */ -#define MSR_P6_PERFCTR0 0xC1 /* .. 0xC2 */ -#define MSR_P6_EVNTSEL0 0x186 /* .. 0x187 */ -#define P6_EVNTSEL_ENABLE 0x00400000 -#define P6_EVNTSEL_INT 0x00100000 -#define P6_EVNTSEL_CPL 0x00030000 -#define P6_EVNTSEL_RESERVED 0x00280000 -#define VC3_EVNTSEL1_RESERVED (~0x1FF) - -/* AMD K7 */ -#define MSR_K7_EVNTSEL0 0xC0010000 /* .. 0xC0010003 */ -#define MSR_K7_PERFCTR0 0xC0010004 /* .. 0xC0010007 */ - -/* AMD K8 */ -#define IS_K8_NB_EVENT(EVNTSEL) ((((EVNTSEL) >> 5) & 0x7) == 0x7) - -/* Intel P4, Intel Pentium M */ -#define MSR_IA32_MISC_ENABLE 0x1A0 -#define MSR_IA32_MISC_ENABLE_PERF_AVAIL (1<<7) /* read-only status bit */ -#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL (1<<12) /* read-only status bit */ - -/* Intel P4 */ -#define MSR_P4_PERFCTR0 0x300 /* .. 0x311 */ -#define MSR_P4_CCCR0 0x360 /* .. 0x371 */ -#define MSR_P4_ESCR0 0x3A0 /* .. 0x3E1, with some gaps */ - -#define MSR_P4_PEBS_ENABLE 0x3F1 -#define P4_PE_REPLAY_TAG_BITS 0x00000607 -#define P4_PE_UOP_TAG 0x01000000 -#define P4_PE_RESERVED 0xFEFFF9F8 /* only allow ReplayTagging */ - -#define MSR_P4_PEBS_MATRIX_VERT 0x3F2 -#define P4_PMV_REPLAY_TAG_BITS 0x00000003 -#define P4_PMV_RESERVED 0xFFFFFFFC - -#define P4_CCCR_OVF 0x80000000 -#define P4_CCCR_CASCADE 0x40000000 -#define P4_CCCR_OVF_PMI_T1 0x08000000 -#define P4_CCCR_OVF_PMI_T0 0x04000000 -#define P4_CCCR_FORCE_OVF 0x02000000 -#define P4_CCCR_ACTIVE_THREAD 0x00030000 -#define P4_CCCR_ENABLE 0x00001000 -#define P4_CCCR_ESCR_SELECT(X) (((X) >> 13) & 0x7) -#define P4_CCCR_EXTENDED_CASCADE 0x00000800 -#define P4_CCCR_RESERVED (0x300007FF|P4_CCCR_OVF|P4_CCCR_OVF_PMI_T1) - -#define P4_ESCR_CPL_T1 0x00000003 -#define P4_ESCR_CPL_T0 0x0000000C -#define P4_ESCR_TAG_ENABLE 0x00000010 -#define P4_ESCR_RESERVED (0x80000000) - -#define P4_FAST_RDPMC 0x80000000 -#define P4_MASK_FAST_RDPMC 0x0000001F /* we only need low 5 bits */ - -/* missing from */ -#define cpu_has_msr boot_cpu_has(X86_FEATURE_MSR) - -#define rdmsr_low(msr,low) \ - __asm__ __volatile__("rdmsr" : "=a"(low) : "c"(msr) : "edx") -#define rdpmc_low(ctr,low) \ - __asm__ __volatile__("rdpmc" : "=a"(low) : "c"(ctr) : "edx") - -static void clear_msr_range(unsigned int base, unsigned int n) -{ - unsigned int i; - - for(i = 0; i < n; ++i) - wrmsr(base+i, 0, 0); -} - -static inline void set_in_cr4_local(unsigned int mask) -{ - write_cr4(read_cr4() | mask); -} - -static inline void clear_in_cr4_local(unsigned int mask) -{ - write_cr4(read_cr4() & ~mask); -} - -static unsigned int new_id(void) -{ - static DEFINE_SPINLOCK(lock); - static unsigned int counter; - int id; - - spin_lock(&lock); - id = ++counter; - spin_unlock(&lock); - return id; -} - -#ifdef CONFIG_X86_LOCAL_APIC -static void perfctr_default_ihandler(unsigned long pc) -{ -} - -static perfctr_ihandler_t perfctr_ihandler = perfctr_default_ihandler; - -asmlinkage void smp_perfctr_interrupt(struct pt_regs *regs) -{ - /* PREEMPT note: invoked via an interrupt gate, which - masks interrupts. We're still on the originating CPU. */ - /* XXX: recursive interrupts? delay the ACK, mask LVTPC, or queue? */ - ack_APIC_irq(); - if (get_cpu_cache()->interrupts_masked) - return; - irq_enter(); - (*perfctr_ihandler)(instruction_pointer(regs)); - irq_exit(); -} - -void perfctr_cpu_set_ihandler(perfctr_ihandler_t ihandler) -{ - perfctr_ihandler = ihandler ? ihandler : perfctr_default_ihandler; -} - -static inline void perfctr_cpu_mask_interrupts(struct per_cpu_cache *cache) -{ - cache->interrupts_masked = 1; -} - -static inline void perfctr_cpu_unmask_interrupts(struct per_cpu_cache *cache) -{ - cache->interrupts_masked = 0; -} - -#else -#define perfctr_cstatus_has_ictrs(cstatus) 0 -#undef cpu_has_apic -#define cpu_has_apic 0 -#undef apic_write -#define apic_write(reg,vector) do{}while(0) -#endif - -#if defined(CONFIG_SMP) - -static inline void -set_isuspend_cpu(struct perfctr_cpu_state *state, int cpu) -{ - state->isuspend_cpu = cpu; -} - -static inline int -is_isuspend_cpu(const struct perfctr_cpu_state *state, int cpu) -{ - return state->isuspend_cpu == cpu; -} - -static inline void clear_isuspend_cpu(struct perfctr_cpu_state *state) -{ - state->isuspend_cpu = NR_CPUS; -} - -#else -static inline void set_isuspend_cpu(struct perfctr_cpu_state *state, int cpu) { } -static inline int is_isuspend_cpu(const struct perfctr_cpu_state *state, int cpu) { return 1; } -static inline void clear_isuspend_cpu(struct perfctr_cpu_state *state) { } -#endif - -/**************************************************************** - * * - * Driver procedures. * - * * - ****************************************************************/ - -/* - * Intel P5 family (Pentium, family code 5). - * - One TSC and two 40-bit PMCs. - * - A single 32-bit CESR (MSR 0x11) controls both PMCs. - * CESR has two halves, each controlling one PMC. - * - Overflow interrupts are not available. - * - Pentium MMX added the RDPMC instruction. RDPMC has lower - * overhead than RDMSR and it can be used in user-mode code. - * - The MMX events are not symmetric: some events are only available - * for some PMC, and some event codes denote different events - * depending on which PMCs they control. - */ - -/* shared with MII and C6 */ -static int p5_like_check_control(struct perfctr_cpu_state *state, - unsigned int reserved_bits, int is_c6) -{ - unsigned short cesr_half[2]; - unsigned int pmc, evntsel, i; - - if (state->control.header.nrictrs != 0 || state->control.header.nractrs > 2) - return -EINVAL; - cesr_half[0] = 0; - cesr_half[1] = 0; - for(i = 0; i < state->control.header.nractrs; ++i) { - pmc = state->control.pmc_map[i]; - if (pmc > 1 || cesr_half[pmc] != 0) - return -EINVAL; - evntsel = state->control.evntsel[0]; - if (pmc == 0) - evntsel &= 0xffff; - else - evntsel >>= 16; - /* protect reserved bits */ - if ((evntsel & reserved_bits) != 0) - return -EPERM; - /* the CPL field (if defined) must be non-zero */ - if (!is_c6 && !(evntsel & P5_CESR_CPL)) - return -EINVAL; - cesr_half[pmc] = evntsel; - } - state->id = (cesr_half[1] << 16) | cesr_half[0]; - return 0; -} - -static int p5_check_control(struct perfctr_cpu_state *state, int is_global) -{ - return p5_like_check_control(state, P5_CESR_RESERVED, 0); -} - -/* shared with MII but not C6 */ -static void p5_write_control(const struct perfctr_cpu_state *state) -{ - struct per_cpu_cache *cache; - unsigned int cesr; - - cesr = state->id; - if (!cesr) /* no PMC is on (this test doesn't work on C6) */ - return; - cache = get_cpu_cache(); - if (cache->id != cesr) { - cache->id = cesr; - wrmsr(MSR_P5_CESR, cesr, 0); - } -} - -static void p5_read_counters(const struct perfctr_cpu_state *state, - struct perfctr_low_ctrs *ctrs) -{ - unsigned int cstatus, nrctrs, i; - - /* The P5 doesn't allocate a cache line on a write miss, so do - a dummy read to avoid a write miss here _and_ a read miss - later in our caller. */ - asm("" : : "r"(ctrs->tsc)); - - cstatus = state->user.cstatus; - if (perfctr_cstatus_has_tsc(cstatus)) - rdtscl(ctrs->tsc); - nrctrs = perfctr_cstatus_nractrs(cstatus); - for(i = 0; i < nrctrs; ++i) { - unsigned int pmc = state->control.pmc_map[i]; - rdmsr_low(MSR_P5_CTR0+pmc, ctrs->pmc[i]); - } -} - -/* used by all except pre-MMX P5 */ -static void rdpmc_read_counters(const struct perfctr_cpu_state *state, - struct perfctr_low_ctrs *ctrs) -{ - unsigned int cstatus, nrctrs, i; - - cstatus = state->user.cstatus; - if (perfctr_cstatus_has_tsc(cstatus)) - rdtscl(ctrs->tsc); - nrctrs = perfctr_cstatus_nractrs(cstatus); - for(i = 0; i < nrctrs; ++i) { - unsigned int pmc = state->control.pmc_map[i]; - rdpmc_low(pmc, ctrs->pmc[i]); - } -} - -/* shared with MII and C6 */ -static void p5_clear_counters(void) -{ - clear_msr_range(MSR_P5_CESR, 1+2); -} - -/* - * Cyrix 6x86/MII/III. - * - Same MSR assignments as P5 MMX. Has RDPMC and two 48-bit PMCs. - * - Event codes and CESR formatting as in the plain P5 subset. - * - Many but not all P5 MMX event codes are implemented. - * - Cyrix adds a few more event codes. The event code is widened - * to 7 bits, and Cyrix puts the high bit in CESR bit 10 - * (and CESR bit 26 for PMC1). - */ - -static int mii_check_control(struct perfctr_cpu_state *state, int is_global) -{ - return p5_like_check_control(state, MII_CESR_RESERVED, 0); -} - -/* - * Centaur WinChip C6/2/3. - * - Same MSR assignments as P5 MMX. Has RDPMC and two 40-bit PMCs. - * - CESR is formatted with two halves, like P5. However, there - * are no defined control fields for e.g. CPL selection, and - * there is no defined method for stopping the counters. - * - Only a few event codes are defined. - * - The 64-bit TSC is synthesised from the low 32 bits of the - * two PMCs, and CESR has to be set up appropriately. - * Reprogramming CESR causes RDTSC to yield invalid results. - * (The C6 may also hang in this case, due to C6 erratum I-13.) - * Therefore, using the PMCs on any of these processors requires - * that the TSC is not accessed at all: - * 1. The kernel must be configured or a TSC-less processor, i.e. - * generic 586 or less. - * 2. The "notsc" boot parameter must be passed to the kernel. - * 3. User-space libraries and code must also be configured and - * compiled for a generic 586 or less. - */ - -#if !defined(CONFIG_X86_TSC) -static int c6_check_control(struct perfctr_cpu_state *state, int is_global) -{ - if (state->control.header.tsc_on) - return -EINVAL; - return p5_like_check_control(state, C6_CESR_RESERVED, 1); -} - -static void c6_write_control(const struct perfctr_cpu_state *state) -{ - struct per_cpu_cache *cache; - unsigned int cesr; - - if (perfctr_cstatus_nractrs(state->user.cstatus) == 0) /* no PMC is on */ - return; - cache = get_cpu_cache(); - cesr = state->id; - if (cache->id != cesr) { - cache->id = cesr; - wrmsr(MSR_P5_CESR, cesr, 0); - } -} -#endif - -/* - * Intel P6 family (Pentium Pro, Pentium II, and Pentium III cores, - * and Xeon and Celeron versions of Pentium II and III cores). - * - One TSC and two 40-bit PMCs. - * - One 32-bit EVNTSEL MSR for each PMC. - * - EVNTSEL0 contains a global enable/disable bit. - * That bit is reserved in EVNTSEL1. - * - Each EVNTSEL contains a CPL field. - * - Overflow interrupts are possible, but requires that the - * local APIC is available. Some Mobile P6s have no local APIC. - * - The PMCs cannot be initialised with arbitrary values, since - * wrmsr fills the high bits by sign-extending from bit 31. - * - Most events are symmetric, but a few are not. - */ - -static int k8_is_multicore; /* affects northbridge events */ - -/* shared with K7 */ -static int p6_like_check_control(struct perfctr_cpu_state *state, int is_k7, int is_global) -{ - unsigned int evntsel, i, nractrs, nrctrs, pmc_mask, pmc; - - nractrs = state->control.header.nractrs; - nrctrs = nractrs + state->control.header.nrictrs; - if (nrctrs < nractrs || nrctrs > (is_k7 ? 4 : 2)) - return -EINVAL; - - pmc_mask = 0; - for(i = 0; i < nrctrs; ++i) { - pmc = state->control.pmc_map[i]; - if (pmc >= (is_k7 ? 4 : 2) || (pmc_mask & (1<control.evntsel[pmc]; - /* prevent the K8 multicore NB event clobber erratum */ - if (!is_global && k8_is_multicore && IS_K8_NB_EVENT(evntsel)) - return -EPERM; - /* protect reserved bits */ - if (evntsel & P6_EVNTSEL_RESERVED) - return -EPERM; - /* check ENable bit */ - if (is_k7) { - /* ENable bit must be set in each evntsel */ - if (!(evntsel & P6_EVNTSEL_ENABLE)) - return -EINVAL; - } else { - /* only evntsel[0] has the ENable bit */ - if (evntsel & P6_EVNTSEL_ENABLE) { - if (pmc > 0) - return -EPERM; - } else { - if (pmc == 0) - return -EINVAL; - } - } - /* the CPL field must be non-zero */ - if (!(evntsel & P6_EVNTSEL_CPL)) - return -EINVAL; - /* INT bit must be off for a-mode and on for i-mode counters */ - if (evntsel & P6_EVNTSEL_INT) { - if (i < nractrs) - return -EINVAL; - } else { - if (i >= nractrs) - return -EINVAL; - } - } - state->id = new_id(); - return 0; -} - -static int p6_check_control(struct perfctr_cpu_state *state, int is_global) -{ - return p6_like_check_control(state, 0, is_global); -} - -#ifdef CONFIG_X86_LOCAL_APIC -/* PRE: perfctr_cstatus_has_ictrs(state->cstatus) != 0 */ -/* shared with K7 and P4 */ -static void p6_like_isuspend(struct perfctr_cpu_state *state, - unsigned int msr_evntsel0) -{ - struct per_cpu_cache *cache; - unsigned int cstatus, nrctrs, i; - int cpu; - unsigned int pending = 0; - - cpu = smp_processor_id(); - set_isuspend_cpu(state, cpu); /* early to limit cpu's live range */ - cache = __get_cpu_cache(cpu); - perfctr_cpu_mask_interrupts(cache); - cstatus = state->user.cstatus; - nrctrs = perfctr_cstatus_nrctrs(cstatus); - for(i = perfctr_cstatus_nractrs(cstatus); i < nrctrs; ++i) { - unsigned int pmc_raw, pmc_idx, now; - pmc_raw = state->control.pmc_map[i]; - /* Note: P4_MASK_FAST_RDPMC is a no-op for P6 and K7. - We don't need to make it into a parameter. */ - pmc_idx = pmc_raw & P4_MASK_FAST_RDPMC; - cache->control.evntsel[pmc_idx] = 0; - /* On P4 this intensionally also clears the CCCR.OVF flag. */ - wrmsr(msr_evntsel0+pmc_idx, 0, 0); - /* P4 erratum N17 does not apply since we read only low 32 bits. */ - rdpmc_low(pmc_raw, now); - state->user.pmc[i].sum += now - state->user.pmc[i].start; - state->user.pmc[i].start = now; - if ((int)now >= 0) - ++pending; - } - state->pending_interrupt = pending; - /* cache->id is still == state->id */ -} - -/* PRE: perfctr_cstatus_has_ictrs(state->cstatus) != 0 */ -/* shared with K7 and P4 */ -static void p6_like_iresume(const struct perfctr_cpu_state *state, - unsigned int msr_evntsel0, - unsigned int msr_perfctr0) -{ - struct per_cpu_cache *cache; - unsigned int cstatus, nrctrs, i; - int cpu; - - cpu = smp_processor_id(); - cache = __get_cpu_cache(cpu); - perfctr_cpu_unmask_interrupts(cache); - if (cache->id == state->id) { - cache->id = 0; /* force reload of cleared EVNTSELs */ - if (is_isuspend_cpu(state, cpu)) - return; /* skip reload of PERFCTRs */ - } - cstatus = state->user.cstatus; - nrctrs = perfctr_cstatus_nrctrs(cstatus); - for(i = perfctr_cstatus_nractrs(cstatus); i < nrctrs; ++i) { - /* Note: P4_MASK_FAST_RDPMC is a no-op for P6 and K7. - We don't need to make it into a parameter. */ - unsigned int pmc = state->control.pmc_map[i] & P4_MASK_FAST_RDPMC; - /* If the control wasn't ours we must disable the evntsels - before reinitialising the counters, to prevent unexpected - counter increments and missed overflow interrupts. */ - if (cache->control.evntsel[pmc]) { - cache->control.evntsel[pmc] = 0; - wrmsr(msr_evntsel0+pmc, 0, 0); - } - /* P4 erratum N15 does not apply since the CCCR is disabled. */ - wrmsr(msr_perfctr0+pmc, (unsigned int)state->user.pmc[i].start, -1); - } - /* cache->id remains != state->id */ -} - -static void p6_isuspend(struct perfctr_cpu_state *state) -{ - p6_like_isuspend(state, MSR_P6_EVNTSEL0); -} - -static void p6_iresume(const struct perfctr_cpu_state *state) -{ - p6_like_iresume(state, MSR_P6_EVNTSEL0, MSR_P6_PERFCTR0); -} -#endif /* CONFIG_X86_LOCAL_APIC */ - -/* shared with K7 and VC3 */ -static void p6_like_write_control(const struct perfctr_cpu_state *state, - unsigned int msr_evntsel0) -{ - struct per_cpu_cache *cache; - unsigned int nrctrs, i; - - cache = get_cpu_cache(); - if (cache->id == state->id) - return; - nrctrs = perfctr_cstatus_nrctrs(state->user.cstatus); - for(i = 0; i < nrctrs; ++i) { - unsigned int pmc = state->control.pmc_map[i]; - unsigned int evntsel = state->control.evntsel[pmc]; - if (evntsel != cache->control.evntsel[pmc]) { - cache->control.evntsel[pmc] = evntsel; - wrmsr(msr_evntsel0+pmc, evntsel, 0); - } - } - cache->id = state->id; -} - -/* shared with VC3, Generic*/ -static void p6_write_control(const struct perfctr_cpu_state *state) -{ - p6_like_write_control(state, MSR_P6_EVNTSEL0); -} - -static void p6_clear_counters(void) -{ - clear_msr_range(MSR_P6_EVNTSEL0, 2); - clear_msr_range(MSR_P6_PERFCTR0, 2); -} - -/* - * AMD K7 family (Athlon, Duron). - * - Somewhat similar to the Intel P6 family. - * - Four 48-bit PMCs. - * - Four 32-bit EVNTSEL MSRs with similar layout as in P6. - * - Completely different MSR assignments :-( - * - Fewer countable events defined :-( - * - The events appear to be completely symmetric. - * - The EVNTSEL MSRs are symmetric since each has its own enable bit. - * - Publicly available documentation is incomplete. - * - K7 model 1 does not have a local APIC. AMD Document #22007 - * Revision J hints that it may use debug interrupts instead. - * - * The K8 has the same hardware layout as the K7. It also has - * better documentation and a different set of available events. - */ - -static int k7_check_control(struct perfctr_cpu_state *state, int is_global) -{ - return p6_like_check_control(state, 1, is_global); -} - -#ifdef CONFIG_X86_LOCAL_APIC -static void k7_isuspend(struct perfctr_cpu_state *state) -{ - p6_like_isuspend(state, MSR_K7_EVNTSEL0); -} - -static void k7_iresume(const struct perfctr_cpu_state *state) -{ - p6_like_iresume(state, MSR_K7_EVNTSEL0, MSR_K7_PERFCTR0); -} -#endif /* CONFIG_X86_LOCAL_APIC */ - -static void k7_write_control(const struct perfctr_cpu_state *state) -{ - p6_like_write_control(state, MSR_K7_EVNTSEL0); -} - -static void k7_clear_counters(void) -{ - clear_msr_range(MSR_K7_EVNTSEL0, 4+4); -} - -/* - * VIA C3 family. - * - A Centaur design somewhat similar to the P6/Celeron. - * - PERFCTR0 is an alias for the TSC, and EVNTSEL0 is read-only. - * - PERFCTR1 is 32 bits wide. - * - EVNTSEL1 has no defined control fields, and there is no - * defined method for stopping the counter. - * - According to testing, the reserved fields in EVNTSEL1 have - * no function. We always fill them with zeroes. - * - Only a few event codes are defined. - * - No local APIC or interrupt-mode support. - * - pmc_map[0] must be 1, if nractrs == 1. - */ -static int vc3_check_control(struct perfctr_cpu_state *state, int is_global) -{ - if (state->control.header.nrictrs || state->control.header.nractrs > 1) - return -EINVAL; - if (state->control.header.nractrs == 1) { - if (state->control.pmc_map[0] != 1) - return -EINVAL; - if (state->control.evntsel[1] & VC3_EVNTSEL1_RESERVED) - return -EPERM; - state->id = state->control.evntsel[1]; - } else - state->id = 0; - return 0; -} - -static void vc3_clear_counters(void) -{ - /* Not documented, but seems to be default after boot. */ - wrmsr(MSR_P6_EVNTSEL0+1, 0x00070079, 0); -} - -/* - * Intel Pentium 4. - * Current implementation restrictions: - * - No DS/PEBS support. - * - * Known quirks: - * - OVF_PMI+FORCE_OVF counters must have an ireset value of -1. - * This allows the regular overflow check to also handle FORCE_OVF - * counters. Not having this restriction would lead to MAJOR - * complications in the driver's "detect overflow counters" code. - * There is no loss of functionality since the ireset value doesn't - * affect the counter's PMI rate for FORCE_OVF counters. - * - In experiments with FORCE_OVF counters, and regular OVF_PMI - * counters with small ireset values between -8 and -1, it appears - * that the faulting instruction is subjected to a new PMI before - * it can complete, ad infinitum. This occurs even though the driver - * clears the CCCR (and in testing also the ESCR) and invokes a - * user-space signal handler before restoring the CCCR and resuming - * the instruction. - */ - -/* - * Table 15-4 in the IA32 Volume 3 manual contains a 18x8 entry mapping - * from counter/CCCR number (0-17) and ESCR SELECT value (0-7) to the - * actual ESCR MSR number. This mapping contains some repeated patterns, - * so we can compact it to a 4x8 table of MSR offsets: - * - * 1. CCCRs 16 and 17 are mapped just like CCCRs 13 and 14, respectively. - * Thus, we only consider the 16 CCCRs 0-15. - * 2. The CCCRs are organised in pairs, and both CCCRs in a pair use the - * same mapping. Thus, we only consider the 8 pairs 0-7. - * 3. In each pair of pairs, the second odd-numbered pair has the same domain - * as the first even-numbered pair, and the range is 1+ the range of the - * the first even-numbered pair. For example, CCCR(0) and (1) map ESCR - * SELECT(7) to 0x3A0, and CCCR(2) and (3) map it to 0x3A1. - * The only exception is that pair (7) [CCCRs 14 and 15] does not have - * ESCR SELECT(3) in its domain, like pair (6) [CCCRs 12 and 13] has. - * NOTE: Revisions of IA32 Volume 3 older than #245472-007 had an error - * in this table: CCCRs 12, 13, and 16 had their mappings for ESCR SELECT - * values 2 and 3 swapped. - * 4. All MSR numbers are on the form 0x3??. Instead of storing these as - * 16-bit numbers, the table only stores the 8-bit offsets from 0x300. - */ - -static const unsigned char p4_cccr_escr_map[4][8] = { - /* 0x00 and 0x01 as is, 0x02 and 0x03 are +1 */ - [0x00/4] { [7] 0xA0, - [6] 0xA2, - [2] 0xAA, - [4] 0xAC, - [0] 0xB2, - [1] 0xB4, - [3] 0xB6, - [5] 0xC8, }, - /* 0x04 and 0x05 as is, 0x06 and 0x07 are +1 */ - [0x04/4] { [0] 0xC0, - [2] 0xC2, - [1] 0xC4, }, - /* 0x08 and 0x09 as is, 0x0A and 0x0B are +1 */ - [0x08/4] { [1] 0xA4, - [0] 0xA6, - [5] 0xA8, - [2] 0xAE, - [3] 0xB0, }, - /* 0x0C, 0x0D, and 0x10 as is, - 0x0E, 0x0F, and 0x11 are +1 except [3] is not in the domain */ - [0x0C/4] { [4] 0xB8, - [5] 0xCC, - [6] 0xE0, - [0] 0xBA, - [2] 0xBC, - [3] 0xBE, - [1] 0xCA, }, -}; - -static unsigned int p4_escr_addr(unsigned int pmc, unsigned int cccr_val) -{ - unsigned int escr_select, pair, escr_offset; - - escr_select = P4_CCCR_ESCR_SELECT(cccr_val); - if (pmc > 0x11) - return 0; /* pmc range error */ - if (pmc > 0x0F) - pmc -= 3; /* 0 <= pmc <= 0x0F */ - pair = pmc / 2; /* 0 <= pair <= 7 */ - escr_offset = p4_cccr_escr_map[pair / 2][escr_select]; - if (!escr_offset || (pair == 7 && escr_select == 3)) - return 0; /* ESCR SELECT range error */ - return escr_offset + (pair & 1) + 0x300; -}; - -static int p4_IQ_ESCR_ok; /* only models <= 2 can use IQ_ESCR{0,1} */ -static int p4_is_ht; /* affects several CCCR & ESCR fields */ -static int p4_extended_cascade_ok; /* only models >= 2 can use extended cascading */ - -static int p4_check_control(struct perfctr_cpu_state *state, int is_global) -{ - unsigned int i, nractrs, nrctrs, pmc_mask; - - nractrs = state->control.header.nractrs; - nrctrs = nractrs + state->control.header.nrictrs; - if (nrctrs < nractrs || nrctrs > 18) - return -EINVAL; - - pmc_mask = 0; - for(i = 0; i < nrctrs; ++i) { - unsigned int pmc, cccr_val, escr_val, escr_addr; - /* check that pmc_map[] is well-defined; - pmc_map[i] is what we pass to RDPMC, the PMC itself - is extracted by masking off the FAST_RDPMC flag */ - pmc = state->control.pmc_map[i] & ~P4_FAST_RDPMC; - if (pmc >= 18 || (pmc_mask & (1<control.evntsel[pmc]; - if (cccr_val & P4_CCCR_RESERVED) - return -EPERM; - if (cccr_val & P4_CCCR_EXTENDED_CASCADE) { - if (!p4_extended_cascade_ok) - return -EPERM; - if (!(pmc == 12 || pmc >= 15)) - return -EPERM; - } - if ((cccr_val & P4_CCCR_ACTIVE_THREAD) != P4_CCCR_ACTIVE_THREAD && !p4_is_ht) - return -EINVAL; - if (!(cccr_val & (P4_CCCR_ENABLE | P4_CCCR_CASCADE | P4_CCCR_EXTENDED_CASCADE))) - return -EINVAL; - if (cccr_val & P4_CCCR_OVF_PMI_T0) { - if (i < nractrs) - return -EINVAL; - if ((cccr_val & P4_CCCR_FORCE_OVF) && - state->control.ireset[pmc] != -1) - return -EINVAL; - } else { - if (i >= nractrs) - return -EINVAL; - } - /* compute and cache ESCR address */ - escr_addr = p4_escr_addr(pmc, cccr_val); - if (!escr_addr) - return -EINVAL; /* ESCR SELECT range error */ - /* IQ_ESCR0 and IQ_ESCR1 only exist in models <= 2 */ - if ((escr_addr & ~0x001) == 0x3BA && !p4_IQ_ESCR_ok) - return -EINVAL; - /* XXX: Two counters could map to the same ESCR. Should we - check that they use the same ESCR value? */ - state->p4_escr_map[i] = escr_addr - MSR_P4_ESCR0; - /* check ESCR contents */ - escr_val = state->control.p4.escr[escr_addr - MSR_P4_ESCR0]; - if (escr_val & P4_ESCR_RESERVED) - return -EPERM; - if ((escr_val & P4_ESCR_CPL_T1) && (!p4_is_ht || !is_global)) - return -EINVAL; - } - /* check ReplayTagging control (PEBS_ENABLE and PEBS_MATRIX_VERT) */ - if (state->control.p4.pebs_enable) { - if (!nrctrs) - return -EPERM; - if (state->control.p4.pebs_enable & P4_PE_RESERVED) - return -EPERM; - if (!(state->control.p4.pebs_enable & P4_PE_UOP_TAG)) - return -EINVAL; - if (!(state->control.p4.pebs_enable & P4_PE_REPLAY_TAG_BITS)) - return -EINVAL; - if (state->control.p4.pebs_matrix_vert & P4_PMV_RESERVED) - return -EPERM; - if (!(state->control.p4.pebs_matrix_vert & P4_PMV_REPLAY_TAG_BITS)) - return -EINVAL; - } else if (state->control.p4.pebs_matrix_vert) - return -EPERM; - state->id = new_id(); - return 0; -} - -#ifdef CONFIG_X86_LOCAL_APIC -static void p4_isuspend(struct perfctr_cpu_state *state) -{ - return p6_like_isuspend(state, MSR_P4_CCCR0); -} - -static void p4_iresume(const struct perfctr_cpu_state *state) -{ - return p6_like_iresume(state, MSR_P4_CCCR0, MSR_P4_PERFCTR0); -} -#endif /* CONFIG_X86_LOCAL_APIC */ - -static void p4_write_control(const struct perfctr_cpu_state *state) -{ - struct per_cpu_cache *cache; - unsigned int nrctrs, i; - - cache = get_cpu_cache(); - if (cache->id == state->id) - return; - nrctrs = perfctr_cstatus_nrctrs(state->user.cstatus); - for(i = 0; i < nrctrs; ++i) { - unsigned int escr_val, escr_off, cccr_val, pmc; - escr_off = state->p4_escr_map[i]; - escr_val = state->control.p4.escr[escr_off]; - if (escr_val != cache->control.escr[escr_off]) { - cache->control.escr[escr_off] = escr_val; - wrmsr(MSR_P4_ESCR0+escr_off, escr_val, 0); - } - pmc = state->control.pmc_map[i] & P4_MASK_FAST_RDPMC; - cccr_val = state->control.evntsel[pmc]; - if (cccr_val != cache->control.evntsel[pmc]) { - cache->control.evntsel[pmc] = cccr_val; - wrmsr(MSR_P4_CCCR0+pmc, cccr_val, 0); - } - } - if (state->control.p4.pebs_enable != cache->control.pebs_enable) { - cache->control.pebs_enable = state->control.p4.pebs_enable; - wrmsr(MSR_P4_PEBS_ENABLE, state->control.p4.pebs_enable, 0); - } - if (state->control.p4.pebs_matrix_vert != cache->control.pebs_matrix_vert) { - cache->control.pebs_matrix_vert = state->control.p4.pebs_matrix_vert; - wrmsr(MSR_P4_PEBS_MATRIX_VERT, state->control.p4.pebs_matrix_vert, 0); - } - cache->id = state->id; -} - -static void p4_clear_counters(void) -{ - /* MSR 0x3F0 seems to have a default value of 0xFC00, but current - docs doesn't fully define it, so leave it alone for now. */ - /* clear PEBS_ENABLE and PEBS_MATRIX_VERT; they handle both PEBS - and ReplayTagging, and should exist even if PEBS is disabled */ - clear_msr_range(0x3F1, 2); - clear_msr_range(0x3A0, 26); - if (p4_IQ_ESCR_ok) - clear_msr_range(0x3BA, 2); - clear_msr_range(0x3BC, 3); - clear_msr_range(0x3C0, 6); - clear_msr_range(0x3C8, 6); - clear_msr_range(0x3E0, 2); - clear_msr_range(MSR_P4_CCCR0, 18); - clear_msr_range(MSR_P4_PERFCTR0, 18); -} - -/* - * Generic driver for any x86 with a working TSC. - */ - -static int generic_check_control(struct perfctr_cpu_state *state, int is_global) -{ - if (state->control.header.nractrs || state->control.header.nrictrs) - return -EINVAL; - return 0; -} - -static void generic_clear_counters(void) -{ -} - -/* - * Driver methods, internal and exported. - * - * Frequently called functions (write_control, read_counters, - * isuspend and iresume) are back-patched to invoke the correct - * processor-specific methods directly, thereby saving the - * overheads of indirect function calls. - * - * Backpatchable call sites must have been "finalised" after - * initialisation. The reason for this is that unsynchronised code - * modification doesn't work in multiprocessor systems, due to - * Intel P6 errata. Consequently, all backpatchable call sites - * must be known and local to this file. - * - * Backpatchable calls must initially be to 'noinline' stubs. - * Otherwise the compiler may inline the stubs, which breaks - * redirect_call() and finalise_backpatching(). - */ - -static int redirect_call_disable; - -static noinline void redirect_call(void *ra, void *to) -{ - /* XXX: make this function __init later */ - if (redirect_call_disable) - printk(KERN_ERR __FILE__ ":%s: unresolved call to %p at %p\n", - __FUNCTION__, to, ra); - /* we can only redirect `call near relative' instructions */ - if (*((unsigned char*)ra - 5) != 0xE8) { - printk(KERN_WARNING __FILE__ ":%s: unable to redirect caller %p to %p\n", - __FUNCTION__, ra, to); - return; - } - *(int*)((char*)ra - 4) = (char*)to - (char*)ra; -} - -static void (*write_control)(const struct perfctr_cpu_state*); -static noinline void perfctr_cpu_write_control(const struct perfctr_cpu_state *state) -{ - redirect_call(__builtin_return_address(0), write_control); - return write_control(state); -} - -static void (*read_counters)(const struct perfctr_cpu_state*, - struct perfctr_low_ctrs*); -static noinline void perfctr_cpu_read_counters(const struct perfctr_cpu_state *state, - struct perfctr_low_ctrs *ctrs) -{ - redirect_call(__builtin_return_address(0), read_counters); - return read_counters(state, ctrs); -} - -#ifdef CONFIG_X86_LOCAL_APIC -static void (*cpu_isuspend)(struct perfctr_cpu_state*); -static noinline void perfctr_cpu_isuspend(struct perfctr_cpu_state *state) -{ - redirect_call(__builtin_return_address(0), cpu_isuspend); - return cpu_isuspend(state); -} - -static void (*cpu_iresume)(const struct perfctr_cpu_state*); -static noinline void perfctr_cpu_iresume(const struct perfctr_cpu_state *state) -{ - redirect_call(__builtin_return_address(0), cpu_iresume); - return cpu_iresume(state); -} - -/* Call perfctr_cpu_ireload() just before perfctr_cpu_resume() to - bypass internal caching and force a reload if the I-mode PMCs. */ -void perfctr_cpu_ireload(struct perfctr_cpu_state *state) -{ -#ifdef CONFIG_SMP - clear_isuspend_cpu(state); -#else - get_cpu_cache()->id = 0; -#endif -} - -/* PRE: the counters have been suspended and sampled by perfctr_cpu_suspend() */ -static int lvtpc_reinit_needed; -unsigned int perfctr_cpu_identify_overflow(struct perfctr_cpu_state *state) -{ - unsigned int cstatus, nrctrs, i, pmc_mask; - - cstatus = state->user.cstatus; - nrctrs = perfctr_cstatus_nrctrs(cstatus); - state->pending_interrupt = 0; - pmc_mask = 0; - for(i = perfctr_cstatus_nractrs(cstatus); i < nrctrs; ++i) { - if ((int)state->user.pmc[i].start >= 0) { /* XXX: ">" ? */ - unsigned int pmc = state->control.pmc_map[i] & P4_MASK_FAST_RDPMC; - /* XXX: "+=" to correct for overshots */ - state->user.pmc[i].start = state->control.ireset[pmc]; - pmc_mask |= (1 << i); - /* On a P4 we should now clear the OVF flag in the - counter's CCCR. However, p4_isuspend() already - did that as a side-effect of clearing the CCCR - in order to stop the i-mode counters. */ - } - } - if (lvtpc_reinit_needed) - apic_write(APIC_LVTPC, LOCAL_PERFCTR_VECTOR); - return pmc_mask; -} - -static inline int check_ireset(struct perfctr_cpu_state *state) -{ - unsigned int nrctrs, i; - - i = state->control.header.nractrs; - nrctrs = i + state->control.header.nrictrs; - for(; i < nrctrs; ++i) { - unsigned int pmc = state->control.pmc_map[i] & P4_MASK_FAST_RDPMC; - if ((int)state->control.ireset[pmc] >= 0) - return -EINVAL; - state->user.pmc[i].start = state->control.ireset[pmc]; - } - return 0; -} - -#else /* CONFIG_X86_LOCAL_APIC */ -static inline void perfctr_cpu_isuspend(struct perfctr_cpu_state *state) { } -static inline void perfctr_cpu_iresume(const struct perfctr_cpu_state *state) { } -static inline int check_ireset(struct perfctr_cpu_state *state) { return 0; } -#endif /* CONFIG_X86_LOCAL_APIC */ - -static int (*check_control)(struct perfctr_cpu_state*, int); -int perfctr_cpu_update_control(struct perfctr_cpu_state *state, int is_global) -{ - int err; - - clear_isuspend_cpu(state); - state->user.cstatus = 0; - - /* disallow i-mode counters if we cannot catch the interrupts */ - if (!(perfctr_info.cpu_features & PERFCTR_FEATURE_PCINT) - && state->control.header.nrictrs) - return -EPERM; - - err = check_control(state, is_global); - if (err < 0) - return err; - err = check_ireset(state); - if (err < 0) - return err; - state->user.cstatus = perfctr_mk_cstatus(state->control.header.tsc_on, - state->control.header.nractrs, - state->control.header.nrictrs); - return 0; -} - -/* - * get_reg_offset() maps MSR numbers to offsets into struct perfctr_cpu_control, - * suitable for accessing control data of type unsigned int. - */ -static int p5_reg_offset(unsigned int msr) -{ - if (msr == MSR_P5_CESR) - return offsetof(struct perfctr_cpu_control, evntsel[0]); - return -1; -} - -static int p6_reg_offset(unsigned int msr) -{ - if (msr - MSR_P6_EVNTSEL0 < 2) - return offsetof(struct perfctr_cpu_control, evntsel[msr - MSR_P6_EVNTSEL0]); - if (msr - MSR_P6_PERFCTR0 < 2) - return offsetof(struct perfctr_cpu_control, ireset[msr - MSR_P6_PERFCTR0]); - return -1; -} - -static int k7_reg_offset(unsigned int msr) -{ - if (msr - MSR_K7_EVNTSEL0 < 4) - return offsetof(struct perfctr_cpu_control, evntsel[msr - MSR_K7_EVNTSEL0]); - if (msr - MSR_K7_PERFCTR0 < 4) - return offsetof(struct perfctr_cpu_control, ireset[msr - MSR_K7_PERFCTR0]); - return -1; -} - -static int p4_reg_offset(unsigned int msr) -{ - if (msr - MSR_P4_CCCR0 < 18) - return offsetof(struct perfctr_cpu_control, evntsel[msr - MSR_P4_CCCR0]); - if (msr - MSR_P4_PERFCTR0 < 18) - return offsetof(struct perfctr_cpu_control, ireset[msr - MSR_P4_PERFCTR0]); - if (msr - MSR_P4_ESCR0 < 0x3E2 - 0x3A0) - return offsetof(struct perfctr_cpu_control, p4.escr[msr - MSR_P4_ESCR0]); - if (msr == MSR_P4_PEBS_ENABLE) - return offsetof(struct perfctr_cpu_control, p4.pebs_enable); - if (msr == MSR_P4_PEBS_MATRIX_VERT) - return offsetof(struct perfctr_cpu_control, p4.pebs_matrix_vert); - return -1; -} - -static int generic_reg_offset(unsigned int msr) -{ - return -1; -} - -static int (*get_reg_offset)(unsigned int); - -static int access_regs(struct perfctr_cpu_control *control, - void *argp, unsigned int argbytes, int do_write) -{ - struct perfctr_cpu_reg *regs; - unsigned int i, nr_regs, *where; - int offset; - - nr_regs = argbytes / sizeof(struct perfctr_cpu_reg); - if (nr_regs * sizeof(struct perfctr_cpu_reg) != argbytes) - return -EINVAL; - regs = (struct perfctr_cpu_reg*)argp; - - for(i = 0; i < nr_regs; ++i) { - offset = get_reg_offset(regs[i].nr); - if (offset < 0) - return -EINVAL; - where = (unsigned int*)((char*)control + offset); - if (do_write) - *where = regs[i].value; - else - regs[i].value = *where; - } - return argbytes; -} - -int perfctr_cpu_control_write(struct perfctr_cpu_control *control, unsigned int domain, - const void *srcp, unsigned int srcbytes) -{ - if (domain != PERFCTR_DOMAIN_CPU_REGS) - return -EINVAL; - return access_regs(control, (void*)srcp, srcbytes, 1); -} - -int perfctr_cpu_control_read(const struct perfctr_cpu_control *control, unsigned int domain, - void *dstp, unsigned int dstbytes) -{ - if (domain != PERFCTR_DOMAIN_CPU_REGS) - return -EINVAL; - return access_regs((struct perfctr_cpu_control*)control, dstp, dstbytes, 0); -} - -void perfctr_cpu_suspend(struct perfctr_cpu_state *state) -{ - unsigned int i, cstatus, nractrs; - struct perfctr_low_ctrs now; - - if (perfctr_cstatus_has_ictrs(state->user.cstatus)) - perfctr_cpu_isuspend(state); - perfctr_cpu_read_counters(state, &now); - cstatus = state->user.cstatus; - if (perfctr_cstatus_has_tsc(cstatus)) - state->user.tsc_sum += now.tsc - state->user.tsc_start; - nractrs = perfctr_cstatus_nractrs(cstatus); - for(i = 0; i < nractrs; ++i) - state->user.pmc[i].sum += now.pmc[i] - state->user.pmc[i].start; - /* perfctr_cpu_disable_rdpmc(); */ /* not for x86 */ -} - -void perfctr_cpu_resume(struct perfctr_cpu_state *state) -{ - if (perfctr_cstatus_has_ictrs(state->user.cstatus)) - perfctr_cpu_iresume(state); - /* perfctr_cpu_enable_rdpmc(); */ /* not for x86 or global-mode */ - perfctr_cpu_write_control(state); - //perfctr_cpu_read_counters(state, &state->start); - { - struct perfctr_low_ctrs now; - unsigned int i, cstatus, nrctrs; - perfctr_cpu_read_counters(state, &now); - cstatus = state->user.cstatus; - if (perfctr_cstatus_has_tsc(cstatus)) - state->user.tsc_start = now.tsc; - nrctrs = perfctr_cstatus_nractrs(cstatus); - for(i = 0; i < nrctrs; ++i) - state->user.pmc[i].start = now.pmc[i]; - } - /* XXX: if (SMP && start.tsc == now.tsc) ++now.tsc; */ -} - -void perfctr_cpu_sample(struct perfctr_cpu_state *state) -{ - unsigned int i, cstatus, nractrs; - struct perfctr_low_ctrs now; - - perfctr_cpu_read_counters(state, &now); - cstatus = state->user.cstatus; - if (perfctr_cstatus_has_tsc(cstatus)) { - state->user.tsc_sum += now.tsc - state->user.tsc_start; - state->user.tsc_start = now.tsc; - } - nractrs = perfctr_cstatus_nractrs(cstatus); - for(i = 0; i < nractrs; ++i) { - state->user.pmc[i].sum += now.pmc[i] - state->user.pmc[i].start; - state->user.pmc[i].start = now.pmc[i]; - } -} - -static void (*clear_counters)(void); -static void perfctr_cpu_clear_counters(void) -{ - return clear_counters(); -} - -/**************************************************************** - * * - * Processor detection and initialisation procedures. * - * * - ****************************************************************/ - -static inline void clear_perfctr_cpus_forbidden_mask(void) -{ -#if !defined(perfctr_cpus_forbidden_mask) - cpus_clear(perfctr_cpus_forbidden_mask); -#endif -} - -static inline void set_perfctr_cpus_forbidden_mask(cpumask_t mask) -{ -#if !defined(perfctr_cpus_forbidden_mask) - perfctr_cpus_forbidden_mask = mask; -#endif -} - -/* see comment above at redirect_call() */ -static void __init finalise_backpatching(void) -{ - struct per_cpu_cache *cache; - struct perfctr_cpu_state state; - cpumask_t old_mask; - - old_mask = perfctr_cpus_forbidden_mask; - clear_perfctr_cpus_forbidden_mask(); - - cache = get_cpu_cache(); - memset(cache, 0, sizeof *cache); - memset(&state, 0, sizeof state); - if (perfctr_info.cpu_features & PERFCTR_FEATURE_PCINT) { - state.user.cstatus = __perfctr_mk_cstatus(0, 1, 0, 0); - perfctr_cpu_sample(&state); - perfctr_cpu_resume(&state); - perfctr_cpu_suspend(&state); - } - state.user.cstatus = 0; - perfctr_cpu_sample(&state); - perfctr_cpu_resume(&state); - perfctr_cpu_suspend(&state); - - set_perfctr_cpus_forbidden_mask(old_mask); - - redirect_call_disable = 1; -} - -#ifdef CONFIG_SMP - -cpumask_t perfctr_cpus_forbidden_mask; - -static void __init p4_ht_mask_setup_cpu(void *forbidden) -{ - unsigned int local_apic_physical_id = cpuid_ebx(1) >> 24; - unsigned int logical_processor_id = local_apic_physical_id & 1; - if (logical_processor_id != 0) - /* We rely on cpu_set() being atomic! */ - cpu_set(smp_processor_id(), *(cpumask_t*)forbidden); -} - -static int __init p4_ht_smp_init(void) -{ - cpumask_t forbidden; - unsigned int cpu; - - cpus_clear(forbidden); - smp_call_function(p4_ht_mask_setup_cpu, &forbidden, 1, 1); - p4_ht_mask_setup_cpu(&forbidden); - if (cpus_empty(forbidden)) - return 0; - perfctr_cpus_forbidden_mask = forbidden; - printk(KERN_INFO "perfctr/x86.c: hyper-threaded P4s detected:" - " restricting access for CPUs"); - for(cpu = 0; cpu < NR_CPUS; ++cpu) - if (cpu_isset(cpu, forbidden)) - printk(" %u", cpu); - printk("\n"); - return 0; -} -#else /* SMP */ -#define p4_ht_smp_init() (0) -#endif /* SMP */ - -static int __init p4_ht_init(void) -{ - unsigned int nr_siblings; - - if (!cpu_has_ht) - return 0; - nr_siblings = (cpuid_ebx(1) >> 16) & 0xFF; - if (nr_siblings > 2) { - printk(KERN_WARNING "perfctr/x86.c: hyper-threaded P4s detected:" - " unsupported number of siblings: %u -- bailing out\n", - nr_siblings); - return -ENODEV; - } - if (nr_siblings < 2) - return 0; - p4_is_ht = 1; /* needed even in a UP kernel */ - return p4_ht_smp_init(); -} - -static int __init intel_init(void) -{ - static char p5_name[] __initdata = "Intel P5"; - static char p6_name[] __initdata = "Intel P6"; - static char p4_name[] __initdata = "Intel P4"; - unsigned int misc_enable; - - if (!cpu_has_tsc) - return -ENODEV; - switch (current_cpu_data.x86) { - case 5: - if (cpu_has_mmx) { - read_counters = rdpmc_read_counters; - - /* Avoid Pentium Erratum 74. */ - if (current_cpu_data.x86_model == 4 && - (current_cpu_data.x86_mask == 4 || - (current_cpu_data.x86_mask == 3 && - ((cpuid_eax(1) >> 12) & 0x3) == 1))) - perfctr_info.cpu_features &= ~PERFCTR_FEATURE_RDPMC; - } else { - perfctr_info.cpu_features &= ~PERFCTR_FEATURE_RDPMC; - read_counters = p5_read_counters; - } - perfctr_set_tests_type(PTT_P5); - perfctr_cpu_name = p5_name; - write_control = p5_write_control; - check_control = p5_check_control; - clear_counters = p5_clear_counters; - get_reg_offset = p5_reg_offset; - return 0; - case 6: - if (current_cpu_data.x86_model == 9 || - current_cpu_data.x86_model == 13) { /* Pentium M */ - /* Pentium M added the MISC_ENABLE MSR from P4. */ - rdmsr_low(MSR_IA32_MISC_ENABLE, misc_enable); - if (!(misc_enable & MSR_IA32_MISC_ENABLE_PERF_AVAIL)) - break; - /* Erratum Y3 probably does not apply since we - read only the low 32 bits. */ - } else if (current_cpu_data.x86_model < 3) { /* Pentium Pro */ - /* Avoid Pentium Pro Erratum 26. */ - if (current_cpu_data.x86_mask < 9) - perfctr_info.cpu_features &= ~PERFCTR_FEATURE_RDPMC; - } - perfctr_set_tests_type(PTT_P6); - perfctr_cpu_name = p6_name; - read_counters = rdpmc_read_counters; - write_control = p6_write_control; - check_control = p6_check_control; - clear_counters = p6_clear_counters; - get_reg_offset = p6_reg_offset; -#ifdef CONFIG_X86_LOCAL_APIC - if (cpu_has_apic) { - perfctr_info.cpu_features |= PERFCTR_FEATURE_PCINT; - cpu_isuspend = p6_isuspend; - cpu_iresume = p6_iresume; - /* P-M apparently inherited P4's LVTPC auto-masking :-( */ - if (current_cpu_data.x86_model == 9 || - current_cpu_data.x86_model == 13) - lvtpc_reinit_needed = 1; - } -#endif - return 0; - case 15: /* Pentium 4 */ - rdmsr_low(MSR_IA32_MISC_ENABLE, misc_enable); - if (!(misc_enable & MSR_IA32_MISC_ENABLE_PERF_AVAIL)) - break; - if (p4_ht_init() != 0) - break; - if (current_cpu_data.x86_model <= 2) - p4_IQ_ESCR_ok = 1; - if (current_cpu_data.x86_model >= 2) - p4_extended_cascade_ok = 1; - perfctr_set_tests_type(PTT_P4); - perfctr_cpu_name = p4_name; - read_counters = rdpmc_read_counters; - write_control = p4_write_control; - check_control = p4_check_control; - clear_counters = p4_clear_counters; - get_reg_offset = p4_reg_offset; -#ifdef CONFIG_X86_LOCAL_APIC - if (cpu_has_apic) { - perfctr_info.cpu_features |= PERFCTR_FEATURE_PCINT; - cpu_isuspend = p4_isuspend; - cpu_iresume = p4_iresume; - lvtpc_reinit_needed = 1; - } -#endif - return 0; - } - return -ENODEV; -} - -/* - * Multicore K8s have issues with northbridge events: - * 1. The NB is shared between the cores, so two different cores - * in the same node cannot count NB events simultaneously. - * This can be handled by using perfctr_cpus_forbidden_mask to - * restrict NB-using threads to core0 of all nodes. - * 2. The initial multicore chips (Revision E) have an erratum - * which causes the NB counters to be reset when either core - * reprograms its evntsels (even for non-NB events). - * This is only an issue because of scheduling of threads, so - * we restrict NB events to the non thread-centric API. - * - * For now we only implement the workaround for issue 2, as this - * also handles issue 1. - * - * TODO: Detect post Revision E chips and implement a weaker - * workaround for them. - */ -#ifdef CONFIG_SMP -static void __init k8_multicore_init(void) -{ - cpumask_t non0cores; - int i; - - cpus_clear(non0cores); - for(i = 0; i < NR_CPUS; ++i) { - cpumask_t cores = cpu_core_map[i]; - int core0 = first_cpu(cores); - if (core0 >= NR_CPUS) - continue; - cpu_clear(core0, cores); - cpus_or(non0cores, non0cores, cores); - } - if (cpus_empty(non0cores)) - return; - k8_is_multicore = 1; - printk(KERN_INFO "perfctr/x86.c: multi-core K8s detected:" - " restricting access to northbridge events\n"); -} -#else -#define k8_multicore_init() do{}while(0) -#endif - -static int __init amd_init(void) -{ - static char amd_name[] __initdata = "AMD K7/K8"; - - if (!cpu_has_tsc) - return -ENODEV; - switch (current_cpu_data.x86) { - case 6: /* K7 */ - break; - case 15: /* K8. Like a K7 with a different event set. */ - k8_multicore_init(); - break; - default: - return -ENODEV; - } - perfctr_set_tests_type(PTT_AMD); - perfctr_cpu_name = amd_name; - read_counters = rdpmc_read_counters; - write_control = k7_write_control; - check_control = k7_check_control; - clear_counters = k7_clear_counters; - get_reg_offset = k7_reg_offset; -#ifdef CONFIG_X86_LOCAL_APIC - if (cpu_has_apic) { - perfctr_info.cpu_features |= PERFCTR_FEATURE_PCINT; - cpu_isuspend = k7_isuspend; - cpu_iresume = k7_iresume; - } -#endif - return 0; -} - -static int __init cyrix_init(void) -{ - static char mii_name[] __initdata = "Cyrix 6x86MX/MII/III"; - if (!cpu_has_tsc) - return -ENODEV; - switch (current_cpu_data.x86) { - case 6: /* 6x86MX, MII, or III */ - perfctr_set_tests_type(PTT_P5); - perfctr_cpu_name = mii_name; - read_counters = rdpmc_read_counters; - write_control = p5_write_control; - check_control = mii_check_control; - clear_counters = p5_clear_counters; - get_reg_offset = p5_reg_offset; - return 0; - } - return -ENODEV; -} - -static int __init centaur_init(void) -{ -#if !defined(CONFIG_X86_TSC) - static char winchip_name[] __initdata = "WinChip C6/2/3"; -#endif - static char vc3_name[] __initdata = "VIA C3"; - switch (current_cpu_data.x86) { -#if !defined(CONFIG_X86_TSC) - case 5: - switch (current_cpu_data.x86_model) { - case 4: /* WinChip C6 */ - case 8: /* WinChip 2, 2A, or 2B */ - case 9: /* WinChip 3, a 2A with larger cache and lower voltage */ - break; - default: - return -ENODEV; - } - perfctr_set_tests_type(PTT_WINCHIP); - perfctr_cpu_name = winchip_name; - /* - * TSC must be inaccessible for perfctrs to work. - */ - if (!(read_cr4() & X86_CR4_TSD) || cpu_has_tsc) - return -ENODEV; - perfctr_info.cpu_features &= ~PERFCTR_FEATURE_RDTSC; - read_counters = rdpmc_read_counters; - write_control = c6_write_control; - check_control = c6_check_control; - clear_counters = p5_clear_counters; - get_reg_offset = p5_reg_offset; - return 0; -#endif - case 6: /* VIA C3 */ - if (!cpu_has_tsc) - return -ENODEV; - switch (current_cpu_data.x86_model) { - case 6: /* Cyrix III */ - case 7: /* Samuel 2, Ezra (steppings >= 8) */ - case 8: /* Ezra-T */ - case 9: /* Antaur/Nehemiah */ - break; - default: - return -ENODEV; - } - perfctr_set_tests_type(PTT_VC3); - perfctr_cpu_name = vc3_name; - read_counters = rdpmc_read_counters; - write_control = p6_write_control; - check_control = vc3_check_control; - clear_counters = vc3_clear_counters; - get_reg_offset = p6_reg_offset; - return 0; - } - return -ENODEV; -} - -static int __init generic_init(void) -{ - static char generic_name[] __initdata = "Generic x86 with TSC"; - if (!cpu_has_tsc) - return -ENODEV; - perfctr_info.cpu_features &= ~PERFCTR_FEATURE_RDPMC; - perfctr_set_tests_type(PTT_GENERIC); - perfctr_cpu_name = generic_name; - check_control = generic_check_control; - write_control = p6_write_control; - read_counters = rdpmc_read_counters; - clear_counters = generic_clear_counters; - get_reg_offset = generic_reg_offset; - return 0; -} - -static void perfctr_cpu_invalidate_cache(void) -{ - /* - * per_cpu_cache[] is initialised to contain "impossible" - * evntsel values guaranteed to differ from anything accepted - * by perfctr_cpu_update_control(). - * All-bits-one works for all currently supported processors. - * The memset also sets the ids to -1, which is intentional. - */ - memset(get_cpu_cache(), ~0, sizeof(struct per_cpu_cache)); -} - -static void perfctr_cpu_init_one(void *ignore) -{ - /* PREEMPT note: when called via smp_call_function(), - this is in IRQ context with preemption disabled. */ - perfctr_cpu_clear_counters(); - perfctr_cpu_invalidate_cache(); - if (cpu_has_apic) - apic_write(APIC_LVTPC, LOCAL_PERFCTR_VECTOR); - if (perfctr_info.cpu_features & PERFCTR_FEATURE_RDPMC) - set_in_cr4_local(X86_CR4_PCE); -} - -static void perfctr_cpu_exit_one(void *ignore) -{ - /* PREEMPT note: when called via smp_call_function(), - this is in IRQ context with preemption disabled. */ - perfctr_cpu_clear_counters(); - perfctr_cpu_invalidate_cache(); - if (cpu_has_apic) - apic_write(APIC_LVTPC, APIC_DM_NMI | APIC_LVT_MASKED); - if (perfctr_info.cpu_features & PERFCTR_FEATURE_RDPMC) - clear_in_cr4_local(X86_CR4_PCE); -} - -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PM) - -static void perfctr_pm_suspend(void) -{ - /* XXX: clear control registers */ - printk("perfctr/x86: PM suspend\n"); -} - -static void perfctr_pm_resume(void) -{ - /* XXX: reload control registers */ - printk("perfctr/x86: PM resume\n"); -} - -#include - -static int perfctr_device_suspend(struct sys_device *dev, u32 state) -{ - perfctr_pm_suspend(); - return 0; -} - -static int perfctr_device_resume(struct sys_device *dev) -{ - perfctr_pm_resume(); - return 0; -} - -static struct sysdev_class perfctr_sysclass = { - set_kset_name("perfctr"), - .resume = perfctr_device_resume, - .suspend = perfctr_device_suspend, -}; - -static struct sys_device device_perfctr = { - .id = 0, - .cls = &perfctr_sysclass, -}; - -static void x86_pm_init(void) -{ - if (sysdev_class_register(&perfctr_sysclass) == 0) - sysdev_register(&device_perfctr); -} - -static void x86_pm_exit(void) -{ - sysdev_unregister(&device_perfctr); - sysdev_class_unregister(&perfctr_sysclass); -} - -#else - -static inline void x86_pm_init(void) { } -static inline void x86_pm_exit(void) { } - -#endif /* CONFIG_X86_LOCAL_APIC && CONFIG_PM */ - -#if !defined(CONFIG_X86_LOCAL_APIC) -static inline int reserve_lapic_nmi(void) { return 0; } -static inline void release_lapic_nmi(void) { } -#endif - -static void do_init_tests(void) -{ -#ifdef CONFIG_PERFCTR_INIT_TESTS - if (reserve_lapic_nmi() >= 0) { - perfctr_x86_init_tests(); - release_lapic_nmi(); - } -#endif -} - -static int init_done; - -int __init perfctr_cpu_init(void) -{ - int err = -ENODEV; - - preempt_disable(); - - /* RDPMC and RDTSC are on by default. They will be disabled - by the init procedures if necessary. */ - perfctr_info.cpu_features = PERFCTR_FEATURE_RDPMC | PERFCTR_FEATURE_RDTSC; - - if (cpu_has_msr) { - switch (current_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - err = intel_init(); - break; - case X86_VENDOR_AMD: - err = amd_init(); - break; - case X86_VENDOR_CYRIX: - err = cyrix_init(); - break; - case X86_VENDOR_CENTAUR: - err = centaur_init(); - } - } - if (err) { - err = generic_init(); /* last resort */ - if (err) - goto out; - } - do_init_tests(); - finalise_backpatching(); - - perfctr_info.cpu_khz = cpu_khz; - perfctr_info.tsc_to_cpu_mult = 1; - init_done = 1; - - out: - preempt_enable(); - return err; -} - -void __exit perfctr_cpu_exit(void) -{ -} - -/**************************************************************** - * * - * Hardware reservation. * - * * - ****************************************************************/ - -static DECLARE_MUTEX(mutex); -static const char *current_service = 0; - -const char *perfctr_cpu_reserve(const char *service) -{ - const char *ret; - - if (!init_done) - return "unsupported hardware"; - down(&mutex); - ret = current_service; - if (ret) - goto out_up; - ret = "unknown driver (oprofile?)"; - if (reserve_lapic_nmi() < 0) - goto out_up; - current_service = service; - if (perfctr_info.cpu_features & PERFCTR_FEATURE_RDPMC) - mmu_cr4_features |= X86_CR4_PCE; - on_each_cpu(perfctr_cpu_init_one, NULL, 1, 1); - perfctr_cpu_set_ihandler(NULL); - x86_pm_init(); - ret = NULL; - out_up: - up(&mutex); - return ret; -} - -void perfctr_cpu_release(const char *service) -{ - down(&mutex); - if (service != current_service) { - printk(KERN_ERR "%s: attempt by %s to release while reserved by %s\n", - __FUNCTION__, service, current_service); - goto out_up; - } - /* power down the counters */ - if (perfctr_info.cpu_features & PERFCTR_FEATURE_RDPMC) - mmu_cr4_features &= ~X86_CR4_PCE; - on_each_cpu(perfctr_cpu_exit_one, NULL, 1, 1); - perfctr_cpu_set_ihandler(NULL); - x86_pm_exit(); - current_service = 0; - release_lapic_nmi(); - out_up: - up(&mutex); -} diff -Naur linux-2.6.12-rc5-mm1/drivers/perfctr/x86_tests.c linux-2.6.12-rc5-mm1-plug/drivers/perfctr/x86_tests.c --- linux-2.6.12-rc5-mm1/drivers/perfctr/x86_tests.c 2005-05-25 16:23:37.446928584 -0700 +++ linux-2.6.12-rc5-mm1-plug/drivers/perfctr/x86_tests.c 1969-12-31 16:00:00.000000000 -0800 @@ -1,308 +0,0 @@ -/* $Id: x86_tests.c,v 1.34 2004/08/08 19:54:40 mikpe Exp $ - * Performance-monitoring counters driver. - * Optional x86/x86_64-specific init-time tests. - * - * Copyright (C) 1999-2004 Mikael Pettersson - */ -#include -#include -#include -#include -#include -#include -#undef MSR_P6_PERFCTR0 -#undef MSR_P4_IQ_CCCR0 -#undef MSR_P4_CRU_ESCR0 -#include -#include -#include /* cpu_khz */ -#include "x86_tests.h" - -#define MSR_P5_CESR 0x11 -#define MSR_P5_CTR0 0x12 -#define P5_CESR_VAL (0x16 | (3<<6)) -#define MSR_P6_PERFCTR0 0xC1 -#define MSR_P6_EVNTSEL0 0x186 -#define P6_EVNTSEL0_VAL (0xC0 | (3<<16) | (1<<22)) -#define MSR_K7_EVNTSEL0 0xC0010000 -#define MSR_K7_PERFCTR0 0xC0010004 -#define K7_EVNTSEL0_VAL (0xC0 | (3<<16) | (1<<22)) -#define VC3_EVNTSEL1_VAL 0xC0 -#define MSR_P4_IQ_COUNTER0 0x30C -#define MSR_P4_IQ_CCCR0 0x36C -#define MSR_P4_CRU_ESCR0 0x3B8 -#define P4_CRU_ESCR0_VAL ((2<<25) | (1<<9) | (0x3<<2)) -#define P4_IQ_CCCR0_VAL ((0x3<<16) | (4<<13) | (1<<12)) - -#define NITER 64 -#define X2(S) S";"S -#define X8(S) X2(X2(X2(S))) - -#ifdef __x86_64__ -#define CR4MOV "movq" -#else -#define CR4MOV "movl" -#endif - -#ifndef CONFIG_X86_LOCAL_APIC -#undef apic_write -#define apic_write(reg,vector) do{}while(0) -#endif - -#if !defined(__x86_64__) -/* Avoid speculative execution by the CPU */ -extern inline void sync_core(void) -{ - int tmp; - asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory"); -} -#endif - -static void __init do_rdpmc(unsigned pmc, unsigned unused2) -{ - unsigned i; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("rdpmc") : : "c"(pmc) : "eax", "edx"); -} - -static void __init do_rdmsr(unsigned msr, unsigned unused2) -{ - unsigned i; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("rdmsr") : : "c"(msr) : "eax", "edx"); -} - -static void __init do_wrmsr(unsigned msr, unsigned data) -{ - unsigned i; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("wrmsr") : : "c"(msr), "a"(data), "d"(0)); -} - -static void __init do_rdcr4(unsigned unused1, unsigned unused2) -{ - unsigned i; - unsigned long dummy; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8(CR4MOV" %%cr4,%0") : "=r"(dummy)); -} - -static void __init do_wrcr4(unsigned cr4, unsigned unused2) -{ - unsigned i; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8(CR4MOV" %0,%%cr4") : : "r"((long)cr4)); -} - -static void __init do_rdtsc(unsigned unused1, unsigned unused2) -{ - unsigned i; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__(X8("rdtsc") : : : "eax", "edx"); -} - -static void __init do_wrlvtpc(unsigned val, unsigned unused2) -{ - unsigned i; - for(i = 0; i < NITER/8; ++i) { - apic_write(APIC_LVTPC, val); - apic_write(APIC_LVTPC, val); - apic_write(APIC_LVTPC, val); - apic_write(APIC_LVTPC, val); - apic_write(APIC_LVTPC, val); - apic_write(APIC_LVTPC, val); - apic_write(APIC_LVTPC, val); - apic_write(APIC_LVTPC, val); - } -} - -static void __init do_sync_core(unsigned unused1, unsigned unused2) -{ - unsigned i; - for(i = 0; i < NITER/8; ++i) { - sync_core(); - sync_core(); - sync_core(); - sync_core(); - sync_core(); - sync_core(); - sync_core(); - sync_core(); - } -} - -static void __init do_empty_loop(unsigned unused1, unsigned unused2) -{ - unsigned i; - for(i = 0; i < NITER/8; ++i) - __asm__ __volatile__("" : : "c"(0)); -} - -static unsigned __init run(void (*doit)(unsigned, unsigned), - unsigned arg1, unsigned arg2) -{ - unsigned start, dummy, stop; - sync_core(); - rdtsc(start, dummy); - (*doit)(arg1, arg2); /* should take < 2^32 cycles to complete */ - sync_core(); - rdtsc(stop, dummy); - return stop - start; -} - -static void __init init_tests_message(void) -{ - printk(KERN_INFO "Please email the following PERFCTR INIT lines " - "to mikpe@csd.uu.se\n" - KERN_INFO "To remove this message, rebuild the driver " - "with CONFIG_PERFCTR_INIT_TESTS=n\n"); - printk(KERN_INFO "PERFCTR INIT: vendor %u, family %u, model %u, stepping %u, clock %u kHz\n", - current_cpu_data.x86_vendor, - current_cpu_data.x86, - current_cpu_data.x86_model, - current_cpu_data.x86_mask, - (unsigned int)cpu_khz); -} - -static void __init -measure_overheads(unsigned msr_evntsel0, unsigned evntsel0, unsigned msr_perfctr0, - unsigned msr_cccr, unsigned cccr_val) -{ - int i; - unsigned int loop, ticks[13]; - const char *name[13]; - - if (msr_evntsel0) - wrmsr(msr_evntsel0, 0, 0); - if (msr_cccr) - wrmsr(msr_cccr, 0, 0); - - name[0] = "rdtsc"; - ticks[0] = run(do_rdtsc, 0, 0); - name[1] = "rdpmc"; - ticks[1] = (perfctr_info.cpu_features & PERFCTR_FEATURE_RDPMC) - ? run(do_rdpmc,1,0) : 0; - name[2] = "rdmsr (counter)"; - ticks[2] = msr_perfctr0 ? run(do_rdmsr, msr_perfctr0, 0) : 0; - name[3] = msr_cccr ? "rdmsr (escr)" : "rdmsr (evntsel)"; - ticks[3] = msr_evntsel0 ? run(do_rdmsr, msr_evntsel0, 0) : 0; - name[4] = "wrmsr (counter)"; - ticks[4] = msr_perfctr0 ? run(do_wrmsr, msr_perfctr0, 0) : 0; - name[5] = msr_cccr ? "wrmsr (escr)" : "wrmsr (evntsel)"; - ticks[5] = msr_evntsel0 ? run(do_wrmsr, msr_evntsel0, evntsel0) : 0; - name[6] = "read cr4"; - ticks[6] = run(do_rdcr4, 0, 0); - name[7] = "write cr4"; - ticks[7] = run(do_wrcr4, read_cr4(), 0); - name[8] = "rdpmc (fast)"; - ticks[8] = msr_cccr ? run(do_rdpmc, 0x80000001, 0) : 0; - name[9] = "rdmsr (cccr)"; - ticks[9] = msr_cccr ? run(do_rdmsr, msr_cccr, 0) : 0; - name[10] = "wrmsr (cccr)"; - ticks[10] = msr_cccr ? run(do_wrmsr, msr_cccr, cccr_val) : 0; - name[11] = "write LVTPC"; - ticks[11] = (perfctr_info.cpu_features & PERFCTR_FEATURE_PCINT) - ? run(do_wrlvtpc, APIC_DM_NMI|APIC_LVT_MASKED, 0) : 0; - name[12] = "sync_core"; - ticks[12] = run(do_sync_core, 0, 0); - - loop = run(do_empty_loop, 0, 0); - - if (msr_evntsel0) - wrmsr(msr_evntsel0, 0, 0); - if (msr_cccr) - wrmsr(msr_cccr, 0, 0); - - init_tests_message(); - printk(KERN_INFO "PERFCTR INIT: NITER == %u\n", NITER); - printk(KERN_INFO "PERFCTR INIT: loop overhead is %u cycles\n", loop); - for(i = 0; i < ARRAY_SIZE(ticks); ++i) { - unsigned int x; - if (!ticks[i]) - continue; - x = ((ticks[i] - loop) * 10) / NITER; - printk(KERN_INFO "PERFCTR INIT: %s cost is %u.%u cycles (%u total)\n", - name[i], x/10, x%10, ticks[i]); - } -} - -#ifndef __x86_64__ -static inline void perfctr_p5_init_tests(void) -{ - measure_overheads(MSR_P5_CESR, P5_CESR_VAL, MSR_P5_CTR0, 0, 0); -} - -static inline void perfctr_p6_init_tests(void) -{ - measure_overheads(MSR_P6_EVNTSEL0, P6_EVNTSEL0_VAL, MSR_P6_PERFCTR0, 0, 0); -} - -#if !defined(CONFIG_X86_TSC) -static inline void perfctr_c6_init_tests(void) -{ - unsigned int cesr, dummy; - - rdmsr(MSR_P5_CESR, cesr, dummy); - init_tests_message(); - printk(KERN_INFO "PERFCTR INIT: boot CESR == %#08x\n", cesr); -} -#endif - -static inline void perfctr_vc3_init_tests(void) -{ - measure_overheads(MSR_P6_EVNTSEL0+1, VC3_EVNTSEL1_VAL, MSR_P6_PERFCTR0+1, 0, 0); -} -#endif /* !__x86_64__ */ - -static inline void perfctr_p4_init_tests(void) -{ - measure_overheads(MSR_P4_CRU_ESCR0, P4_CRU_ESCR0_VAL, MSR_P4_IQ_COUNTER0, - MSR_P4_IQ_CCCR0, P4_IQ_CCCR0_VAL); -} - -static inline void perfctr_k7_init_tests(void) -{ - measure_overheads(MSR_K7_EVNTSEL0, K7_EVNTSEL0_VAL, MSR_K7_PERFCTR0, 0, 0); -} - -static inline void perfctr_generic_init_tests(void) -{ - measure_overheads(0, 0, 0, 0, 0); -} - -enum perfctr_x86_tests_type perfctr_x86_tests_type __initdata = PTT_UNKNOWN; - -void __init perfctr_x86_init_tests(void) -{ - switch (perfctr_x86_tests_type) { -#ifndef __x86_64__ - case PTT_P5: /* Intel P5, P5MMX; Cyrix 6x86MX, MII, III */ - perfctr_p5_init_tests(); - break; - case PTT_P6: /* Intel PPro, PII, PIII, PENTM */ - perfctr_p6_init_tests(); - break; -#if !defined(CONFIG_X86_TSC) - case PTT_WINCHIP: /* WinChip C6, 2, 3 */ - perfctr_c6_init_tests(); - break; -#endif - case PTT_VC3: /* VIA C3 */ - perfctr_vc3_init_tests(); - break; -#endif /* !__x86_64__ */ - case PTT_P4: /* Intel P4 */ - perfctr_p4_init_tests(); - break; - case PTT_AMD: /* AMD K7, K8 */ - perfctr_k7_init_tests(); - break; - case PTT_GENERIC: - perfctr_generic_init_tests(); - break; - default: - printk(KERN_INFO "%s: unknown CPU type %u\n", - __FUNCTION__, perfctr_x86_tests_type); - break; - } -} diff -Naur linux-2.6.12-rc5-mm1/drivers/perfctr/x86_tests.h linux-2.6.12-rc5-mm1-plug/drivers/perfctr/x86_tests.h --- linux-2.6.12-rc5-mm1/drivers/perfctr/x86_tests.h 2005-05-25 16:23:37.447928432 -0700 +++ linux-2.6.12-rc5-mm1-plug/drivers/perfctr/x86_tests.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,30 +0,0 @@ -/* $Id: x86_tests.h,v 1.10 2004/05/22 20:48:57 mikpe Exp $ - * Performance-monitoring counters driver. - * Optional x86/x86_64-specific init-time tests. - * - * Copyright (C) 1999-2004 Mikael Pettersson - */ - -/* 'enum perfctr_x86_tests_type' classifies CPUs according - to relevance for perfctr_x86_init_tests(). */ -enum perfctr_x86_tests_type { - PTT_UNKNOWN, - PTT_GENERIC, - PTT_P5, - PTT_P6, - PTT_P4, - PTT_AMD, - PTT_WINCHIP, - PTT_VC3, -}; - -extern enum perfctr_x86_tests_type perfctr_x86_tests_type; - -static inline void perfctr_set_tests_type(enum perfctr_x86_tests_type t) -{ -#ifdef CONFIG_PERFCTR_INIT_TESTS - perfctr_x86_tests_type = t; -#endif -} - -extern void perfctr_x86_init_tests(void); diff -Naur linux-2.6.12-rc5-mm1/fs/proc/array.c linux-2.6.12-rc5-mm1-plug/fs/proc/array.c --- linux-2.6.12-rc5-mm1/fs/proc/array.c 2005-05-25 16:20:56.000000000 -0700 +++ linux-2.6.12-rc5-mm1-plug/fs/proc/array.c 2005-05-25 17:05:49.542991472 -0700 @@ -163,7 +163,6 @@ read_lock(&tasklist_lock); buffer += sprintf(buffer, "State:\t%s\n" - "SleepAVG:\t%lu%%\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -171,7 +170,6 @@ "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - (p->sleep_avg/1024)*100/(1020000000/1024), p->tgid, p->pid, pid_alive(p) ? p->group_leader->real_parent->tgid : 0, pid_alive(p) && p->ptrace ? p->parent->pid : 0, diff -Naur linux-2.6.12-rc5-mm1/fs/proc/base.c linux-2.6.12-rc5-mm1-plug/fs/proc/base.c --- linux-2.6.12-rc5-mm1/fs/proc/base.c 2005-05-25 16:23:43.223050480 -0700 +++ linux-2.6.12-rc5-mm1-plug/fs/proc/base.c 2005-05-25 17:05:49.545991016 -0700 @@ -68,6 +68,7 @@ #include #include #include +#include #include #include "internal.h" @@ -146,6 +147,10 @@ #ifdef CONFIG_CPUSETS PROC_TID_CPUSET, #endif +#ifdef CONFIG_CPUSCHED_SPA + PROC_TID_CPU_RATE_CAP, + PROC_TID_CPU_RATE_HARD_CAP, +#endif #ifdef CONFIG_SECURITY PROC_TID_ATTR, PROC_TID_ATTR_CURRENT, @@ -243,6 +248,10 @@ #ifdef CONFIG_AUDITSYSCALL E(PROC_TID_LOGINUID, "loginuid", S_IFREG|S_IWUSR|S_IRUGO), #endif +#ifdef CONFIG_CPUSCHED_SPA + E(PROC_TID_CPU_RATE_CAP, "cpu_rate_cap", S_IFREG|S_IRUGO|S_IWUSR), + E(PROC_TID_CPU_RATE_HARD_CAP, "cpu_rate_hard_cap", S_IFREG|S_IRUGO|S_IWUSR), +#endif {0,0,NULL,0} }; @@ -947,6 +956,100 @@ }; #endif /* CONFIG_SECCOMP */ +#ifdef CONFIG_CPUSCHED_SPA +static ssize_t cpu_rate_cap_read(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = PROC_I(file->f_dentry->d_inode)->task; + char buffer[64]; + size_t len; + unsigned int cppt = get_cpu_rate_cap(task); + + if (*ppos) + return 0; + *ppos = len = sprintf(buffer, "%u\n", cppt); + if (copy_to_user(buf, buffer, len)) + return -EFAULT; + + return len; +} + +static ssize_t cpu_rate_cap_write(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = PROC_I(file->f_dentry->d_inode)->task; + char buffer[128] = ""; + char *endptr = NULL; + unsigned long hcppt; + int res; + + + if ((count > 63) || *ppos) + return -EFBIG; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + hcppt = simple_strtoul(buffer, &endptr, 0); + if ((endptr == buffer) || (hcppt == ULONG_MAX)) + return -EINVAL; + + if ((res = set_cpu_rate_cap(task, hcppt)) != 0) + return res; + + return count; +} + +struct file_operations proc_cpu_rate_cap_operations = { + read: cpu_rate_cap_read, + write: cpu_rate_cap_write, +}; + +ssize_t cpu_rate_hard_cap_read(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = PROC_I(file->f_dentry->d_inode)->task; + char buffer[64]; + size_t len; + unsigned int hcppt = get_cpu_rate_hard_cap(task); + + if (*ppos) + return 0; + *ppos = len = sprintf(buffer, "%u\n", hcppt); + if (copy_to_user(buf, buffer, len)) + return -EFAULT; + + return len; +} + +ssize_t cpu_rate_hard_cap_write(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = PROC_I(file->f_dentry->d_inode)->task; + char buffer[128] = ""; + char *endptr = NULL; + unsigned long long hcppt; + int res; + + + if ((count > 63) || *ppos) + return -EFBIG; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + hcppt = simple_strtoul(buffer, &endptr, 0); + if ((endptr == buffer) || (hcppt == ULONG_MAX)) + return -EINVAL; + + if ((res = set_cpu_rate_hard_cap(task, hcppt)) != 0) + return res; + + return count; +} + +struct file_operations proc_cpu_rate_hard_cap_operations = { + read: cpu_rate_hard_cap_read, + write: cpu_rate_hard_cap_write, +}; +#endif + static int proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; @@ -1657,6 +1760,14 @@ inode->i_fop = &proc_loginuid_operations; break; #endif +#ifdef CONFIG_CPUSCHED_SPA + case PROC_TID_CPU_RATE_CAP: + inode->i_fop = &proc_cpu_rate_cap_operations; + break; + case PROC_TID_CPU_RATE_HARD_CAP: + inode->i_fop = &proc_cpu_rate_hard_cap_operations; + break; +#endif default: printk("procfs: impossible type (%d)",p->type); iput(inode); diff -Naur linux-2.6.12-rc5-mm1/fs/proc/proc_misc.c linux-2.6.12-rc5-mm1-plug/fs/proc/proc_misc.c --- linux-2.6.12-rc5-mm1/fs/proc/proc_misc.c 2005-05-25 16:23:43.225050176 -0700 +++ linux-2.6.12-rc5-mm1-plug/fs/proc/proc_misc.c 2005-05-25 17:06:16.298923952 -0700 @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -243,6 +244,17 @@ return proc_calc_metrics(page, start, off, count, eof, len); } +static int scheduler_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len; + + strcpy(page, sched_drvp->name); + strcat(page, "\n"); + len = strlen(page); + return proc_calc_metrics(page, start, off, count, eof, len); +} + extern struct seq_operations cpuinfo_op; static int cpuinfo_open(struct inode *inode, struct file *file) { @@ -646,6 +658,7 @@ {"cmdline", cmdline_read_proc}, {"locks", locks_read_proc}, {"execdomains", execdomains_read_proc}, + {"scheduler", scheduler_read_proc}, {NULL,} }; for (p = simple_ones; p->name; p++) diff -Naur linux-2.6.12-rc5-mm1/include/asm-arm/system.h linux-2.6.12-rc5-mm1-plug/include/asm-arm/system.h --- linux-2.6.12-rc5-mm1/include/asm-arm/system.h 2005-05-25 16:23:44.807809560 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-arm/system.h 2005-05-25 17:02:50.603194456 -0700 @@ -144,12 +144,34 @@ #define set_wmb(var, value) do { var = value; wmb(); } while (0) #define nop() __asm__ __volatile__("mov\tr0,r0\t@ nop\n\t"); +#ifdef CONFIG_SMP /* - * switch_mm() may do a full cache flush over the context switch, - * so enable interrupts over the context switch to avoid high - * latency. + * Define our own context switch locking. This allows us to enable + * interrupts over the context switch, otherwise we end up with high + * interrupt latency. The real problem area is switch_mm() which may + * do a full cache flush. */ -#define __ARCH_WANT_INTERRUPTS_ON_CTXSW +#define prepare_arch_switch(rq,next) \ +do { \ + spin_lock(&(next)->switch_lock); \ + spin_unlock_irq(&(rq)->lock); \ +} while (0) + +#define finish_arch_switch(rq,prev) \ + spin_unlock(&(prev)->switch_lock) + +#define task_running(rq,p) \ + ((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock)) +#else +/* + * Our UP-case is more simple, but we assume knowledge of how + * spin_unlock_irq() and friends are implemented. This avoids + * us needlessly decrementing and incrementing the preempt count. + */ +#define prepare_arch_switch(rq,next) local_irq_enable() +#define finish_arch_switch(rq,prev) spin_unlock(&(rq)->lock) +#define task_running(rq,p) ((rq)->curr == (p)) +#endif /* * switch_to(prev, next) should switch from task `prev' to `next' diff -Naur linux-2.6.12-rc5-mm1/include/asm-i386/mach-default/irq_vectors.h linux-2.6.12-rc5-mm1-plug/include/asm-i386/mach-default/irq_vectors.h --- linux-2.6.12-rc5-mm1/include/asm-i386/mach-default/irq_vectors.h 2005-05-25 16:23:44.879798616 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-i386/mach-default/irq_vectors.h 2005-05-25 17:04:43.729996568 -0700 @@ -56,15 +56,14 @@ * sources per level' errata. */ #define LOCAL_TIMER_VECTOR 0xef -#define LOCAL_PERFCTR_VECTOR 0xee /* - * First APIC vector available to drivers: (vectors 0x30-0xed) + * First APIC vector available to drivers: (vectors 0x30-0xee) * we start at 0x31 to spread out vectors evenly between priority * levels. (0x80 is the syscall vector) */ #define FIRST_DEVICE_VECTOR 0x31 -#define FIRST_SYSTEM_VECTOR 0xee +#define FIRST_SYSTEM_VECTOR 0xef #define TIMER_IRQ 0 diff -Naur linux-2.6.12-rc5-mm1/include/asm-i386/mach-visws/irq_vectors.h linux-2.6.12-rc5-mm1-plug/include/asm-i386/mach-visws/irq_vectors.h --- linux-2.6.12-rc5-mm1/include/asm-i386/mach-visws/irq_vectors.h 2005-05-25 16:23:45.454711216 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-i386/mach-visws/irq_vectors.h 2005-05-25 17:04:43.730996416 -0700 @@ -35,15 +35,14 @@ * sources per level' errata. */ #define LOCAL_TIMER_VECTOR 0xef -#define LOCAL_PERFCTR_VECTOR 0xee /* - * First APIC vector available to drivers: (vectors 0x30-0xed) + * First APIC vector available to drivers: (vectors 0x30-0xee) * we start at 0x31 to spread out vectors evenly between priority * levels. (0x80 is the syscall vector) */ #define FIRST_DEVICE_VECTOR 0x31 -#define FIRST_SYSTEM_VECTOR 0xee +#define FIRST_SYSTEM_VECTOR 0xef #define TIMER_IRQ 0 diff -Naur linux-2.6.12-rc5-mm1/include/asm-i386/perfctr.h linux-2.6.12-rc5-mm1-plug/include/asm-i386/perfctr.h --- linux-2.6.12-rc5-mm1/include/asm-i386/perfctr.h 2005-05-25 16:23:45.484706656 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-i386/perfctr.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,197 +0,0 @@ -/* $Id: perfctr.h,v 1.63 2005/04/08 14:36:49 mikpe Exp $ - * x86/x86_64 Performance-Monitoring Counters driver - * - * Copyright (C) 1999-2005 Mikael Pettersson - */ -#ifndef _ASM_I386_PERFCTR_H -#define _ASM_I386_PERFCTR_H - -#include - -struct perfctr_sum_ctrs { - __u64 tsc; - __u64 pmc[18]; /* the size is not part of the user ABI */ -}; - -struct perfctr_cpu_control_header { - __u32 tsc_on; - __u32 nractrs; /* number of accumulation-mode counters */ - __u32 nrictrs; /* number of interrupt-mode counters */ -}; - -struct perfctr_cpu_state_user { - __u32 cstatus; - __u32 _filler; - __u64 tsc_start; - __u64 tsc_sum; - struct { - __u64 start; - __u64 sum; - } pmc[18]; /* the size is not part of the user ABI */ -}; - -/* cstatus is a re-encoding of control.tsc_on/nractrs/nrictrs - which should have less overhead in most cases */ - -static inline -unsigned int __perfctr_mk_cstatus(unsigned int tsc_on, unsigned int have_ictrs, - unsigned int nrictrs, unsigned int nractrs) -{ - return (tsc_on<<31) | (have_ictrs<<16) | ((nractrs+nrictrs)<<8) | nractrs; -} - -static inline -unsigned int perfctr_mk_cstatus(unsigned int tsc_on, unsigned int nractrs, - unsigned int nrictrs) -{ - return __perfctr_mk_cstatus(tsc_on, nrictrs, nrictrs, nractrs); -} - -static inline unsigned int perfctr_cstatus_enabled(unsigned int cstatus) -{ - return cstatus; -} - -static inline int perfctr_cstatus_has_tsc(unsigned int cstatus) -{ - return (int)cstatus < 0; /* test and jump on sign */ -} - -static inline unsigned int perfctr_cstatus_nractrs(unsigned int cstatus) -{ - return cstatus & 0x7F; /* and with imm8 */ -} - -static inline unsigned int perfctr_cstatus_nrctrs(unsigned int cstatus) -{ - return (cstatus >> 8) & 0x7F; -} - -static inline unsigned int perfctr_cstatus_has_ictrs(unsigned int cstatus) -{ - return cstatus & (0x7F << 16); -} - -/* - * 'struct siginfo' support for perfctr overflow signals. - * In unbuffered mode, si_code is set to SI_PMC_OVF and a bitmask - * describing which perfctrs overflowed is put in si_pmc_ovf_mask. - * A bitmask is used since more than one perfctr can have overflowed - * by the time the interrupt handler runs. - * - * glibc's doesn't seem to define __SI_FAULT or __SI_CODE(), - * and including as well may cause redefinition errors, - * so the user and kernel values are different #defines here. - */ -#ifdef __KERNEL__ -#define SI_PMC_OVF (__SI_FAULT|'P') -#else -#define SI_PMC_OVF ('P') -#endif -#define si_pmc_ovf_mask _sifields._pad[0] /* XXX: use an unsigned field later */ - -#ifdef __KERNEL__ - -#if defined(CONFIG_PERFCTR) - -struct perfctr_cpu_control { - struct perfctr_cpu_control_header header; - unsigned int evntsel[18]; /* primary control registers, physical indices */ - unsigned int ireset[18]; /* >= 2^31, for i-mode counters, physical indices */ - struct { - unsigned int escr[0x3E2-0x3A0]; /* secondary controls, physical indices */ - unsigned int pebs_enable; /* for replay tagging */ - unsigned int pebs_matrix_vert; /* for replay tagging */ - } p4; - unsigned int pmc_map[18]; /* virtual to physical (rdpmc) index map */ -}; - -struct perfctr_cpu_state { - /* Don't change field order here without first considering the number - of cache lines touched during sampling and context switching. */ - unsigned int id; - int isuspend_cpu; - struct perfctr_cpu_state_user user; - struct perfctr_cpu_control control; - unsigned int p4_escr_map[18]; -#ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT - unsigned int pending_interrupt; -#endif -}; - -/* Driver init/exit. */ -extern int perfctr_cpu_init(void); -extern void perfctr_cpu_exit(void); - -/* CPU type name. */ -extern char *perfctr_cpu_name; - -/* Hardware reservation. */ -extern const char *perfctr_cpu_reserve(const char *service); -extern void perfctr_cpu_release(const char *service); - -/* PRE: state has no running interrupt-mode counters. - Check that the new control data is valid. - Update the driver's private control data. - is_global should be zero for per-process counters and non-zero - for global-mode counters. This matters for HT P4s, alas. - Returns a negative error code if the control data is invalid. */ -extern int perfctr_cpu_update_control(struct perfctr_cpu_state *state, int is_global); - -/* Parse and update control for the given domain. */ -extern int perfctr_cpu_control_write(struct perfctr_cpu_control *control, - unsigned int domain, - const void *srcp, unsigned int srcbytes); - -/* Retrieve and format control for the given domain. - Returns number of bytes written. */ -extern int perfctr_cpu_control_read(const struct perfctr_cpu_control *control, - unsigned int domain, - void *dstp, unsigned int dstbytes); - -/* Read a-mode counters. Subtract from start and accumulate into sums. - Must be called with preemption disabled. */ -extern void perfctr_cpu_suspend(struct perfctr_cpu_state *state); - -/* Write control registers. Read a-mode counters into start. - Must be called with preemption disabled. */ -extern void perfctr_cpu_resume(struct perfctr_cpu_state *state); - -/* Perform an efficient combined suspend/resume operation. - Must be called with preemption disabled. */ -extern void perfctr_cpu_sample(struct perfctr_cpu_state *state); - -/* The type of a perfctr overflow interrupt handler. - It will be called in IRQ context, with preemption disabled. */ -typedef void (*perfctr_ihandler_t)(unsigned long pc); - -/* Operations related to overflow interrupt handling. */ -#ifdef CONFIG_X86_LOCAL_APIC -extern void perfctr_cpu_set_ihandler(perfctr_ihandler_t); -extern void perfctr_cpu_ireload(struct perfctr_cpu_state*); -extern unsigned int perfctr_cpu_identify_overflow(struct perfctr_cpu_state*); -static inline int perfctr_cpu_has_pending_interrupt(const struct perfctr_cpu_state *state) -{ - return state->pending_interrupt; -} -#else -static inline void perfctr_cpu_set_ihandler(perfctr_ihandler_t x) { } -static inline int perfctr_cpu_has_pending_interrupt(const struct perfctr_cpu_state *state) -{ - return 0; -} -#endif - -#endif /* CONFIG_PERFCTR */ - -#if defined(CONFIG_PERFCTR) && defined(CONFIG_X86_LOCAL_APIC) -asmlinkage void perfctr_interrupt(struct pt_regs*); -#define perfctr_vector_init() \ - set_intr_gate(LOCAL_PERFCTR_VECTOR, perfctr_interrupt) -#else -#define perfctr_vector_init() do{}while(0) -#endif - -#endif /* __KERNEL__ */ - -#endif /* _ASM_I386_PERFCTR_H */ diff -Naur linux-2.6.12-rc5-mm1/include/asm-i386/processor.h linux-2.6.12-rc5-mm1-plug/include/asm-i386/processor.h --- linux-2.6.12-rc5-mm1/include/asm-i386/processor.h 2005-05-25 16:23:45.487706200 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-i386/processor.h 2005-05-25 17:04:43.734995808 -0700 @@ -456,8 +456,6 @@ unsigned long *io_bitmap_ptr; /* max allowed port in the bitmap, in bytes: */ unsigned long io_bitmap_max; -/* performance counters */ - struct vperfctr *perfctr; }; #define INIT_THREAD { \ diff -Naur linux-2.6.12-rc5-mm1/include/asm-i386/system.h linux-2.6.12-rc5-mm1-plug/include/asm-i386/system.h --- linux-2.6.12-rc5-mm1/include/asm-i386/system.h 2005-05-25 16:23:45.510702704 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-i386/system.h 2005-05-25 17:04:43.736995504 -0700 @@ -14,7 +14,6 @@ #define switch_to(prev,next,last) do { \ unsigned long esi,edi; \ - perfctr_suspend_thread(&(prev)->thread); \ asm volatile("pushfl\n\t" \ "pushl %%ebp\n\t" \ "movl %%esp,%0\n\t" /* save ESP */ \ diff -Naur linux-2.6.12-rc5-mm1/include/asm-i386/topology.h linux-2.6.12-rc5-mm1-plug/include/asm-i386/topology.h --- linux-2.6.12-rc5-mm1/include/asm-i386/topology.h 2005-05-25 16:23:45.514702096 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-i386/topology.h 2005-05-25 17:02:50.613192936 -0700 @@ -74,14 +74,11 @@ .imbalance_pct = 125, \ .cache_hot_time = (10*1000000), \ .cache_nice_tries = 1, \ - .busy_idx = 3, \ - .idle_idx = 1, \ - .newidle_idx = 2, \ - .wake_idx = 1, \ .per_cpu_gain = 100, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ - | SD_BALANCE_FORK \ + | SD_BALANCE_NEWIDLE \ + | SD_WAKE_IDLE \ | SD_WAKE_BALANCE, \ .last_balance = jiffies, \ .balance_interval = 1, \ diff -Naur linux-2.6.12-rc5-mm1/include/asm-i386/unistd.h linux-2.6.12-rc5-mm1-plug/include/asm-i386/unistd.h --- linux-2.6.12-rc5-mm1/include/asm-i386/unistd.h 2005-05-25 16:23:45.517701640 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-i386/unistd.h 2005-05-25 17:04:43.735995656 -0700 @@ -296,12 +296,8 @@ #define __NR_keyctl 288 #define __NR_ioprio_set 289 #define __NR_ioprio_get 290 -#define __NR_vperfctr_open 291 -#define __NR_vperfctr_control (__NR_vperfctr_open+1) -#define __NR_vperfctr_write (__NR_vperfctr_open+2) -#define __NR_vperfctr_read (__NR_vperfctr_open+3) -#define NR_syscalls 295 +#define NR_syscalls 291 /* * user-visible error numbers are in the range -1 - -128: see diff -Naur linux-2.6.12-rc5-mm1/include/asm-ia64/system.h linux-2.6.12-rc5-mm1-plug/include/asm-ia64/system.h --- linux-2.6.12-rc5-mm1/include/asm-ia64/system.h 2005-05-25 16:23:45.563694648 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-ia64/system.h 2005-05-25 17:02:50.613192936 -0700 @@ -183,6 +183,8 @@ #ifdef __KERNEL__ +#define prepare_to_switch() do { } while(0) + #ifdef CONFIG_IA32_SUPPORT # define IS_IA32_PROCESS(regs) (ia64_psr(regs)->is != 0) #else @@ -272,7 +274,13 @@ * of that CPU which will not be released, because there we wait for the * tasklist_lock to become available. */ -#define __ARCH_WANT_UNLOCKED_CTXSW +#define prepare_arch_switch(rq, next) \ +do { \ + spin_lock(&(next)->switch_lock); \ + spin_unlock(&(rq)->lock); \ +} while (0) +#define finish_arch_switch(rq, prev) spin_unlock_irq(&(prev)->switch_lock) +#define task_running(rq, p) ((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock)) #define ia64_platform_is(x) (strcmp(x, platform_name) == 0) diff -Naur linux-2.6.12-rc5-mm1/include/asm-ia64/topology.h linux-2.6.12-rc5-mm1-plug/include/asm-ia64/topology.h --- linux-2.6.12-rc5-mm1/include/asm-ia64/topology.h 2005-05-25 16:23:45.565694344 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-ia64/topology.h 2005-05-25 17:02:50.614192784 -0700 @@ -42,54 +42,25 @@ void build_cpu_to_node_map(void); -#define SD_CPU_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ - .parent = NULL, \ - .groups = NULL, \ - .min_interval = 1, \ - .max_interval = 4, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_hot_time = (10*1000000), \ - .per_cpu_gain = 100, \ - .cache_nice_tries = 2, \ - .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 2, \ - .wake_idx = 1, \ - .forkexec_idx = 1, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_NEWIDLE \ - | SD_BALANCE_EXEC \ - | SD_WAKE_AFFINE, \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .nr_balance_failed = 0, \ -} - /* sched_domains SD_NODE_INIT for IA64 NUMA machines */ #define SD_NODE_INIT (struct sched_domain) { \ .span = CPU_MASK_NONE, \ .parent = NULL, \ .groups = NULL, \ - .min_interval = 8, \ - .max_interval = 8*(min(num_online_cpus(), 32)), \ - .busy_factor = 64, \ + .min_interval = 80, \ + .max_interval = 320, \ + .busy_factor = 320, \ .imbalance_pct = 125, \ .cache_hot_time = (10*1000000), \ - .cache_nice_tries = 2, \ - .busy_idx = 3, \ - .idle_idx = 2, \ - .newidle_idx = 0, /* unused */ \ - .wake_idx = 1, \ - .forkexec_idx = 1, \ + .cache_nice_tries = 1, \ .per_cpu_gain = 100, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ - | SD_BALANCE_FORK \ + | SD_BALANCE_NEWIDLE \ + | SD_WAKE_IDLE \ | SD_WAKE_BALANCE, \ .last_balance = jiffies, \ - .balance_interval = 64, \ + .balance_interval = 1, \ .nr_balance_failed = 0, \ } @@ -98,21 +69,17 @@ .span = CPU_MASK_NONE, \ .parent = NULL, \ .groups = NULL, \ - .min_interval = 64, \ - .max_interval = 64*num_online_cpus(), \ - .busy_factor = 128, \ - .imbalance_pct = 133, \ + .min_interval = 80, \ + .max_interval = 320, \ + .busy_factor = 320, \ + .imbalance_pct = 125, \ .cache_hot_time = (10*1000000), \ .cache_nice_tries = 1, \ - .busy_idx = 3, \ - .idle_idx = 3, \ - .newidle_idx = 0, /* unused */ \ - .wake_idx = 0, /* unused */ \ - .forkexec_idx = 0, /* unused */ \ .per_cpu_gain = 100, \ - .flags = SD_LOAD_BALANCE, \ + .flags = SD_LOAD_BALANCE \ + | SD_BALANCE_EXEC, \ .last_balance = jiffies, \ - .balance_interval = 64, \ + .balance_interval = 100*(63+num_online_cpus())/64, \ .nr_balance_failed = 0, \ } diff -Naur linux-2.6.12-rc5-mm1/include/asm-mips/system.h linux-2.6.12-rc5-mm1-plug/include/asm-mips/system.h --- linux-2.6.12-rc5-mm1/include/asm-mips/system.h 2005-05-25 16:23:45.636683552 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-mips/system.h 2005-05-25 17:02:50.615192632 -0700 @@ -422,10 +422,16 @@ extern int stop_a_enabled; /* - * See include/asm-ia64/system.h; prevents deadlock on SMP + * Taken from include/asm-ia64/system.h; prevents deadlock on SMP * systems. */ -#define __ARCH_WANT_UNLOCKED_CTXSW +#define prepare_arch_switch(rq, next) \ +do { \ + spin_lock(&(next)->switch_lock); \ + spin_unlock(&(rq)->lock); \ +} while (0) +#define finish_arch_switch(rq, prev) spin_unlock_irq(&(prev)->switch_lock) +#define task_running(rq, p) ((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock)) #define arch_align_stack(x) (x) diff -Naur linux-2.6.12-rc5-mm1/include/asm-ppc/perfctr.h linux-2.6.12-rc5-mm1-plug/include/asm-ppc/perfctr.h --- linux-2.6.12-rc5-mm1/include/asm-ppc/perfctr.h 2005-05-25 16:23:45.836653152 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-ppc/perfctr.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,176 +0,0 @@ -/* $Id: perfctr.h,v 1.19 2005/04/08 14:36:49 mikpe Exp $ - * PPC32 Performance-Monitoring Counters driver - * - * Copyright (C) 2004-2005 Mikael Pettersson - */ -#ifndef _ASM_PPC_PERFCTR_H -#define _ASM_PPC_PERFCTR_H - -#include - -struct perfctr_sum_ctrs { - __u64 tsc; - __u64 pmc[8]; /* the size is not part of the user ABI */ -}; - -struct perfctr_cpu_control_header { - __u32 tsc_on; - __u32 nractrs; /* number of accumulation-mode counters */ - __u32 nrictrs; /* number of interrupt-mode counters */ -}; - -struct perfctr_cpu_state_user { - __u32 cstatus; - /* 'samplecnt' is incremented every time the 'start' - fields have been updated by a sampling operation. - Unfortunately the PPC timebase (tsc_start) has too - low frequency for it to be a reliable context-switch - indicator for user-space. */ - __u32 samplecnt; - __u64 tsc_start; - __u64 tsc_sum; - struct { - __u64 start; - __u64 sum; - } pmc[8]; /* the size is not part of the user ABI */ -}; - -/* cstatus is a re-encoding of control.tsc_on/nractrs/nrictrs - which should have less overhead in most cases */ -/* XXX: ppc driver internally also uses cstatus&(1<<30) */ - -static inline -unsigned int perfctr_mk_cstatus(unsigned int tsc_on, unsigned int nractrs, - unsigned int nrictrs) -{ - return (tsc_on<<31) | (nrictrs<<16) | ((nractrs+nrictrs)<<8) | nractrs; -} - -static inline unsigned int perfctr_cstatus_enabled(unsigned int cstatus) -{ - return cstatus; -} - -static inline int perfctr_cstatus_has_tsc(unsigned int cstatus) -{ - return (int)cstatus < 0; /* test and jump on sign */ -} - -static inline unsigned int perfctr_cstatus_nractrs(unsigned int cstatus) -{ - return cstatus & 0x7F; /* and with imm8 */ -} - -static inline unsigned int perfctr_cstatus_nrctrs(unsigned int cstatus) -{ - return (cstatus >> 8) & 0x7F; -} - -static inline unsigned int perfctr_cstatus_has_ictrs(unsigned int cstatus) -{ - return cstatus & (0x7F << 16); -} - -/* - * 'struct siginfo' support for perfctr overflow signals. - * In unbuffered mode, si_code is set to SI_PMC_OVF and a bitmask - * describing which perfctrs overflowed is put in si_pmc_ovf_mask. - * A bitmask is used since more than one perfctr can have overflowed - * by the time the interrupt handler runs. - * - * glibc's doesn't seem to define __SI_FAULT or __SI_CODE(), - * and including as well may cause redefinition errors, - * so the user and kernel values are different #defines here. - */ -#ifdef __KERNEL__ -#define SI_PMC_OVF (__SI_FAULT|'P') -#else -#define SI_PMC_OVF ('P') -#endif -#define si_pmc_ovf_mask _sifields._pad[0] /* XXX: use an unsigned field later */ - -#ifdef __KERNEL__ - -#if defined(CONFIG_PERFCTR) - -struct perfctr_cpu_control { - struct perfctr_cpu_control_header header; - unsigned int mmcr0; - unsigned int mmcr1; - unsigned int mmcr2; - /* IABR/DABR/BAMR not supported */ - unsigned int ireset[8]; /* [0,0x7fffffff], for i-mode counters, physical indices */ - unsigned int pmc_map[8]; /* virtual to physical index map */ -}; - -struct perfctr_cpu_state { - /* Don't change field order here without first considering the number - of cache lines touched during sampling and context switching. */ - unsigned int id; - int isuspend_cpu; - struct perfctr_cpu_state_user user; - struct perfctr_cpu_control control; -}; - -/* Driver init/exit. */ -extern int perfctr_cpu_init(void); -extern void perfctr_cpu_exit(void); - -/* CPU type name. */ -extern char *perfctr_cpu_name; - -/* Hardware reservation. */ -extern const char *perfctr_cpu_reserve(const char *service); -extern void perfctr_cpu_release(const char *service); - -/* PRE: state has no running interrupt-mode counters. - Check that the new control data is valid. - Update the driver's private control data. - Returns a negative error code if the control data is invalid. */ -extern int perfctr_cpu_update_control(struct perfctr_cpu_state *state, int is_global); - -/* Parse and update control for the given domain. */ -extern int perfctr_cpu_control_write(struct perfctr_cpu_control *control, - unsigned int domain, - const void *srcp, unsigned int srcbytes); - -/* Retrieve and format control for the given domain. - Returns number of bytes written. */ -extern int perfctr_cpu_control_read(const struct perfctr_cpu_control *control, - unsigned int domain, - void *dstp, unsigned int dstbytes); - -/* Read a-mode counters. Subtract from start and accumulate into sums. - Must be called with preemption disabled. */ -extern void perfctr_cpu_suspend(struct perfctr_cpu_state *state); - -/* Write control registers. Read a-mode counters into start. - Must be called with preemption disabled. */ -extern void perfctr_cpu_resume(struct perfctr_cpu_state *state); - -/* Perform an efficient combined suspend/resume operation. - Must be called with preemption disabled. */ -extern void perfctr_cpu_sample(struct perfctr_cpu_state *state); - -/* The type of a perfctr overflow interrupt handler. - It will be called in IRQ context, with preemption disabled. */ -typedef void (*perfctr_ihandler_t)(unsigned long pc); - -/* Operations related to overflow interrupt handling. */ -#ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT -extern void perfctr_cpu_set_ihandler(perfctr_ihandler_t); -extern void perfctr_cpu_ireload(struct perfctr_cpu_state*); -extern unsigned int perfctr_cpu_identify_overflow(struct perfctr_cpu_state*); -#else -static inline void perfctr_cpu_set_ihandler(perfctr_ihandler_t x) { } -#endif -static inline int perfctr_cpu_has_pending_interrupt(const struct perfctr_cpu_state *state) -{ - return 0; -} - -#endif /* CONFIG_PERFCTR */ - -#endif /* __KERNEL__ */ - -#endif /* _ASM_PPC_PERFCTR_H */ diff -Naur linux-2.6.12-rc5-mm1/include/asm-ppc/processor.h linux-2.6.12-rc5-mm1-plug/include/asm-ppc/processor.h --- linux-2.6.12-rc5-mm1/include/asm-ppc/processor.h 2005-05-25 16:23:45.842652240 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-ppc/processor.h 2005-05-25 17:04:43.757992312 -0700 @@ -122,9 +122,6 @@ unsigned long spefscr; /* SPE & eFP status */ int used_spe; /* set if process has used spe */ #endif /* CONFIG_SPE */ -#ifdef CONFIG_PERFCTR_VIRTUAL - struct vperfctr *perfctr; /* performance counters */ -#endif }; #define ARCH_MIN_TASKALIGN 16 diff -Naur linux-2.6.12-rc5-mm1/include/asm-ppc/reg.h linux-2.6.12-rc5-mm1-plug/include/asm-ppc/reg.h --- linux-2.6.12-rc5-mm1/include/asm-ppc/reg.h 2005-05-25 16:23:45.843652088 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-ppc/reg.h 2005-05-25 17:04:43.758992160 -0700 @@ -274,14 +274,22 @@ #define SPRN_LDSTCR 0x3f8 /* Load/Store control register */ #define SPRN_LDSTDB 0x3f4 /* */ #define SPRN_LR 0x008 /* Link Register */ +#define SPRN_MMCR0 0x3B8 /* Monitor Mode Control Register 0 */ +#define SPRN_MMCR1 0x3BC /* Monitor Mode Control Register 1 */ #ifndef SPRN_PIR #define SPRN_PIR 0x3FF /* Processor Identification Register */ #endif +#define SPRN_PMC1 0x3B9 /* Performance Counter Register 1 */ +#define SPRN_PMC2 0x3BA /* Performance Counter Register 2 */ +#define SPRN_PMC3 0x3BD /* Performance Counter Register 3 */ +#define SPRN_PMC4 0x3BE /* Performance Counter Register 4 */ #define SPRN_PTEHI 0x3D5 /* 981 7450 PTE HI word (S/W TLB load) */ #define SPRN_PTELO 0x3D6 /* 982 7450 PTE LO word (S/W TLB load) */ #define SPRN_PVR 0x11F /* Processor Version Register */ #define SPRN_RPA 0x3D6 /* Required Physical Address Register */ +#define SPRN_SDA 0x3BF /* Sampled Data Address Register */ #define SPRN_SDR1 0x019 /* MMU Hash Base Register */ +#define SPRN_SIA 0x3BB /* Sampled Instruction Address Register */ #define SPRN_SPRG0 0x110 /* Special Purpose Register General 0 */ #define SPRN_SPRG1 0x111 /* Special Purpose Register General 1 */ #define SPRN_SPRG2 0x112 /* Special Purpose Register General 2 */ @@ -308,79 +316,16 @@ #define SPRN_THRM3 0x3FE /* Thermal Management Register 3 */ #define THRM3_E (1<<0) #define SPRN_TLBMISS 0x3D4 /* 980 7450 TLB Miss Register */ +#define SPRN_UMMCR0 0x3A8 /* User Monitor Mode Control Register 0 */ +#define SPRN_UMMCR1 0x3AC /* User Monitor Mode Control Register 0 */ +#define SPRN_UPMC1 0x3A9 /* User Performance Counter Register 1 */ +#define SPRN_UPMC2 0x3AA /* User Performance Counter Register 2 */ +#define SPRN_UPMC3 0x3AD /* User Performance Counter Register 3 */ +#define SPRN_UPMC4 0x3AE /* User Performance Counter Register 4 */ +#define SPRN_USIA 0x3AB /* User Sampled Instruction Address Register */ #define SPRN_VRSAVE 0x100 /* Vector Register Save Register */ #define SPRN_XER 0x001 /* Fixed Point Exception Register */ -/* Performance-monitoring control and counter registers */ -#define SPRN_MMCR0 0x3B8 /* Monitor Mode Control Register 0 (604 and up) */ -#define SPRN_MMCR1 0x3BC /* Monitor Mode Control Register 1 (604e and up) */ -#define SPRN_MMCR2 0x3B0 /* Monitor Mode Control Register 2 (7400 and up) */ -#define SPRN_PMC1 0x3B9 /* Performance Counter Register 1 (604 and up) */ -#define SPRN_PMC2 0x3BA /* Performance Counter Register 2 (604 and up) */ -#define SPRN_PMC3 0x3BD /* Performance Counter Register 3 (604e and up) */ -#define SPRN_PMC4 0x3BE /* Performance Counter Register 4 (604e and up) */ -#define SPRN_PMC5 0x3B1 /* Performance Counter Register 5 (7450 and up) */ -#define SPRN_PMC6 0x3B2 /* Performance Counter Register 6 (7450 and up) */ -#define SPRN_SIA 0x3BB /* Sampled Instruction Address Register (604 and up) */ -#define SPRN_SDA 0x3BF /* Sampled Data Address Register (604/604e only) */ -#define SPRN_BAMR 0x3B7 /* Breakpoint Address Mask Register (7400 and up) */ - -#define SPRN_UMMCR0 0x3A8 /* User Monitor Mode Control Register 0 (750 and up) */ -#define SPRN_UMMCR1 0x3AC /* User Monitor Mode Control Register 0 (750 and up) */ -#define SPRN_UMMCR2 0x3A0 /* User Monitor Mode Control Register 0 (7400 and up) */ -#define SPRN_UPMC1 0x3A9 /* User Performance Counter Register 1 (750 and up) */ -#define SPRN_UPMC2 0x3AA /* User Performance Counter Register 2 (750 and up) */ -#define SPRN_UPMC3 0x3AD /* User Performance Counter Register 3 (750 and up) */ -#define SPRN_UPMC4 0x3AE /* User Performance Counter Register 4 (750 and up) */ -#define SPRN_UPMC5 0x3A1 /* User Performance Counter Register 5 (7450 and up) */ -#define SPRN_UPMC6 0x3A2 /* User Performance Counter Register 5 (7450 and up) */ -#define SPRN_USIA 0x3AB /* User Sampled Instruction Address Register (750 and up) */ -#define SPRN_UBAMR 0x3A7 /* User Breakpoint Address Mask Register (7400 and up) */ - -/* MMCR0 layout (74xx terminology) */ -#define MMCR0_FC 0x80000000 /* Freeze counters unconditionally. */ -#define MMCR0_FCS 0x40000000 /* Freeze counters while MSR[PR]=0 (supervisor mode). */ -#define MMCR0_FCP 0x20000000 /* Freeze counters while MSR[PR]=1 (user mode). */ -#define MMCR0_FCM1 0x10000000 /* Freeze counters while MSR[PM]=1. */ -#define MMCR0_FCM0 0x08000000 /* Freeze counters while MSR[PM]=0. */ -#define MMCR0_PMXE 0x04000000 /* Enable performance monitor exceptions. - * Cleared by hardware when a PM exception occurs. - * 604: PMXE is not cleared by hardware. - */ -#define MMCR0_FCECE 0x02000000 /* Freeze counters on enabled condition or event. - * FCECE is treated as 0 if TRIGGER is 1. - * 74xx: FC is set when the event occurs. - * 604/750: ineffective when PMXE=0. - */ -#define MMCR0_TBSEL 0x01800000 /* Time base lower (TBL) bit selector. - * 00: bit 31, 01: bit 23, 10: bit 19, 11: bit 15. - */ -#define MMCR0_TBEE 0x00400000 /* Enable event on TBL bit transition from 0 to 1. */ -#define MMCR0_THRESHOLD 0x003F0000 /* Threshold value for certain events. */ -#define MMCR0_PMC1CE 0x00008000 /* Enable event on PMC1 overflow. */ -#define MMCR0_PMCjCE 0x00004000 /* Enable event on PMC2-PMC6 overflow. - * 604/750: Overrides FCECE (DISCOUNT). - */ -#define MMCR0_TRIGGER 0x00002000 /* Disable PMC2-PMC6 until PMC1 overflow or other event. - * 74xx: cleared by hardware when the event occurs. - */ -#define MMCR0_PMC1SEL 0x00001FB0 /* PMC1 event selector, 7 bits. */ -#define MMCR0_PMC2SEL 0x0000003F /* PMC2 event selector, 6 bits. */ - -/* MMCR1 layout (604e-7457) */ -#define MMCR1_PMC3SEL 0xF8000000 /* PMC3 event selector, 5 bits. */ -#define MMCR1_PMC4SEL 0x07B00000 /* PMC4 event selector, 5 bits. */ -#define MMCR1_PMC5SEL 0x003E0000 /* PMC5 event selector, 5 bits. (745x only) */ -#define MMCR1_PMC6SEL 0x0001F800 /* PMC6 event selector, 6 bits. (745x only) */ -#define MMCR1__RESERVED 0x000007FF /* should be zero */ - -/* MMCR2 layout (7400-7457) */ -#define MMCR2_THRESHMULT 0x80000000 /* MMCR0[THRESHOLD] multiplier. */ -#define MMCR2_SMCNTEN 0x40000000 /* 7400/7410 only, should be zero. */ -#define MMCR2_SMINTEN 0x20000000 /* 7400/7410 only, should be zero. */ -#define MMCR2__RESERVED 0x1FFFFFFF /* should be zero */ -#define MMCR2_RESERVED (MMCR2_SMCNTEN | MMCR2_SMINTEN | MMCR2__RESERVED) - /* Bit definitions for MMCR0 and PMC1 / PMC2. */ #define MMCR0_PMC1_CYCLES (1 << 7) #define MMCR0_PMC1_ICACHEMISS (5 << 7) @@ -389,6 +334,7 @@ #define MMCR0_PMC2_CYCLES 0x1 #define MMCR0_PMC2_ITLB 0x7 #define MMCR0_PMC2_LOADMISSTIME 0x5 +#define MMCR0_PMXE (1 << 26) /* Processor Version Register */ diff -Naur linux-2.6.12-rc5-mm1/include/asm-ppc/unistd.h linux-2.6.12-rc5-mm1-plug/include/asm-ppc/unistd.h --- linux-2.6.12-rc5-mm1/include/asm-ppc/unistd.h 2005-05-25 16:23:45.864648896 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-ppc/unistd.h 2005-05-25 17:04:43.760991856 -0700 @@ -279,12 +279,8 @@ #define __NR_waitid 272 #define __NR_ioprio_set 273 #define __NR_ioprio_get 274 -#define __NR_vperfctr_open 275 -#define __NR_vperfctr_control (__NR_vperfctr_open+1) -#define __NR_vperfctr_write (__NR_vperfctr_open+2) -#define __NR_vperfctr_read (__NR_vperfctr_open+3) -#define __NR_syscalls 279 +#define __NR_syscalls 275 #define __NR(n) #n diff -Naur linux-2.6.12-rc5-mm1/include/asm-ppc64/perfctr.h linux-2.6.12-rc5-mm1-plug/include/asm-ppc64/perfctr.h --- linux-2.6.12-rc5-mm1/include/asm-ppc64/perfctr.h 2005-05-25 16:23:45.726669872 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-ppc64/perfctr.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,169 +0,0 @@ -/* - * PPC64 Performance-Monitoring Counters driver - * - * Copyright (C) 2004 David Gibson, IBM Corporation. - * Copyright (C) 2004 Mikael Pettersson - */ -#ifndef _ASM_PPC64_PERFCTR_H -#define _ASM_PPC64_PERFCTR_H - -#include - -struct perfctr_sum_ctrs { - __u64 tsc; - __u64 pmc[8]; /* the size is not part of the user ABI */ -}; - -struct perfctr_cpu_control_header { - __u32 tsc_on; - __u32 nractrs; /* number of accumulation-mode counters */ - __u32 nrictrs; /* number of interrupt-mode counters */ -}; - -struct perfctr_cpu_state_user { - __u32 cstatus; - /* 'samplecnt' is incremented every time the 'start' - fields have been updated by a sampling operation. - Unfortunately the PPC timebase (tsc_start) has too - low frequency for it to be a reliable context-switch - indicator for user-space. */ - __u32 samplecnt; - __u64 tsc_start; - __u64 tsc_sum; - struct { - __u64 start; - __u64 sum; - } pmc[8]; /* the size is not part of the user ABI */ -}; - -/* cstatus is a re-encoding of control.tsc_on/nractrs/nrictrs - which should have less overhead in most cases */ -/* XXX: ppc driver internally also uses cstatus&(1<<30) */ - -static inline -unsigned int perfctr_mk_cstatus(unsigned int tsc_on, unsigned int nractrs, - unsigned int nrictrs) -{ - return (tsc_on<<31) | (nrictrs<<16) | ((nractrs+nrictrs)<<8) | nractrs; -} - -static inline unsigned int perfctr_cstatus_enabled(unsigned int cstatus) -{ - return cstatus; -} - -static inline int perfctr_cstatus_has_tsc(unsigned int cstatus) -{ - return (int)cstatus < 0; /* test and jump on sign */ -} - -static inline unsigned int perfctr_cstatus_nractrs(unsigned int cstatus) -{ - return cstatus & 0x7F; /* and with imm8 */ -} - -static inline unsigned int perfctr_cstatus_nrctrs(unsigned int cstatus) -{ - return (cstatus >> 8) & 0x7F; -} - -static inline unsigned int perfctr_cstatus_has_ictrs(unsigned int cstatus) -{ - return cstatus & (0x7F << 16); -} - -/* - * 'struct siginfo' support for perfctr overflow signals. - * In unbuffered mode, si_code is set to SI_PMC_OVF and a bitmask - * describing which perfctrs overflowed is put in si_pmc_ovf_mask. - * A bitmask is used since more than one perfctr can have overflowed - * by the time the interrupt handler runs. - */ -#define SI_PMC_OVF -8 -#define si_pmc_ovf_mask _sifields._pad[0] /* XXX: use an unsigned field later */ - -#ifdef __KERNEL__ - -#if defined(CONFIG_PERFCTR) - -struct perfctr_cpu_control { - struct perfctr_cpu_control_header header; - u64 mmcr0; - u64 mmcr1; - u64 mmcra; - unsigned int ireset[8]; /* [0,0x7fffffff], for i-mode counters, physical indices */ - unsigned int pmc_map[8]; /* virtual to physical index map */ -}; - -struct perfctr_cpu_state { - /* Don't change field order here without first considering the number - of cache lines touched during sampling and context switching. */ - unsigned int id; - int isuspend_cpu; - struct perfctr_cpu_state_user user; - unsigned int unused_pmcs; - struct perfctr_cpu_control control; -}; - -/* Driver init/exit. */ -extern int perfctr_cpu_init(void); -extern void perfctr_cpu_exit(void); - -/* CPU type name. */ -extern char *perfctr_cpu_name; - -/* Hardware reservation. */ -extern const char *perfctr_cpu_reserve(const char *service); -extern void perfctr_cpu_release(const char *service); - -/* PRE: state has no running interrupt-mode counters. - Check that the new control data is valid. - Update the driver's private control data. - Returns a negative error code if the control data is invalid. */ -extern int perfctr_cpu_update_control(struct perfctr_cpu_state *state, int is_global); - -/* Parse and update control for the given domain. */ -extern int perfctr_cpu_control_write(struct perfctr_cpu_control *control, - unsigned int domain, - const void *srcp, unsigned int srcbytes); - -/* Retrieve and format control for the given domain. - Returns number of bytes written. */ -extern int perfctr_cpu_control_read(const struct perfctr_cpu_control *control, - unsigned int domain, - void *dstp, unsigned int dstbytes); - -/* Read a-mode counters. Subtract from start and accumulate into sums. - Must be called with preemption disabled. */ -extern void perfctr_cpu_suspend(struct perfctr_cpu_state *state); - -/* Write control registers. Read a-mode counters into start. - Must be called with preemption disabled. */ -extern void perfctr_cpu_resume(struct perfctr_cpu_state *state); - -/* Perform an efficient combined suspend/resume operation. - Must be called with preemption disabled. */ -extern void perfctr_cpu_sample(struct perfctr_cpu_state *state); - -/* The type of a perfctr overflow interrupt handler. - It will be called in IRQ context, with preemption disabled. */ -typedef void (*perfctr_ihandler_t)(unsigned long pc); - -/* Operations related to overflow interrupt handling. */ -#ifdef CONFIG_PERFCTR_INTERRUPT_SUPPORT -extern void perfctr_cpu_set_ihandler(perfctr_ihandler_t); -extern void perfctr_cpu_ireload(struct perfctr_cpu_state*); -extern unsigned int perfctr_cpu_identify_overflow(struct perfctr_cpu_state*); -#else -static inline void perfctr_cpu_set_ihandler(perfctr_ihandler_t x) { } -#endif -static inline int perfctr_cpu_has_pending_interrupt(const struct perfctr_cpu_state *state) -{ - return 0; -} - -#endif /* CONFIG_PERFCTR */ - -#endif /* __KERNEL__ */ - -#endif /* _ASM_PPC64_PERFCTR_H */ diff -Naur linux-2.6.12-rc5-mm1/include/asm-ppc64/processor.h linux-2.6.12-rc5-mm1-plug/include/asm-ppc64/processor.h --- linux-2.6.12-rc5-mm1/include/asm-ppc64/processor.h 2005-05-25 16:23:45.753665768 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-ppc64/processor.h 2005-05-25 17:04:43.806984864 -0700 @@ -574,8 +574,6 @@ unsigned long vrsave; int used_vr; /* set if process has used altivec */ #endif /* CONFIG_ALTIVEC */ - /* performance counters */ - struct vperfctr *perfctr; }; #define ARCH_MIN_TASKALIGN 16 diff -Naur linux-2.6.12-rc5-mm1/include/asm-ppc64/unistd.h linux-2.6.12-rc5-mm1-plug/include/asm-ppc64/unistd.h --- linux-2.6.12-rc5-mm1/include/asm-ppc64/unistd.h 2005-05-25 16:23:45.782661360 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-ppc64/unistd.h 2005-05-25 17:04:43.807984712 -0700 @@ -283,14 +283,8 @@ #define __NR_request_key 270 #define __NR_keyctl 271 #define __NR_waitid 272 -/* 273 is reserved for ioprio_set */ -/* 274 is reserved for ioprio_get */ -#define __NR_vperfctr_open 275 -#define __NR_vperfctr_control (__NR_vperfctr_open+1) -#define __NR_vperfctr_write (__NR_vperfctr_open+2) -#define __NR_vperfctr_read (__NR_vperfctr_open+3) -#define __NR_syscalls 279 +#define __NR_syscalls 273 #ifdef __KERNEL__ #define NR_syscalls __NR_syscalls #endif diff -Naur linux-2.6.12-rc5-mm1/include/asm-s390/system.h linux-2.6.12-rc5-mm1-plug/include/asm-s390/system.h --- linux-2.6.12-rc5-mm1/include/asm-s390/system.h 2005-05-25 16:23:45.886645552 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-s390/system.h 2005-05-25 17:02:50.616192480 -0700 @@ -103,18 +103,29 @@ prev = __switch_to(prev,next); \ } while (0) +#define prepare_arch_switch(rq, next) do { } while(0) +#define task_running(rq, p) ((rq)->curr == (p)) + #ifdef CONFIG_VIRT_CPU_ACCOUNTING extern void account_user_vtime(struct task_struct *); extern void account_system_vtime(struct task_struct *); -#else -#define account_system_vtime(prev) do { } while (0) -#endif #define finish_arch_switch(rq, prev) do { \ set_fs(current->thread.mm_segment); \ + spin_unlock(&(rq)->lock); \ account_system_vtime(prev); \ + local_irq_enable(); \ } while (0) +#else + +#define finish_arch_switch(rq, prev) do { \ + set_fs(current->thread.mm_segment); \ + spin_unlock_irq(&(rq)->lock); \ +} while (0) + +#endif + #define nop() __asm__ __volatile__ ("nop") #define xchg(ptr,x) \ diff -Naur linux-2.6.12-rc5-mm1/include/asm-sparc/system.h linux-2.6.12-rc5-mm1-plug/include/asm-sparc/system.h --- linux-2.6.12-rc5-mm1/include/asm-sparc/system.h 2005-05-25 16:23:46.060619104 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-sparc/system.h 2005-05-25 17:02:50.617192328 -0700 @@ -101,7 +101,7 @@ * SWITCH_ENTER and SWITH_DO_LAZY_FPU do not work yet (e.g. SMP does not work) * XXX WTF is the above comment? Found in late teen 2.4.x. */ -#define prepare_arch_switch(next) do { \ +#define prepare_arch_switch(rq, next) do { \ __asm__ __volatile__( \ ".globl\tflush_patch_switch\nflush_patch_switch:\n\t" \ "save %sp, -0x40, %sp; save %sp, -0x40, %sp; save %sp, -0x40, %sp\n\t" \ @@ -109,6 +109,8 @@ "save %sp, -0x40, %sp\n\t" \ "restore; restore; restore; restore; restore; restore; restore"); \ } while(0) +#define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) +#define task_running(rq, p) ((rq)->curr == (p)) /* Much care has gone into this code, do not touch it. * diff -Naur linux-2.6.12-rc5-mm1/include/asm-sparc64/system.h linux-2.6.12-rc5-mm1-plug/include/asm-sparc64/system.h --- linux-2.6.12-rc5-mm1/include/asm-sparc64/system.h 2005-05-25 16:23:46.015625944 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-sparc64/system.h 2005-05-25 17:02:50.617192328 -0700 @@ -139,13 +139,19 @@ #define flush_user_windows flushw_user #define flush_register_windows flushw_all -/* Don't hold the runqueue lock over context switch */ -#define __ARCH_WANT_UNLOCKED_CTXSW -#define prepare_arch_switch(next) \ -do { \ +#define prepare_arch_switch(rq, next) \ +do { spin_lock(&(next)->switch_lock); \ + spin_unlock(&(rq)->lock); \ flushw_all(); \ } while (0) +#define finish_arch_switch(rq, prev) \ +do { spin_unlock_irq(&(prev)->switch_lock); \ +} while (0) + +#define task_running(rq, p) \ + ((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock)) + /* See what happens when you design the chip correctly? * * We tell gcc we clobber all non-fixed-usage registers except diff -Naur linux-2.6.12-rc5-mm1/include/asm-x86_64/hw_irq.h linux-2.6.12-rc5-mm1-plug/include/asm-x86_64/hw_irq.h --- linux-2.6.12-rc5-mm1/include/asm-x86_64/hw_irq.h 2005-05-25 16:23:46.123609528 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-x86_64/hw_irq.h 2005-05-25 17:04:43.742994592 -0700 @@ -65,15 +65,14 @@ * sources per level' errata. */ #define LOCAL_TIMER_VECTOR 0xef -#define LOCAL_PERFCTR_VECTOR 0xee /* - * First APIC vector available to drivers: (vectors 0x30-0xed) + * First APIC vector available to drivers: (vectors 0x30-0xee) * we start at 0x31 to spread out vectors evenly between priority * levels. (0x80 is the syscall vector) */ #define FIRST_DEVICE_VECTOR 0x31 -#define FIRST_SYSTEM_VECTOR 0xee /* duplicated in irq.h */ +#define FIRST_SYSTEM_VECTOR 0xef /* duplicated in irq.h */ #ifndef __ASSEMBLY__ diff -Naur linux-2.6.12-rc5-mm1/include/asm-x86_64/ia32_unistd.h linux-2.6.12-rc5-mm1-plug/include/asm-x86_64/ia32_unistd.h --- linux-2.6.12-rc5-mm1/include/asm-x86_64/ia32_unistd.h 2005-05-25 16:23:46.157604360 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-x86_64/ia32_unistd.h 2005-05-25 17:04:43.743994440 -0700 @@ -294,11 +294,7 @@ #define __NR_ia32_add_key 286 #define __NR_ia32_request_key 287 #define __NR_ia32_keyctl 288 -#define __NR_ia32_vperfctr_open 291 -#define __NR_ia32_vperfctr_control (__NR_ia32_vperfctr_open+1) -#define __NR_ia32_vperfctr_write (__NR_ia32_vperfctr_open+2) -#define __NR_ia32_vperfctr_read (__NR_ia32_vperfctr_open+3) -#define IA32_NR_syscalls 295 /* must be > than biggest syscall! */ +#define IA32_NR_syscalls 290 /* must be > than biggest syscall! */ #endif /* _ASM_X86_64_IA32_UNISTD_H_ */ diff -Naur linux-2.6.12-rc5-mm1/include/asm-x86_64/irq.h linux-2.6.12-rc5-mm1-plug/include/asm-x86_64/irq.h --- linux-2.6.12-rc5-mm1/include/asm-x86_64/irq.h 2005-05-25 16:23:46.166602992 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-x86_64/irq.h 2005-05-25 17:04:43.745994136 -0700 @@ -29,7 +29,7 @@ */ #define NR_VECTORS 256 -#define FIRST_SYSTEM_VECTOR 0xee /* duplicated in hw_irq.h */ +#define FIRST_SYSTEM_VECTOR 0xef /* duplicated in hw_irq.h */ #ifdef CONFIG_PCI_MSI #define NR_IRQS FIRST_SYSTEM_VECTOR diff -Naur linux-2.6.12-rc5-mm1/include/asm-x86_64/perfctr.h linux-2.6.12-rc5-mm1-plug/include/asm-x86_64/perfctr.h --- linux-2.6.12-rc5-mm1/include/asm-x86_64/perfctr.h 2005-05-25 16:23:46.194598736 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-x86_64/perfctr.h 1969-12-31 16:00:00.000000000 -0800 @@ -1 +0,0 @@ -#include diff -Naur linux-2.6.12-rc5-mm1/include/asm-x86_64/processor.h linux-2.6.12-rc5-mm1-plug/include/asm-x86_64/processor.h --- linux-2.6.12-rc5-mm1/include/asm-x86_64/processor.h 2005-05-25 16:23:46.234592656 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-x86_64/processor.h 2005-05-25 17:04:43.746993984 -0700 @@ -251,8 +251,6 @@ unsigned io_bitmap_max; /* cached TLS descriptors. */ u64 tls_array[GDT_ENTRY_TLS_ENTRIES]; -/* performance counters */ - struct vperfctr *perfctr; } __attribute__((aligned(16))); #define INIT_THREAD {} diff -Naur linux-2.6.12-rc5-mm1/include/asm-x86_64/system.h linux-2.6.12-rc5-mm1-plug/include/asm-x86_64/system.h --- linux-2.6.12-rc5-mm1/include/asm-x86_64/system.h 2005-05-25 16:23:46.256589312 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-x86_64/system.h 2005-05-25 17:05:49.566987824 -0700 @@ -26,14 +26,11 @@ #define __EXTRA_CLOBBER \ ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15" -#define switch_to(prev,next,last) do { \ - perfctr_suspend_thread(&(prev)->thread); \ +#define switch_to(prev,next,last) \ asm volatile(SAVE_CONTEXT \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ "call __switch_to\n\t" \ - ".globl thread_return\n" \ - "thread_return:\n\t" \ "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \ "movq %P[thread_info](%%rsi),%%r8\n\t" \ LOCK "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \ @@ -47,8 +44,7 @@ [tif_fork] "i" (TIF_FORK), \ [thread_info] "i" (offsetof(struct task_struct, thread_info)), \ [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \ - : "memory", "cc" __EXTRA_CLOBBER); \ -} while (0) + : "memory", "cc" __EXTRA_CLOBBER) extern void load_gs_index(unsigned); diff -Naur linux-2.6.12-rc5-mm1/include/asm-x86_64/topology.h linux-2.6.12-rc5-mm1-plug/include/asm-x86_64/topology.h --- linux-2.6.12-rc5-mm1/include/asm-x86_64/topology.h 2005-05-25 16:23:46.258589008 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-x86_64/topology.h 2005-05-25 17:02:50.618192176 -0700 @@ -39,16 +39,12 @@ .busy_factor = 32, \ .imbalance_pct = 125, \ .cache_hot_time = (10*1000000), \ - .cache_nice_tries = 2, \ - .busy_idx = 3, \ - .idle_idx = 2, \ - .newidle_idx = 0, \ - .wake_idx = 1, \ - .forkexec_idx = 1, \ + .cache_nice_tries = 1, \ .per_cpu_gain = 100, \ .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_FORK \ + | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ + | SD_WAKE_IDLE \ | SD_WAKE_BALANCE, \ .last_balance = jiffies, \ .balance_interval = 1, \ diff -Naur linux-2.6.12-rc5-mm1/include/asm-x86_64/unistd.h linux-2.6.12-rc5-mm1-plug/include/asm-x86_64/unistd.h --- linux-2.6.12-rc5-mm1/include/asm-x86_64/unistd.h 2005-05-25 16:23:46.266587792 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/asm-x86_64/unistd.h 2005-05-25 17:04:43.747993832 -0700 @@ -565,16 +565,8 @@ __SYSCALL(__NR_ioprio_set, sys_ioprio_set) #define __NR_ioprio_get 252 __SYSCALL(__NR_ioprio_get, sys_ioprio_get) -#define __NR_vperfctr_open 253 -__SYSCALL(__NR_vperfctr_open, sys_vperfctr_open) -#define __NR_vperfctr_control (__NR_vperfctr_open+1) -__SYSCALL(__NR_vperfctr_control, sys_vperfctr_control) -#define __NR_vperfctr_write (__NR_vperfctr_open+2) -__SYSCALL(__NR_vperfctr_write, sys_vperfctr_write) -#define __NR_vperfctr_read (__NR_vperfctr_open+3) -__SYSCALL(__NR_vperfctr_read, sys_vperfctr_read) -#define __NR_syscall_max __NR_vperfctr_read +#define __NR_syscall_max __NR_ioprio_get #ifndef __NO_STUBS /* user-visible error numbers are in the range -1 - -4095 */ diff -Naur linux-2.6.12-rc5-mm1/include/linux/init_task.h linux-2.6.12-rc5-mm1-plug/include/linux/init_task.h --- linux-2.6.12-rc5-mm1/include/linux/init_task.h 2005-05-25 16:23:46.561542952 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/linux/init_task.h 2005-05-25 17:07:13.625209032 -0700 @@ -74,15 +74,14 @@ .usage = ATOMIC_INIT(2), \ .flags = 0, \ .lock_depth = -1, \ - .prio = MAX_PRIO-20, \ - .static_prio = MAX_PRIO-20, \ + .prio = NICE_TO_PRIO(0), \ + .static_prio = NICE_TO_PRIO(0), \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ .active_mm = &init_mm, \ .run_list = LIST_HEAD_INIT(tsk.run_list), \ .ioprio = 0, \ - .time_slice = HZ, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ @@ -109,6 +108,7 @@ .blocked = {{0}}, \ .alloc_lock = SPIN_LOCK_UNLOCKED, \ .proc_lock = SPIN_LOCK_UNLOCKED, \ + .switch_lock = SPIN_LOCK_UNLOCKED, \ .journal_info = NULL, \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .fs_excl = ATOMIC_INIT(0), \ diff -Naur linux-2.6.12-rc5-mm1/include/linux/kernel.h linux-2.6.12-rc5-mm1-plug/include/linux/kernel.h --- linux-2.6.12-rc5-mm1/include/linux/kernel.h 2005-05-25 16:23:46.593538088 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/linux/kernel.h 2005-05-25 17:02:50.619192024 -0700 @@ -58,23 +58,15 @@ * be biten later when the calling function happens to sleep when it is not * supposed to. */ -#ifdef CONFIG_PREEMPT_VOLUNTARY -extern int cond_resched(void); -# define might_resched() cond_resched() -#else -# define might_resched() do { } while (0) -#endif - #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP - void __might_sleep(char *file, int line); -# define might_sleep() \ - do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) +#define might_sleep() __might_sleep(__FILE__, __LINE__) +#define might_sleep_if(cond) do { if (unlikely(cond)) might_sleep(); } while (0) +void __might_sleep(char *file, int line); #else -# define might_sleep() do { might_resched(); } while (0) +#define might_sleep() do {} while(0) +#define might_sleep_if(cond) do {} while (0) #endif -#define might_sleep_if(cond) do { if (unlikely(cond)) might_sleep(); } while (0) - #define abs(x) ({ \ int __x = (x); \ (__x < 0) ? -__x : __x; \ diff -Naur linux-2.6.12-rc5-mm1/include/linux/perfctr.h linux-2.6.12-rc5-mm1-plug/include/linux/perfctr.h --- linux-2.6.12-rc5-mm1/include/linux/perfctr.h 2005-05-25 16:23:46.938485648 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/linux/perfctr.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,159 +0,0 @@ -/* $Id: perfctr.h,v 1.91 2005/03/18 00:10:53 mikpe Exp $ - * Performance-Monitoring Counters driver - * - * Copyright (C) 1999-2005 Mikael Pettersson - */ -#ifndef _LINUX_PERFCTR_H -#define _LINUX_PERFCTR_H - -#ifdef CONFIG_PERFCTR /* don't break archs without */ - -#include - -/* cpu_features flag bits */ -#define PERFCTR_FEATURE_RDPMC 0x01 -#define PERFCTR_FEATURE_RDTSC 0x02 -#define PERFCTR_FEATURE_PCINT 0x04 - -/* virtual perfctr control object */ -struct vperfctr_control { - __s32 si_signo; - __u32 preserve; -}; - -/* commands for sys_vperfctr_control() */ -#define VPERFCTR_CONTROL_UNLINK 0x01 -#define VPERFCTR_CONTROL_SUSPEND 0x02 -#define VPERFCTR_CONTROL_RESUME 0x03 -#define VPERFCTR_CONTROL_CLEAR 0x04 - -/* common description of an arch-specific control register */ -struct perfctr_cpu_reg { - __u64 nr; - __u64 value; -}; - -/* state and control domain numbers - 0-127 are for architecture-neutral domains - 128-255 are for architecture-specific domains */ -#define VPERFCTR_DOMAIN_SUM 1 /* struct perfctr_sum_ctrs */ -#define VPERFCTR_DOMAIN_CONTROL 2 /* struct vperfctr_control */ -#define VPERFCTR_DOMAIN_CHILDREN 3 /* struct perfctr_sum_ctrs */ - -/* domain numbers for common arch-specific control data */ -#define PERFCTR_DOMAIN_CPU_CONTROL 128 /* struct perfctr_cpu_control_header */ -#define PERFCTR_DOMAIN_CPU_MAP 129 /* __u32[] */ -#define PERFCTR_DOMAIN_CPU_REGS 130 /* struct perfctr_cpu_reg[] */ - -#endif /* CONFIG_PERFCTR */ - -#ifdef __KERNEL__ - -/* - * The perfctr system calls. - */ -asmlinkage long sys_vperfctr_open(int tid, int creat); -asmlinkage long sys_vperfctr_control(int fd, unsigned int cmd); -asmlinkage long sys_vperfctr_write(int fd, unsigned int domain, - const void __user *argp, - unsigned int argbytes); -asmlinkage long sys_vperfctr_read(int fd, unsigned int domain, - void __user *argp, - unsigned int argbytes); - -struct perfctr_info { - unsigned int cpu_features; - unsigned int cpu_khz; - unsigned int tsc_to_cpu_mult; -}; - -extern struct perfctr_info perfctr_info; - -#ifdef CONFIG_PERFCTR_VIRTUAL - -/* - * Virtual per-process performance-monitoring counters. - */ -struct vperfctr; /* opaque */ - -/* process management operations */ -extern void __vperfctr_copy(struct task_struct*, struct pt_regs*); -extern void __vperfctr_release(struct task_struct*); -extern void __vperfctr_exit(struct vperfctr*); -extern void __vperfctr_suspend(struct vperfctr*); -extern void __vperfctr_resume(struct vperfctr*); -extern void __vperfctr_sample(struct vperfctr*); -extern void __vperfctr_set_cpus_allowed(struct task_struct*, struct vperfctr*, cpumask_t); - -static inline void perfctr_copy_task(struct task_struct *tsk, struct pt_regs *regs) -{ - if (tsk->thread.perfctr) - __vperfctr_copy(tsk, regs); -} - -static inline void perfctr_release_task(struct task_struct *tsk) -{ - if (tsk->thread.perfctr) - __vperfctr_release(tsk); -} - -static inline void perfctr_exit_thread(struct thread_struct *thread) -{ - struct vperfctr *perfctr; - perfctr = thread->perfctr; - if (perfctr) - __vperfctr_exit(perfctr); -} - -static inline void perfctr_suspend_thread(struct thread_struct *prev) -{ - struct vperfctr *perfctr; - perfctr = prev->perfctr; - if (perfctr) - __vperfctr_suspend(perfctr); -} - -static inline void perfctr_resume_thread(struct thread_struct *next) -{ - struct vperfctr *perfctr; - perfctr = next->perfctr; - if (perfctr) - __vperfctr_resume(perfctr); -} - -static inline void perfctr_sample_thread(struct thread_struct *thread) -{ - struct vperfctr *perfctr; - perfctr = thread->perfctr; - if (perfctr) - __vperfctr_sample(perfctr); -} - -static inline void perfctr_set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) -{ -#ifdef CONFIG_PERFCTR_CPUS_FORBIDDEN_MASK - struct vperfctr *perfctr; - - task_lock(p); - perfctr = p->thread.perfctr; - if (perfctr) - __vperfctr_set_cpus_allowed(p, perfctr, new_mask); - task_unlock(p); -#endif -} - -#else /* !CONFIG_PERFCTR_VIRTUAL */ - -static inline void perfctr_copy_task(struct task_struct *p, struct pt_regs *r) { } -static inline void perfctr_release_task(struct task_struct *p) { } -static inline void perfctr_exit_thread(struct thread_struct *t) { } -static inline void perfctr_suspend_thread(struct thread_struct *t) { } -static inline void perfctr_resume_thread(struct thread_struct *t) { } -static inline void perfctr_sample_thread(struct thread_struct *t) { } -static inline void perfctr_set_cpus_allowed(struct task_struct *p, cpumask_t m) { } - -#endif /* CONFIG_PERFCTR_VIRTUAL */ - -#endif /* __KERNEL__ */ - -#endif /* _LINUX_PERFCTR_H */ diff -Naur linux-2.6.12-rc5-mm1/include/linux/sched.h linux-2.6.12-rc5-mm1-plug/include/linux/sched.h --- linux-2.6.12-rc5-mm1/include/linux/sched.h 2005-05-25 16:23:46.998476528 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/linux/sched.h 2005-05-25 17:08:28.508824992 -0700 @@ -386,11 +386,6 @@ #endif }; -/* Context switch must be unlocked if interrupts are to be enabled */ -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW -# define __ARCH_WANT_UNLOCKED_CTXSW -#endif - /* * Bits in flags field of signal_struct. */ @@ -416,8 +411,6 @@ #define MAX_USER_RT_PRIO 100 #define MAX_RT_PRIO MAX_USER_RT_PRIO -#define MAX_PRIO (MAX_RT_PRIO + 40) - #define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) /* @@ -487,11 +480,10 @@ #define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ #define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */ #define SD_BALANCE_EXEC 4 /* Balance on exec */ -#define SD_BALANCE_FORK 8 /* Balance on fork, clone */ -#define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */ -#define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */ -#define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ -#define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ +#define SD_WAKE_IDLE 8 /* Wake to idle CPU on task wakeup */ +#define SD_WAKE_AFFINE 16 /* Wake task to waking CPU */ +#define SD_WAKE_BALANCE 32 /* Perform balancing at task wakeup */ +#define SD_SHARE_CPUPOWER 64 /* Domain members share cpu power */ struct sched_group { struct sched_group *next; /* Must be a circular list */ @@ -516,11 +508,6 @@ unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */ - unsigned int busy_idx; - unsigned int idle_idx; - unsigned int newidle_idx; - unsigned int wake_idx; - unsigned int forkexec_idx; int flags; /* See SD_* */ /* Runtime fields. */ @@ -544,16 +531,10 @@ unsigned long alb_failed; unsigned long alb_pushed; - /* SD_BALANCE_EXEC stats */ - unsigned long sbe_cnt; - unsigned long sbe_balanced; + /* sched_balance_exec() stats */ + unsigned long sbe_attempts; unsigned long sbe_pushed; - /* SD_BALANCE_FORK stats */ - unsigned long sbf_cnt; - unsigned long sbf_balanced; - unsigned long sbf_pushed; - /* try_to_wake_up() stats */ unsigned long ttwu_wake_remote; unsigned long ttwu_move_affine; @@ -561,8 +542,6 @@ #endif }; -extern void partition_sched_domains(cpumask_t *partition1, - cpumask_t *partition2); #ifdef ARCH_HAS_SCHED_DOMAIN /* Useful helpers that arch setup code may use. Defined in kernel/sched.c */ extern cpumask_t cpu_isolated_map; @@ -614,6 +593,8 @@ struct audit_context; /* See audit.c */ struct mempolicy; +#include + struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ struct thread_info *thread_info; @@ -623,23 +604,18 @@ int lock_depth; /* BKL lock depth */ -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) - int oncpu; -#endif int prio, static_prio; struct list_head run_list; - prio_array_t *array; + union sched_drv_task sdu; unsigned short ioprio; - unsigned long sleep_avg; unsigned long long timestamp, last_ran; unsigned long long sched_time; /* sched_clock time spent running */ - int activated; - unsigned long policy; + unsigned long policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; + #ifdef CONFIG_SCHEDSTATS struct sched_info sched_info; @@ -750,6 +726,8 @@ spinlock_t alloc_lock; /* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */ spinlock_t proc_lock; +/* context-switch lock */ + spinlock_t switch_lock; /* journalling filesystem info */ void *journal_info; @@ -957,7 +935,7 @@ #else static inline void kick_process(struct task_struct *tsk) { } #endif -extern void FASTCALL(sched_fork(task_t * p, int clone_flags)); +extern void FASTCALL(sched_fork(task_t * p)); extern void FASTCALL(sched_exit(task_t * p)); extern int in_group_p(gid_t); @@ -1140,9 +1118,6 @@ * subscriptions and synchronises with wait4(). Also used in procfs. Also * pins the final release of task.io_context. * - * Synchronises set_cpus_allowed(), unlink, and creat of ->thread.perfctr. - * [if CONFIG_PERFCTR_VIRTUAL] - * * Nests both inside and outside of read_lock(&tasklist_lock). * It must not be nested with write_lock_irq(&tasklist_lock), * neither inside nor outside. diff -Naur linux-2.6.12-rc5-mm1/include/linux/sched_cpustats.h linux-2.6.12-rc5-mm1-plug/include/linux/sched_cpustats.h --- linux-2.6.12-rc5-mm1/include/linux/sched_cpustats.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.12-rc5-mm1-plug/include/linux/sched_cpustats.h 2005-05-25 17:05:49.575986456 -0700 @@ -0,0 +1,156 @@ +#ifndef _LINUX_SCHED_CPUSTATS_H +#define _LINUX_SCHED_CPUSTATS_H + +#include + +/* + * Fixed denominator rational numbers for use by the CPU scheduler + */ +#define SCHED_AVG_OFFSET 4 +/* + * Get the rounded integer value of a scheduling statistic average field + * i.e. those fields whose names begin with avg_ + */ +#define SCHED_AVG_RND(x) \ + (((x) + (1 << (SCHED_AVG_OFFSET - 1))) >> (SCHED_AVG_OFFSET)) +#define SCHED_AVG_REAL(a) ((a) << SCHED_AVG_OFFSET) + +#define INITIAL_CPUSTATS_TIMESTAMP \ + ((unsigned long long)INITIAL_JIFFIES * (1000000000ULL / HZ)) + +struct runq_cpustats { + unsigned long long total_delay; + unsigned long long total_rt_delay; + unsigned long long total_intr_delay; + unsigned long long total_rt_intr_delay; + unsigned long long total_fork_delay; + unsigned long long total_sinbin; +}; + +extern DEFINE_PER_CPU(struct runq_cpustats, cpustats_runqs); + +/* + * Scheduling statistics for a task/thread + */ +struct task_cpustats { + unsigned long long total_wake_ups; + unsigned long long intr_wake_ups; + unsigned long long total_sleep; + unsigned long long avg_sleep_per_cycle; + unsigned long long total_cpu; + unsigned long long avg_cpu_per_cycle; + unsigned long long total_delay; + unsigned long long avg_delay_per_cycle; + unsigned long long total_sinbin; + unsigned long long avg_cycle_length; + unsigned long cpu_usage_rate; + unsigned int flags; +}; + +#define CPUSTATS_WOKEN_FOR_INTR_FL (1 << 0) +#define CPUSTATS_JUST_FORKED_FL (1 << 1) + +#define INIT_CPUSTATS \ + .cpustats = { 0, }, \ + .csrq = NULL + + +struct task_struct; + +extern void init_runq_cpustats(unsigned int cpu); +static inline struct runq_cpustats *cpu_runq_cpustats(unsigned int cpu) +{ + return &per_cpu(cpustats_runqs, cpu); +} +#ifdef CONFIG_SMP +extern unsigned long long adjusted_sched_clock(const struct task_struct *p); +#else +#define adjusted_sched_clock(p) sched_clock() +#endif + +extern void initialize_cpustats(struct task_struct *p, unsigned long long now); +extern void delta_sleep_cpustats(struct task_struct *p, unsigned long long now); +extern void delta_cpu_cpustats(struct task_struct *p, unsigned long long now); +extern void delta_delay_cpustats(struct task_struct *p, unsigned long long now); +extern void delta_rq_delay_cpustats(struct task_struct *p, unsigned long long delta); +extern void update_cpustats_at_wake_up(struct task_struct *p, unsigned long long now); +extern void update_cpustats_at_end_of_ts(struct task_struct *p, unsigned long long now); + +extern unsigned long long cpustats_avg_in_jiffies(unsigned long long avg); + +struct task_accrued_cpustats { + unsigned long long timestamp; + unsigned long long total_wake_ups; + unsigned long long intr_wake_ups; + unsigned long long total_sleep; + unsigned long long total_cpu; + unsigned long long total_delay; + unsigned long long total_sinbin; +}; + +/* + * Get "up to date" scheduling statistics for the given task + * This function should be used if reliable scheduling statistitcs are required + * outside the scheduler itself as the relevant fields in the task structure + * are not "up to date" NB the possible difference between those in the task + * structure and the correct values could be quite large for sleeping tasks. + */ +extern int get_task_accrued_cpustats(struct task_struct*, struct task_accrued_cpustats*); + +/* + * Scheduling statistics for a CPU + */ +struct cpu_cpustats { + unsigned long long timestamp; + unsigned long long total_idle; + unsigned long long total_busy; + unsigned long long total_delay; + unsigned long long total_rt_delay; + unsigned long long total_intr_delay; + unsigned long long total_rt_intr_delay; + unsigned long long total_fork_delay; + unsigned long long total_sinbin; + unsigned long long nr_switches; +}; + +/* + * Get scheduling statistics for the nominated CPU + */ +extern int get_cpu_cpustats(unsigned int, struct cpu_cpustats*); + +/* + * Make scheduling statistics available via /proc + */ +extern int task_sched_cpustats(struct task_struct *p, char *buffer); +extern int cpustats_read_proc(char *page, char **start, off_t off, int count, + int *eof, void *data); + + +/* + * CPU rate statistics are estimated as a proportions (i.e. real numbers in the + * rang 0 to 1 inclusive) using fixed denominator rational numbers. + * The denominator (PROPORTION_ONE) must be less than to 2^24 + */ +#define PROPORTION_OFFSET 23 +#define PROPORTION_ONE (1ULL << PROPORTION_OFFSET) +#define PROP_FM_PPT(a) (((unsigned long long)(a) * PROPORTION_ONE) / 1000) + +/* Require: a <= b */ +extern unsigned long calc_proportion(unsigned long long a, unsigned long long b); +extern unsigned long map_proportion(unsigned long prop, unsigned long range); +#define map_proportion_rnd(p, r) map_proportion((p) >> 1, ((r) << 1) + 1) +extern unsigned long proportion_to_ppt(unsigned long proportion); +extern unsigned long ppt_to_proportion(unsigned long ppt); + +extern unsigned long avg_cpu_usage_rate(const struct task_struct*); +extern unsigned long avg_sleep_rate(const struct task_struct*); +extern unsigned long avg_cpu_delay_rate(const struct task_struct*); +extern unsigned long delay_in_jiffies_for_usage(const struct task_struct*, unsigned long); + +extern int do_proc_proportion(ctl_table *ctp, int write, struct file *fp, + void __user *buffer, size_t *lenp, loff_t *ppos); + +#define TASK_CPUSTATS(p) (p)->sdu.spa.cpustats +#define RUNQ_CPUSTATS(p) (p)->sdu.spa.csrq + +#endif diff -Naur linux-2.6.12-rc5-mm1/include/linux/sched_drv.h linux-2.6.12-rc5-mm1-plug/include/linux/sched_drv.h --- linux-2.6.12-rc5-mm1/include/linux/sched_drv.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.12-rc5-mm1-plug/include/linux/sched_drv.h 2005-05-25 17:05:49.575986456 -0700 @@ -0,0 +1,61 @@ +#ifndef _LINUX_SCHED_DRV_H +#define _LINUX_SCHED_DRV_H +/* + * include/linux/sched_drv.h + * This contains the definition of the driver struct for all the exported per + * runqueue scheduler functions, and the private per scheduler data in + * struct task_struct. + */ +#include + +#include +#include + +/* + * This is the main scheduler driver struct. + */ +struct sched_drv { + const char *name; + void (*init_runqueue_queue)(union runqueue_queue *); + void (*set_oom_time_slice)(struct task_struct *, unsigned long); + unsigned int (*task_timeslice)(const task_t *); + void (*wake_up_task)(struct task_struct *, struct runqueue *, unsigned int, int); + void (*fork)(task_t *); + void (*wake_up_new_task)(task_t *, unsigned long); + void (*exit)(task_t *); +#ifdef CONFIG_SMP + int (*move_tasks)(runqueue_t *, int, runqueue_t *, unsigned long, + struct sched_domain *, enum idle_type); +#endif + void (*tick)(struct task_struct*, struct runqueue *, unsigned long long); +#ifdef CONFIG_SCHED_SMT + struct task_struct *(*head_of_queue)(union runqueue_queue *); + int (*dependent_sleeper_trumps)(const struct task_struct *, + const struct task_struct *, struct sched_domain *); +#endif + void (*schedule)(void); + void (*set_normal_task_nice)(task_t *, long); + void (*setscheduler)(task_t *, int, int); + long (*sys_yield)(void); + void (*yield)(void); + void (*init_idle)(task_t *, int); + void (*sched_init)(void); +#ifdef CONFIG_SMP + void (*migrate_queued_task)(struct task_struct *, int); +#ifdef CONFIG_HOTPLUG_CPU + void (*set_select_idle_first)(struct runqueue *); + void (*set_select_idle_last)(struct runqueue *); + void (*migrate_dead_tasks)(unsigned int); +#endif +#endif +#ifdef CONFIG_MAGIC_SYSRQ + void (*normalize_rt_task)(struct task_struct *); +#endif + struct attribute **attrs; +}; + +extern const struct sched_drv *sched_drvp; + +extern void sched_drv_sysfs_init(void); + +#endif diff -Naur linux-2.6.12-rc5-mm1/include/linux/sched_pvt.h linux-2.6.12-rc5-mm1-plug/include/linux/sched_pvt.h --- linux-2.6.12-rc5-mm1/include/linux/sched_pvt.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.12-rc5-mm1-plug/include/linux/sched_pvt.h 2005-05-25 17:05:49.576986304 -0700 @@ -0,0 +1,442 @@ +#ifndef _LINUX_SCHED_PVT_H +#define _LINUX_SCHED_PVT_H +/* + * include/linux/sched_pvt.h + * This contains the definition of the CPU scheduler macros and function + * prototypes that are only of interest to scheduler implementations. + */ + +#include + +#include + +extern DEFINE_PER_CPU(struct runqueue, runqueues); + +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() (&__get_cpu_var(runqueues)) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) + +/* + * Default context-switch locking: + */ +#ifndef prepare_arch_switch +# define prepare_arch_switch(rq, next) do { } while (0) +# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) +# define task_running(rq, p) ((rq)->curr == (p)) +#endif + +/* + * task_rq_lock - lock the runqueue a given task resides on and disable + * interrupts. Note the ordering: we can safely lookup the task_rq without + * explicitly disabling preemption. + */ +static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) + __acquires(rq->lock) +{ + struct runqueue *rq; + +repeat_lock_task: + local_irq_save(*flags); + rq = task_rq(p); + spin_lock(&rq->lock); + if (unlikely(rq != task_rq(p))) { + spin_unlock_irqrestore(&rq->lock, *flags); + goto repeat_lock_task; + } + return rq; +} + +static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) + __releases(rq->lock) +{ + spin_unlock_irqrestore(&rq->lock, *flags); +} + +/* + * rq_lock - lock a given runqueue and disable interrupts. + */ +static inline runqueue_t *this_rq_lock(void) + __acquires(rq->lock) +{ + runqueue_t *rq; + + local_irq_disable(); + rq = this_rq(); + spin_lock(&rq->lock); + + return rq; +} + +/* + * Place scheduler attributes in sysfs + */ +struct sched_drv_sysfs_entry { + struct attribute attr; + ssize_t (*show)(char *); + ssize_t (*store)(const char *, size_t); +}; + +#define to_sched_drv_sysfs_entry(a) container_of((a), struct sched_drv_sysfs_entry, attr) + +/* + * Macros to help define more common scheduler sysfs attribute types + */ +#define SCHED_DRV_SYSFS_UINT_RW_EV(sdse_vis, aname, conv_in, conv_out, MINV, MAXV) \ +static ssize_t show_ ## aname(char *page) \ +{ \ + unsigned long long val = conv_out(aname); \ + \ + return sprintf(page, "%lld\n", val); \ +} \ + \ +static ssize_t store_ ## aname(const char *page, size_t count) \ +{ \ + unsigned long long val; \ + char *end = NULL; \ + \ + val = simple_strtoull(page, &end, 10); \ + if ((end == page) || ((*end != '\0') && (*end != '\n'))) \ + return -EINVAL; \ + val = conv_in(val); \ + if (val < (MINV)) \ + val = (MINV); \ + else if (val > (MAXV)) \ + val = (MAXV); \ + \ + aname = val; \ + \ + return count; \ +} \ + \ +sdse_vis struct sched_drv_sysfs_entry aname ## _sdse = { \ + .attr = { .name = # aname, .mode = S_IRUGO | S_IWUSR }, \ + .show = show_ ## aname, \ + .store = store_ ## aname, \ +} +#define SCHED_DRV_SYSFS_UINT_RW(aname, conv_in, conv_out, MINV, MAXV) \ + SCHED_DRV_SYSFS_UINT_RW_EV(, aname, conv_in, conv_out, MINV, MAXV) +#define SCHED_DRV_SYSFS_UINT_RW_STATIC(aname, conv_in, conv_out, MINV, MAXV) \ + SCHED_DRV_SYSFS_UINT_RW_EV(static, aname, conv_in, conv_out, MINV, MAXV) + +#define SCHED_DRV_SYSFS_UINT_RO_EV(sdse_vis, ev, aname, conv_out) \ +static ssize_t show_ ## aname(char *page) \ +{ \ + unsigned long long val = conv_out(aname); \ + \ + return sprintf(page, "%lld\n", val); \ +} \ + \ +sdes_vis struct sched_drv_sysfs_entry aname ## _sdse = { \ + .attr = { .name = # aname, .mode = S_IRUGO }, \ + .show = show_ ## aname, \ + .store = NULL, \ +} + +#define SCHED_DRV_SYSFS_UINT_RO(sdse_vis, ev, aname, conv_out) \ + SCHED_DRV_SYSFS_UINT_RO_EV(, ev, aname, conv_out) +#define SCHED_DRV_SYSFS_UINT_RO_STATIC(sdse_vis, ev, aname, conv_out) \ + SCHED_DRV_SYSFS_UINT_RO_EV(static, ev, aname, conv_out) + +#define SCHED_DRV_SYSFS_ATTR(aname) (aname ## _sdse.attr) +#define SCHED_DRV_DECLARE_SYSFS_ENTRY(aname) \ +extern struct sched_drv_sysfs_entry aname ## _sdse + +/** + * finish_task_switch - clean up after a task-switch + * @prev: the thread we just switched away from. + * + * We enter this with the runqueue still locked, and finish_arch_switch() + * will unlock it along with doing any other architecture-specific cleanup + * actions. + * + * Note that we may have delayed dropping an mm in context_switch(). If + * so, we finish that here outside of the runqueue lock. (Doing it + * with the lock held can cause deadlocks; see schedule() for + * details.) + */ +static inline void finish_task_switch(task_t *prev) + __releases(rq->lock) +{ + runqueue_t *rq = this_rq(); + struct mm_struct *mm = rq->prev_mm; + unsigned long prev_task_flags; + + rq->prev_mm = NULL; + + /* + * A task struct has one reference for the use as "current". + * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and + * calls schedule one last time. The schedule call will never return, + * and the scheduled task must drop that reference. + * The test for EXIT_ZOMBIE must occur while the runqueue locks are + * still held, otherwise prev could be scheduled on another cpu, die + * there before we look at prev->state, and then the reference would + * be dropped twice. + * Manfred Spraul + */ + prev_task_flags = prev->flags; + finish_arch_switch(rq, prev); + if (mm) + mmdrop(mm); + if (unlikely(prev_task_flags & PF_DEAD)) + put_task_struct(prev); +} + +/* + * context_switch - switch to the new MM and the new + * thread's register state. + */ +static inline +task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) +{ + struct mm_struct *mm = next->mm; + struct mm_struct *oldmm = prev->active_mm; + + if (unlikely(!mm)) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next); + } else + switch_mm(oldmm, mm, next); + + if (unlikely(!prev->mm)) { + prev->active_mm = NULL; + WARN_ON(rq->prev_mm); + rq->prev_mm = oldmm; + } + + /* Here we just switch the register state and the stack. */ + switch_to(prev, next, prev); + + return prev; +} + +/* + * This is called on clock ticks and on context switches. + * Bank in p->sched_time the ns elapsed since the last tick or switch. + */ +static inline void update_cpu_clock(task_t *p, runqueue_t *rq, + unsigned long long now) +{ + unsigned long long last = max(p->timestamp, rq->timestamp_last_tick); + p->sched_time += now - last; +} + +/* Actually do priority change: must hold rq lock. */ +void __setscheduler(struct task_struct *, int, int); + +#ifdef CONFIG_SMP +#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ + < (long long) (sd)->cache_hot_time) +extern void resched_task(task_t *p); +extern void idle_balance(int, runqueue_t *); +extern void rebalance_tick(int, runqueue_t *, enum idle_type); + +#ifdef CONFIG_SCHED_SMT +extern int cpu_and_siblings_are_idle(int cpu); +#else +#define cpu_and_siblings_are_idle(A) idle_cpu(A) +#endif + +/* + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + */ +static inline +int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, + struct sched_domain *sd, enum idle_type idle) +{ + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 3) are cache-hot on their current CPU. + */ + if (task_running(rq, p)) + return 0; + if (!cpu_isset(this_cpu, p->cpus_allowed)) + return 0; + + /* + * Aggressive migration if: + * 1) the [whole] cpu is idle, or + * 2) too many balance attempts have failed. + */ + + if (cpu_and_siblings_are_idle(this_cpu) || \ + sd->nr_balance_failed > sd->cache_nice_tries) + return 1; + + if (task_hot(p, rq->timestamp_last_tick, sd)) + return 0; + return 1; +} + +#ifdef CONFIG_HOTPLUG_CPU +extern void migrate_dead(unsigned int, task_t *); +#endif +#else +#define resched_task(p) set_tsk_need_resched(p) +/* + * on UP we do not need to balance between CPUs: + */ +static inline void idle_balance(int cpu, runqueue_t *rq) { } +static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) { } +#endif + +#ifdef CONFIG_SCHED_SMT +extern int wake_priority_sleeper(runqueue_t *); +extern void wake_sleeping_dependent(int, runqueue_t *); +extern int dependent_sleeper(int, runqueue_t *); +#else +static inline int wake_priority_sleeper(runqueue_t *rq) { return 0; } +static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) { } +static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) { return 0; } +#endif + +/* + * "Nice" biased load balancing + */ +#ifdef CONFIG_SMP +#define MAX_STATIC_PRIO (MAX_RT_PRIO + 40) +static inline void inc_prio_bias(runqueue_t *rq, int static_prio) +{ + rq->prio_bias += MAX_STATIC_PRIO - static_prio; +} + +static inline void dec_prio_bias(runqueue_t *rq, int static_prio) +{ + rq->prio_bias -= MAX_STATIC_PRIO - static_prio; +} +#else +static inline void inc_prio_bias(runqueue_t *rq, int static_prio) +{ +} + +static inline void dec_prio_bias(runqueue_t *rq, int static_prio) +{ +} +#endif + +static inline void inc_nr_running(task_t *p, runqueue_t *rq) +{ + rq->nr_running++; + inc_prio_bias(rq, p->static_prio); +} + +static inline void dec_nr_running(task_t *p, runqueue_t *rq) +{ + rq->nr_running--; + dec_prio_bias(rq, p->static_prio); +} + +#ifdef CONFIG_SCHEDSTATS +# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) + +/* + * Called when a process is dequeued from the active array and given + * the cpu. We should note that with the exception of interactive + * tasks, the expired queue will become the active queue after the active + * queue is empty, without explicitly dequeuing and requeuing tasks in the + * expired queue. (Interactive tasks may be requeued directly to the + * active queue, thus delaying tasks in the expired queue from running; + * see scheduler_tick()). + * + * This function is only called from sched_info_arrive(), rather than + * dequeue_task(). Even though a task may be queued and dequeued multiple + * times as it is shuffled about, we're really interested in knowing how + * long it was from the *first* time it was queued to the time that it + * finally hit a cpu. + */ +static inline void sched_info_dequeued(task_t *t) +{ + t->sched_info.last_queued = 0; +} + +/* + * Called when a task finally hits the cpu. We can now calculate how + * long it was waiting to run. We also note when it began so that we + * can keep stats on how long its timeslice is. + */ +static inline void sched_info_arrive(task_t *t) +{ + unsigned long now = jiffies, diff = 0; + struct runqueue *rq = task_rq(t); + + if (t->sched_info.last_queued) + diff = now - t->sched_info.last_queued; + sched_info_dequeued(t); + t->sched_info.run_delay += diff; + t->sched_info.last_arrival = now; + t->sched_info.pcnt++; + + if (!rq) + return; + + rq->rq_sched_info.run_delay += diff; + rq->rq_sched_info.pcnt++; +} + +/* + * Called when a process is queued into either the active or expired + * array. The time is noted and later used to determine how long we + * had to wait for us to reach the cpu. Since the expired queue will + * become the active queue after active queue is empty, without dequeuing + * and requeuing any tasks, we are interested in queuing to either. It + * is unusual but not impossible for tasks to be dequeued and immediately + * requeued in the same or another array: this can happen in sched_yield(), + * set_user_nice(), and even load_balance() as it moves tasks from runqueue + * to runqueue. + * + * This function is only called from enqueue_task(), but also only updates + * the timestamp if it is already not set. It's assumed that + * sched_info_dequeued() will clear that stamp when appropriate. + */ +static inline void sched_info_queued(task_t *t) +{ + if (!t->sched_info.last_queued) + t->sched_info.last_queued = jiffies; +} + +/* + * Called when a process ceases being the active-running process, either + * voluntarily or involuntarily. Now we can calculate how long we ran. + */ +static inline void sched_info_depart(task_t *t) +{ + struct runqueue *rq = task_rq(t); + unsigned long diff = jiffies - t->sched_info.last_arrival; + + t->sched_info.cpu_time += diff; + + if (rq) + rq->rq_sched_info.cpu_time += diff; +} + +/* + * Called when tasks are switched involuntarily due, typically, to expiring + * their time slice. (This may also be called when switching to or from + * the idle task.) We are only called when prev != next. + */ +static inline void sched_info_switch(task_t *prev, task_t *next) +{ + struct runqueue *rq = task_rq(prev); + + /* + * prev now departs the cpu. It's not interesting to record + * stats about how efficient we were at scheduling the idle + * process, however. + */ + if (prev != rq->idle) + sched_info_depart(prev); + + if (next != rq->idle) + sched_info_arrive(next); +} +#else +# define schedstat_inc(rq, field) do { } while (0) +# define sched_info_queued(t) do { } while (0) +# define sched_info_switch(t, next) do { } while (0) +#endif /* CONFIG_SCHEDSTATS */ + +#endif diff -Naur linux-2.6.12-rc5-mm1/include/linux/sched_runq.h linux-2.6.12-rc5-mm1-plug/include/linux/sched_runq.h --- linux-2.6.12-rc5-mm1/include/linux/sched_runq.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.12-rc5-mm1-plug/include/linux/sched_runq.h 2005-05-25 17:05:49.588984480 -0700 @@ -0,0 +1,174 @@ +#ifndef _LINUX_SCHED_RUNQ_H +#define _LINUX_SCHED_RUNQ_H +/* + * include/linux/sched_runq.h + * This contains the definition of the CPU scheduler run queue type. + * Modified to allow each scheduler to have its own private run queue data. + */ + +/* + * These are the runqueue data structures: + */ +#ifdef CONFIG_CPUSCHED_INGO +#define INGO_MAX_PRIO (MAX_RT_PRIO + 40) + +#define INGO_BITMAP_SIZE ((((INGO_MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) + +struct prio_array { + unsigned int nr_active; + unsigned long bitmap[INGO_BITMAP_SIZE]; + struct list_head queue[INGO_MAX_PRIO]; +}; + +struct ingo_runqueue_queue { + prio_array_t *active, *expired, arrays[2]; + /* + set to 0 on init, become null or array switch + set to jiffies whenever an non-interactive job expires + reset to jiffies if expires + */ + unsigned long expired_timestamp; + int best_expired_prio; +}; +#endif + +#ifdef CONFIG_CPUSCHED_STAIRCASE +#define STAIRCASE_MAX_PRIO (MAX_RT_PRIO + 40) +#define STAIRCASE_NUM_PRIO_SLOTS (STAIRCASE_MAX_PRIO + 1) + +struct staircase_runqueue_queue { + DECLARE_BITMAP(bitmap, STAIRCASE_NUM_PRIO_SLOTS); + struct list_head queue[STAIRCASE_NUM_PRIO_SLOTS - 1]; + unsigned int cache_ticks; + unsigned int preempted; +}; +#endif + +#ifdef CONFIG_CPUSCHED_SPA +#ifdef CONFIG_CPUSCHED_ZAPHOD +#define SPA_IDLE_PRIO 159 +#else +#define SPA_IDLE_PRIO (MAX_RT_PRIO + 40 + 2) +#endif +#define SPA_NUM_PRIO_SLOTS (SPA_IDLE_PRIO + 1) + +struct spa_prio_slot { + unsigned int prio; + struct list_head list; +}; + +struct spa_runqueue_queue { + DECLARE_BITMAP(bitmap, SPA_NUM_PRIO_SLOTS); + struct spa_prio_slot queue[SPA_NUM_PRIO_SLOTS - 1]; + unsigned long next_prom_due; + unsigned long pcount; +}; +#endif + +#ifdef CONFIG_CPUSCHED_NICK +#define NICK_MAX_PRIO (MAX_RT_PRIO + 59) + +#define NICK_BITMAP_SIZE ((((NICK_MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) + +struct nick_prio_array { + int min_prio; + unsigned int nr_active; + unsigned long bitmap[NICK_BITMAP_SIZE]; + struct list_head queue[NICK_MAX_PRIO]; +}; + +struct nick_runqueue_queue { + struct nick_prio_array *active, *expired, arrays[2]; + /* + set to 0 on init, become null or array switch + set to jiffies whenever an non-interactive job expires + reset to jiffies if expires + */ + unsigned long array_sequence; +}; +#endif + +typedef struct runqueue runqueue_t; + +union runqueue_queue { +#ifdef CONFIG_CPUSCHED_INGO + struct ingo_runqueue_queue ingosched; +#endif +#ifdef CONFIG_CPUSCHED_STAIRCASE + struct staircase_runqueue_queue staircase; +#endif +#ifdef CONFIG_CPUSCHED_SPA + struct spa_runqueue_queue spa; +#endif +#ifdef CONFIG_CPUSCHED_NICK + struct nick_runqueue_queue nicksched; +#endif +}; + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct runqueue { + spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned long nr_running; +#ifdef CONFIG_SMP + unsigned long prio_bias; + unsigned long cpu_load; +#endif + unsigned long long nr_switches; + + /* + * This is part of a global counter where only the total sum + * over all CPUs matters. A task can increase this counter on + * one CPU and if it got migrated afterwards it may decrease + * it on another CPU. Always updated under the runqueue lock: + */ + unsigned long nr_uninterruptible; + union runqueue_queue qu; + unsigned long long timestamp_last_tick; + task_t *curr, *idle; + struct mm_struct *prev_mm; + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + struct sched_domain *sd; + + /* For active balancing */ + int active_balance; + int push_cpu; + + task_t *migration_thread; + struct list_head migration_queue; +#endif + +#ifdef CONFIG_SCHEDSTATS + /* latency stats */ + struct sched_info rq_sched_info; + + /* sys_sched_yield() stats */ + unsigned long yld_exp_empty; + unsigned long yld_act_empty; + unsigned long yld_both_empty; + unsigned long yld_cnt; + + /* schedule() stats */ + unsigned long sched_switch; + unsigned long sched_cnt; + unsigned long sched_goidle; + + /* try_to_wake_up() stats */ + unsigned long ttwu_cnt; + unsigned long ttwu_local; +#endif +}; + +#endif diff -Naur linux-2.6.12-rc5-mm1/include/linux/sched_task.h linux-2.6.12-rc5-mm1-plug/include/linux/sched_task.h --- linux-2.6.12-rc5-mm1/include/linux/sched_task.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.12-rc5-mm1-plug/include/linux/sched_task.h 2005-05-25 17:05:49.588984480 -0700 @@ -0,0 +1,92 @@ +#ifndef _LINUX_SCHED_TASK_H +#define _LINUX_SCHED_TASK_H +/* + * include/linux/sched_task.h + */ + +/* + * Require that the relationship between 'nice' and 'static_prio' be the same + * for all schedulers. + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..(MAX_RT_PRIO + 39) ], + * and back. + */ +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) + +#ifdef CONFIG_CPUSCHED_INGO +struct ingo_sched_drv_task { + struct prio_array *array; + unsigned int time_slice; + unsigned int first_time_slice; + unsigned long sleep_avg; + int activated; +}; +#endif + +#ifdef CONFIG_CPUSCHED_STAIRCASE +struct staircase_sched_drv_task { + unsigned long sflags; + unsigned long runtime, totalrun, ns_debit; + unsigned int burst; + unsigned int slice, time_slice; +}; +#endif + +#ifdef CONFIG_CPUSCHED_SPA +#include +#ifdef CONFIG_CPUSCHED_ZAPHOD +#include +#endif + +struct spa_sched_drv_task { + unsigned int time_slice; + struct task_cpustats cpustats; +#ifdef CONFIG_CPUSCHED_ZAPHOD + struct sched_zaphod zaphod; +#endif + unsigned long cpu_rate_cap, min_cpu_rate_cap; + unsigned long cpu_rate_hard_cap; + struct timer_list sinbin_timer; + unsigned int flags; +}; + +#define SPAF_SINBINNED (1 << 0) /* I am sinbinned */ +#define SPAF_UISLEEP (1 << 1) /* Uninterruptible sleep */ + +#define task_is_sinbinned(p) (unlikely(((p)->sdu.spa.flags & SPAF_SINBINNED) != 0)) + +/* set/get cpu rate caps in parts per thousand */ +extern int set_cpu_rate_cap(struct task_struct *p, unsigned long new_cap); +extern int set_cpu_rate_hard_cap(struct task_struct *p, unsigned long new_cap); +extern unsigned long get_cpu_rate_cap(struct task_struct *p); +extern unsigned long get_cpu_rate_hard_cap(struct task_struct *p); +#endif + +#ifdef CONFIG_CPUSCHED_NICK +struct nick_sched_drv_task { + struct nick_prio_array *array; + unsigned long array_sequence; + unsigned long total_time, sleep_time; + int used_slice; +}; +#endif + +union sched_drv_task { +#ifdef CONFIG_CPUSCHED_INGO + struct ingo_sched_drv_task ingosched; +#endif +#ifdef CONFIG_CPUSCHED_STAIRCASE + struct staircase_sched_drv_task staircase; +#endif +#ifdef CONFIG_CPUSCHED_SPA + struct spa_sched_drv_task spa; +#endif +#ifdef CONFIG_CPUSCHED_NICK + struct nick_sched_drv_task nicksched; +#endif +}; + +void set_oom_time_slice(struct task_struct *p, unsigned long t); +#endif diff -Naur linux-2.6.12-rc5-mm1/include/linux/sched_zaphod.h linux-2.6.12-rc5-mm1-plug/include/linux/sched_zaphod.h --- linux-2.6.12-rc5-mm1/include/linux/sched_zaphod.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.12-rc5-mm1-plug/include/linux/sched_zaphod.h 2005-05-25 17:05:49.589984328 -0700 @@ -0,0 +1,67 @@ +#ifndef _LINUX_SCHED_ZAPHOD_H +#define _LINUX_SCHED_ZAPHOD_H + +#include +#include + +/* + * Making IDLE_PRIO bigger than 159 would require modification of bitmaps + */ +#define ZAPHOD_IDLE_PRIO 159 +#define ZAPHOD_BGND_PRIO (ZAPHOD_IDLE_PRIO - 1) +#define ZAPHOD_MIN_NORMAL_PRIO MAX_RT_PRIO +#define ZAPHOD_MAX_PRIO (ZAPHOD_MIN_NORMAL_PRIO + 40) + +/* + * For entitlemnet based scheduling a task's shares will be determined from + * their "nice"ness + */ +#define EB_SHARES_PER_NICE 5 +#define DEFAULT_EB_SHARES (20 * EB_SHARES_PER_NICE) +#define MAX_EB_SHARES (DEFAULT_EB_SHARES * DEFAULT_EB_SHARES) + +struct sched_zaphod_runq_data { + unsigned long avg_nr_running; + atomic_t eb_yardstick; + atomic_t eb_ticks_to_decay; +}; + +extern void zaphod_init_cpu_runq_data(unsigned int cpu); +extern struct sched_zaphod_runq_data *zaphod_cpu_runq_data(unsigned int cpu); +extern void zaphod_runq_data_tick(unsigned int cpu, unsigned long numr); + +struct sched_zaphod { + unsigned int pre_bonus_priority; + unsigned int interactive_bonus; + unsigned int throughput_bonus; + unsigned int eb_shares; +}; + +#define ZAPHOD_TASK_DATA_INIT() \ + { .pre_bonus_priority = (ZAPHOD_BGND_PRIO - 20), \ + .eb_shares = DEFAULT_EB_SHARES, \ + .interactive_bonus = 0, \ + .throughput_bonus = 0, \ + } + +#define SCHED_ZAPHOD_INIT \ + .zrq = NULL, \ + .zaphod = ZAPHOD_TASK_DATA_INIT() + +static inline struct sched_zaphod zaphod_task_data_init(void) { + struct sched_zaphod ret = ZAPHOD_TASK_DATA_INIT(); + + return ret; +} + +struct task_struct; + +extern void zaphod_fork(struct task_struct *p); +extern unsigned int zaphod_effective_prio(struct task_struct *p); +extern void zaphod_reassess_at_activation(struct task_struct *p); +extern void zaphod_reassess_at_end_of_ts(struct task_struct *p); +extern void zaphod_reassess_at_sinbin_release(struct task_struct *p); +extern void zaphod_reassess_at_renice(struct task_struct *p); +extern void zaphod_reassess_at_new_cap(struct task_struct *p); + +#endif diff -Naur linux-2.6.12-rc5-mm1/include/linux/topology.h linux-2.6.12-rc5-mm1-plug/include/linux/topology.h --- linux-2.6.12-rc5-mm1/include/linux/topology.h 2005-05-25 16:23:47.056467712 -0700 +++ linux-2.6.12-rc5-mm1-plug/include/linux/topology.h 2005-05-25 17:02:50.622191568 -0700 @@ -89,11 +89,6 @@ .cache_hot_time = 0, \ .cache_nice_tries = 0, \ .per_cpu_gain = 25, \ - .busy_idx = 0, \ - .idle_idx = 0, \ - .newidle_idx = 1, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ @@ -120,15 +115,12 @@ .cache_hot_time = (5*1000000/2), \ .cache_nice_tries = 1, \ .per_cpu_gain = 100, \ - .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 2, \ - .wake_idx = 1, \ - .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ - | SD_WAKE_AFFINE, \ + | SD_WAKE_AFFINE \ + | SD_WAKE_IDLE \ + | SD_WAKE_BALANCE, \ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ diff -Naur linux-2.6.12-rc5-mm1/init/Kconfig linux-2.6.12-rc5-mm1-plug/init/Kconfig --- linux-2.6.12-rc5-mm1/init/Kconfig 2005-05-25 16:23:47.205445064 -0700 +++ linux-2.6.12-rc5-mm1-plug/init/Kconfig 2005-05-25 17:05:49.598982960 -0700 @@ -238,6 +238,64 @@ Say N if unsure. +config PLUGSCHED + bool "Support for multiple cpu schedulers" + default y + help + Say Y here if you want to compile in support for multiple + cpu schedulers. The cpu scheduler may be selected at boot time + with the boot parameter "cpusched=". The choice of which cpu + schedulers to compile into the kernel can be made by enabling + "Configure standard kernel features" otherwise all cpu schedulers + supported will be compiled in. + +choice + prompt "Default cpu scheduler" + help + This option allows you to choose which cpu scheduler shall be + booted by default at startup if you have plugsched support, or + it will choose which is the only scheduler compiled in. + +config CPUSCHED_DEFAULT_INGO + bool "Ingosched cpu scheduler" + select CPUSCHED_INGO + ---help--- + This is the default cpu scheduler which is an O(1) dual priority + array scheduler with a hybrid interactive design. + +config CPUSCHED_DEFAULT_STAIRCASE + bool "Staircase cpu scheduler" + select CPUSCHED_STAIRCASE + ---help--- + This scheduler is an O(1) single priority array with a foreground- + background interactive design. + +config CPUSCHED_DEFAULT_SPA_NF + bool "Single priority array (SPA) cpu scheduler (no frills)" + select CPUSCHED_SPA_NF + ---help--- + This is a simple round robin scheduler with a O(1) single priority + array. + +config CPUSCHED_DEFAULT_ZAPHOD + bool "Zaphod cpu scheduler" + select CPUSCHED_ZAPHOD + ---help--- + This scheduler is an O(1) single priority array with interactive + bonus, throughput bonus, soft and hard CPU rate caps and a runtime + choice between priority based and entitlement based interpretation + of nice. + +config CPUSCHED_DEFAULT_NICK + bool "Nicksched cpu scheduler" + select CPUSCHED_NICK + ---help--- + This is the default cpu scheduler which is an O(1) dual priority + array scheduler with a hybrid interactive design as modified by + Nick Piggin. + +endchoice + menuconfig EMBEDDED bool "Configure standard kernel features (for small systems)" help @@ -246,6 +304,70 @@ environments which can tolerate a "non-standard" kernel. Only use this if you really know what you are doing. +config CPUSCHED_INGO + bool "Ingosched cpu scheduler" if EMBEDDED + depends on PLUGSCHED + default y + ---help--- + This is the default cpu scheduler which is an O(1) dual priority + array scheduler with a hybrid interactive design. + To boot this cpu scheduler, if it is not the default, use the + bootparam "cpusched=ingosched". + +config CPUSCHED_STAIRCASE + bool "Staircase cpu scheduler" if EMBEDDED + depends on PLUGSCHED + default y + ---help--- + This scheduler is an O(1) single priority array with a foreground- + background interactive design. + To boot this cpu scheduler, if it is not the default, use the + bootparam "cpusched=staircase". + +config CPUSCHED_SPA + bool "SPA cpu schedulers" if EMBEDDED + depends on PLUGSCHED + default y + ---help--- + Support for O(1) single priority array schedulers. + +config CPUSCHED_SPA_NF + bool "SPA cpu scheduler (no frills)" if EMBEDDED + depends on PLUGSCHED + select CPUSCHED_SPA + default y + ---help--- + This scheduler is a simple round robin O(1) single priority array + with NO extra scheduling "frills". This scheduler contains no extra + mechanisms for enhancing interactive response and is best suited for + server systems. + To boot this cpu scheduler, if it is not the default, use the + bootparam "cpusched=spa_no_frills". + +config CPUSCHED_ZAPHOD + bool "Zaphod cpu scheduler" if EMBEDDED + depends on PLUGSCHED + select CPUSCHED_SPA + default y + ---help--- + This scheduler is an O(1) single priority array with interactive + bonus, throughput bonus, soft and hard CPU rate caps and a runtime + choice between priority based and entitlement based interpretation + of nice. + To boot this cpu scheduler, if it is not the default, use the + bootparam "cpusched=zaphod". + +config CPUSCHED_NICK + bool "Nicksched cpu scheduler" if EMBEDDED + depends on PLUGSCHED + default y + ---help--- + This is the default cpu scheduler which is an O(1) dual priority + array scheduler with a hybrid interactive design as modified by + Nick Piggin. + To boot this cpu scheduler, if it is not the default, use the + bootparam "cpusched=nicksched". + config KALLSYMS bool "Load all symbols for debugging/kksymoops" if EMBEDDED default y diff -Naur linux-2.6.12-rc5-mm1/init/main.c linux-2.6.12-rc5-mm1-plug/init/main.c --- linux-2.6.12-rc5-mm1/init/main.c 2005-05-25 16:23:47.207444760 -0700 +++ linux-2.6.12-rc5-mm1-plug/init/main.c 2005-05-25 17:09:09.885534776 -0700 @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -442,10 +443,19 @@ */ smp_prepare_boot_cpu(); + build_all_zonelists(); + page_alloc_init(); + printk(KERN_NOTICE "Kernel command line: %s\n", saved_command_line); + parse_early_param(); + parse_args("Booting kernel", command_line, __start___param, + __stop___param - __start___param, + &unknown_bootoption); /* * Set up the scheduler prior starting any interrupts (such as the * timer interrupt). Full topology setup happens at smp_init() * time - but meanwhile we still have a functioning scheduler. + * But defer until after boot command line is parsed to avoid doing + * this twice in the event that a different scheduler is selected. */ sched_init(); /* @@ -453,14 +463,7 @@ * fragile until we cpu_idle() for the first time. */ preempt_disable(); - build_all_zonelists(); - page_alloc_init(); trap_init(); - printk(KERN_NOTICE "Kernel command line: %s\n", saved_command_line); - parse_early_param(); - parse_args("Booting kernel", command_line, __start___param, - __stop___param - __start___param, - &unknown_bootoption); sort_main_extable(); rcu_init(); init_IRQ(); @@ -523,6 +526,7 @@ acpi_early_init(); /* before LAPIC and SMP init */ + printk("Running with \"%s\" cpu scheduler.\n", sched_drvp->name); /* Do the rest non-__init'ed, we're now alive */ rest_init(); } @@ -592,6 +596,7 @@ #ifdef CONFIG_SYSCTL sysctl_init(); #endif + sched_drv_sysfs_init(); /* Networking initialization needs a process context */ sock_init(); diff -Naur linux-2.6.12-rc5-mm1/kernel/Kconfig.preempt linux-2.6.12-rc5-mm1-plug/kernel/Kconfig.preempt --- linux-2.6.12-rc5-mm1/kernel/Kconfig.preempt 2005-05-25 16:23:47.283433208 -0700 +++ linux-2.6.12-rc5-mm1-plug/kernel/Kconfig.preempt 2005-05-25 17:02:50.623191416 -0700 @@ -1,56 +1,15 @@ -choice - prompt "Preemption Model" - default PREEMPT_NONE - -config PREEMPT_NONE - bool "No Forced Preemption (Server)" - help - This is the traditional Linux preemption model, geared towards - throughput. It will still provide good latencies most of the - time, but there are no guarantees and occasional longer delays - are possible. - - Select this option if you are building a kernel for a server or - scientific/computation system, or if you want to maximize the - raw processing power of the kernel, irrespective of scheduling - latencies. - -config PREEMPT_VOLUNTARY - bool "Voluntary Kernel Preemption (Desktop)" - help - This option reduces the latency of the kernel by adding more - "explicit preemption points" to the kernel code. These new - preemption points have been selected to reduce the maximum - latency of rescheduling, providing faster application reactions, - at the cost of slighly lower throughput. - - This allows reaction to interactive events by allowing a - low priority process to voluntarily preempt itself even if it - is in kernel mode executing a system call. This allows - applications to run more 'smoothly' even when the system is - under load. - - Select this if you are building a kernel for a desktop system. - config PREEMPT - bool "Preemptible Kernel (Low-Latency Desktop)" + bool "Preemptible Kernel" help - This option reduces the latency of the kernel by making - all kernel code (that is not executing in a critical section) - preemptible. This allows reaction to interactive events by - permitting a low priority process to be preempted involuntarily - even if it is in kernel mode executing a system call and would - otherwise not be about to reach a natural preemption point. - This allows applications to run more 'smoothly' even when the - system is under load, at the cost of slighly lower throughput - and a slight runtime overhead to kernel code. - - Select this if you are building a kernel for a desktop or - embedded system with latency requirements in the milliseconds - range. + This option reduces the latency of the kernel when reacting to + real-time or interactive events by allowing a low priority process to + be preempted even if it is in kernel mode executing a system call. + This allows applications to run more reliably even when the system is + under load. -endchoice + Say Y here if you are building a kernel for a desktop, embedded + or real-time system. Say N if you are unsure. config PREEMPT_BKL bool "Preempt The Big Kernel Lock" diff -Naur linux-2.6.12-rc5-mm1/kernel/Makefile linux-2.6.12-rc5-mm1-plug/kernel/Makefile --- linux-2.6.12-rc5-mm1/kernel/Makefile 2005-05-25 16:23:47.300430624 -0700 +++ linux-2.6.12-rc5-mm1-plug/kernel/Makefile 2005-05-25 17:05:49.617980072 -0700 @@ -7,8 +7,13 @@ sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o intermodule.o extable.o params.o posix-timers.o \ - kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o + kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o sched_drv.o +obj-$(CONFIG_CPUSCHED_INGO) += ingosched.o +obj-$(CONFIG_CPUSCHED_STAIRCASE) += staircase.o +obj-$(CONFIG_CPUSCHED_SPA) += sched_spa.o sched_cpustats.o +obj-$(CONFIG_CPUSCHED_ZAPHOD) += sched_zaphod.o +obj-$(CONFIG_CPUSCHED_NICK) += nicksched.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o diff -Naur linux-2.6.12-rc5-mm1/kernel/cpuset.c linux-2.6.12-rc5-mm1-plug/kernel/cpuset.c --- linux-2.6.12-rc5-mm1/kernel/cpuset.c 2005-05-25 16:23:47.218443088 -0700 +++ linux-2.6.12-rc5-mm1-plug/kernel/cpuset.c 2005-05-25 17:02:50.625191112 -0700 @@ -596,62 +596,10 @@ return 0; } -/* - * For a given cpuset cur, partition the system as follows - * a. All cpus in the parent cpuset's cpus_allowed that are not part of any - * exclusive child cpusets - * b. All cpus in the current cpuset's cpus_allowed that are not part of any - * exclusive child cpusets - * Build these two partitions by calling partition_sched_domains - * - * Call with cpuset_sem held. May nest a call to the - * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. - */ -static void update_cpu_domains(struct cpuset *cur) -{ - struct cpuset *c, *par = cur->parent; - cpumask_t pspan, cspan; - - if (par == NULL || cpus_empty(cur->cpus_allowed)) - return; - - /* - * Get all cpus from parent's cpus_allowed not part of exclusive - * children - */ - pspan = par->cpus_allowed; - list_for_each_entry(c, &par->children, sibling) { - if (is_cpu_exclusive(c)) - cpus_andnot(pspan, pspan, c->cpus_allowed); - } - if (is_removed(cur) || !is_cpu_exclusive(cur)) { - cpus_or(pspan, pspan, cur->cpus_allowed); - if (cpus_equal(pspan, cur->cpus_allowed)) - return; - cspan = CPU_MASK_NONE; - } else { - if (cpus_empty(pspan)) - return; - cspan = cur->cpus_allowed; - /* - * Get all cpus from current cpuset's cpus_allowed not part - * of exclusive children - */ - list_for_each_entry(c, &cur->children, sibling) { - if (is_cpu_exclusive(c)) - cpus_andnot(cspan, cspan, c->cpus_allowed); - } - } - - lock_cpu_hotplug(); - partition_sched_domains(&pspan, &cspan); - unlock_cpu_hotplug(); -} - static int update_cpumask(struct cpuset *cs, char *buf) { struct cpuset trialcs; - int retval, cpus_unchanged; + int retval; trialcs = *cs; retval = cpulist_parse(buf, trialcs.cpus_allowed); @@ -661,13 +609,9 @@ if (cpus_empty(trialcs.cpus_allowed)) return -ENOSPC; retval = validate_change(cs, &trialcs); - if (retval < 0) - return retval; - cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); - cs->cpus_allowed = trialcs.cpus_allowed; - if (is_cpu_exclusive(cs) && !cpus_unchanged) - update_cpu_domains(cs); - return 0; + if (retval == 0) + cs->cpus_allowed = trialcs.cpus_allowed; + return retval; } static int update_nodemask(struct cpuset *cs, char *buf) @@ -703,7 +647,7 @@ { int turning_on; struct cpuset trialcs; - int err, cpu_exclusive_changed; + int err; turning_on = (simple_strtoul(buf, NULL, 10) != 0); @@ -714,18 +658,13 @@ clear_bit(bit, &trialcs.flags); err = validate_change(cs, &trialcs); - if (err < 0) - return err; - cpu_exclusive_changed = - (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); - if (turning_on) - set_bit(bit, &cs->flags); - else - clear_bit(bit, &cs->flags); - - if (cpu_exclusive_changed) - update_cpu_domains(cs); - return 0; + if (err == 0) { + if (turning_on) + set_bit(bit, &cs->flags); + else + clear_bit(bit, &cs->flags); + } + return err; } static int attach_task(struct cpuset *cs, char *buf) @@ -1371,14 +1310,12 @@ up(&cpuset_sem); return -EBUSY; } + spin_lock(&cs->dentry->d_lock); parent = cs->parent; set_bit(CS_REMOVED, &cs->flags); - if (is_cpu_exclusive(cs)) - update_cpu_domains(cs); list_del(&cs->sibling); /* delete my sibling from parent->children */ if (list_empty(&parent->children)) check_for_release(parent); - spin_lock(&cs->dentry->d_lock); d = dget(cs->dentry); cs->dentry = NULL; spin_unlock(&d->d_lock); diff -Naur linux-2.6.12-rc5-mm1/kernel/exit.c linux-2.6.12-rc5-mm1-plug/kernel/exit.c --- linux-2.6.12-rc5-mm1/kernel/exit.c 2005-05-25 16:23:47.221442632 -0700 +++ linux-2.6.12-rc5-mm1-plug/kernel/exit.c 2005-05-25 17:04:43.773989880 -0700 @@ -26,7 +26,6 @@ #include #include #include -#include #include #include @@ -101,7 +100,6 @@ zap_leader = (leader->exit_signal == -1); } - perfctr_release_task(p); sched_exit(p); write_unlock_irq(&tasklist_lock); spin_unlock(&p->proc_lock); diff -Naur linux-2.6.12-rc5-mm1/kernel/fork.c linux-2.6.12-rc5-mm1-plug/kernel/fork.c --- linux-2.6.12-rc5-mm1/kernel/fork.c 2005-05-25 16:23:47.238440048 -0700 +++ linux-2.6.12-rc5-mm1-plug/kernel/fork.c 2005-05-25 17:02:50.626190960 -0700 @@ -1003,6 +1003,9 @@ p->pdeath_signal = 0; p->exit_state = 0; + /* Perform scheduler related setup */ + sched_fork(p); + /* * Ok, make it visible to the rest of the system. * We dont wake it up yet. @@ -1011,24 +1014,18 @@ INIT_LIST_HEAD(&p->ptrace_children); INIT_LIST_HEAD(&p->ptrace_list); - /* Perform scheduler related setup. Assign this task to a CPU. */ - sched_fork(p, clone_flags); - /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); /* - * The task hasn't been attached yet, so its cpus_allowed mask will - * not be changed, nor will its assigned CPU. - * - * The cpus_allowed mask of the parent may have changed after it was - * copied first time - so re-copy it here, then check the child's CPU - * to ensure it is on a valid CPU (and if not, just force it back to - * parent's CPU). This avoids alot of nasty races. + * The task hasn't been attached yet, so cpus_allowed mask cannot + * have changed. The cpus_allowed mask of the parent may have + * changed after it was copied first time, and it may then move to + * another CPU - so we re-copy it here and set the child's CPU to + * the parent's CPU. This avoids alot of nasty races. */ p->cpus_allowed = current->cpus_allowed; - if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed))) - set_task_cpu(p, smp_processor_id()); + set_task_cpu(p, smp_processor_id()); /* * Check for pending SIGKILL! The new thread should not be allowed diff -Naur linux-2.6.12-rc5-mm1/kernel/ingosched.c linux-2.6.12-rc5-mm1-plug/kernel/ingosched.c --- linux-2.6.12-rc5-mm1/kernel/ingosched.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.12-rc5-mm1-plug/kernel/ingosched.c 2005-05-25 17:05:49.620979616 -0700 @@ -0,0 +1,1175 @@ +/* + * kernel/ingosched.c + * Copyright (C) 1991-2005 Linus Torvalds + * + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: + * hybrid priority-list and round-robin design with + * an array-switch method of distributing timeslices + * and per-CPU runqueues. Cleanups and useful suggestions + * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2003-09-03 Interactivity tuning by Con Kolivas. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +static void ingo_init_runqueue_queue(union runqueue_queue *rqq) +{ + int j; + + rqq->ingosched.active = rqq->ingosched.arrays; + rqq->ingosched.expired = rqq->ingosched.arrays + 1; + rqq->ingosched.best_expired_prio = INGO_MAX_PRIO; + + for (j = 0; j < 2; j++) { + int k; + prio_array_t *array = rqq->ingosched.arrays + j; + + for (k = 0; k < INGO_MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(INGO_MAX_PRIO, array->bitmap); + array->nr_active = 0; + } + + rqq->ingosched.expired_timestamp = 0; +} + +static void ingo_set_oom_time_slice(struct task_struct *p, unsigned long t) +{ + p->sdu.ingosched.time_slice = t; +} + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define MAX_USER_PRIO (USER_PRIO(INGO_MAX_PRIO)) + +/* + * Some helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) + +/* + * These are the 'tuning knobs' of the scheduler: + * + * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), + * default timeslice is 100 msecs, maximum timeslice is 800 msecs. + * Timeslices get refilled after they expire. + */ +#define MIN_TIMESLICE max(5 * HZ / 1000, 1) +#define DEF_TIMESLICE (100 * HZ / 1000) +#define ON_RUNQUEUE_WEIGHT 30 +#define CHILD_PENALTY 95 +#define PARENT_PENALTY 100 +#define EXIT_WEIGHT 3 +#define PRIO_BONUS_RATIO 25 +#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) +#define INTERACTIVE_DELTA 2 +#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) +#define STARVATION_LIMIT (MAX_SLEEP_AVG) +#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) + +/* + * If a task is 'interactive' then we reinsert it in the active + * array after it has expired its current timeslice. (it will not + * continue to run immediately, it will still roundrobin with + * other interactive tasks.) + * + * This part scales the interactivity limit depending on niceness. + * + * We scale it linearly, offset by the INTERACTIVE_DELTA delta. + * Here are a few examples of different nice levels: + * + * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] + * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] + * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] + * + * (the X axis represents the possible -5 ... 0 ... +5 dynamic + * priority range a task can explore, a value of '1' means the + * task is rated interactive.) + * + * Ie. nice +19 tasks can never get 'interactive' enough to be + * reinserted into the active array. And only heavily CPU-hog nice -20 + * tasks will be expired. Default nice 0 tasks are somewhere between, + * it takes some effort for them to get interactive, but it's not + * too hard. + */ + +#define CURRENT_BONUS(p) \ + (NS_TO_JIFFIES((p)->sdu.ingosched.sleep_avg) * MAX_BONUS / \ + MAX_SLEEP_AVG) + +#define GRANULARITY (10 * HZ / 1000 ? : 1) + +#ifdef CONFIG_SMP +#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ + num_online_cpus()) +#else +#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) +#endif + +#define SCALE(v1,v1_max,v2_max) \ + (v1) * (v2_max) / (v1_max) + +#define DELTA(p) \ + (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) + +#define TASK_INTERACTIVE(p) \ + ((p)->prio <= (p)->static_prio - DELTA(p)) + +#define INTERACTIVE_SLEEP(p) \ + (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ + (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) + +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) + +/* + * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] + * to time slice values: [800ms ... 100ms ... 5ms] + * + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. + */ + +#define SCALE_PRIO(x, prio) \ + max(x * (INGO_MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) + +static inline unsigned int task_timeslice(const task_t *p) +{ + if (p->static_prio < NICE_TO_PRIO(0)) + return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); + else + return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); +} + +/* + * Adding/removing a task to/from a priority array: + */ +static void dequeue_task(struct task_struct *p, prio_array_t *array) +{ + array->nr_active--; + list_del_init(&p->run_list); + if (list_empty(array->queue + p->prio)) + __clear_bit(p->prio, array->bitmap); +} + +static void enqueue_task(struct task_struct *p, prio_array_t *array) +{ + sched_info_queued(p); + list_add_tail(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->sdu.ingosched.array = array; +} + +/* + * Put task to the end of the run list without the overhead of dequeue + * followed by enqueue. + */ +static void requeue_task(struct task_struct *p, prio_array_t *array) +{ + list_move_tail(&p->run_list, array->queue + p->prio); +} + +static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) +{ + list_add(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->sdu.ingosched.array = array; +} + +/* + * effective_prio - return the priority that is based on the static + * priority but is modified by bonuses/penalties. + * + * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] + * into the -5 ... 0 ... +5 bonus/penalty range. + * + * We use 25% of the full 0...39 priority range so that: + * + * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. + * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. + * + * Both properties are important to certain workloads. + */ +static int effective_prio(task_t *p) +{ + int bonus, prio; + + if (rt_task(p)) + return p->prio; + + bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; + + prio = p->static_prio - bonus; + if (prio < MAX_RT_PRIO) + prio = MAX_RT_PRIO; + if (prio > INGO_MAX_PRIO-1) + prio = INGO_MAX_PRIO-1; + return prio; +} + +/* + * __activate_task - move a task to the runqueue. + */ +static inline void __activate_task(task_t *p, runqueue_t *rq) +{ + enqueue_task(p, rq->qu.ingosched.active); + inc_nr_running(p, rq); +} + +static void recalc_task_prio(task_t *p, unsigned long long now) +{ + /* Caller must always ensure 'now >= p->timestamp' */ + unsigned long long __sleep_time = now - p->timestamp; + unsigned long sleep_time; + + if (__sleep_time > NS_MAX_SLEEP_AVG) + sleep_time = NS_MAX_SLEEP_AVG; + else + sleep_time = (unsigned long)__sleep_time; + + if (likely(sleep_time > 0)) { + /* + * User tasks that sleep a long time are categorised as + * idle and will get just interactive status to stay active & + * prevent them suddenly becoming cpu hogs and starving + * other processes. + */ + if (p->mm && p->sdu.ingosched.activated != -1 && + sleep_time > INTERACTIVE_SLEEP(p)) { + p->sdu.ingosched.sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - + DEF_TIMESLICE); + } else { + /* + * The lower the sleep avg a task has the more + * rapidly it will rise with sleep time. + */ + sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; + + /* + * Tasks waking from uninterruptible sleep are + * limited in their sleep_avg rise as they + * are likely to be waiting on I/O + */ + if (p->sdu.ingosched.activated == -1 && p->mm) { + if (p->sdu.ingosched.sleep_avg >= INTERACTIVE_SLEEP(p)) + sleep_time = 0; + else if (p->sdu.ingosched.sleep_avg + sleep_time >= + INTERACTIVE_SLEEP(p)) { + p->sdu.ingosched.sleep_avg = INTERACTIVE_SLEEP(p); + sleep_time = 0; + } + } + + /* + * This code gives a bonus to interactive tasks. + * + * The boost works by updating the 'average sleep time' + * value here, based on ->timestamp. The more time a + * task spends sleeping, the higher the average gets - + * and the higher the priority boost gets as well. + */ + p->sdu.ingosched.sleep_avg += sleep_time; + + if (p->sdu.ingosched.sleep_avg > NS_MAX_SLEEP_AVG) + p->sdu.ingosched.sleep_avg = NS_MAX_SLEEP_AVG; + } + } + + p->prio = effective_prio(p); +} + +/* + * activate_task - move a task to the runqueue and do priority recalculation + * + * Update all the scheduling statistics stuff. (sleep average + * calculation, priority modifiers, etc.) + */ +static void activate_task(task_t *p, runqueue_t *rq, int local) +{ + unsigned long long now; + + now = sched_clock(); +#ifdef CONFIG_SMP + if (!local) { + /* Compensate for drifting sched_clock */ + runqueue_t *this_rq = this_rq(); + now = (now - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; + } +#endif + + recalc_task_prio(p, now); + + /* + * This checks to make sure it's not an uninterruptible task + * that is now waking up. + */ + if (!p->sdu.ingosched.activated) { + /* + * Tasks which were woken up by interrupts (ie. hw events) + * are most likely of interactive nature. So we give them + * the credit of extending their sleep time to the period + * of time they spend on the runqueue, waiting for execution + * on a CPU, first time around: + */ + if (in_interrupt()) + p->sdu.ingosched.activated = 2; + else { + /* + * Normal first-time wakeups get a credit too for + * on-runqueue time, but it will be weighted down: + */ + p->sdu.ingosched.activated = 1; + } + } + p->timestamp = now; + + __activate_task(p, rq); +} + +/* + * __activate_idle_task - move idle task to the _front_ of runqueue. + */ +static inline void __activate_idle_task(task_t *p, runqueue_t *rq) +{ + enqueue_task_head(p, rq->qu.ingosched.active); + inc_nr_running(p, rq); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +static void deactivate_task(struct task_struct *p, runqueue_t *rq) +{ + dec_nr_running(p, rq); + dequeue_task(p, p->sdu.ingosched.array); + p->sdu.ingosched.array = NULL; +} + +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @old_state: the task's state before being woken + * @sync: do a synchronous wakeup? + * @rq: The run queue on which the task is to be placed (already locked) + */ +static void ingo_wake_up_task(struct task_struct *p, struct runqueue *rq, unsigned int old_state, int sync) +{ + int same_cpu = (rq == this_rq()); + + if (old_state == TASK_UNINTERRUPTIBLE) { + rq->nr_uninterruptible--; + /* + * Tasks on involuntary sleep don't earn + * sleep_avg beyond just interactive state. + */ + p->sdu.ingosched.activated = -1; + } + + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption, if the woken up task will run on + * this cpu. (in this case the 'I will reschedule' promise of + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ + activate_task(p, rq, same_cpu); + if (!sync || !same_cpu) { + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } +} + +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + */ +static void ingo_fork(task_t *p) +{ + p->sdu.ingosched.array = NULL; + /* + * Share the timeslice between parent and child, thus the + * total amount of pending timeslices in the system doesn't change, + * resulting in more scheduling fairness. + */ + local_irq_disable(); + p->sdu.ingosched.time_slice = (current->sdu.ingosched.time_slice + 1) >> 1; + /* + * The remainder of the first timeslice might be recovered by + * the parent if the child exits early enough. + */ + p->sdu.ingosched.first_time_slice = 1; + current->sdu.ingosched.time_slice >>= 1; + p->timestamp = sched_clock(); + if (unlikely(!current->sdu.ingosched.time_slice)) { + /* + * This case is rare, it happens when the parent has only + * a single jiffy left from its timeslice. Taking the + * runqueue lock is not a problem. + */ + current->sdu.ingosched.time_slice = 1; + preempt_disable(); + scheduler_tick(); + local_irq_enable(); + preempt_enable(); + } else + local_irq_enable(); +} + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +static void ingo_wake_up_new_task(task_t * p, unsigned long clone_flags) +{ + unsigned long flags; + int this_cpu, cpu; + runqueue_t *rq, *this_rq; + + rq = task_rq_lock(p, &flags); + cpu = task_cpu(p); + this_cpu = smp_processor_id(); + + BUG_ON(p->state != TASK_RUNNING); + + /* + * We decrease the sleep average of forking parents + * and children as well, to keep max-interactive tasks + * from forking tasks that are max-interactive. The parent + * (current) is done further down, under its lock. + */ + p->sdu.ingosched.sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * + CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + + p->prio = effective_prio(p); + + if (likely(cpu == this_cpu)) { + if (!(clone_flags & CLONE_VM)) { + /* + * The VM isn't cloned, so we're in a good position to + * do child-runs-first in anticipation of an exec. This + * usually avoids a lot of COW overhead. + */ + if (unlikely(!current->sdu.ingosched.array)) + __activate_task(p, rq); + else { + p->prio = current->prio; + list_add_tail(&p->run_list, ¤t->run_list); + p->sdu.ingosched.array = current->sdu.ingosched.array; + p->sdu.ingosched.array->nr_active++; + inc_nr_running(p, rq); + } + set_need_resched(); + } else + /* Run child last */ + __activate_task(p, rq); + /* + * We skip the following code due to cpu == this_cpu + * + * task_rq_unlock(rq, &flags); + * this_rq = task_rq_lock(current, &flags); + */ + this_rq = rq; + } else { + this_rq = cpu_rq(this_cpu); + + /* + * Not the local CPU - must adjust timestamp. This should + * get optimised away in the !CONFIG_SMP case. + */ + p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; + __activate_task(p, rq); + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + + /* + * Parent and child are on different CPUs, now get the + * parent runqueue to update the parent's ->sdu.ingosched.sleep_avg: + */ + task_rq_unlock(rq, &flags); + this_rq = task_rq_lock(current, &flags); + } + current->sdu.ingosched.sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * + PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + task_rq_unlock(this_rq, &flags); +} + +/* + * Potentially available exiting-child timeslices are + * retrieved here - this way the parent does not get + * penalized for creating too many threads. + * + * (this cannot be used to 'generate' timeslices + * artificially, because any timeslice recovered here + * was given away by the parent in the first place.) + */ +static void ingo_exit(task_t * p) +{ + unsigned long flags; + runqueue_t *rq; + + /* + * If the child was a (relative-) CPU hog then decrease + * the sleep_avg of the parent as well. + */ + rq = task_rq_lock(p->parent, &flags); + if (p->sdu.ingosched.first_time_slice) { + p->parent->sdu.ingosched.time_slice += p->sdu.ingosched.time_slice; + if (unlikely(p->parent->sdu.ingosched.time_slice > task_timeslice(p))) + p->parent->sdu.ingosched.time_slice = task_timeslice(p); + } + if (p->sdu.ingosched.sleep_avg < p->parent->sdu.ingosched.sleep_avg) + p->parent->sdu.ingosched.sleep_avg = p->parent->sdu.ingosched.sleep_avg / + (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sdu.ingosched.sleep_avg / + (EXIT_WEIGHT + 1); + task_rq_unlock(rq, &flags); +} + +#ifdef CONFIG_SMP +/* + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static inline +void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, + runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) +{ + dequeue_task(p, src_array); + dec_nr_running(p, src_rq); + set_task_cpu(p, this_cpu); + inc_nr_running(p, this_rq); + enqueue_task(p, this_array); + p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + + this_rq->timestamp_last_tick; + /* + * Note that idle threads have a prio of INGO_MAX_PRIO, for this test + * to be always true for them. + */ + if (TASK_PREEMPTS_CURR(p, this_rq)) + resched_task(this_rq->curr); +} + +/* + * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, + * as part of a balancing operation within "domain". Returns the number of + * tasks moved. + * + * Called with both runqueues locked. + */ +static int ingo_move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) +{ + prio_array_t *array, *dst_array; + struct list_head *head, *curr; + int idx, pulled = 0; + task_t *tmp; + + if (max_nr_move <= 0 || busiest->nr_running <= 1) + goto out; + + /* + * We first consider expired tasks. Those will likely not be + * executed in the near future, and they are most likely to + * be cache-cold, thus switching CPUs has the least effect + * on them. + */ + if (busiest->qu.ingosched.expired->nr_active) { + array = busiest->qu.ingosched.expired; + dst_array = this_rq->qu.ingosched.expired; + } else { + array = busiest->qu.ingosched.active; + dst_array = this_rq->qu.ingosched.active; + } + +new_array: + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, INGO_MAX_PRIO, idx); + if (idx >= INGO_MAX_PRIO) { + if (array == busiest->qu.ingosched.expired && busiest->qu.ingosched.active->nr_active) { + array = busiest->qu.ingosched.active; + dst_array = this_rq->qu.ingosched.active; + goto new_array; + } + goto out; + } + + head = array->queue + idx; + curr = head->prev; +skip_queue: + tmp = list_entry(curr, task_t, run_list); + + curr = curr->prev; + + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + +#ifdef CONFIG_SCHEDSTATS + if (task_hot(tmp, busiest->timestamp_last_tick, sd)) + schedstat_inc(sd, lb_hot_gained[idle]); +#endif + + pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pulled++; + + /* We only want to steal up to the prescribed number of tasks. */ + if (pulled < max_nr_move) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } +out: + return pulled; +} +#endif + +/* + * We place interactive tasks back into the active array, if possible. + * + * To guarantee that this does not starve expired tasks we ignore the + * interactivity of a task if the first expired task had to wait more + * than a 'reasonable' amount of time. This deadline timeout is + * load-dependent, as the frequency of array switched decreases with + * increasing number of running tasks. We also ignore the interactivity + * if a better static_prio task has expired: + */ +#define EXPIRED_STARVING(rq) \ + ((STARVATION_LIMIT && ((rq)->qu.ingosched.expired_timestamp && \ + (jiffies - (rq)->qu.ingosched.expired_timestamp >= \ + STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ + ((rq)->curr->static_prio > (rq)->qu.ingosched.best_expired_prio)) + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + * + * It also gets called by the fork code, when changing the parent's + * timeslices. + */ +static void ingo_tick(struct task_struct *p, struct runqueue *rq, unsigned long long now) +{ + int cpu = smp_processor_id(); + + if (p == rq->idle) { + if (wake_priority_sleeper(rq)) + goto out; + rebalance_tick(cpu, rq, SCHED_IDLE); + return; + } + + /* Task might have expired already, but not scheduled off yet */ + if (p->sdu.ingosched.array != rq->qu.ingosched.active) { + set_tsk_need_resched(p); + goto out; + } + spin_lock(&rq->lock); + /* + * The task was running during this tick - update the + * time slice counter. Note: we do not update a thread's + * priority until it either goes to sleep or uses up its + * timeslice. This makes it possible for interactive tasks + * to use up their timeslices at their highest priority levels. + */ + if (rt_task(p)) { + /* + * RR tasks need a special form of timeslice management. + * FIFO tasks have no timeslices. + */ + if ((p->policy == SCHED_RR) && !--p->sdu.ingosched.time_slice) { + p->sdu.ingosched.time_slice = task_timeslice(p); + p->sdu.ingosched.first_time_slice = 0; + set_tsk_need_resched(p); + + /* put it at the end of the queue: */ + requeue_task(p, rq->qu.ingosched.active); + } + goto out_unlock; + } + if (!--p->sdu.ingosched.time_slice) { + dequeue_task(p, rq->qu.ingosched.active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + p->sdu.ingosched.time_slice = task_timeslice(p); + p->sdu.ingosched.first_time_slice = 0; + + if (!rq->qu.ingosched.expired_timestamp) + rq->qu.ingosched.expired_timestamp = jiffies; + if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { + enqueue_task(p, rq->qu.ingosched.expired); + if (p->static_prio < rq->qu.ingosched.best_expired_prio) + rq->qu.ingosched.best_expired_prio = p->static_prio; + } else + enqueue_task(p, rq->qu.ingosched.active); + } else { + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + * + * This only applies to tasks in the interactive + * delta range with at least TIMESLICE_GRANULARITY to requeue. + */ + if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - + p->sdu.ingosched.time_slice) % TIMESLICE_GRANULARITY(p)) && + (p->sdu.ingosched.time_slice >= TIMESLICE_GRANULARITY(p)) && + (p->sdu.ingosched.array == rq->qu.ingosched.active)) { + + requeue_task(p, rq->qu.ingosched.active); + set_tsk_need_resched(p); + } + } +out_unlock: + spin_unlock(&rq->lock); +out: + rebalance_tick(cpu, rq, NOT_IDLE); +} + +#ifdef CONFIG_SCHED_SMT +static struct task_struct *ingo_head_of_queue(union runqueue_queue *rqq) +{ + prio_array_t *array = rqq->ingosched.active; + + if (!array->nr_active) + array = rqq->ingosched.expired; + BUG_ON(!array->nr_active); + + return list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, + task_t, run_list); +} + +static int ingo_dependent_sleeper_trumps(const struct task_struct *p1, + const struct task_struct * p2, struct sched_domain *sd) +{ + return ((p1->sdu.ingosched.time_slice * (100 - sd->per_cpu_gain) / 100) > + task_timeslice(p2) || rt_task(p1)) && + p2->mm && p1->mm && !rt_task(p2); +} +#endif + +/* + * schedule() is the main scheduler function. + */ +static void ingo_schedule(void) +{ + long *switch_count; + prio_array_t *array; + unsigned long run_time; + int cpu, idx; + struct task_struct *prev = current, *next; + struct list_head *queue; + struct runqueue *rq = this_rq(); + unsigned long long now = sched_clock(); + + if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { + run_time = now - prev->timestamp; + if (unlikely((long long)(now - prev->timestamp) < 0)) + run_time = 0; + } else + run_time = NS_MAX_SLEEP_AVG; + + /* + * Tasks charged proportionately less run_time at high sleep_avg to + * delay them losing their interactive status + */ + run_time /= (CURRENT_BONUS(prev) ? : 1); + + spin_lock_irq(&rq->lock); + + if (unlikely(prev->flags & PF_DEAD)) + prev->state = EXIT_DEAD; + + switch_count = &prev->nivcsw; + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; + else { + if (prev->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; + deactivate_task(prev, rq); + } + } + + cpu = smp_processor_id(); + if (unlikely(!rq->nr_running)) { +go_idle: + idle_balance(cpu, rq); + if (!rq->nr_running) { + next = rq->idle; + rq->qu.ingosched.expired_timestamp = 0; + wake_sleeping_dependent(cpu, rq); + /* + * wake_sleeping_dependent() might have released + * the runqueue, so break out if we got new + * tasks meanwhile: + */ + if (!rq->nr_running) + goto switch_tasks; + } + } else { + if (dependent_sleeper(cpu, rq)) { + next = rq->idle; + goto switch_tasks; + } + /* + * dependent_sleeper() releases and reacquires the runqueue + * lock, hence go into the idle loop if the rq went + * empty meanwhile: + */ + if (unlikely(!rq->nr_running)) + goto go_idle; + } + + array = rq->qu.ingosched.active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + schedstat_inc(rq, sched_switch); + rq->qu.ingosched.active = rq->qu.ingosched.expired; + rq->qu.ingosched.expired = array; + array = rq->qu.ingosched.active; + rq->qu.ingosched.expired_timestamp = 0; + rq->qu.ingosched.best_expired_prio = INGO_MAX_PRIO; + } + + idx = sched_find_first_bit(array->bitmap); + queue = array->queue + idx; + next = list_entry(queue->next, task_t, run_list); + + if (!rt_task(next) && next->sdu.ingosched.activated > 0) { + unsigned long long delta = now - next->timestamp; + if (unlikely((long long)(now - next->timestamp) < 0)) + delta = 0; + + if (next->sdu.ingosched.activated == 1) + delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; + + array = next->sdu.ingosched.array; + dequeue_task(next, array); + recalc_task_prio(next, next->timestamp + delta); + enqueue_task(next, array); + } + next->sdu.ingosched.activated = 0; +switch_tasks: + if (next == rq->idle) + schedstat_inc(rq, sched_goidle); + prefetch(next); + clear_tsk_need_resched(prev); + rcu_qsctr_inc(task_cpu(prev)); + + update_cpu_clock(prev, rq, now); + + prev->sdu.ingosched.sleep_avg -= run_time; + if ((long)prev->sdu.ingosched.sleep_avg <= 0) + prev->sdu.ingosched.sleep_avg = 0; + prev->timestamp = prev->last_ran = now; + + sched_info_switch(prev, next); + if (likely(prev != next)) { + next->timestamp = now; + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + + prepare_arch_switch(rq, next); + prev = context_switch(rq, prev, next); + barrier(); + + finish_task_switch(prev); + } else + spin_unlock_irq(&rq->lock); +} + +static void ingo_set_normal_task_nice(task_t *p, long nice) +{ + prio_array_t *array; + int old_prio, new_prio, delta; + + array = p->sdu.ingosched.array; + if (array) { + dequeue_task(p, array); + dec_prio_bias(task_rq(p), p->static_prio); + } + + old_prio = p->prio; + new_prio = NICE_TO_PRIO(nice); + delta = new_prio - old_prio; + p->static_prio = NICE_TO_PRIO(nice); + p->prio += delta; + + if (array) { + struct runqueue *rq = task_rq(p); + + inc_prio_bias(task_rq(p), p->static_prio); + enqueue_task(p, array); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + if (delta < 0 || (delta > 0 && task_running(rq, p))) + resched_task(rq->curr); + } +} + +/* + * setscheduler - change the scheduling policy and/or RT priority of a thread. + */ +static void ingo_setscheduler(task_t *p, int policy, int prio) +{ + int oldprio; + prio_array_t *array; + runqueue_t *rq = task_rq(p); + + array = p->sdu.ingosched.array; + if (array) + deactivate_task(p, rq); + oldprio = p->prio; + __setscheduler(p, policy, prio); + if (array) { + __activate_task(p, rq); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * this function yields the current CPU by moving the calling thread + * to the expired array. If there are no other threads running on this + * CPU then this function will return. + */ + +static long ingo_sys_yield(void) +{ + runqueue_t *rq = this_rq_lock(); + prio_array_t *array = current->sdu.ingosched.array; + prio_array_t *target = rq->qu.ingosched.expired; + + schedstat_inc(rq, yld_cnt); + /* + * We implement yielding by moving the task into the expired + * queue. + * + * (special rule: RT tasks will just roundrobin in the active + * array.) + */ + if (rt_task(current)) + target = rq->qu.ingosched.active; + + if (current->sdu.ingosched.array->nr_active == 1) { + schedstat_inc(rq, yld_act_empty); + if (!rq->qu.ingosched.expired->nr_active) + schedstat_inc(rq, yld_both_empty); + } else if (!rq->qu.ingosched.expired->nr_active) + schedstat_inc(rq, yld_exp_empty); + + if (array != target) { + dequeue_task(current, array); + enqueue_task(current, target); + } else + /* + * requeue_task is cheaper so perform that if possible. + */ + requeue_task(current, array); + + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: + */ + __release(rq->lock); + _raw_spin_unlock(&rq->lock); + preempt_enable_no_resched(); + + schedule(); + + return 0; +} + +static void ingo_yield(void) +{ + set_current_state(TASK_RUNNING); + ingo_sys_yield(); +} + +static void ingo_init_idle(task_t *idle, int cpu) +{ + idle->sdu.ingosched.sleep_avg = 0; + idle->sdu.ingosched.array = NULL; + idle->prio = INGO_MAX_PRIO; +} + +#ifdef CONFIG_SMP +/* source and destination queues will be already locked */ +static void ingo_migrate_queued_task(struct task_struct *p, int dest_cpu) +{ + struct runqueue *rq_src = task_rq(p); + struct runqueue *rq_dest = cpu_rq(dest_cpu); + + /* + * Sync timestamp with rq_dest's before activating. + * The same thing could be achieved by doing this step + * afterwards, and pretending it was a local activate. + * This way is cleaner and logically correct. + */ + p->timestamp = p->timestamp - rq_src->timestamp_last_tick + + rq_dest->timestamp_last_tick; + deactivate_task(p, rq_src); + set_task_cpu(p, dest_cpu); + activate_task(p, rq_dest, 0); + if (TASK_PREEMPTS_CURR(p, rq_dest)) + resched_task(rq_dest->curr); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void ingo_set_select_idle_first(struct runqueue *rq) +{ + __setscheduler(rq->idle, SCHED_FIFO, MAX_RT_PRIO-1); + /* Add idle task to _front_ of it's priority queue */ + __activate_idle_task(rq->idle, rq); +} + +static void ingo_set_select_idle_last(struct runqueue *rq) +{ + deactivate_task(rq->idle, rq); + rq->idle->static_prio = INGO_MAX_PRIO; + __setscheduler(rq->idle, SCHED_NORMAL, 0); +} + +static void ingo_migrate_dead_tasks(unsigned int dead_cpu) +{ + unsigned arr, i; + struct runqueue *rq = cpu_rq(dead_cpu); + + for (arr = 0; arr < 2; arr++) { + for (i = 0; i < INGO_MAX_PRIO; i++) { + struct list_head *list = &rq->qu.ingosched.arrays[arr].queue[i]; + while (!list_empty(list)) + migrate_dead(dead_cpu, + list_entry(list->next, task_t, + run_list)); + } + } +} +#endif +#endif + +static void ingo_sched_init(void) +{ + init_task.sdu.ingosched.time_slice = HZ; + init_task.sdu.ingosched.array = NULL; +} + +#ifdef CONFIG_MAGIC_SYSRQ +static void ingo_normalize_rt_task(struct task_struct *p) +{ + prio_array_t *array; + unsigned long flags; + runqueue_t *rq; + + rq = task_rq_lock(p, &flags); + + array = p->sdu.ingosched.array; + if (array) + deactivate_task(p, rq); + __setscheduler(p, SCHED_NORMAL, 0); + if (array) { + __activate_task(p, rq); + resched_task(rq->curr); + } + + task_rq_unlock(rq, &flags); +} +#endif + +const struct sched_drv ingo_sched_drv = { + .name = "ingosched", + .init_runqueue_queue = ingo_init_runqueue_queue, + .set_oom_time_slice = ingo_set_oom_time_slice, + .task_timeslice = task_timeslice, + .wake_up_task = ingo_wake_up_task, + .fork = ingo_fork, + .wake_up_new_task = ingo_wake_up_new_task, + .exit = ingo_exit, +#ifdef CONFIG_SMP + .move_tasks = ingo_move_tasks, +#endif + .tick = ingo_tick, +#ifdef CONFIG_SCHED_SMT + .head_of_queue = ingo_head_of_queue, + .dependent_sleeper_trumps = ingo_dependent_sleeper_trumps, +#endif + .schedule = ingo_schedule, + .set_normal_task_nice = ingo_set_normal_task_nice, + .setscheduler = ingo_setscheduler, + .sys_yield = ingo_sys_yield, + .yield = ingo_yield, + .init_idle = ingo_init_idle, + .sched_init = ingo_sched_init, +#ifdef CONFIG_SMP + .migrate_queued_task = ingo_migrate_queued_task, +#ifdef CONFIG_HOTPLUG_CPU + .set_select_idle_first = ingo_set_select_idle_first, + .set_select_idle_last = ingo_set_select_idle_last, + .migrate_dead_tasks = ingo_migrate_dead_tasks, +#endif +#endif +#ifdef CONFIG_MAGIC_SYSRQ + .normalize_rt_task = ingo_normalize_rt_task, +#endif + .attrs = NULL, +}; diff -Naur linux-2.6.12-rc5-mm1/kernel/nicksched.c linux-2.6.12-rc5-mm1-plug/kernel/nicksched.c --- linux-2.6.12-rc5-mm1/kernel/nicksched.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.12-rc5-mm1-plug/kernel/nicksched.c 2005-05-25 17:05:49.622979312 -0700 @@ -0,0 +1,992 @@ +/* + * kernel/nicksched.c + * Copyright (C) 1991-2005 Linus Torvalds + * + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: + * hybrid priority-list and round-robin design with + * an array-switch method of distributing timeslices + * and per-CPU runqueues. Cleanups and useful suggestions + * by Davide Libenzi, preemptible kernel bits by Robert Love. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +static void nick_init_runqueue_queue(union runqueue_queue *rqq) +{ + int j; + + rqq->nicksched.active = rqq->nicksched.arrays; + rqq->nicksched.expired = rqq->nicksched.arrays + 1; + + for (j = 0; j < 2; j++) { + int k; + struct nick_prio_array *array = rqq->nicksched.arrays + j; + + array->min_prio = NICK_MAX_PRIO; + for (k = 0; k < NICK_MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(NICK_MAX_PRIO, array->bitmap); + array->nr_active = 0; + } + + rqq->nicksched.array_sequence = 0; +} + +static void nick_set_oom_time_slice(struct task_struct *p, unsigned long t) +{ +} + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p) - MAX_RT_PRIO) +#define MAX_USER_PRIO (USER_PRIO(NICK_MAX_PRIO)) +/* + * Correct for fact that p->static_prio has normal mapping + */ +#define STATIC_USER_PRIO(p) ((p)->static_prio - MAX_RT_PRIO + 10) + +/* + * Some helpers for converting microsecond timing to jiffy resolution + */ +#define US_TO_JIFFIES(x) ((x) * HZ / 1000000) +#define JIFFIES_TO_US(x) ((x) * 1000000 / HZ) + +static int base_timeslice = 256; +#define min_base_timeslice 1 +#define max_base_timeslice 10000 + +#define RT_TIMESLICE (50 * 1000 / HZ) /* 50ms */ +#define BASE_TIMESLICE (base_timeslice) +#define MIN_TIMESLICE (base_timeslice / 16 ?: 1) + +/* Maximum amount of history that will be used to calculate priority */ +#define MAX_SLEEP_SHIFT 19 +#define MAX_SLEEP (1UL << MAX_SLEEP_SHIFT) /* ~0.52s */ + +/* + * Maximum effect that 1 block of activity (run/sleep/etc) can have. This is + * will moderate dicard freak events (eg. SIGSTOP) + */ +#define MAX_SLEEP_AFFECT (MAX_SLEEP/4) + +/* + * The amount of history can be decreased (on fork for example). This puts a + * lower bound on it. + */ +#define MIN_HISTORY (MAX_SLEEP/8) +#define FORKED_TS_MAX (US_TO_JIFFIES(MIN_HISTORY) ?: 1) + +/* + * SLEEP_FACTOR is a fixed point factor used to scale history tracking things. + * In particular: total_time, sleep_time, sleep_avg. + */ +#define SLEEP_FACTOR 1024 + +/* + * The scheduler classifies a process as performing one of the following + * activities + */ +#define STIME_SLEEP 1 /* Sleeping */ +#define STIME_RUN 2 /* Using CPU */ + +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) + +/* + * Adding/removing a task to/from a priority array: + */ +static void dequeue_task(struct task_struct *p, struct nick_prio_array *array) +{ + array->nr_active--; + list_del_init(&p->run_list); + if (list_empty(array->queue + p->prio)) + __clear_bit(p->prio, array->bitmap); +} + +static void enqueue_task(struct task_struct *p, struct nick_prio_array *array) +{ + struct list_head *entry = array->queue + p->prio; + + sched_info_queued(p); + if (!rt_task(p)) { + /* + * Cycle tasks on the same priority level. This reduces their + * timeslice fluctuations due to higher priority tasks expiring. + */ + if (!list_empty(entry)) + entry = entry->next; + } + list_add_tail(&p->run_list, entry); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->sdu.nicksched.array = array; +} + +static inline void enqueue_task_head(struct task_struct *p, struct nick_prio_array *array) +{ + list_add(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->sdu.nicksched.array = array; +} + +#define NS_TO_APPROX_US(t) ((t) >> 10) + +/* + * add_task_time updates a task @p after @time of doing the specified @type + * of activity. See STIME_*. This is used for priority calculation. + */ +static inline void add_task_time(task_t *p, unsigned long long time, unsigned long type) +{ + unsigned long ratio; + unsigned long long tmp; + unsigned long t; + if (type == STIME_SLEEP) { + if (time > MAX_SLEEP_AFFECT*4) + time = MAX_SLEEP_AFFECT*4; + t = ((unsigned long)time + 3) / 4; + } else { + unsigned long div = 60 - STATIC_USER_PRIO(p); + t = (unsigned long)time * 30; + t = t / div; + t = t * 30; + t = t / div; + } + + ratio = MAX_SLEEP - t; + tmp = (unsigned long long)ratio * p->sdu.nicksched.total_time + MAX_SLEEP/2; + tmp >>= MAX_SLEEP_SHIFT; + p->sdu.nicksched.total_time = (unsigned long)tmp; + + tmp = (unsigned long long)ratio * p->sdu.nicksched.sleep_time + MAX_SLEEP/2; + tmp >>= MAX_SLEEP_SHIFT; + p->sdu.nicksched.sleep_time = (unsigned long)tmp; + + p->sdu.nicksched.total_time += t; + if (type == STIME_SLEEP) + p->sdu.nicksched.sleep_time += t; +} + +static unsigned long task_sleep_avg(task_t *p) +{ + return (SLEEP_FACTOR * p->sdu.nicksched.sleep_time) / (p->sdu.nicksched.total_time + 1); +} + +/* + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. + * + * Timeslices are scaled, so if only low priority processes are running, + * they will all get long timeslices. + */ + +static int task_timeslice(const task_t *p, runqueue_t *rq) +{ + int idx, base, delta; + int timeslice; + + if (rt_task(p)) + return RT_TIMESLICE; + + idx = min(p->prio, rq->qu.nicksched.expired->min_prio); + delta = p->prio - idx; + base = BASE_TIMESLICE * (MAX_USER_PRIO + 1) / (delta + 2); + base = base * (MAX_USER_PRIO + 1) / (delta + 2); + + base = base * 40 / (70 - USER_PRIO(idx)); + base = base * 40 / (70 - USER_PRIO(idx)); + + timeslice = base >> 10; + timeslice = timeslice * HZ / 1000; + if (timeslice < MIN_TIMESLICE) + timeslice = MIN_TIMESLICE; + + return timeslice; +} + +/* ++ * task_priority: calculates a task's priority based on previous running ++ * history (see add_task_time). The priority is just a simple linear function ++ * based on sleep_avg and static_prio. ++ */ +static int task_priority(task_t *p) +{ + unsigned long sleep_avg; + int bonus, prio; + + if (rt_task(p)) + return p->prio; + + sleep_avg = task_sleep_avg(p); + + prio = STATIC_USER_PRIO(p) + 10; + bonus = (((MAX_USER_PRIO + 1) / 3) * sleep_avg + (SLEEP_FACTOR / 2)) + / SLEEP_FACTOR; + prio = MAX_RT_PRIO + prio - bonus; + + if (prio < MAX_RT_PRIO) + return MAX_RT_PRIO; + if (prio > NICK_MAX_PRIO-1) + return NICK_MAX_PRIO-1; + + return prio; +} + +/* + * __activate_task - move a task to the runqueue. + */ +static inline void __activate_task(task_t *p, runqueue_t *rq, struct nick_prio_array *array) +{ + enqueue_task(p, array); + inc_nr_running(p, rq); + if (!rt_task(p)) { + if (p->prio < array->min_prio) + array->min_prio = p->prio; + } +} + +/* + * activate_task - move a task to the runqueue and do priority recalculation + * + * Update all the scheduling statistics stuff. (sleep average + * calculation, priority modifiers, etc.) + */ +static void activate_task(task_t *p, runqueue_t *rq, int local) +{ + unsigned long long now, sleep; + struct nick_prio_array *array; + + now = sched_clock(); +#ifdef CONFIG_SMP + if (!local) { + /* Compensate for drifting sched_clock */ + runqueue_t *this_rq = this_rq(); + now = (now - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; + } +#endif + + /* + * If we have slept through an active/expired array switch, restart + * our timeslice too. + */ + sleep = NS_TO_APPROX_US(now - p->timestamp); + p->timestamp = now; + add_task_time(p, sleep, STIME_SLEEP); + p->prio = task_priority(p); + + array = rq->qu.nicksched.active; + if (rq->qu.nicksched.array_sequence != p->sdu.nicksched.array_sequence) { + p->sdu.nicksched.used_slice = 0; + } else if (unlikely(p->sdu.nicksched.used_slice == -1)) { + p->sdu.nicksched.used_slice = 0; + array = rq->qu.nicksched.expired; + } + + __activate_task(p, rq, array); +} + +/* + * __activate_idle_task - move idle task to the _front_ of runqueue. + */ +static inline void __activate_idle_task(task_t *p, runqueue_t *rq) +{ + enqueue_task_head(p, rq->qu.nicksched.active); + inc_nr_running(p, rq); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) +{ + p->sdu.nicksched.array_sequence = rq->qu.nicksched.array_sequence; + dec_nr_running(p, rq); + dequeue_task(p, p->sdu.nicksched.array); + p->sdu.nicksched.array = NULL; +} + +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @old_state: the task's state before being woken + * @sync: do a synchronous wakeup? + * @rq: The run queue on which the task is to be placed (already locked) + */ +static void nick_wake_up_task(struct task_struct *p, struct runqueue *rq, unsigned int old_state, int sync) +{ + int same_cpu = (rq == this_rq()); + + if (old_state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; + + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption, if the woken up task will run on + * this cpu. (in this case the 'I will reschedule' promise of + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ + activate_task(p, rq, same_cpu); + if (!sync || !same_cpu) { + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } +} + +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + */ +static void nick_fork(task_t *p) +{ + unsigned long sleep_avg; + runqueue_t *rq; + + p->sdu.nicksched.array = NULL; + + p->timestamp = sched_clock(); + p->sdu.nicksched.used_slice = 0; + if (rt_task(p)) { + BUG_ON(!rt_task(current)); + return; + } + + preempt_disable(); + rq = this_rq(); + /* Get MIN_HISTORY of history with the same sleep_avg as parent. */ + sleep_avg = task_sleep_avg(current); + p->sdu.nicksched.total_time = MIN_HISTORY; + p->sdu.nicksched.sleep_time = p->sdu.nicksched.total_time * sleep_avg / SLEEP_FACTOR; + + /* Parent loses 1/4 of sleep time for forking */ + current->sdu.nicksched.sleep_time = 3 * current->sdu.nicksched.sleep_time / 4; + + local_irq_disable(); + if (unlikely(current->sdu.nicksched.used_slice == -1 || current == rq->idle)) + p->sdu.nicksched.used_slice = -1; + else { + int ts = task_timeslice(current, rq); + current->sdu.nicksched.used_slice += (ts + 3) / 4; + if (current->sdu.nicksched.used_slice >= ts) { + current->sdu.nicksched.used_slice = -1; + set_need_resched(); + } + } + local_irq_enable(); + preempt_enable(); +} + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +static void nick_wake_up_new_task(task_t * p, unsigned long clone_flags) +{ + unsigned long flags; + int this_cpu, cpu; + runqueue_t *rq; + struct nick_prio_array *array; + + rq = task_rq_lock(p, &flags); + + BUG_ON(p->state != TASK_RUNNING); + + cpu = task_cpu(p); + this_cpu = smp_processor_id(); + + array = rq->qu.nicksched.active; + if (!rt_task(p)) { + if (unlikely(p->sdu.nicksched.used_slice == -1)) { + p->sdu.nicksched.used_slice = 0; + array = rq->qu.nicksched.expired; + } else { + int total = task_timeslice(p, rq); + int ts = max((total + 3) / 4, MIN_TIMESLICE); + ts = min(ts, (int)FORKED_TS_MAX); + p->sdu.nicksched.used_slice = total - ts; + } + } + + if (likely(cpu == this_cpu)) { + if (!(clone_flags & CLONE_VM) && likely(array == rq->qu.nicksched.active)) { + /* + * The VM isn't cloned, so we're in a good position to + * do child-runs-first in anticipation of an exec. This + * usually avoids a lot of COW overhead. + */ + if (p->prio >= current->prio) { + p->prio = current->prio; + list_add_tail(&p->run_list, ¤t->run_list); + p->sdu.nicksched.array = current->sdu.nicksched.array; + p->sdu.nicksched.array->nr_active++; + inc_nr_running(p, rq); + } else { + p->prio = task_priority(p); + __activate_task(p, rq, array); + } + set_need_resched(); + } else { + /* Run child last */ + p->prio = task_priority(p); + __activate_task(p, rq, array); + } +#ifdef CONFIG_SMP + } else { + runqueue_t *this_rq = cpu_rq(this_cpu); + + /* + * Not the local CPU - must adjust timestamp. This should + * get optimised away in the !CONFIG_SMP case. + */ + p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; + p->prio = task_priority(p); + __activate_task(p, rq, array); + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); +#endif + } + + task_rq_unlock(rq, &flags); +} + +/* + * Potentially available exiting-child timeslices are + * retrieved here - this way the parent does not get + * penalized for creating too many threads. + * + * (this cannot be used to 'generate' timeslices + * artificially, because any timeslice recovered here + * was given away by the parent in the first place.) + */ +static void nick_exit(task_t * p) +{ +} + +#ifdef CONFIG_SMP +/* + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static inline +void pull_task(runqueue_t *src_rq, struct nick_prio_array *src_array, task_t *p, + runqueue_t *this_rq, struct nick_prio_array *this_array, int this_cpu) +{ + dequeue_task(p, src_array); + dec_nr_running(p, src_rq); + set_task_cpu(p, this_cpu); + inc_nr_running(p, this_rq); + enqueue_task(p, this_array); + p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + + this_rq->timestamp_last_tick; + /* + * Note that idle threads have a prio of NICK_MAX_PRIO, for this test + * to be always true for them. + */ + if (TASK_PREEMPTS_CURR(p, this_rq)) + resched_task(this_rq->curr); +} + +/* + * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, + * as part of a balancing operation within "domain". Returns the number of + * tasks moved. + * + * Called with both runqueues locked. + */ +static int nick_move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) +{ + struct nick_prio_array *array, *dst_array; + struct list_head *head, *curr; + int idx, pulled = 0; + task_t *tmp; + + if (max_nr_move <= 0 || busiest->nr_running <= 1) + goto out; + + /* + * We first consider expired tasks. Those will likely not be + * executed in the near future, and they are most likely to + * be cache-cold, thus switching CPUs has the least effect + * on them. + */ + if (busiest->qu.nicksched.expired->nr_active) { + array = busiest->qu.nicksched.expired; + dst_array = this_rq->qu.nicksched.expired; + } else { + array = busiest->qu.nicksched.active; + dst_array = this_rq->qu.nicksched.active; + } + +new_array: + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, NICK_MAX_PRIO, idx); + if (idx >= NICK_MAX_PRIO) { + if (array == busiest->qu.nicksched.expired && busiest->qu.nicksched.active->nr_active) { + array = busiest->qu.nicksched.active; + dst_array = this_rq->qu.nicksched.active; + goto new_array; + } + goto out; + } + + head = array->queue + idx; + curr = head->prev; +skip_queue: + tmp = list_entry(curr, task_t, run_list); + + curr = curr->prev; + + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + +#ifdef CONFIG_SCHEDSTATS + if (task_hot(tmp, busiest->timestamp_last_tick, sd)) + schedstat_inc(sd, lb_hot_gained[idle]); +#endif + + pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pulled++; + + /* We only want to steal up to the prescribed number of tasks. */ + if (pulled < max_nr_move) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } +out: + return pulled; +} +#endif + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + * + * It also gets called by the fork code, when changing the parent's + * timeslices. + */ +static void nick_tick(struct task_struct *p, struct runqueue *rq, unsigned long long now) +{ + enum idle_type cpu_status; + int ts; + + if (p == rq->idle) { + cpu_status = SCHED_IDLE; + goto out; + } + + cpu_status = NOT_IDLE; + /* Task might have expired already, but not scheduled off yet */ + if (unlikely(p->sdu.nicksched.used_slice == -1)) + goto out; + + if (unlikely(p->policy == SCHED_FIFO)) + goto out; + + /* p was running during this tick. Update its time slice counter. */ + p->sdu.nicksched.used_slice++; + ts = task_timeslice(p, rq); + if (unlikely(p->sdu.nicksched.used_slice >= ts)) { + p->sdu.nicksched.used_slice = -1; + set_tsk_need_resched(p); + } +out: + rebalance_tick(smp_processor_id(), rq, cpu_status); +} + +#ifdef CONFIG_SCHED_SMT +/* these should never get called */ +static struct task_struct *nick_head_of_queue(union runqueue_queue *rqq) +{ + struct nick_prio_array *array = rqq->nicksched.active; + + if (!array->nr_active) + array = rqq->nicksched.expired; + BUG_ON(!array->nr_active); + + return list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, + task_t, run_list); +} + +static int nick_dependent_sleeper_trumps(const struct task_struct *p1, + const struct task_struct * p2, struct sched_domain *sd) +{ + return 0; +} +#endif + +/* + * schedule() is the main scheduler function. + */ +static void nick_schedule(void) +{ + long *switch_count; + struct nick_prio_array *array; + unsigned long run_time; + int cpu, idx; + struct task_struct *prev = current, *next; + struct list_head *queue; + struct runqueue *rq = this_rq(); + unsigned long long now = sched_clock(); + + run_time = NS_TO_APPROX_US(now - prev->timestamp); + update_cpu_clock(prev, rq, now); + prev->timestamp = prev->last_ran = now; + add_task_time(prev, run_time, STIME_RUN); + + spin_lock_irq(&rq->lock); + + if (unlikely(prev->flags & PF_DEAD)) + prev->state = EXIT_DEAD; + + switch_count = &prev->nivcsw; + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; + else { + if (prev->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; + deactivate_task(prev, rq); + goto no_check_expired; + } + } + + if (unlikely(prev->sdu.nicksched.used_slice == -1)) { + dequeue_task(prev, prev->sdu.nicksched.array); + if (rt_task(prev)) { + /* SCHED_FIFO can come in here too, from sched_yield */ + array = rq->qu.nicksched.active; + } else { + array = rq->qu.nicksched.expired; + prev->prio = task_priority(prev); + if (prev->prio < rq->qu.nicksched.expired->min_prio) + rq->qu.nicksched.expired->min_prio = prev->prio; + } + enqueue_task(prev, array); + prev->sdu.nicksched.used_slice = 0; + } +no_check_expired: + + cpu = smp_processor_id(); + if (unlikely(!rq->nr_running)) { + rq->qu.nicksched.array_sequence++; + idle_balance(cpu, rq); + if (!rq->nr_running) { + next = rq->idle; + rq->qu.nicksched.arrays[0].min_prio = NICK_MAX_PRIO; + rq->qu.nicksched.arrays[1].min_prio = NICK_MAX_PRIO; + goto switch_tasks; + } + } + + array = rq->qu.nicksched.active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + schedstat_inc(rq, sched_switch); + rq->qu.nicksched.array_sequence++; + rq->qu.nicksched.active = rq->qu.nicksched.expired; + rq->qu.nicksched.expired = array; + rq->qu.nicksched.expired->min_prio = NICK_MAX_PRIO; + array = rq->qu.nicksched.active; + } + + idx = sched_find_first_bit(array->bitmap); + queue = array->queue + idx; + next = list_entry(queue->next, task_t, run_list); + +switch_tasks: + if (next == rq->idle) + schedstat_inc(rq, sched_goidle); + clear_tsk_need_resched(prev); + rcu_qsctr_inc(cpu); + + sched_info_switch(prev, next); + if (likely(prev != next)) { + next->timestamp = now; + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + + prepare_arch_switch(rq, next); + prev = context_switch(rq, prev, next); + barrier(); + + finish_task_switch(prev); + } else + spin_unlock_irq(&rq->lock); +} + +static void nick_set_normal_task_nice(task_t *p, long nice) +{ + struct nick_prio_array *array; + int old_prio, new_prio, delta; + + array = p->sdu.nicksched.array; + if (array) { + dequeue_task(p, array); + dec_prio_bias(task_rq(p), p->static_prio); + } + + old_prio = p->prio; + new_prio = NICE_TO_PRIO(nice); + delta = new_prio - old_prio; + p->static_prio = NICE_TO_PRIO(nice); + p->prio = task_priority(p); + + if (array) { + struct runqueue *rq = task_rq(p); + + inc_prio_bias(task_rq(p), p->static_prio); + enqueue_task(p, array); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + if (delta < 0 || (delta > 0 && task_running(rq, p))) + resched_task(rq->curr); + } +} + +/* + * setscheduler - change the scheduling policy and/or RT priority of a thread. + */ +static void nick_setscheduler(task_t *p, int policy, int prio) +{ + int oldprio; + struct nick_prio_array *array; + runqueue_t *rq = task_rq(p); + + array = p->sdu.nicksched.array; + if (array) + deactivate_task(p, rq); + oldprio = p->prio; + __setscheduler(p, policy, prio); + if (policy == SCHED_FIFO || policy == SCHED_RR) + p->sdu.nicksched.used_slice = 0; + + if (array) { + __activate_task(p, rq, rq->qu.nicksched.active); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * this function yields the current CPU by moving the calling thread + * to the expired array. If there are no other threads running on this + * CPU then this function will return. + */ + +static long nick_sys_yield(void) +{ + local_irq_disable(); +#ifdef CONFIG_SCHEDSTATS + schedstat_inc(this_rq(), yld_cnt); +#endif + current->sdu.nicksched.used_slice = -1; + set_need_resched(); + local_irq_enable(); + + return 0; +} + +static void nick_yield(void) +{ + set_current_state(TASK_RUNNING); + nick_sys_yield(); +#ifndef CONFIG_PREEMPT + /* + * Kernel-space yield won't follow the schedule upon + * return from syscall path. Must call schedule() here. + */ + schedule(); +#endif +} + +static void nick_init_idle(task_t *idle, int cpu) +{ + idle->sdu.nicksched.used_slice = 0; + idle->sdu.nicksched.array = NULL; + idle->prio = NICK_MAX_PRIO; +} + +#ifdef CONFIG_SMP +/* source and destination queues will be already locked */ +static void nick_migrate_queued_task(struct task_struct *p, int dest_cpu) +{ + struct runqueue *rq_src = task_rq(p); + struct runqueue *rq_dest = cpu_rq(dest_cpu); + + /* + * Sync timestamp with rq_dest's before activating. + * The same thing could be achieved by doing this step + * afterwards, and pretending it was a local activate. + * This way is cleaner and logically correct. + */ + p->timestamp = p->timestamp - rq_src->timestamp_last_tick + + rq_dest->timestamp_last_tick; + deactivate_task(p, rq_src); + set_task_cpu(p, dest_cpu); + activate_task(p, rq_dest, 0); + if (TASK_PREEMPTS_CURR(p, rq_dest)) + resched_task(rq_dest->curr); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void nick_set_select_idle_first(struct runqueue *rq) +{ + __setscheduler(rq->idle, SCHED_FIFO, MAX_RT_PRIO-1); + /* Add idle task to _front_ of it's priority queue */ + __activate_idle_task(rq->idle, rq); +} + +static void nick_set_select_idle_last(struct runqueue *rq) +{ + deactivate_task(rq->idle, rq); + rq->idle->static_prio = NICK_MAX_PRIO; + __setscheduler(rq->idle, SCHED_NORMAL, 0); +} + +static void nick_migrate_dead_tasks(unsigned int dead_cpu) +{ + unsigned arr, i; + struct runqueue *rq = cpu_rq(dead_cpu); + + for (arr = 0; arr < 2; arr++) { + for (i = 0; i < NICK_MAX_PRIO; i++) { + struct list_head *list = &rq->qu.nicksched.arrays[arr].queue[i]; + while (!list_empty(list)) + migrate_dead(dead_cpu, + list_entry(list->next, task_t, + run_list)); + } + } +} +#endif +#endif + +static void nick_sched_init(void) +{ + init_task.sdu.nicksched.used_slice = 0; + init_task.sdu.nicksched.array = NULL; +} + +#ifdef CONFIG_MAGIC_SYSRQ +static void nick_normalize_rt_task(struct task_struct *p) +{ + struct nick_prio_array *array; + unsigned long flags; + runqueue_t *rq; + + rq = task_rq_lock(p, &flags); + + array = p->sdu.nicksched.array; + if (array) + deactivate_task(p, rq); + __setscheduler(p, SCHED_NORMAL, 0); + if (array) { + __activate_task(p, rq, array); + resched_task(rq->curr); + } + + task_rq_unlock(rq, &flags); +} +#endif + +static unsigned int nick_task_timeslice(const struct task_struct *p) +{ + return task_timeslice(p, task_rq(p)); +} + +#ifdef CONFIG_SYSFS +#define no_change(a) (a) +SCHED_DRV_SYSFS_UINT_RW(base_timeslice, no_change, no_change, min_base_timeslice, max_base_timeslice); + +static struct attribute *nick_attrs[] = { + &SCHED_DRV_SYSFS_ATTR(base_timeslice), + NULL, +}; +#endif + +const struct sched_drv nick_sched_drv = { + .name = "nicksched", + .init_runqueue_queue = nick_init_runqueue_queue, + .set_oom_time_slice = nick_set_oom_time_slice, + .task_timeslice = nick_task_timeslice, + .wake_up_task = nick_wake_up_task, + .fork = nick_fork, + .wake_up_new_task = nick_wake_up_new_task, + .exit = nick_exit, +#ifdef CONFIG_SMP + .move_tasks = nick_move_tasks, +#endif + .tick = nick_tick, +#ifdef CONFIG_SCHED_SMT + .head_of_queue = nick_head_of_queue, + .dependent_sleeper_trumps = nick_dependent_sleeper_trumps, +#endif + .schedule = nick_schedule, + .set_normal_task_nice = nick_set_normal_task_nice, + .setscheduler = nick_setscheduler, + .sys_yield = nick_sys_yield, + .yield = nick_yield, + .init_idle = nick_init_idle, + .sched_init = nick_sched_init, +#ifdef CONFIG_SMP + .migrate_queued_task = nick_migrate_queued_task, +#ifdef CONFIG_HOTPLUG_CPU + .set_select_idle_first = nick_set_select_idle_first, + .set_select_idle_last = nick_set_select_idle_last, + .migrate_dead_tasks = nick_migrate_dead_tasks, +#endif +#endif +#ifdef CONFIG_MAGIC_SYSRQ + .normalize_rt_task = nick_normalize_rt_task, +#endif + .attrs = nick_attrs, +}; diff -Naur linux-2.6.12-rc5-mm1/kernel/sched.c linux-2.6.12-rc5-mm1-plug/kernel/sched.c --- linux-2.6.12-rc5-mm1/kernel/sched.c 2005-05-25 16:23:47.394416336 -0700 +++ linux-2.6.12-rc5-mm1-plug/kernel/sched.c 2005-05-25 17:05:49.630978096 -0700 @@ -42,10 +42,8 @@ #include #include #include -#include #include #include -#include #include #include #include @@ -53,330 +51,30 @@ #include -/* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], - * and back. - */ -#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) -#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) -#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) - -/* - * 'User priority' is the nice value converted to something we - * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. - */ -#define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) -#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) - -/* - * Some helpers for converting nanosecond timing to jiffy resolution - */ -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) - -/* - * These are the 'tuning knobs' of the scheduler: - * - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), - * default timeslice is 100 msecs, maximum timeslice is 800 msecs. - * Timeslices get refilled after they expire. - */ -#define MIN_TIMESLICE max(5 * HZ / 1000, 1) -#define DEF_TIMESLICE (100 * HZ / 1000) -#define ON_RUNQUEUE_WEIGHT 30 -#define CHILD_PENALTY 95 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) -#define STARVATION_LIMIT (MAX_SLEEP_AVG) -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) - -/* - * If a task is 'interactive' then we reinsert it in the active - * array after it has expired its current timeslice. (it will not - * continue to run immediately, it will still roundrobin with - * other interactive tasks.) - * - * This part scales the interactivity limit depending on niceness. - * - * We scale it linearly, offset by the INTERACTIVE_DELTA delta. - * Here are a few examples of different nice levels: - * - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] - * - * (the X axis represents the possible -5 ... 0 ... +5 dynamic - * priority range a task can explore, a value of '1' means the - * task is rated interactive.) - * - * Ie. nice +19 tasks can never get 'interactive' enough to be - * reinserted into the active array. And only heavily CPU-hog nice -20 - * tasks will be expired. Default nice 0 tasks are somewhere between, - * it takes some effort for them to get interactive, but it's not - * too hard. - */ - -#define CURRENT_BONUS(p) \ - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ - MAX_SLEEP_AVG) - -#define GRANULARITY (10 * HZ / 1000 ? : 1) - -#ifdef CONFIG_SMP -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ - num_online_cpus()) -#else -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) -#endif - -#define SCALE(v1,v1_max,v2_max) \ - (v1) * (v2_max) / (v1_max) - -#define DELTA(p) \ - (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) +#include +#include +#include -#define TASK_INTERACTIVE(p) \ - ((p)->prio <= (p)->static_prio - DELTA(p)) - -#define INTERACTIVE_SLEEP(p) \ - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) - -#define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) - -/* - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] - * to time slice values: [800ms ... 100ms ... 5ms] - * - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. - */ - -#define SCALE_PRIO(x, prio) \ - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) - -static unsigned int task_timeslice(task_t *p) +static inline unsigned int task_timeslice(const task_t *p) { - if (p->static_prio < NICE_TO_PRIO(0)) - return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); - else - return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); + return sched_drvp->task_timeslice(p); } -#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ - < (long long) (sd)->cache_hot_time) /* * These are the runqueue data structures: */ +DEFINE_PER_CPU(struct runqueue, runqueues); -#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) - -typedef struct runqueue runqueue_t; - -struct prio_array { - unsigned int nr_active; - unsigned long bitmap[BITMAP_SIZE]; - struct list_head queue[MAX_PRIO]; -}; - -/* - * This is the main, per-CPU runqueue data structure. - * - * Locking rule: those places that want to lock multiple runqueues - * (such as the load balancing or the thread migration code), lock - * acquire operations must be ordered by ascending &runqueue. - */ -struct runqueue { - spinlock_t lock; - - /* - * nr_running and cpu_load should be in the same cacheline because - * remote CPUs use both these fields when doing load calculation. - */ - unsigned long nr_running; -#ifdef CONFIG_SMP - unsigned long prio_bias; - unsigned long cpu_load[3]; -#endif - unsigned long long nr_switches; - - /* - * This is part of a global counter where only the total sum - * over all CPUs matters. A task can increase this counter on - * one CPU and if it got migrated afterwards it may decrease - * it on another CPU. Always updated under the runqueue lock: - */ - unsigned long nr_uninterruptible; - - unsigned long expired_timestamp; - unsigned long long timestamp_last_tick; - task_t *curr, *idle; - struct mm_struct *prev_mm; - prio_array_t *active, *expired, arrays[2]; - int best_expired_prio; - atomic_t nr_iowait; - -#ifdef CONFIG_SMP - struct sched_domain *sd; - - /* For active balancing */ - int active_balance; - int push_cpu; - - task_t *migration_thread; - struct list_head migration_queue; -#endif - -#ifdef CONFIG_SCHEDSTATS - /* latency stats */ - struct sched_info rq_sched_info; - - /* sys_sched_yield() stats */ - unsigned long yld_exp_empty; - unsigned long yld_act_empty; - unsigned long yld_both_empty; - unsigned long yld_cnt; - - /* schedule() stats */ - unsigned long sched_switch; - unsigned long sched_cnt; - unsigned long sched_goidle; - - /* try_to_wake_up() stats */ - unsigned long ttwu_cnt; - unsigned long ttwu_local; -#endif -}; - -static DEFINE_PER_CPU(struct runqueue, runqueues); - -/* - * The domain tree (rq->sd) is protected by RCU's quiescent state transition. - * See detach_destroy_domains: synchronize_sched for details. - * - * The domain tree of any CPU may only be accessed from within - * preempt-disabled sections. - */ #define for_each_domain(cpu, domain) \ -for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) - -#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -#define this_rq() (&__get_cpu_var(runqueues)) -#define task_rq(p) cpu_rq(task_cpu(p)) -#define cpu_curr(cpu) (cpu_rq(cpu)->curr) - -#ifndef prepare_arch_switch -# define prepare_arch_switch(next) do { } while (0) -#endif -#ifndef finish_arch_switch -# define finish_arch_switch(prev) do { } while (0) -#endif - -#ifndef __ARCH_WANT_UNLOCKED_CTXSW -static inline int task_running(runqueue_t *rq, task_t *p) -{ - return rq->curr == p; -} - -static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) -{ -} - -static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) -{ - spin_unlock_irq(&rq->lock); -} - -#else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline int task_running(runqueue_t *rq, task_t *p) -{ -#ifdef CONFIG_SMP - return p->oncpu; -#else - return rq->curr == p; -#endif -} - -static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->oncpu = 1; -#endif -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - spin_unlock_irq(&rq->lock); -#else - spin_unlock(&rq->lock); -#endif -} - -static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->oncpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - */ - smp_wmb(); - prev->oncpu = 0; -#endif -#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_enable(); -#endif -} -#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ - -/* - * task_rq_lock - lock the runqueue a given task resides on and disable - * interrupts. Note the ordering: we can safely lookup the task_rq without - * explicitly disabling preemption. - */ -static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) - __acquires(rq->lock) -{ - struct runqueue *rq; - -repeat_lock_task: - local_irq_save(*flags); - rq = task_rq(p); - spin_lock(&rq->lock); - if (unlikely(rq != task_rq(p))) { - spin_unlock_irqrestore(&rq->lock, *flags); - goto repeat_lock_task; - } - return rq; -} - -static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) - __releases(rq->lock) -{ - spin_unlock_irqrestore(&rq->lock, *flags); -} + for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) +#define task_is_queued(p) (!list_empty(&(p)->run_list)) #ifdef CONFIG_SCHEDSTATS /* * bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 12 +#define SCHEDSTAT_VERSION 11 static int show_schedstat(struct seq_file *seq, void *v) { @@ -405,7 +103,6 @@ #ifdef CONFIG_SMP /* domain-specific stats */ - preempt_disable(); for_each_domain(cpu, sd) { enum idle_type itype; char mask_str[NR_CPUS]; @@ -424,13 +121,11 @@ sd->lb_nobusyq[itype], sd->lb_nobusyg[itype]); } - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n", sd->alb_cnt, sd->alb_failed, sd->alb_pushed, - sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, - sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, + sd->sbe_pushed, sd->sbe_attempts, sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); } - preempt_enable(); #endif } return 0; @@ -462,379 +157,25 @@ .release = single_release, }; -# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) #else /* !CONFIG_SCHEDSTATS */ -# define schedstat_inc(rq, field) do { } while (0) # define schedstat_add(rq, field, amt) do { } while (0) #endif -/* - * rq_lock - lock a given runqueue and disable interrupts. - */ -static inline runqueue_t *this_rq_lock(void) - __acquires(rq->lock) -{ - runqueue_t *rq; - - local_irq_disable(); - rq = this_rq(); - spin_lock(&rq->lock); - - return rq; -} - -#ifdef CONFIG_SCHEDSTATS -/* - * Called when a process is dequeued from the active array and given - * the cpu. We should note that with the exception of interactive - * tasks, the expired queue will become the active queue after the active - * queue is empty, without explicitly dequeuing and requeuing tasks in the - * expired queue. (Interactive tasks may be requeued directly to the - * active queue, thus delaying tasks in the expired queue from running; - * see scheduler_tick()). - * - * This function is only called from sched_info_arrive(), rather than - * dequeue_task(). Even though a task may be queued and dequeued multiple - * times as it is shuffled about, we're really interested in knowing how - * long it was from the *first* time it was queued to the time that it - * finally hit a cpu. - */ -static inline void sched_info_dequeued(task_t *t) -{ - t->sched_info.last_queued = 0; -} - -/* - * Called when a task finally hits the cpu. We can now calculate how - * long it was waiting to run. We also note when it began so that we - * can keep stats on how long its timeslice is. - */ -static inline void sched_info_arrive(task_t *t) -{ - unsigned long now = jiffies, diff = 0; - struct runqueue *rq = task_rq(t); - - if (t->sched_info.last_queued) - diff = now - t->sched_info.last_queued; - sched_info_dequeued(t); - t->sched_info.run_delay += diff; - t->sched_info.last_arrival = now; - t->sched_info.pcnt++; - - if (!rq) - return; - - rq->rq_sched_info.run_delay += diff; - rq->rq_sched_info.pcnt++; -} - -/* - * Called when a process is queued into either the active or expired - * array. The time is noted and later used to determine how long we - * had to wait for us to reach the cpu. Since the expired queue will - * become the active queue after active queue is empty, without dequeuing - * and requeuing any tasks, we are interested in queuing to either. It - * is unusual but not impossible for tasks to be dequeued and immediately - * requeued in the same or another array: this can happen in sched_yield(), - * set_user_nice(), and even load_balance() as it moves tasks from runqueue - * to runqueue. - * - * This function is only called from enqueue_task(), but also only updates - * the timestamp if it is already not set. It's assumed that - * sched_info_dequeued() will clear that stamp when appropriate. - */ -static inline void sched_info_queued(task_t *t) -{ - if (!t->sched_info.last_queued) - t->sched_info.last_queued = jiffies; -} - -/* - * Called when a process ceases being the active-running process, either - * voluntarily or involuntarily. Now we can calculate how long we ran. - */ -static inline void sched_info_depart(task_t *t) -{ - struct runqueue *rq = task_rq(t); - unsigned long diff = jiffies - t->sched_info.last_arrival; - - t->sched_info.cpu_time += diff; - - if (rq) - rq->rq_sched_info.cpu_time += diff; -} - -/* - * Called when tasks are switched involuntarily due, typically, to expiring - * their time slice. (This may also be called when switching to or from - * the idle task.) We are only called when prev != next. - */ -static inline void sched_info_switch(task_t *prev, task_t *next) -{ - struct runqueue *rq = task_rq(prev); - - /* - * prev now departs the cpu. It's not interesting to record - * stats about how efficient we were at scheduling the idle - * process, however. - */ - if (prev != rq->idle) - sched_info_depart(prev); - - if (next != rq->idle) - sched_info_arrive(next); -} -#else -#define sched_info_queued(t) do { } while (0) -#define sched_info_switch(t, next) do { } while (0) -#endif /* CONFIG_SCHEDSTATS */ - -/* - * Adding/removing a task to/from a priority array: - */ -static void dequeue_task(struct task_struct *p, prio_array_t *array) -{ - array->nr_active--; - list_del(&p->run_list); - if (list_empty(array->queue + p->prio)) - __clear_bit(p->prio, array->bitmap); -} - -static void enqueue_task(struct task_struct *p, prio_array_t *array) -{ - sched_info_queued(p); - list_add_tail(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); - array->nr_active++; - p->array = array; -} - -/* - * Put task to the end of the run list without the overhead of dequeue - * followed by enqueue. - */ -static void requeue_task(struct task_struct *p, prio_array_t *array) -{ - list_move_tail(&p->run_list, array->queue + p->prio); -} - -static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) -{ - list_add(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); - array->nr_active++; - p->array = array; -} - -/* - * effective_prio - return the priority that is based on the static - * priority but is modified by bonuses/penalties. - * - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] - * into the -5 ... 0 ... +5 bonus/penalty range. - * - * We use 25% of the full 0...39 priority range so that: - * - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. - * - * Both properties are important to certain workloads. - */ -static int effective_prio(task_t *p) -{ - int bonus, prio; - - if (rt_task(p)) - return p->prio; - - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; - - prio = p->static_prio - bonus; - if (prio < MAX_RT_PRIO) - prio = MAX_RT_PRIO; - if (prio > MAX_PRIO-1) - prio = MAX_PRIO-1; - return prio; -} - -#ifdef CONFIG_SMP -static inline void inc_prio_bias(runqueue_t *rq, int prio) -{ - rq->prio_bias += MAX_PRIO - prio; -} - -static inline void dec_prio_bias(runqueue_t *rq, int prio) -{ - rq->prio_bias -= MAX_PRIO - prio; -} -#else -static inline void inc_prio_bias(runqueue_t *rq, int prio) -{ -} - -static inline void dec_prio_bias(runqueue_t *rq, int prio) -{ -} -#endif - -static inline void inc_nr_running(task_t *p, runqueue_t *rq) -{ - rq->nr_running++; - if (rt_task(p)) - inc_prio_bias(rq, p->prio); - else - inc_prio_bias(rq, p->static_prio); -} - -static inline void dec_nr_running(task_t *p, runqueue_t *rq) -{ - rq->nr_running--; - if (rt_task(p)) - dec_prio_bias(rq, p->prio); - else - dec_prio_bias(rq, p->static_prio); -} - -/* - * __activate_task - move a task to the runqueue. - */ -static inline void __activate_task(task_t *p, runqueue_t *rq) -{ - enqueue_task(p, rq->active); - inc_nr_running(p, rq); -} - -/* - * __activate_idle_task - move idle task to the _front_ of runqueue. - */ -static inline void __activate_idle_task(task_t *p, runqueue_t *rq) -{ - enqueue_task_head(p, rq->active); - inc_nr_running(p, rq); -} - -static int recalc_task_prio(task_t *p, unsigned long long now) +#ifdef CONFIG_SCHED_SMT +int cpu_and_siblings_are_idle(int cpu) { - /* Caller must always ensure 'now >= p->timestamp' */ - unsigned long long __sleep_time = now - p->timestamp; - unsigned long sleep_time; - - if (__sleep_time > NS_MAX_SLEEP_AVG) - sleep_time = NS_MAX_SLEEP_AVG; - else - sleep_time = (unsigned long)__sleep_time; - - if (likely(sleep_time > 0)) { - /* - * User tasks that sleep a long time are categorised as - * idle and will get just interactive status to stay active & - * prevent them suddenly becoming cpu hogs and starving - * other processes. - */ - if (p->mm && p->activated != -1 && - sleep_time > INTERACTIVE_SLEEP(p)) { - p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - - DEF_TIMESLICE); - } else { - /* - * The lower the sleep avg a task has the more - * rapidly it will rise with sleep time. - */ - sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; - - /* - * Tasks waking from uninterruptible sleep are - * limited in their sleep_avg rise as they - * are likely to be waiting on I/O - */ - if (p->activated == -1 && p->mm) { - if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) - sleep_time = 0; - else if (p->sleep_avg + sleep_time >= - INTERACTIVE_SLEEP(p)) { - p->sleep_avg = INTERACTIVE_SLEEP(p); - sleep_time = 0; - } - } - - /* - * This code gives a bonus to interactive tasks. - * - * The boost works by updating the 'average sleep time' - * value here, based on ->timestamp. The more time a - * task spends sleeping, the higher the average gets - - * and the higher the priority boost gets as well. - */ - p->sleep_avg += sleep_time; - - if (p->sleep_avg > NS_MAX_SLEEP_AVG) - p->sleep_avg = NS_MAX_SLEEP_AVG; - } + int sib; + for_each_cpu_mask(sib, cpu_sibling_map[cpu]) { + if (idle_cpu(sib)) + continue; + return 0; } - return effective_prio(p); + return 1; } - -/* - * activate_task - move a task to the runqueue and do priority recalculation - * - * Update all the scheduling statistics stuff. (sleep average - * calculation, priority modifiers, etc.) - */ -static void activate_task(task_t *p, runqueue_t *rq, int local) -{ - unsigned long long now; - - now = sched_clock(); -#ifdef CONFIG_SMP - if (!local) { - /* Compensate for drifting sched_clock */ - runqueue_t *this_rq = this_rq(); - now = (now - this_rq->timestamp_last_tick) - + rq->timestamp_last_tick; - } #endif - p->prio = recalc_task_prio(p, now); - - /* - * This checks to make sure it's not an uninterruptible task - * that is now waking up. - */ - if (!p->activated) { - /* - * Tasks which were woken up by interrupts (ie. hw events) - * are most likely of interactive nature. So we give them - * the credit of extending their sleep time to the period - * of time they spend on the runqueue, waiting for execution - * on a CPU, first time around: - */ - if (in_interrupt()) - p->activated = 2; - else { - /* - * Normal first-time wakeups get a credit too for - * on-runqueue time, but it will be weighted down: - */ - p->activated = 1; - } - } - p->timestamp = now; - - __activate_task(p, rq); -} - -/* - * deactivate_task - remove a task from the runqueue. - */ -static void deactivate_task(struct task_struct *p, runqueue_t *rq) -{ - dec_nr_running(p, rq); - dequeue_task(p, p->array); - p->array = NULL; -} - /* * resched_task - mark a task 'to be rescheduled now'. * @@ -843,7 +184,7 @@ * the target CPU. */ #ifdef CONFIG_SMP -static void resched_task(task_t *p) +void resched_task(task_t *p) { int need_resched, nrpolling; @@ -857,11 +198,6 @@ if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) smp_send_reschedule(task_cpu(p)); } -#else -static inline void resched_task(task_t *p) -{ - set_tsk_need_resched(p); -} #endif /** @@ -874,12 +210,22 @@ } #ifdef CONFIG_SMP +enum request_type { + REQ_MOVE_TASK, + REQ_SET_DOMAIN, +}; + typedef struct { struct list_head list; + enum request_type type; + /* For REQ_MOVE_TASK */ task_t *task; int dest_cpu; + /* For REQ_SET_DOMAIN */ + struct sched_domain *sd; + struct completion done; } migration_req_t; @@ -895,12 +241,13 @@ * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ - if (!p->array && !task_running(rq, p)) { + if (!task_is_queued(p) && !task_running(rq, p)) { set_task_cpu(p, dest_cpu); return 0; } init_completion(&req->done); + req->type = REQ_MOVE_TASK; req->task = p; req->dest_cpu = dest_cpu; list_add(&req->list, &rq->migration_queue); @@ -925,7 +272,7 @@ repeat: rq = task_rq_lock(p, &flags); /* Must be off runqueue entirely, not preempted. */ - if (unlikely(p->array || task_running(rq, p))) { + if (unlikely(task_is_queued(p) || task_running(rq, p))) { /* If it's preempted, we yield. It could be a while. */ preempted = !task_running(rq, p); task_rq_unlock(rq, &flags); @@ -967,179 +314,42 @@ * We want to under-estimate the load of migration sources, to * balance conservatively. */ -static inline unsigned long __source_load(int cpu, int type, enum idle_type idle) +static inline unsigned long source_load(int cpu, enum idle_type idle) { runqueue_t *rq = cpu_rq(cpu); - unsigned long cpu_load = rq->cpu_load[type-1], - load_now = rq->nr_running * SCHED_LOAD_SCALE; - - if (idle == NOT_IDLE) { - /* - * If we are balancing busy runqueues the load is biased by - * priority to create 'nice' support across cpus. - */ - cpu_load *= rq->prio_bias; - load_now *= rq->prio_bias; - } - - if (type == 0) - return load_now; + unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + unsigned long load = min(rq->cpu_load, load_now); - return min(cpu_load, load_now); -} + /* + * If we are balancing busy runqueues the load is biased by + * priority to create 'nice' support across cpus. + */ + if (idle == NOT_IDLE) + load *= rq->prio_bias; -static inline unsigned long source_load(int cpu, int type) -{ - return __source_load(cpu, type, NOT_IDLE); + return load; } /* * Return a high guess at the load of a migration-target cpu */ -static inline unsigned long __target_load(int cpu, int type, enum idle_type idle) +static inline unsigned long target_load(int cpu, enum idle_type idle) { runqueue_t *rq = cpu_rq(cpu); - unsigned long cpu_load = rq->cpu_load[type-1], - load_now = rq->nr_running * SCHED_LOAD_SCALE; - - if (type == 0) - return load_now; - - if (idle == NOT_IDLE) { - cpu_load *= rq->prio_bias; - load_now *= rq->prio_bias; - } - return max(cpu_load, load_now); -} + unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + unsigned long load = max(rq->cpu_load, load_now); -static inline unsigned long target_load(int cpu, int type) -{ - return __target_load(cpu, type, NOT_IDLE); -} - -/* - * find_idlest_group finds and returns the least busy CPU group within the - * domain. - */ -static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) -{ - struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; - unsigned long min_load = ULONG_MAX, this_load = 0; - int load_idx = sd->forkexec_idx; - int imbalance = 100 + (sd->imbalance_pct-100)/2; - - do { - unsigned long load, avg_load; - int local_group; - int i; - - local_group = cpu_isset(this_cpu, group->cpumask); - /* XXX: put a cpus allowed check */ - - /* Tally up the load of all CPUs in the group */ - avg_load = 0; - - for_each_cpu_mask(i, group->cpumask) { - /* Bias balancing toward cpus of our domain */ - if (local_group) - load = source_load(i, load_idx); - else - load = target_load(i, load_idx); - - avg_load += load; - } - - /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; - - if (local_group) { - this_load = avg_load; - this = group; - } else if (avg_load < min_load) { - min_load = avg_load; - idlest = group; - } - group = group->next; - } while (group != sd->groups); - - if (!idlest || 100*this_load < imbalance*min_load) - return NULL; - return idlest; -} - -/* - * find_idlest_queue - find the idlest runqueue among the cpus in group. - */ -static int find_idlest_cpu(struct sched_group *group, int this_cpu) -{ - unsigned long load, min_load = ULONG_MAX; - int idlest = -1; - int i; - - for_each_cpu_mask(i, group->cpumask) { - load = source_load(i, 0); - - if (load < min_load || (load == min_load && i == this_cpu)) { - min_load = load; - idlest = i; - } - } - - return idlest; -} - -/* - * sched_balance_self: balance the current task (running on cpu) in domains - * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and - * SD_BALANCE_EXEC. - * - * Balance, ie. select the least loaded group. - * - * Returns the target CPU number, or the same CPU if no balancing is needed. - * - * preempt must be disabled. - */ -static int sched_balance_self(int cpu, int flag) -{ - struct task_struct *t = current; - struct sched_domain *tmp, *sd = NULL; - - for_each_domain(cpu, tmp) - if (tmp->flags & flag) - sd = tmp; - - while (sd) { - cpumask_t span; - struct sched_group *group; - int new_cpu; - - span = sd->span; - group = find_idlest_group(sd, t, cpu); - if (!group) - goto nextlevel; - - new_cpu = find_idlest_cpu(group, cpu); - if (new_cpu == -1 || new_cpu == cpu) - goto nextlevel; - - /* Now try balancing at a lower domain level */ - cpu = new_cpu; -nextlevel: - sd = NULL; - for_each_domain(cpu, tmp) { - if (cpus_subset(span, tmp->span)) - break; - if (tmp->flags & flag) - sd = tmp; - } - /* while loop will break here if sd == NULL */ - } + /* + * If we are balancing busy runqueues the load is biased by + * priority to create 'nice' support across cpus. + */ + if (idle == NOT_IDLE) + load *= rq->prio_bias; - return cpu; + return load; } -#endif /* CONFIG_SMP */ +#endif /* * wake_idle() will wake a task on an idle cpu if task->cpu is @@ -1161,14 +371,14 @@ for_each_domain(cpu, sd) { if (sd->flags & SD_WAKE_IDLE) { - cpus_and(tmp, sd->span, p->cpus_allowed); + cpus_and(tmp, sd->span, cpu_online_map); + cpus_and(tmp, tmp, p->cpus_allowed); for_each_cpu_mask(i, tmp) { if (idle_cpu(i)) return i; } } - else - break; + else break; } return cpu; } @@ -1201,7 +411,7 @@ runqueue_t *rq; #ifdef CONFIG_SMP unsigned long load, this_load; - struct sched_domain *sd, *this_sd = NULL; + struct sched_domain *sd; int new_cpu; #endif @@ -1210,7 +420,7 @@ if (!(old_state & state)) goto out; - if (p->array) + if (task_is_queued(p)) goto out_running; cpu = task_cpu(p); @@ -1220,69 +430,70 @@ if (unlikely(task_running(rq, p))) goto out_activate; - new_cpu = cpu; - +#ifdef CONFIG_SCHEDSTATS schedstat_inc(rq, ttwu_cnt); if (cpu == this_cpu) { schedstat_inc(rq, ttwu_local); - goto out_set_cpu; - } - - for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { - schedstat_inc(sd, ttwu_wake_remote); - this_sd = sd; - break; + } else { + for_each_domain(this_cpu, sd) { + if (cpu_isset(cpu, sd->span)) { + schedstat_inc(sd, ttwu_wake_remote); + break; + } } } +#endif - if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + new_cpu = cpu; + if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) goto out_set_cpu; + load = source_load(cpu, SCHED_IDLE); + this_load = target_load(this_cpu, SCHED_IDLE); + /* - * Check for affine wakeup and passive balancing possibilities. + * If sync wakeup then subtract the (maximum possible) effect of + * the currently running task from the load of the current CPU: */ - if (this_sd) { - int idx = this_sd->wake_idx; - unsigned int imbalance; + if (sync) + this_load -= SCHED_LOAD_SCALE; - imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; - - load = source_load(cpu, idx); - this_load = target_load(this_cpu, idx); - - new_cpu = this_cpu; /* Wake to this CPU if we can */ - - if (this_sd->flags & SD_WAKE_AFFINE) { - unsigned long tl = this_load; - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) - tl -= SCHED_LOAD_SCALE; + /* Don't pull the task off an idle CPU to a busy one */ + if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2) + goto out_set_cpu; - if ((tl <= load && - tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || - 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { - /* - * This domain has SD_WAKE_AFFINE and - * p is cache cold in this domain, and - * there is no bad imbalance. - */ - schedstat_inc(this_sd, ttwu_move_affine); - goto out_set_cpu; - } - } + new_cpu = this_cpu; /* Wake to this CPU if we can */ + /* + * Scan domains for affine wakeup and passive balancing + * possibilities. + */ + for_each_domain(this_cpu, sd) { + unsigned int imbalance; /* * Start passive balancing when half the imbalance_pct * limit is reached. */ - if (this_sd->flags & SD_WAKE_BALANCE) { - if (imbalance*this_load <= 100*load) { - schedstat_inc(this_sd, ttwu_move_balance); + imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; + + if ((sd->flags & SD_WAKE_AFFINE) && + !task_hot(p, rq->timestamp_last_tick, sd)) { + /* + * This domain has SD_WAKE_AFFINE and p is cache cold + * in this domain. + */ + if (cpu_isset(cpu, sd->span)) { + schedstat_inc(sd, ttwu_move_affine); + goto out_set_cpu; + } + } else if ((sd->flags & SD_WAKE_BALANCE) && + imbalance*this_load <= 100*load) { + /* + * This domain has SD_WAKE_BALANCE and there is + * an imbalance. + */ + if (cpu_isset(cpu, sd->span)) { + schedstat_inc(sd, ttwu_move_balance); goto out_set_cpu; } } @@ -1299,7 +510,7 @@ old_state = p->state; if (!(old_state & state)) goto out; - if (p->array) + if (task_is_queued(p)) goto out_running; this_cpu = smp_processor_id(); @@ -1308,28 +519,7 @@ out_activate: #endif /* CONFIG_SMP */ - if (old_state == TASK_UNINTERRUPTIBLE) { - rq->nr_uninterruptible--; - /* - * Tasks on involuntary sleep don't earn - * sleep_avg beyond just interactive state. - */ - p->activated = -1; - } - - /* - * Sync wakeups (i.e. those types of wakeups where the waker - * has indicated that it will leave the CPU in short order) - * don't trigger a preemption, if the woken up task will run on - * this cpu. (in this case the 'I will reschedule' promise of - * the waker guarantees that the freshly woken up task is going - * to be considered on this CPU.) - */ - activate_task(p, rq, cpu == this_cpu); - if (!sync || cpu != this_cpu) { - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); - } + sched_drvp->wake_up_task(p, rq, old_state, sync); success = 1; out_running: @@ -1353,19 +543,17 @@ return try_to_wake_up(p, state, 0); } +#ifdef CONFIG_SMP +static int find_idlest_cpu(struct task_struct *p, int this_cpu, + struct sched_domain *sd); +#endif + /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. */ -void fastcall sched_fork(task_t *p, int clone_flags) +void fastcall sched_fork(task_t *p) { - int cpu = get_cpu(); - -#ifdef CONFIG_SMP - cpu = sched_balance_self(cpu, SD_BALANCE_FORK); -#endif - set_task_cpu(p, cpu); - /* * We mark the process as running here, but have not actually * inserted it onto the runqueue yet. This guarantees that @@ -1374,42 +562,20 @@ */ p->state = TASK_RUNNING; INIT_LIST_HEAD(&p->run_list); - p->array = NULL; + spin_lock_init(&p->switch_lock); #ifdef CONFIG_SCHEDSTATS memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) - p->oncpu = 0; -#endif #ifdef CONFIG_PREEMPT - /* Want to start with kernel preemption disabled. */ - p->thread_info->preempt_count = 1; -#endif - /* - * Share the timeslice between parent and child, thus the - * total amount of pending timeslices in the system doesn't change, - * resulting in more scheduling fairness. - */ - local_irq_disable(); - p->time_slice = (current->time_slice + 1) >> 1; /* - * The remainder of the first timeslice might be recovered by - * the parent if the child exits early enough. + * During context-switch we hold precisely one spinlock, which + * schedule_tail drops. (in the common case it's this_rq()->lock, + * but it also can be p->switch_lock.) So we compensate with a count + * of 1. Also, we want to start with kernel preemption disabled. */ - p->first_time_slice = 1; - current->time_slice >>= 1; - p->timestamp = sched_clock(); - if (unlikely(!current->time_slice)) { - /* - * This case is rare, it happens when the parent has only - * a single jiffy left from its timeslice. Taking the - * runqueue lock is not a problem. - */ - current->time_slice = 1; - scheduler_tick(); - } - local_irq_enable(); - put_cpu(); + p->thread_info->preempt_count = 1; +#endif + sched_drvp->fork(p); } /* @@ -1421,167 +587,12 @@ */ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) { - unsigned long flags; - int this_cpu, cpu; - runqueue_t *rq, *this_rq; - - rq = task_rq_lock(p, &flags); - BUG_ON(p->state != TASK_RUNNING); - this_cpu = smp_processor_id(); - cpu = task_cpu(p); - - /* - * We decrease the sleep average of forking parents - * and children as well, to keep max-interactive tasks - * from forking tasks that are max-interactive. The parent - * (current) is done further down, under its lock. - */ - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - - p->prio = effective_prio(p); - - if (likely(cpu == this_cpu)) { - if (!(clone_flags & CLONE_VM)) { - /* - * The VM isn't cloned, so we're in a good position to - * do child-runs-first in anticipation of an exec. This - * usually avoids a lot of COW overhead. - */ - if (unlikely(!current->array)) - __activate_task(p, rq); - else { - p->prio = current->prio; - list_add_tail(&p->run_list, ¤t->run_list); - p->array = current->array; - p->array->nr_active++; - inc_nr_running(p, rq); - } - set_need_resched(); - } else - /* Run child last */ - __activate_task(p, rq); - /* - * We skip the following code due to cpu == this_cpu - * - * task_rq_unlock(rq, &flags); - * this_rq = task_rq_lock(current, &flags); - */ - this_rq = rq; - } else { - this_rq = cpu_rq(this_cpu); - - /* - * Not the local CPU - must adjust timestamp. This should - * get optimised away in the !CONFIG_SMP case. - */ - p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) - + rq->timestamp_last_tick; - __activate_task(p, rq); - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); - - /* - * Parent and child are on different CPUs, now get the - * parent runqueue to update the parent's ->sleep_avg: - */ - task_rq_unlock(rq, &flags); - this_rq = task_rq_lock(current, &flags); - } - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - task_rq_unlock(this_rq, &flags); + sched_drvp->wake_up_new_task(p, clone_flags); } -/* - * Potentially available exiting-child timeslices are - * retrieved here - this way the parent does not get - * penalized for creating too many threads. - * - * (this cannot be used to 'generate' timeslices - * artificially, because any timeslice recovered here - * was given away by the parent in the first place.) - */ void fastcall sched_exit(task_t * p) { - unsigned long flags; - runqueue_t *rq; - - /* - * If the child was a (relative-) CPU hog then decrease - * the sleep_avg of the parent as well. - */ - rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice) { - p->parent->time_slice += p->time_slice; - if (unlikely(p->parent->time_slice > task_timeslice(p))) - p->parent->time_slice = task_timeslice(p); - } - if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = p->parent->sleep_avg / - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / - (EXIT_WEIGHT + 1); - task_rq_unlock(rq, &flags); -} - -/** - * prepare_task_switch - prepare to switch tasks - * @rq: the runqueue preparing to switch - * @next: the task we are going to switch to. - * - * This is called with the rq lock held and interrupts off. It must - * be paired with a subsequent finish_task_switch after the context - * switch. - * - * prepare_task_switch sets up locking and calls architecture specific - * hooks. - */ -static inline void prepare_task_switch(runqueue_t *rq, task_t *next) -{ - prepare_lock_switch(rq, next); - prepare_arch_switch(next); -} - -/** - * finish_task_switch - clean up after a task-switch - * @prev: the thread we just switched away from. - * - * finish_task_switch must be called after the context switch, paired - * with a prepare_task_switch call before the context switch. - * finish_task_switch will reconcile locking set up by prepare_task_switch, - * and do any other architecture-specific cleanup actions. - * - * Note that we may have delayed dropping an mm in context_switch(). If - * so, we finish that here outside of the runqueue lock. (Doing it - * with the lock held can cause deadlocks; see schedule() for - * details.) - */ -static inline void finish_task_switch(runqueue_t *rq, task_t *prev) - __releases(rq->lock) -{ - struct mm_struct *mm = rq->prev_mm; - unsigned long prev_task_flags; - - rq->prev_mm = NULL; - - /* - * A task struct has one reference for the use as "current". - * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and - * calls schedule one last time. The schedule call will never return, - * and the scheduled task must drop that reference. - * The test for EXIT_ZOMBIE must occur while the runqueue locks are - * still held, otherwise prev could be scheduled on another cpu, die - * there before we look at prev->state, and then the reference would - * be dropped twice. - * Manfred Spraul - */ - prev_task_flags = prev->flags; - finish_arch_switch(prev); - finish_lock_switch(rq, prev); - if (mm) - mmdrop(mm); - if (unlikely(prev_task_flags & PF_DEAD)) - put_task_struct(prev); + sched_drvp->exit(p); } /** @@ -1591,46 +602,13 @@ asmlinkage void schedule_tail(task_t *prev) __releases(rq->lock) { - runqueue_t *rq = this_rq(); - finish_task_switch(rq, prev); -#ifdef __ARCH_WANT_UNLOCKED_CTXSW - /* In this case, finish_task_switch does not reenable preemption */ - preempt_enable(); -#endif + finish_task_switch(prev); + if (current->set_child_tid) put_user(current->pid, current->set_child_tid); } /* - * context_switch - switch to the new MM and the new - * thread's register state. - */ -static inline -task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) -{ - struct mm_struct *mm = next->mm; - struct mm_struct *oldmm = prev->active_mm; - - if (unlikely(!mm)) { - next->active_mm = oldmm; - atomic_inc(&oldmm->mm_count); - enter_lazy_tlb(oldmm, next); - } else - switch_mm(oldmm, mm, next); - - if (unlikely(!prev->mm)) { - prev->active_mm = NULL; - WARN_ON(rq->prev_mm); - rq->prev_mm = oldmm; - } - - /* Here we just switch the register state and the stack. */ - switch_to(prev, next, prev); - - return prev; -} - -/* * nr_running, nr_uninterruptible and nr_context_switches: * * externally visible scheduler statistics: current number of runnable @@ -1746,6 +724,51 @@ } /* + * find_idlest_cpu - find the least busy runqueue. + */ +static int find_idlest_cpu(struct task_struct *p, int this_cpu, + struct sched_domain *sd) +{ + unsigned long load, min_load, this_load; + int i, min_cpu; + cpumask_t mask; + + min_cpu = UINT_MAX; + min_load = ULONG_MAX; + + cpus_and(mask, sd->span, p->cpus_allowed); + + for_each_cpu_mask(i, mask) { + load = target_load(i, SCHED_IDLE); + + if (load < min_load) { + min_cpu = i; + min_load = load; + + /* break out early on an idle CPU: */ + if (!min_load) + break; + } + } + + /* add +1 to account for the new task */ + this_load = source_load(this_cpu, SCHED_IDLE) + SCHED_LOAD_SCALE; + + /* + * Would with the addition of the new task to the + * current CPU there be an imbalance between this + * CPU and the idlest CPU? + * + * Use half of the balancing threshold - new-context is + * a good opportunity to balance. + */ + if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100) + return min_cpu; + + return this_cpu; +} + +/* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only * allow dest_cpu, which will force the cpu onto dest_cpu. Then @@ -1778,73 +801,37 @@ } /* - * sched_exec - execve() is a valuable balancing opportunity, because at - * this point the task has the smallest effective memory and cache footprint. + * sched_exec(): find the highest-level, exec-balance-capable + * domain and try to migrate the task to the least loaded CPU. + * + * execve() is a valuable balancing opportunity, because at this point + * the task has the smallest effective memory and cache footprint. */ void sched_exec(void) { + struct sched_domain *tmp, *sd = NULL; int new_cpu, this_cpu = get_cpu(); - new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); - put_cpu(); - if (new_cpu != this_cpu) - sched_migrate_task(current, new_cpu); -} - -/* - * pull_task - move a task from a remote runqueue to the local runqueue. - * Both runqueues must be locked. - */ -static inline -void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, - runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) -{ - dequeue_task(p, src_array); - dec_nr_running(p, src_rq); - set_task_cpu(p, this_cpu); - inc_nr_running(p, this_rq); - enqueue_task(p, this_array); - p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) - + this_rq->timestamp_last_tick; - /* - * Note that idle threads have a prio of MAX_PRIO, for this test - * to be always true for them. - */ - if (TASK_PREEMPTS_CURR(p, this_rq)) - resched_task(this_rq->curr); -} - -/* - * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? - */ -static inline -int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, - struct sched_domain *sd, enum idle_type idle, int *all_pinned) -{ - /* - * We do not migrate tasks that are: - * 1) running (obviously), or - * 2) cannot be migrated to this CPU due to cpus_allowed, or - * 3) are cache-hot on their current CPU. - */ - if (!cpu_isset(this_cpu, p->cpus_allowed)) - return 0; - *all_pinned = 0; - if (task_running(rq, p)) - return 0; - - /* - * Aggressive migration if: - * 1) task is cache cold, or - * 2) too many balance attempts have failed. - */ + /* Prefer the current CPU if there's only this task running */ + if (this_rq()->nr_running <= 1) + goto out; - if (sd->nr_balance_failed > sd->cache_nice_tries) - return 1; + for_each_domain(this_cpu, tmp) + if (tmp->flags & SD_BALANCE_EXEC) + sd = tmp; - if (task_hot(p, rq->timestamp_last_tick, sd)) - return 0; - return 1; + if (sd) { + schedstat_inc(sd, sbe_attempts); + new_cpu = find_idlest_cpu(current, this_cpu, sd); + if (new_cpu != this_cpu) { + schedstat_inc(sd, sbe_pushed); + put_cpu(); + sched_migrate_task(current, new_cpu); + return; + } + } +out: + put_cpu(); } /* @@ -1854,81 +841,12 @@ * * Called with both runqueues locked. */ -static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, +static inline int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, unsigned long max_nr_move, struct sched_domain *sd, - enum idle_type idle, int *all_pinned) + enum idle_type idle) { - prio_array_t *array, *dst_array; - struct list_head *head, *curr; - int idx, pulled = 0, pinned = 0; - task_t *tmp; - - if (max_nr_move == 0) - goto out; - - pinned = 1; - - /* - * We first consider expired tasks. Those will likely not be - * executed in the near future, and they are most likely to - * be cache-cold, thus switching CPUs has the least effect - * on them. - */ - if (busiest->expired->nr_active) { - array = busiest->expired; - dst_array = this_rq->expired; - } else { - array = busiest->active; - dst_array = this_rq->active; - } - -new_array: - /* Start searching at priority 0: */ - idx = 0; -skip_bitmap: - if (!idx) - idx = sched_find_first_bit(array->bitmap); - else - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); - if (idx >= MAX_PRIO) { - if (array == busiest->expired && busiest->active->nr_active) { - array = busiest->active; - dst_array = this_rq->active; - goto new_array; - } - goto out; - } - - head = array->queue + idx; - curr = head->prev; -skip_queue: - tmp = list_entry(curr, task_t, run_list); - - curr = curr->prev; - - if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - } - -#ifdef CONFIG_SCHEDSTATS - if (task_hot(tmp, busiest->timestamp_last_tick, sd)) - schedstat_inc(sd, lb_hot_gained[idle]); -#endif - - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); - pulled++; + int pulled = sched_drvp->move_tasks(this_rq, this_cpu, busiest, max_nr_move, sd, idle); - /* We only want to steal up to the prescribed number of tasks. */ - if (pulled < max_nr_move) { - if (curr != head) - goto skip_queue; - idx++; - goto skip_bitmap; - } -out: /* * Right now, this is the only place pull_task() is called, * so we can safely collect pull_task() stats here rather than @@ -1936,8 +854,6 @@ */ schedstat_add(sd, lb_gained[idle], pulled); - if (all_pinned) - *all_pinned = pinned; return pulled; } @@ -1952,15 +868,8 @@ { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; - int load_idx; max_load = this_load = total_load = total_pwr = 0; - if (idle == NOT_IDLE) - load_idx = sd->busy_idx; - else if (idle == NEWLY_IDLE) - load_idx = sd->newidle_idx; - else - load_idx = sd->idle_idx; do { unsigned long load; @@ -1975,9 +884,9 @@ for_each_cpu_mask(i, group->cpumask) { /* Bias balancing toward cpus of our domain */ if (local_group) - load = __target_load(i, load_idx, idle); + load = target_load(i, idle); else - load = __source_load(i, load_idx, idle); + load = source_load(i, idle); avg_load += load; } @@ -1991,10 +900,12 @@ if (local_group) { this_load = avg_load; this = group; + goto nextgroup; } else if (avg_load > max_load) { max_load = avg_load; busiest = group; } +nextgroup: group = group->next; } while (group != sd->groups); @@ -2067,9 +978,15 @@ /* Get rid of the scaling factor, rounding down as we divide */ *imbalance = *imbalance / SCHED_LOAD_SCALE; + return busiest; out_balanced: + if (busiest && (idle == NEWLY_IDLE || + (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) { + *imbalance = 1; + return busiest; + } *imbalance = 0; return NULL; @@ -2078,15 +995,14 @@ /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ -static runqueue_t *find_busiest_queue(struct sched_group *group, - enum idle_type idle) +static runqueue_t *find_busiest_queue(struct sched_group *group, enum idle_type idle) { unsigned long load, max_load = 0; runqueue_t *busiest = NULL; int i; for_each_cpu_mask(i, group->cpumask) { - load = __source_load(i, 0, idle); + load = source_load(i, idle); if (load > max_load) { max_load = load; @@ -2098,12 +1014,6 @@ } /* - * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but - * so long as it is large enough. - */ -#define MAX_PINNED_INTERVAL 512 - -/* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. * @@ -2115,8 +1025,7 @@ struct sched_group *group; runqueue_t *busiest; unsigned long imbalance; - int nr_moved, all_pinned = 0; - int active_balance = 0; + int nr_moved; spin_lock(&this_rq->lock); schedstat_inc(sd, lb_cnt[idle]); @@ -2133,7 +1042,15 @@ goto out_balanced; } - BUG_ON(busiest == this_rq); + /* + * This should be "impossible", but since load + * balancing is inherently racy and statistical, + * it could happen in theory. + */ + if (unlikely(busiest == this_rq)) { + WARN_ON(1); + goto out_balanced; + } schedstat_add(sd, lb_imbalance[idle], imbalance); @@ -2147,15 +1064,9 @@ */ double_lock_balance(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, idle, - &all_pinned); + imbalance, sd, idle); spin_unlock(&busiest->lock); - - /* All tasks on this runqueue were pinned by CPU affinity */ - if (unlikely(all_pinned)) - goto out_balanced; } - spin_unlock(&this_rq->lock); if (!nr_moved) { @@ -2163,38 +1074,36 @@ sd->nr_balance_failed++; if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { + int wake = 0; spin_lock(&busiest->lock); if (!busiest->active_balance) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; - active_balance = 1; + wake = 1; } spin_unlock(&busiest->lock); - if (active_balance) + if (wake) wake_up_process(busiest->migration_thread); /* * We've kicked active balancing, reset the failure * counter. */ - sd->nr_balance_failed = sd->cache_nice_tries+1; + sd->nr_balance_failed = sd->cache_nice_tries; } - } else - sd->nr_balance_failed = 0; - if (likely(!active_balance)) { - /* We were unbalanced, so reset the balancing interval */ - sd->balance_interval = sd->min_interval; - } else { /* - * If we've begun active balancing, start to back off. This - * case may not be covered by the all_pinned logic if there - * is only 1 task on the busy runqueue (because we don't call - * move_tasks). + * We were unbalanced, but unsuccessful in move_tasks(), + * so bump the balance_interval to lessen the lock contention. */ if (sd->balance_interval < sd->max_interval) - sd->balance_interval *= 2; + sd->balance_interval++; + } else { + sd->nr_balance_failed = 0; + + /* We were unbalanced, so reset the balancing interval */ + sd->balance_interval = sd->min_interval; } return nr_moved; @@ -2204,10 +1113,8 @@ schedstat_inc(sd, lb_balanced[idle]); - sd->nr_balance_failed = 0; /* tune up the balancing interval */ - if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || - (sd->balance_interval < sd->max_interval)) + if (sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; return 0; @@ -2231,43 +1138,38 @@ schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); if (!group) { + schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); - goto out_balanced; + goto out; } busiest = find_busiest_queue(group, NEWLY_IDLE); - if (!busiest) { + if (!busiest || busiest == this_rq) { + schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); - goto out_balanced; + goto out; } - BUG_ON(busiest == this_rq); - /* Attempt to move tasks */ double_lock_balance(this_rq, busiest); schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); nr_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, NEWLY_IDLE, NULL); + imbalance, sd, NEWLY_IDLE); if (!nr_moved) schedstat_inc(sd, lb_failed[NEWLY_IDLE]); - else - sd->nr_balance_failed = 0; spin_unlock(&busiest->lock); - return nr_moved; -out_balanced: - schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); - sd->nr_balance_failed = 0; - return 0; +out: + return nr_moved; } /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ -static inline void idle_balance(int this_cpu, runqueue_t *this_rq) +void idle_balance(int this_cpu, runqueue_t *this_rq) { struct sched_domain *sd; @@ -2292,42 +1194,56 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) { struct sched_domain *sd; + struct sched_group *cpu_group; runqueue_t *target_rq; - int target_cpu = busiest_rq->push_cpu; - - if (busiest_rq->nr_running <= 1) - /* no task to move */ - return; - - target_rq = cpu_rq(target_cpu); + cpumask_t visited_cpus; + int cpu; /* - * This condition is "impossible", if it occurs - * we need to fix it. Originally reported by - * Bjorn Helgaas on a 128-cpu setup. + * Search for suitable CPUs to push tasks to in successively higher + * domains with SD_LOAD_BALANCE set. */ - BUG_ON(busiest_rq == target_rq); - - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); + visited_cpus = CPU_MASK_NONE; + for_each_domain(busiest_cpu, sd) { + if (!(sd->flags & SD_LOAD_BALANCE)) + /* no more domains to search */ + break; - /* Search for an sd spanning us and the target CPU. */ - for_each_domain(target_cpu, sd) - if ((sd->flags & SD_LOAD_BALANCE) && - cpu_isset(busiest_cpu, sd->span)) - break; + schedstat_inc(sd, alb_cnt); - if (unlikely(sd == NULL)) - goto out; + cpu_group = sd->groups; + do { + for_each_cpu_mask(cpu, cpu_group->cpumask) { + if (busiest_rq->nr_running <= 1) + /* no more tasks left to move */ + return; + if (cpu_isset(cpu, visited_cpus)) + continue; + cpu_set(cpu, visited_cpus); + if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu) + continue; - schedstat_inc(sd, alb_cnt); + target_rq = cpu_rq(cpu); + /* + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by + * Bjorn Helgaas on a 128-cpu setup. + */ + BUG_ON(busiest_rq == target_rq); - if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) - schedstat_inc(sd, alb_pushed); - else - schedstat_inc(sd, alb_failed); -out: - spin_unlock(&target_rq->lock); + /* move a task from busiest_rq to target_rq */ + double_lock_balance(busiest_rq, target_rq); + if (move_tasks(target_rq, cpu, busiest_rq, + 1, sd, SCHED_IDLE)) { + schedstat_inc(sd, alb_pushed); + } else { + schedstat_inc(sd, alb_failed); + } + spin_unlock(&target_rq->lock); + } + cpu_group = cpu_group->next; + } while (cpu_group != sd->groups); + } } /* @@ -2342,29 +1258,23 @@ /* Don't have all balancing operations going off at once */ #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) -static void rebalance_tick(int this_cpu, runqueue_t *this_rq, - enum idle_type idle) +void rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle) { unsigned long old_load, this_load; unsigned long j = jiffies + CPU_OFFSET(this_cpu); struct sched_domain *sd; - int i; - this_load = this_rq->nr_running * SCHED_LOAD_SCALE; /* Update our load */ - for (i = 0; i < 3; i++) { - unsigned long new_load = this_load; - int scale = 1 << i; - old_load = this_rq->cpu_load[i]; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale-1; - this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; - } + old_load = this_rq->cpu_load; + this_load = this_rq->nr_running * SCHED_LOAD_SCALE; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (this_load > old_load) + old_load++; + this_rq->cpu_load = (old_load + this_load) / 2; for_each_domain(this_cpu, sd) { unsigned long interval; @@ -2383,29 +1293,20 @@ if (j - sd->last_balance >= interval) { if (load_balance(this_cpu, this_rq, sd, idle)) { - /* We've pulled tasks over so no longer idle */ - idle = NOT_IDLE; - } - sd->last_balance += interval; - } - } -} -#else -/* - * on UP we do not need to balance between CPUs: - */ -static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) -{ -} -static inline void idle_balance(int cpu, runqueue_t *rq) -{ + /* We've pulled tasks over so no longer idle */ + idle = NOT_IDLE; + } + sd->last_balance += interval; + } + } } #endif -static inline int wake_priority_sleeper(runqueue_t *rq) +#ifdef CONFIG_SCHED_SMT +int wake_priority_sleeper(runqueue_t *rq) { int ret = 0; -#ifdef CONFIG_SCHED_SMT + spin_lock(&rq->lock); /* * If an SMT sibling task has been put to sleep for priority @@ -2416,26 +1317,16 @@ ret = 1; } spin_unlock(&rq->lock); -#endif + return ret; } +#endif DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); /* - * This is called on clock ticks and on context switches. - * Bank in p->sched_time the ns elapsed since the last tick or switch. - */ -static inline void update_cpu_clock(task_t *p, runqueue_t *rq, - unsigned long long now) -{ - unsigned long long last = max(p->timestamp, rq->timestamp_last_tick); - p->sched_time += now - last; -} - -/* * Return current->sched_time plus any more ns on the sched_clock * that have not yet been banked. */ @@ -2451,22 +1342,6 @@ } /* - * We place interactive tasks back into the active array, if possible. - * - * To guarantee that this does not starve expired tasks we ignore the - * interactivity of a task if the first expired task had to wait more - * than a 'reasonable' amount of time. This deadline timeout is - * load-dependent, as the frequency of array switched decreases with - * increasing number of running tasks. We also ignore the interactivity - * if a better static_prio task has expired: - */ -#define EXPIRED_STARVING(rq) \ - ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ - (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ - ((rq)->curr->static_prio > (rq)->best_expired_prio)) - -/* * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() @@ -2550,7 +1425,6 @@ */ void scheduler_tick(void) { - int cpu = smp_processor_id(); runqueue_t *rq = this_rq(); task_t *p = current; unsigned long long now = sched_clock(); @@ -2559,100 +1433,17 @@ rq->timestamp_last_tick = now; - if (p == rq->idle) { - if (wake_priority_sleeper(rq)) - goto out; - rebalance_tick(cpu, rq, SCHED_IDLE); - return; - } - - /* Task might have expired already, but not scheduled off yet */ - if (p->array != rq->active) { - set_tsk_need_resched(p); - goto out; - } - spin_lock(&rq->lock); - /* - * The task was running during this tick - update the - * time slice counter. Note: we do not update a thread's - * priority until it either goes to sleep or uses up its - * timeslice. This makes it possible for interactive tasks - * to use up their timeslices at their highest priority levels. - */ - if (rt_task(p)) { - /* - * RR tasks need a special form of timeslice management. - * FIFO tasks have no timeslices. - */ - if ((p->policy == SCHED_RR) && !--p->time_slice) { - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - set_tsk_need_resched(p); - - /* put it at the end of the queue: */ - requeue_task(p, rq->active); - } - goto out_unlock; - } - if (!--p->time_slice) { - dequeue_task(p, rq->active); - set_tsk_need_resched(p); - p->prio = effective_prio(p); - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; - if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { - enqueue_task(p, rq->expired); - if (p->static_prio < rq->best_expired_prio) - rq->best_expired_prio = p->static_prio; - } else - enqueue_task(p, rq->active); - } else { - /* - * Prevent a too long timeslice allowing a task to monopolize - * the CPU. We do this by splitting up the timeslice into - * smaller pieces. - * - * Note: this does not mean the task's timeslices expire or - * get lost in any way, they just might be preempted by - * another task of equal priority. (one with higher - * priority would have preempted this task already.) We - * requeue this task to the end of the list on this priority - * level, which is in essence a round-robin of tasks with - * equal priority. - * - * This only applies to tasks in the interactive - * delta range with at least TIMESLICE_GRANULARITY to requeue. - */ - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - - p->time_slice) % TIMESLICE_GRANULARITY(p)) && - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq->active)) { - - requeue_task(p, rq->active); - set_tsk_need_resched(p); - } - } -out_unlock: - spin_unlock(&rq->lock); -out: - rebalance_tick(cpu, rq, NOT_IDLE); + sched_drvp->tick(p, rq, now); } #ifdef CONFIG_SCHED_SMT -static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) +void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) { - struct sched_domain *tmp, *sd = NULL; + struct sched_domain *sd = this_rq->sd; cpumask_t sibling_map; int i; - for_each_domain(this_cpu, tmp) - if (tmp->flags & SD_SHARE_CPUPOWER) - sd = tmp; - - if (!sd) + if (!(sd->flags & SD_SHARE_CPUPOWER)) return; /* @@ -2691,19 +1482,14 @@ */ } -static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) +int dependent_sleeper(int this_cpu, runqueue_t *this_rq) { - struct sched_domain *tmp, *sd = NULL; + struct sched_domain *sd = this_rq->sd; cpumask_t sibling_map; - prio_array_t *array; int ret = 0, i; task_t *p; - for_each_domain(this_cpu, tmp) - if (tmp->flags & SD_SHARE_CPUPOWER) - sd = tmp; - - if (!sd) + if (!(sd->flags & SD_SHARE_CPUPOWER)) return 0; /* @@ -2722,13 +1508,8 @@ */ if (!this_rq->nr_running) goto out_unlock; - array = this_rq->active; - if (!array->nr_active) - array = this_rq->expired; - BUG_ON(!array->nr_active); - p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, - task_t, run_list); + p = sched_drvp->head_of_queue(&this_rq->qu); for_each_cpu_mask(i, sibling_map) { runqueue_t *smt_rq = cpu_rq(i); @@ -2742,9 +1523,7 @@ * task from using an unfair proportion of the * physical cpu's resources. -ck */ - if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(p) || rt_task(smt_curr)) && - p->mm && smt_curr->mm && !rt_task(p)) + if (sched_drvp->dependent_sleeper_trumps(smt_curr, p, sd)) ret = 1; /* @@ -2752,9 +1531,7 @@ * or wake it up if it has been put to sleep for priority * reasons. */ - if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(smt_curr) || rt_task(p)) && - smt_curr->mm && p->mm && !rt_task(smt_curr)) || + if (sched_drvp->dependent_sleeper_trumps(p, smt_curr, sd) || (smt_curr == smt_rq->idle && smt_rq->nr_running)) resched_task(smt_curr); } @@ -2763,15 +1540,6 @@ spin_unlock(&cpu_rq(i)->lock); return ret; } -#else -static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) -{ -} - -static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) -{ - return 0; -} #endif #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) @@ -2811,14 +1579,8 @@ */ asmlinkage void __sched schedule(void) { - long *switch_count; - task_t *prev, *next; + task_t *prev; runqueue_t *rq; - prio_array_t *array; - struct list_head *queue; - unsigned long long now; - unsigned long run_time; - int cpu, idx, new_prio; /* * Test if we are atomic. Since do_exit() needs to call into @@ -2852,136 +1614,8 @@ } schedstat_inc(rq, sched_cnt); - now = sched_clock(); - if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { - run_time = now - prev->timestamp; - if (unlikely((long long)(now - prev->timestamp) < 0)) - run_time = 0; - } else - run_time = NS_MAX_SLEEP_AVG; - /* - * Tasks charged proportionately less run_time at high sleep_avg to - * delay them losing their interactive status - */ - run_time /= (CURRENT_BONUS(prev) ? : 1); - - spin_lock_irq(&rq->lock); - - if (unlikely(prev->flags & PF_DEAD)) - prev->state = EXIT_DEAD; - - switch_count = &prev->nivcsw; - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - switch_count = &prev->nvcsw; - if (unlikely((prev->state & TASK_INTERRUPTIBLE) && - unlikely(signal_pending(prev)))) - prev->state = TASK_RUNNING; - else { - if (prev->state == TASK_UNINTERRUPTIBLE) - rq->nr_uninterruptible++; - deactivate_task(prev, rq); - } - } - - cpu = smp_processor_id(); - if (unlikely(!rq->nr_running)) { -go_idle: - idle_balance(cpu, rq); - if (!rq->nr_running) { - next = rq->idle; - rq->expired_timestamp = 0; - wake_sleeping_dependent(cpu, rq); - /* - * wake_sleeping_dependent() might have released - * the runqueue, so break out if we got new - * tasks meanwhile: - */ - if (!rq->nr_running) - goto switch_tasks; - } - } else { - if (dependent_sleeper(cpu, rq)) { - next = rq->idle; - goto switch_tasks; - } - /* - * dependent_sleeper() releases and reacquires the runqueue - * lock, hence go into the idle loop if the rq went - * empty meanwhile: - */ - if (unlikely(!rq->nr_running)) - goto go_idle; - } - - array = rq->active; - if (unlikely(!array->nr_active)) { - /* - * Switch the active and expired arrays. - */ - schedstat_inc(rq, sched_switch); - rq->active = rq->expired; - rq->expired = array; - array = rq->active; - rq->expired_timestamp = 0; - rq->best_expired_prio = MAX_PRIO; - } - - idx = sched_find_first_bit(array->bitmap); - queue = array->queue + idx; - next = list_entry(queue->next, task_t, run_list); - - if (!rt_task(next) && next->activated > 0) { - unsigned long long delta = now - next->timestamp; - if (unlikely((long long)(now - next->timestamp) < 0)) - delta = 0; - - if (next->activated == 1) - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; - - array = next->array; - new_prio = recalc_task_prio(next, next->timestamp + delta); - - if (unlikely(next->prio != new_prio)) { - dequeue_task(next, array); - next->prio = new_prio; - enqueue_task(next, array); - } else - requeue_task(next, array); - } - next->activated = 0; -switch_tasks: - if (next == rq->idle) - schedstat_inc(rq, sched_goidle); - prefetch(next); - clear_tsk_need_resched(prev); - rcu_qsctr_inc(task_cpu(prev)); - - update_cpu_clock(prev, rq, now); - - prev->sleep_avg -= run_time; - if ((long)prev->sleep_avg <= 0) - prev->sleep_avg = 0; - prev->timestamp = prev->last_ran = now; - - sched_info_switch(prev, next); - if (likely(prev != next)) { - next->timestamp = now; - rq->nr_switches++; - rq->curr = next; - ++*switch_count; - - prepare_task_switch(rq, next); - prev = context_switch(rq, prev, next); - barrier(); - /* - * this_rq must be evaluated again because prev may have moved - * CPUs since it called schedule(), thus the 'rq' on its stack - * frame will be invalid. - */ - finish_task_switch(this_rq(), prev); - } else - spin_unlock_irq(&rq->lock); + sched_drvp->schedule(); prev = current; if (unlikely(reacquire_kernel_lock(prev) < 0)) @@ -3392,9 +2026,7 @@ void set_user_nice(task_t *p, long nice) { unsigned long flags; - prio_array_t *array; runqueue_t *rq; - int old_prio, new_prio, delta; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -3410,31 +2042,17 @@ * not SCHED_NORMAL: */ if (rt_task(p)) { - p->static_prio = NICE_TO_PRIO(nice); + int new_static_prio = NICE_TO_PRIO(nice); + + if (task_is_queued(p)) { + dec_prio_bias(rq, p->static_prio); + inc_prio_bias(rq, new_static_prio); + } + p->static_prio = new_static_prio; goto out_unlock; } - array = p->array; - if (array) { - dequeue_task(p, array); - dec_prio_bias(rq, p->static_prio); - } - old_prio = p->prio; - new_prio = NICE_TO_PRIO(nice); - delta = new_prio - old_prio; - p->static_prio = NICE_TO_PRIO(nice); - p->prio += delta; - - if (array) { - enqueue_task(p, array); - inc_prio_bias(rq, p->static_prio); - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ - if (delta < 0 || (delta > 0 && task_running(rq, p))) - resched_task(rq->curr); - } + sched_drvp->set_normal_task_nice(p, nice); out_unlock: task_rq_unlock(rq, &flags); } @@ -3555,9 +2173,9 @@ } /* Actually do priority change: must hold rq lock. */ -static void __setscheduler(struct task_struct *p, int policy, int prio) +void __setscheduler(struct task_struct *p, int policy, int prio) { - BUG_ON(p->array); + BUG_ON(task_is_queued(p)); p->policy = policy; p->rt_priority = prio; if (policy != SCHED_NORMAL) @@ -3576,8 +2194,7 @@ int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) { int retval; - int oldprio, oldpolicy = -1; - prio_array_t *array; + int oldpolicy = -1; unsigned long flags; runqueue_t *rq; @@ -3598,24 +2215,13 @@ if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) return -EINVAL; - /* - * Allow unprivileged RT tasks to decrease priority: - */ - if (!capable(CAP_SYS_NICE)) { - /* can't change policy */ - if (policy != p->policy) - return -EPERM; - /* can't increase priority */ - if (policy != SCHED_NORMAL && - param->sched_priority > p->rt_priority && - param->sched_priority > - p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) - return -EPERM; - /* can't change other user's priorities */ - if ((current->euid != p->euid) && - (current->euid != p->uid)) - return -EPERM; - } + if ((policy == SCHED_FIFO || policy == SCHED_RR) && + param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur && + !capable(CAP_SYS_NICE)) + return -EPERM; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + return -EPERM; retval = security_task_setscheduler(p, policy, param); if (retval) @@ -3631,24 +2237,9 @@ task_rq_unlock(rq, &flags); goto recheck; } - array = p->array; - if (array) - deactivate_task(p, rq); - oldprio = p->prio; - __setscheduler(p, policy, param->sched_priority); - if (array) { - __activate_task(p, rq); - /* - * Reschedule if we are currently running on this runqueue and - * our priority decreased, or if we are not currently running on - * this runqueue and our priority is higher than the current's - */ - if (task_running(rq, p)) { - if (p->prio > oldprio) - resched_task(rq->curr); - } else if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); - } + + sched_drvp->setscheduler(p, policy, param->sched_priority); + task_rq_unlock(rq, &flags); return 0; } @@ -3906,48 +2497,7 @@ */ asmlinkage long sys_sched_yield(void) { - runqueue_t *rq = this_rq_lock(); - prio_array_t *array = current->array; - prio_array_t *target = rq->expired; - - schedstat_inc(rq, yld_cnt); - /* - * We implement yielding by moving the task into the expired - * queue. - * - * (special rule: RT tasks will just roundrobin in the active - * array.) - */ - if (rt_task(current)) - target = rq->active; - - if (current->array->nr_active == 1) { - schedstat_inc(rq, yld_act_empty); - if (!rq->expired->nr_active) - schedstat_inc(rq, yld_both_empty); - } else if (!rq->expired->nr_active) - schedstat_inc(rq, yld_exp_empty); - - if (array != target) { - dequeue_task(current, array); - enqueue_task(current, target); - } else - /* - * requeue_task is cheaper so perform that if possible. - */ - requeue_task(current, array); - - /* - * Since we are going to call schedule() anyway, there's - * no need to preempt or enable interrupts: - */ - __release(rq->lock); - _raw_spin_unlock(&rq->lock); - preempt_enable_no_resched(); - - schedule(); - - return 0; + return sched_drvp->sys_yield(); } static inline void __cond_resched(void) @@ -4021,8 +2571,7 @@ */ void __sched yield(void) { - set_current_state(TASK_RUNNING); - sys_sched_yield(); + sched_drvp->yield(); } EXPORT_SYMBOL(yield); @@ -4238,32 +2787,19 @@ read_unlock(&tasklist_lock); } -/** - * init_idle - set up an idle thread for a given CPU - * @idle: task in question - * @cpu: cpu the idle task belongs to - * - * NOTE: this function does not set the idle thread's NEED_RESCHED - * flag, to make booting more robust. Architecture-level cpu_idle() - * functions should set it explicitly, before entering their idle-loop. - */ void __devinit init_idle(task_t *idle, int cpu) { runqueue_t *rq = cpu_rq(cpu); unsigned long flags; - idle->sleep_avg = 0; - idle->array = NULL; - idle->prio = MAX_PRIO; + sched_drvp->init_idle(idle, cpu); idle->state = TASK_RUNNING; idle->cpus_allowed = cpumask_of_cpu(cpu); set_task_cpu(idle, cpu); spin_lock_irqsave(&rq->lock, flags); rq->curr = rq->idle = idle; -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) - idle->oncpu = 1; -#endif + set_tsk_need_resched(idle); spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ @@ -4316,8 +2852,6 @@ migration_req_t req; runqueue_t *rq; - perfctr_set_cpus_allowed(p, new_mask); - rq = task_rq_lock(p, &flags); if (!cpus_intersects(new_mask, cpu_online_map)) { ret = -EINVAL; @@ -4371,21 +2905,10 @@ if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; - set_task_cpu(p, dest_cpu); - if (p->array) { - /* - * Sync timestamp with rq_dest's before activating. - * The same thing could be achieved by doing this step - * afterwards, and pretending it was a local activate. - * This way is cleaner and logically correct. - */ - p->timestamp = p->timestamp - rq_src->timestamp_last_tick - + rq_dest->timestamp_last_tick; - deactivate_task(p, rq_src); - activate_task(p, rq_dest, 0); - if (TASK_PREEMPTS_CURR(p, rq_dest)) - resched_task(rq_dest->curr); - } + if (task_is_queued(p)) + sched_drvp->migrate_queued_task(p, dest_cpu); + else + set_task_cpu(p, dest_cpu); out: double_rq_unlock(rq_src, rq_dest); @@ -4435,9 +2958,17 @@ req = list_entry(head->next, migration_req_t, list); list_del_init(head->next); - spin_unlock(&rq->lock); - __migrate_task(req->task, cpu, req->dest_cpu); - local_irq_enable(); + if (req->type == REQ_MOVE_TASK) { + spin_unlock(&rq->lock); + __migrate_task(req->task, cpu, req->dest_cpu); + local_irq_enable(); + } else if (req->type == REQ_SET_DOMAIN) { + rq->sd = req->sd; + spin_unlock_irq(&rq->lock); + } else { + spin_unlock_irq(&rq->lock); + WARN_ON(1); + } complete(&req->done); } @@ -4535,7 +3066,6 @@ { int cpu = smp_processor_id(); runqueue_t *rq = this_rq(); - struct task_struct *p = rq->idle; unsigned long flags; /* cpu has to be offline */ @@ -4546,9 +3076,7 @@ */ spin_lock_irqsave(&rq->lock, flags); - __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); - /* Add idle task to _front_ of it's priority queue */ - __activate_idle_task(p, rq); + sched_drvp->set_select_idle_first(rq); spin_unlock_irqrestore(&rq->lock, flags); } @@ -4567,7 +3095,7 @@ mmdrop(mm); } -static void migrate_dead(unsigned int dead_cpu, task_t *tsk) +void migrate_dead(unsigned int dead_cpu, task_t *tsk) { struct runqueue *rq = cpu_rq(dead_cpu); @@ -4592,20 +3120,9 @@ } /* release_task() removes task from tasklist, so we won't find dead tasks. */ -static void migrate_dead_tasks(unsigned int dead_cpu) +static inline void migrate_dead_tasks(unsigned int dead_cpu) { - unsigned arr, i; - struct runqueue *rq = cpu_rq(dead_cpu); - - for (arr = 0; arr < 2; arr++) { - for (i = 0; i < MAX_PRIO; i++) { - struct list_head *list = &rq->arrays[arr].queue[i]; - while (!list_empty(list)) - migrate_dead(dead_cpu, - list_entry(list->next, task_t, - run_list)); - } - } + sched_drvp->migrate_dead_tasks(dead_cpu); } #endif /* CONFIG_HOTPLUG_CPU */ @@ -4652,9 +3169,7 @@ rq->migration_thread = NULL; /* Idle task back to normal (off runqueue, low prio) */ rq = task_rq_lock(rq->idle, &flags); - deactivate_task(rq->idle, rq); - rq->idle->static_prio = MAX_PRIO; - __setscheduler(rq->idle, SCHED_NORMAL, 0); + sched_drvp->set_select_idle_last(rq); migrate_dead_tasks(cpu); task_rq_unlock(rq, &flags); migrate_nr_uninterruptible(rq); @@ -4668,6 +3183,7 @@ migration_req_t *req; req = list_entry(rq->migration_queue.next, migration_req_t, list); + BUG_ON(req->type != REQ_MOVE_TASK); list_del_init(&req->list); complete(&req->done); } @@ -4698,17 +3214,12 @@ #endif #ifdef CONFIG_SMP -#undef SCHED_DOMAIN_DEBUG +#define SCHED_DOMAIN_DEBUG #ifdef SCHED_DOMAIN_DEBUG static void sched_domain_debug(struct sched_domain *sd, int cpu) { int level = 0; - if (!sd) { - printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); - return; - } - printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); do { @@ -4791,204 +3302,37 @@ #define sched_domain_debug(sd, cpu) {} #endif -#if defined(CONFIG_DEBUG_KERNEL) && defined(CONFIG_SYSCTL) -static struct ctl_table sd_ctl_dir[] = { - {1, "sched_domain", NULL, 0, 0755, NULL, }, - {0,}, -}; - -static struct ctl_table sd_ctl_root[] = { - {1, "kernel", NULL, 0, 0755, sd_ctl_dir, }, - {0,}, -}; - -static char *sched_strdup(char *str) -{ - int n = strlen(str)+1; - char *s = kmalloc(n, GFP_KERNEL); - if (!s) - return NULL; - return strcpy(s, str); -} - -static struct ctl_table *sd_alloc_ctl_entry(int n) -{ - struct ctl_table *entry = - kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL); - BUG_ON(!entry); - memset(entry, 0, n * sizeof(struct ctl_table)); - return entry; -} - -static void set_table_entry(struct ctl_table *entry, int ctl_name, - const char *procname, void *data, int maxlen, - mode_t mode, proc_handler *proc_handler) -{ - entry->ctl_name = ctl_name; - entry->procname = procname; - entry->data = data; - entry->maxlen = maxlen; - entry->mode = mode; - entry->proc_handler = proc_handler; -} - -static struct ctl_table * -sd_alloc_ctl_domain_table(struct sched_domain *sd) -{ - struct ctl_table *table; - table = sd_alloc_ctl_entry(14); - - set_table_entry(&table[0], 1, "min_interval", &sd->min_interval, - sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[1], 2, "max_interval", &sd->max_interval, - sizeof(long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[2], 3, "busy_idx", &sd->busy_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[3], 4, "idle_idx", &sd->idle_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[4], 5, "newidle_idx", &sd->newidle_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[5], 6, "wake_idx", &sd->wake_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[6], 7, "forkexec_idx", &sd->forkexec_idx, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[7], 8, "busy_factor", &sd->busy_factor, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[8], 9, "imbalance_pct", &sd->imbalance_pct, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[9], 10, "cache_hot_time", &sd->cache_hot_time, - sizeof(long long), 0644, proc_doulongvec_minmax); - set_table_entry(&table[10], 11, "cache_nice_tries", &sd->cache_nice_tries, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[11], 12, "per_cpu_gain", &sd->per_cpu_gain, - sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[12], 13, "flags", &sd->flags, - sizeof(int), 0644, proc_dointvec_minmax); - return table; -} - -static ctl_table *sd_alloc_ctl_cpu_table(int cpu) -{ - struct sched_domain *sd; - int domain_num = 0, i; - struct ctl_table *entry, *table; - char buf[32]; - for_each_domain(cpu, sd) - domain_num++; - entry = table = sd_alloc_ctl_entry(domain_num + 1); - - i = 0; - for_each_domain(cpu, sd) { - snprintf(buf, 32, "domain%d", i); - entry->ctl_name = i + 1; - entry->procname = sched_strdup(buf); - entry->mode = 0755; - entry->child = sd_alloc_ctl_domain_table(sd); - entry++; - i++; - } - return table; -} - -static struct ctl_table_header *sd_sysctl_header; -static void init_sched_domain_sysctl(void) -{ - int i, cpu_num = num_online_cpus(); - char buf[32]; - struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); - - sd_ctl_dir[0].child = entry; - - for (i = 0; i < cpu_num; i++, entry++) { - snprintf(buf, 32, "cpu%d", i); - entry->ctl_name = i + 1; - entry->procname = sched_strdup(buf); - entry->mode = 0755; - entry->child = sd_alloc_ctl_cpu_table(i); - } - sd_sysctl_header = register_sysctl_table(sd_ctl_root, 0); -} -#else -static void init_sched_domain_sysctl(void) -{ -} -#endif - -static int sd_degenerate(struct sched_domain *sd) -{ - if (cpus_weight(sd->span) == 1) - return 1; - - /* Following flags need at least 2 groups */ - if (sd->flags & (SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC)) { - if (sd->groups != sd->groups->next) - return 0; - } - - /* Following flags don't use groups */ - if (sd->flags & (SD_WAKE_IDLE | - SD_WAKE_AFFINE | - SD_WAKE_BALANCE)) - return 0; - - return 1; -} - -static int sd_parent_degenerate(struct sched_domain *sd, - struct sched_domain *parent) -{ - unsigned long cflags = sd->flags, pflags = parent->flags; - - if (sd_degenerate(parent)) - return 1; - - if (!cpus_equal(sd->span, parent->span)) - return 0; - - /* Does parent contain flags not in child? */ - /* WAKE_BALANCE is a subset of WAKE_AFFINE */ - if (cflags & SD_WAKE_AFFINE) - pflags &= ~SD_WAKE_BALANCE; - /* Flags needing groups don't count if only 1 group in parent */ - if (parent->groups == parent->groups->next) { - pflags &= ~(SD_LOAD_BALANCE | - SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC); - } - if (~cflags & pflags) - return 0; - - return 1; -} - /* * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. */ -void cpu_attach_domain(struct sched_domain *sd, int cpu) +void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) { + migration_req_t req; + unsigned long flags; runqueue_t *rq = cpu_rq(cpu); - struct sched_domain *tmp; + int local = 1; - /* Remove the sched domains which do not contribute to scheduling. */ - for (tmp = sd; tmp; tmp = tmp->parent) { - struct sched_domain *parent = tmp->parent; - if (!parent) - break; - if (sd_parent_degenerate(tmp, parent)) - tmp->parent = parent->parent; - } + sched_domain_debug(sd, cpu); - if (sd && sd_degenerate(sd)) - sd = sd->parent; + spin_lock_irqsave(&rq->lock, flags); - sched_domain_debug(sd, cpu); + if (cpu == smp_processor_id() || !cpu_online(cpu)) { + rq->sd = sd; + } else { + init_completion(&req.done); + req.type = REQ_SET_DOMAIN; + req.sd = sd; + list_add(&req.list, &rq->migration_queue); + local = 0; + } + + spin_unlock_irqrestore(&rq->lock, flags); - rcu_assign_pointer(rq->sd, sd); + if (!local) { + wake_up_process(rq->migration_thread); + wait_for_completion(&req.done); + } } /* cpus with isolated domains */ @@ -5020,7 +3364,7 @@ * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ -void init_sched_build_groups(struct sched_group groups[], +void __devinit init_sched_build_groups(struct sched_group groups[], cpumask_t span, int (*group_fn)(int cpu)) { struct sched_group *first = NULL, *last = NULL; @@ -5056,14 +3400,13 @@ #ifdef ARCH_HAS_SCHED_DOMAIN -extern void build_sched_domains(const cpumask_t *cpu_map); -extern void arch_init_sched_domains(const cpumask_t *cpu_map); -extern void arch_destroy_sched_domains(const cpumask_t *cpu_map); +extern void __devinit arch_init_sched_domains(void); +extern void __devinit arch_destroy_sched_domains(void); #else #ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static struct sched_group sched_group_cpus[NR_CPUS]; -static int cpu_to_cpu_group(int cpu) +static int __devinit cpu_to_cpu_group(int cpu) { return cpu; } @@ -5071,7 +3414,7 @@ static DEFINE_PER_CPU(struct sched_domain, phys_domains); static struct sched_group sched_group_phys[NR_CPUS]; -static int cpu_to_phys_group(int cpu) +static int __devinit cpu_to_phys_group(int cpu) { #ifdef CONFIG_SCHED_SMT return first_cpu(cpu_sibling_map[cpu]); @@ -5084,7 +3427,7 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains); static struct sched_group sched_group_nodes[MAX_NUMNODES]; -static int cpu_to_node_group(int cpu) +static int __devinit cpu_to_node_group(int cpu) { return cpu_to_node(cpu); } @@ -5115,28 +3458,39 @@ #endif /* - * Build sched domains for a given set of cpus and attach the sched domains - * to the individual cpus + * Set up scheduler domains and groups. Callers must hold the hotplug lock. */ -static void build_sched_domains(const cpumask_t *cpu_map) +static void __devinit arch_init_sched_domains(void) { int i; + cpumask_t cpu_default_map; + +#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) + check_sibling_maps(); +#endif + /* + * Setup mask for cpus without special case scheduling requirements. + * For now this just excludes isolated cpus, but could be used to + * exclude other special cases in the future. + */ + cpus_complement(cpu_default_map, cpu_isolated_map); + cpus_and(cpu_default_map, cpu_default_map, cpu_online_map); /* - * Set up domains for cpus specified by the cpu_map. + * Set up domains. Isolated domains just stay on the dummy domain. */ - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask(i, cpu_default_map) { int group; struct sched_domain *sd = NULL, *p; cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); - cpus_and(nodemask, nodemask, *cpu_map); + cpus_and(nodemask, nodemask, cpu_default_map); #ifdef CONFIG_NUMA sd = &per_cpu(node_domains, i); group = cpu_to_node_group(i); *sd = SD_NODE_INIT; - sd->span = *cpu_map; + sd->span = cpu_default_map; sd->groups = &sched_group_nodes[group]; #endif @@ -5154,7 +3508,7 @@ group = cpu_to_cpu_group(i); *sd = SD_SIBLING_INIT; sd->span = cpu_sibling_map[i]; - cpus_and(sd->span, sd->span, *cpu_map); + cpus_and(sd->span, sd->span, cpu_default_map); sd->parent = p; sd->groups = &sched_group_cpus[group]; #endif @@ -5164,7 +3518,7 @@ /* Set up CPU (sibling) groups */ for_each_online_cpu(i) { cpumask_t this_sibling_map = cpu_sibling_map[i]; - cpus_and(this_sibling_map, this_sibling_map, *cpu_map); + cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); if (i != first_cpu(this_sibling_map)) continue; @@ -5177,7 +3531,7 @@ for (i = 0; i < MAX_NUMNODES; i++) { cpumask_t nodemask = node_to_cpumask(i); - cpus_and(nodemask, nodemask, *cpu_map); + cpus_and(nodemask, nodemask, cpu_default_map); if (cpus_empty(nodemask)) continue; @@ -5187,12 +3541,12 @@ #ifdef CONFIG_NUMA /* Set up node groups */ - init_sched_build_groups(sched_group_nodes, *cpu_map, + init_sched_build_groups(sched_group_nodes, cpu_default_map, &cpu_to_node_group); #endif /* Calculate CPU power for physical packages and nodes */ - for_each_cpu_mask(i, *cpu_map) { + for_each_cpu_mask(i, cpu_default_map) { int power; struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT @@ -5216,7 +3570,7 @@ } /* Attach the domains */ - for_each_cpu_mask(i, *cpu_map) { + for_each_online_cpu(i) { struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i); @@ -5226,85 +3580,41 @@ cpu_attach_domain(sd, i); } } -/* - * Set up scheduler domains and groups. Callers must hold the hotplug lock. - */ -static void arch_init_sched_domains(cpumask_t *cpu_map) -{ - cpumask_t cpu_default_map; - -#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) - check_sibling_maps(); -#endif - /* - * Setup mask for cpus without special case scheduling requirements. - * For now this just excludes isolated cpus, but could be used to - * exclude other special cases in the future. - */ - cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); - - build_sched_domains(&cpu_default_map); -} -static void arch_destroy_sched_domains(const cpumask_t *cpu_map) +#ifdef CONFIG_HOTPLUG_CPU +static void __devinit arch_destroy_sched_domains(void) { /* Do nothing: everything is statically allocated. */ } +#endif #endif /* ARCH_HAS_SCHED_DOMAIN */ /* - * Detach sched domains from a group of cpus specified in cpu_map - * These cpus will now be attached to the NULL domain + * Initial dummy domain for early boot and for hotplug cpu. Being static, + * it is initialized to zero, so all balancing flags are cleared which is + * what we want. */ -static inline void detach_destroy_domains(const cpumask_t *cpu_map) -{ - int i; - - for_each_cpu_mask(i, *cpu_map) - cpu_attach_domain(NULL, i); - synchronize_sched(); - arch_destroy_sched_domains(cpu_map); -} - -/* - * Partition sched domains as specified by the cpumasks below. - * This attaches all cpus from the cpumasks to the NULL domain, - * waits for a RCU quiescent period, recalculates sched - * domain information and then attaches them back to the - * correct sched domains - * Call with hotplug lock held - */ -void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) -{ - cpumask_t change_map; - - cpus_and(*partition1, *partition1, cpu_online_map); - cpus_and(*partition2, *partition2, cpu_online_map); - cpus_or(change_map, *partition1, *partition2); - - /* Detach sched domains from all of the affected cpus */ - detach_destroy_domains(&change_map); - if (!cpus_empty(*partition1)) - build_sched_domains(partition1); - if (!cpus_empty(*partition2)) - build_sched_domains(partition2); -} +static struct sched_domain sched_domain_dummy; #ifdef CONFIG_HOTPLUG_CPU /* * Force a reinitialization of the sched domains hierarchy. The domains * and groups cannot be updated in place without racing with the balancing - * code, so we temporarily attach all running cpus to the NULL domain + * code, so we temporarily attach all running cpus to a "dummy" domain * which will prevent rebalancing while the sched domains are recalculated. */ static int update_sched_domains(struct notifier_block *nfb, unsigned long action, void *hcpu) { + int i; + switch (action) { case CPU_UP_PREPARE: case CPU_DOWN_PREPARE: - detach_destroy_domains(&cpu_online_map); + for_each_online_cpu(i) + cpu_attach_domain(&sched_domain_dummy, i); + arch_destroy_sched_domains(); return NOTIFY_OK; case CPU_UP_CANCELED: @@ -5320,7 +3630,7 @@ } /* The hotplug lock is already held by cpu_up/cpu_down */ - arch_init_sched_domains(&cpu_online_map); + arch_init_sched_domains(); return NOTIFY_OK; } @@ -5329,11 +3639,10 @@ void __init sched_init_smp(void) { lock_cpu_hotplug(); - arch_init_sched_domains(&cpu_online_map); + arch_init_sched_domains(); unlock_cpu_hotplug(); /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); - init_sched_domain_sysctl(); } #else void __init sched_init_smp(void) @@ -5350,25 +3659,25 @@ && addr < (unsigned long)__sched_text_end); } +void set_oom_time_slice(struct task_struct *p, unsigned long t) +{ + sched_drvp->set_oom_time_slice(p, t); +} + void __init sched_init(void) { runqueue_t *rq; - int i, j, k; + int i; - for (i = 0; i < NR_CPUS; i++) { - prio_array_t *array; + sched_drvp->sched_init(); + for (i = 0; i < NR_CPUS; i++) { rq = cpu_rq(i); spin_lock_init(&rq->lock); - rq->nr_running = 0; - rq->active = rq->arrays; - rq->expired = rq->arrays + 1; - rq->best_expired_prio = MAX_PRIO; #ifdef CONFIG_SMP - rq->sd = NULL; - for (j = 1; j < 3; j++) - rq->cpu_load[j] = 0; + rq->sd = &sched_domain_dummy; + rq->cpu_load = 0; rq->active_balance = 0; rq->push_cpu = 0; rq->migration_thread = NULL; @@ -5376,15 +3685,7 @@ #endif atomic_set(&rq->nr_iowait, 0); - for (j = 0; j < 2; j++) { - array = rq->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { - INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); - } + sched_drvp->init_runqueue_queue(&rq->qu); } /* @@ -5428,27 +3729,11 @@ void normalize_rt_tasks(void) { struct task_struct *p; - prio_array_t *array; - unsigned long flags; - runqueue_t *rq; read_lock_irq(&tasklist_lock); for_each_process (p) { - if (!rt_task(p)) - continue; - - rq = task_rq_lock(p, &flags); - - array = p->array; - if (array) - deactivate_task(p, task_rq(p)); - __setscheduler(p, SCHED_NORMAL, 0); - if (array) { - __activate_task(p, task_rq(p)); - resched_task(rq->curr); - } - - task_rq_unlock(rq, &flags); + if (rt_task(p)) + sched_drvp->normalize_rt_task(p); } read_unlock_irq(&tasklist_lock); } diff -Naur linux-2.6.12-rc5-mm1/kernel/sched_cpustats.c linux-2.6.12-rc5-mm1-plug/kernel/sched_cpustats.c --- linux-2.6.12-rc5-mm1/kernel/sched_cpustats.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.12-rc5-mm1-plug/kernel/sched_cpustats.c 2005-05-25 17:05:49.632977792 -0700 @@ -0,0 +1,399 @@ +/* + * kernel/sched_stats.c + * + * Kernel highe resolution cpu statistics for use by schedulers + * + * Copyright (C) 2004 Aurema Pty Ltd + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include + +#ifndef task_is_sinbinned +#define task_is_sinbinned(p) (0) +#endif + +DEFINE_PER_CPU(struct runq_cpustats, cpustats_runqs); + +void init_runq_cpustats(unsigned int cpu) +{ + struct runq_cpustats *csrq = &per_cpu(cpustats_runqs, cpu); + + csrq->total_delay = 0; + csrq->total_sinbin = 0; + csrq->total_rt_delay = 0; + csrq->total_intr_delay = 0; + csrq->total_rt_intr_delay = 0; + csrq->total_fork_delay = 0; + cpu_rq(cpu)->timestamp_last_tick = INITIAL_CPUSTATS_TIMESTAMP; +} + +#ifdef CONFIG_SMP +unsigned long long adjusted_sched_clock(const task_t *p) +{ + return sched_clock() + (task_rq(p)->timestamp_last_tick - this_rq()->timestamp_last_tick); +} +#endif + +void initialize_cpustats(struct task_struct *p, unsigned long long now) +{ + TASK_CPUSTATS(p).avg_sleep_per_cycle = 0; + TASK_CPUSTATS(p).avg_delay_per_cycle = 0; + TASK_CPUSTATS(p).avg_cpu_per_cycle = 0; + TASK_CPUSTATS(p).total_sleep = 0; + TASK_CPUSTATS(p).total_delay = 0; + TASK_CPUSTATS(p).total_sinbin = 0; + TASK_CPUSTATS(p).total_cpu = 0; + TASK_CPUSTATS(p).total_wake_ups = 0; + TASK_CPUSTATS(p).intr_wake_ups = 0; + TASK_CPUSTATS(p).avg_cycle_length = 0; + p->timestamp = now; + TASK_CPUSTATS(p).flags = CPUSTATS_JUST_FORKED_FL; +} + +void delta_sleep_cpustats(struct task_struct *p, unsigned long long now) +{ + unsigned long long delta; + + /* sched_clock() is not guaranteed monotonic */ + if (now <= p->timestamp) { + p->timestamp = now; + return; + } + + delta = now - p->timestamp; + p->timestamp = now; + TASK_CPUSTATS(p).avg_sleep_per_cycle += delta; + TASK_CPUSTATS(p).total_sleep += delta; +} + +void delta_cpu_cpustats(struct task_struct *p, unsigned long long now) +{ + unsigned long long delta; + + /* sched_clock() is not guaranteed monotonic */ + if (now <= p->timestamp) { + p->timestamp = now; + return; + } + + delta = now - p->timestamp; + p->timestamp = now; + TASK_CPUSTATS(p).avg_cpu_per_cycle += delta; + TASK_CPUSTATS(p).total_cpu += delta; +} + +void delta_delay_cpustats(struct task_struct *p, unsigned long long now) +{ + unsigned long long delta; + struct runq_cpustats *rq_stats; + + /* sched_clock() is not guaranteed monotonic */ + if (now <= p->timestamp) { + p->timestamp = now; + return; + } + + rq_stats = &per_cpu(cpustats_runqs, task_cpu(p)); + delta = now - p->timestamp; + p->timestamp = now; + TASK_CPUSTATS(p).avg_delay_per_cycle += delta; + TASK_CPUSTATS(p).total_delay += delta; + rq_stats->total_delay += delta; + if (task_is_sinbinned(p)) { + TASK_CPUSTATS(p).total_sinbin += delta; + rq_stats->total_sinbin += delta; + } else if (rt_task(p)) { /* rt tasks are never sinbinned */ + rq_stats->total_rt_delay += delta; + if (TASK_CPUSTATS(p).flags & CPUSTATS_WOKEN_FOR_INTR_FL) + rq_stats->total_rt_intr_delay += delta; + } + if (unlikely(TASK_CPUSTATS(p).flags & CPUSTATS_JUST_FORKED_FL)) { + rq_stats->total_fork_delay += delta; + TASK_CPUSTATS(p).flags &= ~CPUSTATS_JUST_FORKED_FL; + } + if (TASK_CPUSTATS(p).flags & CPUSTATS_WOKEN_FOR_INTR_FL) { + rq_stats->total_intr_delay += delta; + TASK_CPUSTATS(p).flags &= ~CPUSTATS_WOKEN_FOR_INTR_FL; + } +} + +#define SCHED_AVG_ALPHA ((1 << SCHED_AVG_OFFSET) - 1) +static inline void apply_sched_avg_decay(unsigned long long *valp) +{ + *valp *= SCHED_AVG_ALPHA; + *valp >>= SCHED_AVG_OFFSET; +} + +static inline void decay_cpustats_for_cycle(struct task_struct *p) +{ + apply_sched_avg_decay(&TASK_CPUSTATS(p).avg_sleep_per_cycle); + apply_sched_avg_decay(&TASK_CPUSTATS(p).avg_delay_per_cycle); + apply_sched_avg_decay(&TASK_CPUSTATS(p).avg_cpu_per_cycle); + TASK_CPUSTATS(p).avg_cycle_length = TASK_CPUSTATS(p).avg_sleep_per_cycle + + TASK_CPUSTATS(p).avg_delay_per_cycle + + TASK_CPUSTATS(p).avg_cpu_per_cycle; + /* take short cut and avoid possible divide by zero below */ + if (TASK_CPUSTATS(p).avg_cpu_per_cycle == 0) + TASK_CPUSTATS(p).cpu_usage_rate = 0; + else + TASK_CPUSTATS(p).cpu_usage_rate = calc_proportion(TASK_CPUSTATS(p).avg_cpu_per_cycle, TASK_CPUSTATS(p).avg_cycle_length); +} + +void update_cpustats_at_wake_up(struct task_struct *p, unsigned long long now) +{ + delta_sleep_cpustats(p, now); + if (in_interrupt()) { + TASK_CPUSTATS(p).intr_wake_ups++; + TASK_CPUSTATS(p).flags |= CPUSTATS_WOKEN_FOR_INTR_FL; + } + TASK_CPUSTATS(p).total_wake_ups++; + decay_cpustats_for_cycle(p); +} + +void update_cpustats_at_end_of_ts(struct task_struct *p, unsigned long long now) +{ + delta_cpu_cpustats(p, now); + decay_cpustats_for_cycle(p); +} + +#ifndef CONFIG_CPUSCHED_SPA +int task_sched_cpustats(struct task_struct *p, char *buffer) +{ + struct task_cpustats stats; + unsigned long nvcsw, nivcsw; /* context switch counts */ + int result; + + read_lock(&tasklist_lock); + result = get_task_cpustats(p, &stats); + nvcsw = p->nvcsw; + nivcsw = p-> nivcsw; + read_unlock(&tasklist_lock); + if (result) + return sprintf(buffer, "Data unavailable\n"); + return sprintf(buffer, + "%llu %llu %llu %llu %llu %llu %lu %lu @ %llu\n", + stats.total_sleep, + stats.total_cpu, + stats.total_delay, + stats.total_sinbin, + stats.total_wake_ups, + stats.intr_wake_ups, + nvcsw, nivcsw, + stats.timestamp); +} + +int cpustats_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int i; + int len = 0; + int avail = 1; + struct cpu_cpustats total = {0, }; + + for_each_online_cpu(i) { + struct cpu_cpustats stats; + + if (get_cpu_cpustats(i, &stats) != 0) { + avail = 0; + break; + } + len += sprintf(page + len, + "cpu%02d %llu %llu %llu %llu %llu %llu %llu %llu @ %llu\n", i, + stats.total_idle, + stats.total_busy, + stats.total_delay, + stats.total_rt_delay, + stats.total_intr_delay, + stats.total_rt_intr_delay, + stats.total_sinbin, + stats.nr_switches, + stats.timestamp); + total.total_idle += stats.total_idle; + total.total_busy += stats.total_busy; + total.total_delay += stats.total_delay; + total.total_rt_delay += stats.total_rt_delay; + total.total_intr_delay += stats.total_intr_delay; + total.total_rt_intr_delay += stats.total_rt_intr_delay; + total.total_sinbin += stats.total_sinbin; + total.nr_switches += stats.nr_switches; + } + if (avail) + len += sprintf(page + len, "total %llu %llu %llu %llu %llu %llu %llu %llu\n", + total.total_idle, + total.total_busy, + total.total_delay, + total.total_intr_delay, + total.total_rt_delay, + total.total_rt_intr_delay, + total.total_sinbin, + total.nr_switches); + else + len = sprintf(page, "Data unavailable\n"); + + if (len <= off+count) *eof = 1; + *start = page + off; + len -= off; + if (len > count) len = count; + if (len < 0) len = 0; + + return len; +} +#endif + +static inline unsigned long long sched_div_64(unsigned long long a, unsigned long long b) +{ +#if BITS_PER_LONG < 64 + /* + * Assume that there's no 64 bit divide available + */ + if (a < b) + return 0; + /* + * Scale down until b less than 32 bits so that we can do + * a divide using do_div() + */ + while (b > ULONG_MAX) { a >>= 1; b >>= 1; } + + (void)do_div(a, (unsigned long)b); + + return a; +#else + return a / b; +#endif +} + +unsigned long long cpustats_avg_in_jiffies(unsigned long long avg) +{ + return sched_div_64(SCHED_AVG_RND(avg) * HZ, 1000000000); +} + +/* + * CPU usage rate is estimated as a proportion of a CPU using fixed denominator + * rational numbers. The denominator must be less than 2^24 so that + * we can store the eb_yardstick in an atomic_t on sparc + */ +#if PROPORTION_OFFSET >= 24 +#error "PROPORTION_OFFSET must be less than 24" +#endif +#define PROPORTION_OVERFLOW ((1ULL << (64 - PROPORTION_OFFSET)) - 1) + +/* + * Convert a / b to a proportion in the range 0 to PROPORTION_ONE + * Requires a <= b or may get a divide by zero exception + */ +unsigned long calc_proportion(unsigned long long a, unsigned long long b) +{ + if (unlikely(a == b)) + return PROPORTION_ONE; + + while (a > PROPORTION_OVERFLOW) { a >>= 1; b >>= 1; } + + return sched_div_64(a << PROPORTION_OFFSET, b); +} + +/* + * Map the given proportion to an unsigned long in the specified range + * Requires range < PROPORTION_ONE to avoid overflow + */ +unsigned long map_proportion(unsigned long prop, unsigned long range) +{ + /* use 64 bits to help avoid overflow on 32 bit systems */ + return ((unsigned long long)prop * (unsigned long long)range) >> PROPORTION_OFFSET; +} + +/* WANT: proportion_to_ppt(ppt_to_proportion(x)) == x + */ +unsigned long proportion_to_ppt(unsigned long proportion) +{ + return ((unsigned long long)proportion * 2001ULL) >> (PROPORTION_OFFSET + 1); +} + +unsigned long ppt_to_proportion(unsigned long ppt) +{ + return sched_div_64((unsigned long long)ppt * PROPORTION_ONE, 1000); +} + +unsigned long avg_cpu_usage_rate(const struct task_struct *p) +{ + return TASK_CPUSTATS(p).cpu_usage_rate; +} + +unsigned long avg_sleep_rate(const struct task_struct *p) +{ + /* take short cut and avoid possible divide by zero below */ + if (TASK_CPUSTATS(p).avg_sleep_per_cycle == 0) + return 0; + + return calc_proportion(TASK_CPUSTATS(p).avg_sleep_per_cycle, TASK_CPUSTATS(p).avg_cycle_length); +} + +unsigned long avg_cpu_delay_rate(const struct task_struct *p) +{ + /* take short cut and avoid possible divide by zero below */ + if (TASK_CPUSTATS(p).avg_delay_per_cycle == 0) + return 0; + + return calc_proportion(TASK_CPUSTATS(p).avg_delay_per_cycle, TASK_CPUSTATS(p).avg_cycle_length); +} + +unsigned long delay_in_jiffies_for_usage(const struct task_struct *p, unsigned long rur) +{ + unsigned long long acpc_jiffies, aspc_jiffies, res; + + if (rur == 0) + return ULONG_MAX; + + acpc_jiffies = cpustats_avg_in_jiffies(TASK_CPUSTATS(p).avg_cpu_per_cycle); + aspc_jiffies = cpustats_avg_in_jiffies(TASK_CPUSTATS(p).avg_sleep_per_cycle); + + /* + * we have to be careful about overflow and/or underflow + */ + while (unlikely(acpc_jiffies > PROPORTION_OVERFLOW)) { + acpc_jiffies >>= 1; + if (unlikely((rur >>= 1) == 0)) + return ULONG_MAX; + } + + res = sched_div_64(acpc_jiffies << PROPORTION_OFFSET, rur); + res -= acpc_jiffies; + if (res > aspc_jiffies) + return res - aspc_jiffies; + else + return 0; +} + +#ifndef CONFIG_CPUSCHED_SPA +static int convert_proportion(unsigned long *val, void *data, int write) +{ + if (write) { + if (*val > 1000) + return -1; + *val = ppt_to_proportion(*val); + } else + *val = proportion_to_ppt(*val); + + return 0; +} + +int do_proc_proportion(ctl_table *ctp, int write, struct file *fp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_doulongvec_convf_minmax(ctp, write, fp, buffer, lenp, + ppos, convert_proportion, NULL); +} +#endif diff -Naur linux-2.6.12-rc5-mm1/kernel/sched_drv.c linux-2.6.12-rc5-mm1-plug/kernel/sched_drv.c --- linux-2.6.12-rc5-mm1/kernel/sched_drv.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.12-rc5-mm1-plug/kernel/sched_drv.c 2005-05-25 17:05:49.633977640 -0700 @@ -0,0 +1,137 @@ +/* + * kernel/sched_drv.c + * + * Kernel scheduler device implementation + */ +#include +#include +#include +#include +#include +#include +#include + +/* + * All private per scheduler entries in task_struct are defined as + * separate structs and placed into the cpusched union in task_struct. + */ + +/* Ingosched */ +#ifdef CONFIG_CPUSCHED_INGO +extern const struct sched_drv ingo_sched_drv; +#endif + +/* Staircase */ +#ifdef CONFIG_CPUSCHED_STAIRCASE +extern const struct sched_drv staircase_sched_drv; +#endif + +/* Single priority array (SPA) schedulers */ +#ifdef CONFIG_CPUSCHED_SPA_NF +extern const struct sched_drv spa_nf_sched_drv; +#endif +#ifdef CONFIG_CPUSCHED_ZAPHOD +extern const struct sched_drv zaphod_sched_drv; +#endif + +/* Nicksched */ +#ifdef CONFIG_CPUSCHED_NICK +extern const struct sched_drv nick_sched_drv; +#endif + +const struct sched_drv *sched_drvp = +#if defined(CONFIG_CPUSCHED_DEFAULT_INGO) + &ingo_sched_drv; +#elif defined(CONFIG_CPUSCHED_DEFAULT_STAIRCASE) + &staircase_sched_drv; +#elif defined(CONFIG_CPUSCHED_DEFAULT_SPA_NF) + &spa_nf_sched_drv; +#elif defined(CONFIG_CPUSCHED_DEFAULT_ZAPHOD) + &zaphod_sched_drv; +#elif defined(CONFIG_CPUSCHED_DEFAULT_NICK) + &nick_sched_drv; +#else + NULL; +#error "You must have at least 1 cpu scheduler selected" +#endif + +extern struct task_struct base_init_task; + +#define CPUSCHED_CHECK_SELECT(drv) \ +do { \ + if (!strcmp(str, (drv).name)) { \ + sched_drvp = &(drv); \ + return 1; \ + } \ +} while (0) + +static int __init sched_drv_setup(char *str) +{ +#if defined(CONFIG_CPUSCHED_INGO) + CPUSCHED_CHECK_SELECT(ingo_sched_drv); +#endif +#if defined(CONFIG_CPUSCHED_STAIRCASE) + CPUSCHED_CHECK_SELECT(staircase_sched_drv); +#endif +#if defined(CONFIG_CPUSCHED_SPA_NF) + CPUSCHED_CHECK_SELECT(spa_nf_sched_drv); +#endif +#if defined(CONFIG_CPUSCHED_ZAPHOD) + CPUSCHED_CHECK_SELECT(zaphod_sched_drv); +#endif +#if defined(CONFIG_CPUSCHED_NICK) + CPUSCHED_CHECK_SELECT(nick_sched_drv); +#endif + return 1; +} + +__setup ("cpusched=", sched_drv_setup); + +static ssize_t show_attribute(struct kobject *kobj, struct attribute *attr, char *page) +{ + struct sched_drv_sysfs_entry *e = to_sched_drv_sysfs_entry(attr); + + if (!e->show) + return 0; + + return e->show(page); +} + +static ssize_t store_attribute(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) +{ + struct sched_drv_sysfs_entry *e = to_sched_drv_sysfs_entry(attr); + + if (!e->show) + return -EBADF; + + return e->store(page, length); +} + +struct sysfs_ops sched_drv_sysfs_ops = { + .show = show_attribute, + .store = store_attribute, +}; + +static struct kobj_type sched_drv_ktype = { + .sysfs_ops = &sched_drv_sysfs_ops, + .default_attrs = NULL, +}; + +static struct kobject sched_drv_kobj = { + .ktype = &sched_drv_ktype +}; + +decl_subsys(cpusched, NULL, NULL); + +void __init sched_drv_sysfs_init(void) +{ + if (subsystem_register(&cpusched_subsys) == 0) { + if (sched_drvp->attrs == NULL) + return; + + sched_drv_ktype.default_attrs = sched_drvp->attrs; + strncpy(sched_drv_kobj.name, sched_drvp->name, KOBJ_NAME_LEN); + sched_drv_kobj.kset = &cpusched_subsys.kset; + (void)kobject_register(&sched_drv_kobj); + } +} diff -Naur linux-2.6.12-rc5-mm1/kernel/sched_spa.c linux-2.6.12-rc5-mm1-plug/kernel/sched_spa.c --- linux-2.6.12-rc5-mm1/kernel/sched_spa.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.12-rc5-mm1-plug/kernel/sched_spa.c 2005-05-25 17:05:49.637977032 -0700 @@ -0,0 +1,1541 @@ +/* + * kernel/sched_spa.c + * Copyright (C) 1991-2005 Linus Torvalds + * + * 2005-01-11 Single priority array scheduler (no frills and Zaphod) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline void adjust_timestamp(struct task_struct *tsk, struct runqueue *tsk_rq, struct runqueue *other_rq) +{ +#ifdef CONFIG_SMP + tsk->timestamp += (tsk_rq->timestamp_last_tick - other_rq->timestamp_last_tick); +#endif +} + +extern const struct sched_drv spa_nf_sched_drv; +extern const struct sched_drv zaphod_sched_drv; + +/* + * Some of our exported functions could be called when other schedulers are + * in charge with catastrophic results if not handled properly. + * So we define some macros to enable detection of whether either of our + * schedulers is in charge + */ +#ifdef CONFIG_CPUSCHED_SPA_NF +#define spa_nf_in_charge() (&spa_nf_sched_drv == sched_drvp) +#else +#define spa_nf_in_charge() (0) +#endif + +#ifdef CONFIG_CPUSCHED_ZAPHOD +#define zaphod_in_charge() (&zaphod_sched_drv == sched_drvp) +#else +#define zaphod_in_charge() (0) +#endif + +#define spa_in_charge() (zaphod_in_charge() || spa_nf_in_charge()) + +#define SPA_BGND_PRIO (SPA_IDLE_PRIO - 1) +#define SPA_SOFT_CAP_PRIO (SPA_BGND_PRIO - 1) + +#define task_is_queued(p) (!list_empty(&(p)->run_list)) + +static void spa_init_runqueue_queue(union runqueue_queue *qup) +{ + int k; + + for (k = 0; k < SPA_IDLE_PRIO; k++) { + qup->spa.queue[k].prio = k; + INIT_LIST_HEAD(&qup->spa.queue[k].list); + } + bitmap_zero(qup->spa.bitmap, SPA_NUM_PRIO_SLOTS); + // delimiter for bitsearch + __set_bit(SPA_IDLE_PRIO, qup->spa.bitmap); + qup->spa.next_prom_due = ULONG_MAX; + qup->spa.pcount = 0; +} + +static void spa_set_oom_time_slice(struct task_struct *p, unsigned long t) +{ + p->sdu.spa.time_slice = t; +} + +/* + * These are the 'tuning knobs' of the scheduler: + * + * Default configurable timeslice is 100 msecs, maximum configurable + * timeslice is 1000 msecs and minumum configurable timeslice is 1 jiffy. + * Timeslices get renewed on task creation, on wake up and after they expire. + */ +#define MIN_TIMESLICE 1 +#define DEF_TIMESLICE (100 * HZ / 1000) +#define MAX_TIMESLICE (1000 * HZ / 1000) +#define DEF_DESKTOP_TIMESLICE ((DEF_TIMESLICE > 10) ? (DEF_TIMESLICE / 10) : 1) + +static unsigned long time_slice = DEF_TIMESLICE; +static unsigned long sched_rr_time_slice = DEF_TIMESLICE; + +/* + * Background tasks may have longer time slices as compensation + */ +#define task_is_bgnd(p) (unlikely((p)->sdu.spa.cpu_rate_cap == 0)) +static unsigned int bgnd_time_slice_multiplier = 1; + +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) + +static inline unsigned int normal_task_timeslice(const task_t *p) +{ + if (unlikely(p->prio == SPA_BGND_PRIO)) + return time_slice * bgnd_time_slice_multiplier; + + return time_slice; +} + +static inline unsigned int hard_cap_timeslice(const task_t *p) +{ + unsigned int cpu_avg = cpustats_avg_in_jiffies(p->sdu.spa.cpustats.avg_cpu_per_cycle); + + return (cpu_avg / 2) ? (cpu_avg / 2) : 1; +} + +/* + * spa_task_timeslice() is the interface that is used by the scheduler. + */ +static unsigned int spa_task_timeslice(const task_t *p) +{ + if (rt_task(p)) + return sched_rr_time_slice; + + return normal_task_timeslice(p); +} + +/* + * Adding/removing a task to/from a priority array: + */ +static void dequeue_task(struct task_struct *p, struct spa_runqueue_queue *rqq) +{ + /* + * Initialize after removal from the list so that list_empty() works + * as a means for testing whether the task is runnable + * If p is the last task in this priority slot then slotp will be + * a pointer to the head of the list in the sunqueue structure + * NB we can't use p->prio as is for bitmap as task may have + * been promoted so we update it. + */ + struct list_head *slotp = p->run_list.next; + + list_del_init(&p->run_list); + if (list_empty(slotp)) { + p->prio = list_entry(slotp, struct spa_prio_slot, list)->prio; + __clear_bit(p->prio, rqq->bitmap); + } +} + +static void enqueue_task(struct task_struct *p, struct spa_runqueue_queue *rqq) +{ + sched_info_queued(p); + list_add_tail(&p->run_list, &rqq->queue[p->prio].list); + __set_bit(p->prio, rqq->bitmap); +} + +/* + * Used by the migration code - we pull tasks from the head of the + * remote queue so we want these tasks to show up at the head of the + * local queue: + */ +static inline void enqueue_task_head(struct task_struct *p, struct spa_runqueue_queue *rqq) +{ + list_add(&p->run_list, &rqq->queue[p->prio].list); + __set_bit(p->prio, rqq->bitmap); +} + +/* + * Control value for promotion mechanism NB this controls severity of "nice" + */ +unsigned long base_prom_interval = ((DEF_TIMESLICE * 15) / 10); + +#define PROMOTION_FLOOR MAX_RT_PRIO +#define PROMOTION_CEILING SPA_BGND_PRIO +#define in_promotable_range(prio) \ + ((prio) > PROMOTION_FLOOR && (prio) < PROMOTION_CEILING) + +static inline void restart_promotions(struct runqueue *rq) +{ + rq->qu.spa.next_prom_due = jiffies + base_prom_interval; + rq->qu.spa.pcount = 2; +} + +#define check_restart_promotions(rq) \ +do { \ + if (rq->nr_running == 2) \ + restart_promotions(rq); \ +} while (0) + +/* make it (relatively) easy to switch to using a timer */ +static inline void stop_promotions(struct runqueue *rq) +{ +} + +#define check_stop_promotions(rq) \ +do { \ + if (rq->nr_running == 1) \ + stop_promotions(rq); \ +} while (0) + +/* + * Are promotions due? + */ +static inline int promotions_due(const struct runqueue *rq) +{ + return unlikely(time_after_eq(jiffies, rq->qu.spa.next_prom_due)); +} + +static inline void update_curr_prio_for_promotion(struct runqueue *rq) +{ + if (likely(in_promotable_range(rq->curr->prio))) + rq->curr->prio--; +} + +/* + * Assume spa_runq lock is NOT already held. + */ +static void do_promotions(struct runqueue *rq) +{ + int idx = PROMOTION_FLOOR; + + spin_lock(&rq->lock); + if (unlikely(rq->nr_running < 2)) + goto out_unlock; + if (rq->nr_running > rq->qu.spa.pcount) { + rq->qu.spa.pcount++; + goto out_unlock; + } + for (;;) { + int new_prio; + idx = find_next_bit(rq->qu.spa.bitmap, PROMOTION_CEILING, idx + 1); + if (idx > (PROMOTION_CEILING - 1)) + break; + + new_prio = idx - 1; + __list_splice(&rq->qu.spa.queue[idx].list, rq->qu.spa.queue[new_prio].list.prev); + INIT_LIST_HEAD(&rq->qu.spa.queue[idx].list); + __clear_bit(idx, rq->qu.spa.bitmap); + __set_bit(new_prio, rq->qu.spa.bitmap); + } + /* The only prio field that needs update is the current task's */ + update_curr_prio_for_promotion(rq); + rq->qu.spa.pcount = 2; +out_unlock: + rq->qu.spa.next_prom_due = jiffies + base_prom_interval; + spin_unlock(&rq->lock); +} + +/* + * effective_prio - return the priority that is based on the static + * priority + */ +#define should_run_in_background(p) \ + (task_is_bgnd(p) && !((p)->sdu.spa.flags & SPAF_UISLEEP)) +#define exceeding_cap(p) \ + (avg_cpu_usage_rate(p) > (p)->sdu.spa.min_cpu_rate_cap) +#ifdef CONFIG_CPUSCHED_SPA_NF +static int spa_nf_effective_prio(task_t *p) +{ + if (rt_task(p)) + return p->prio; + + if (unlikely(should_run_in_background(p))) + return SPA_BGND_PRIO; + + /* using the minimum of the hard and soft caps makes things smoother */ + if (unlikely(exceeding_cap(p))) + return SPA_SOFT_CAP_PRIO; + + return p->static_prio; +} +#endif + +#ifdef CONFIG_CPUSCHED_ZAPHOD +static int spa_zaphod_effective_prio(task_t *p) +{ + if (rt_task(p)) + return p->prio; + + if (unlikely(should_run_in_background(p))) + return SPA_BGND_PRIO; + + /* using the minimum of the hard and soft caps makes things smoother */ + if (unlikely(exceeding_cap(p))) + return SPA_SOFT_CAP_PRIO; + + return zaphod_effective_prio(p); +} +#endif + +static int (*effective_prio)(struct task_struct *p) = +#ifdef CONFIG_CPUSCHED_SPA_NF +spa_nf_effective_prio; +#else +spa_zaphod_effective_prio; +#endif + +/* + * __activate_task - move a task to the runqueue. + */ +static inline void __activate_task(task_t *p, runqueue_t *rq) +{ + struct spa_runqueue_queue *rqq = &rq->qu.spa; + + enqueue_task(p, rqq); + inc_nr_running(p, rq); + check_restart_promotions(rq); +} + +#ifdef CONFIG_CPUSCHED_SPA_NF +#ifdef CONFIG_CPUSCHED_ZAPHOD +static void do_nothing_to_task(task_t *p) {} +static void (*reassess_at_activation)(task_t *p) = do_nothing_to_task; +#else +static inline void reassess_at_activation(task_t *p) {} +#endif +#else +#define reassess_at_activation(p) zaphod_reassess_at_activation(p) +#endif + +/* + * activate_task - move a task to the runqueue and do priority recalculation + */ +static void activate_task(task_t *p, runqueue_t *rq) +{ + if (rt_task(p)) + p->sdu.spa.time_slice = sched_rr_time_slice; + else { + reassess_at_activation(p); + p->prio = effective_prio(p); + /* hard capped tasks that never use their full time slice evade + * the sinbin so we need to reduce the size of their time slice + * to reduce the size of the hole that they slip through. + * It would be unwise to close it completely. + */ + if (unlikely(p->sdu.spa.cpustats.cpu_usage_rate > p->sdu.spa.cpu_rate_hard_cap)) + p->sdu.spa.time_slice = hard_cap_timeslice(p); + else + p->sdu.spa.time_slice = normal_task_timeslice(p); + } + p->sdu.spa.flags &= ~SPAF_UISLEEP; + __activate_task(p, rq); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) +{ + dec_nr_running(p, rq); + dequeue_task(p, &rq->qu.spa); + check_stop_promotions(rq); +} + +/* + * Check to see if p preempts rq->curr and resched if it does. In compute + * mode we do not preempt for at least cache_delay and set rq->preempted. + */ +static inline void preempt_if_warranted(task_t *p, struct runqueue *rq) +{ + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); +} + +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @old_state: thetask's state before being woken + * @sync: do a synchronous wakeup? + * @rq: The run queue on which the task is to be placed (already locked) + */ +static void spa_wake_up_task(struct task_struct *p, struct runqueue *rq, unsigned int old_state, int sync) +{ + /* + * This is the end of one scheduling cycle and the start of the next + */ + update_cpustats_at_wake_up(p, adjusted_sched_clock(p)); + + if (old_state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; + + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption, if the woken up task will run on + * this cpu. (in this case the 'I will reschedule' promise of + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ + activate_task(p, rq); + if (!sync || (rq != this_rq())) + preempt_if_warranted(p, rq); +} + +#ifdef CONFIG_CPUSCHED_SPA_NF +#ifdef CONFIG_CPUSCHED_ZAPHOD +static void (*spa_fork_extras)(task_t *p) = do_nothing_to_task; +#else +static inline void spa_fork_extras(task_t *p) {} +#endif +#else +#define spa_fork_extras(p) zaphod_fork(p) +#endif + +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + */ +static void spa_fork(task_t *p) +{ + unsigned long now; + + init_timer(&p->sdu.spa.sinbin_timer); + p->sdu.spa.sinbin_timer.data = (unsigned long) p; + /* + * Give the task a new timeslice. + */ + p->sdu.spa.time_slice = spa_task_timeslice(p); + local_irq_disable(); + now = sched_clock(); + local_irq_enable(); + /* + * Initialize the scheduling statistics + */ + initialize_cpustats(p, now); + spa_fork_extras(p); +} + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +#ifdef CONFIG_SMP +#define rq_is_this_rq(rq) (likely((rq) == this_rq())) +#else +#define rq_is_this_rq(rq) 1 +#endif +static void spa_wake_up_new_task(task_t * p, unsigned long clone_flags) +{ + unsigned long flags; + runqueue_t *rq; + + rq = task_rq_lock(p, &flags); + + BUG_ON(p->state != TASK_RUNNING); + + if (rq_is_this_rq(rq)) { + if (!(clone_flags & CLONE_VM)) { + /* + * The VM isn't cloned, so we're in a good position to + * do child-runs-first in anticipation of an exec. This + * usually avoids a lot of COW overhead. + */ + if (unlikely(!task_is_queued(current))) { + p->prio = effective_prio(p); + __activate_task(p, rq); + } else { + p->prio = current->prio; + list_add_tail(&p->run_list, ¤t->run_list); + inc_nr_running(p, rq); + check_restart_promotions(rq); + } + set_need_resched(); + } else { + p->prio = effective_prio(p); + /* Run child last */ + __activate_task(p, rq); + } + } else { + p->prio = effective_prio(p); + __activate_task(p, rq); + preempt_if_warranted(p, rq); + } + + task_rq_unlock(rq, &flags); +} + +/* + * (Optionally) log scheduler statistics at exit. + */ +static int log_at_exit = 0; +static void spa_exit(task_t * p) +{ + struct task_accrued_cpustats stats; + + if (!log_at_exit) + return; + + get_task_accrued_cpustats(p, &stats); + printk("SCHED_EXIT[%d] (%s) %llu %llu %llu %llu %llu %llu %lu %lu @ %llu\n", + p->pid, p->comm, + stats.total_sleep, stats.total_cpu, stats.total_delay, + stats.total_sinbin, stats.total_wake_ups, stats.intr_wake_ups, + p->nvcsw, p->nivcsw, stats.timestamp); +} + +/* + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static inline +void pull_task(runqueue_t *src_rq, task_t *p, runqueue_t *this_rq, int this_cpu) +{ + dequeue_task(p, &src_rq->qu.spa); + dec_nr_running(p, src_rq); + check_stop_promotions(src_rq); + /* not the current task on its cpu so increment delay stats */ + delta_delay_cpustats(p, adjusted_sched_clock(p)); + set_task_cpu(p, this_cpu); + adjust_timestamp(p, this_rq, src_rq); + inc_nr_running(p, this_rq); + enqueue_task(p, &this_rq->qu.spa); + check_restart_promotions(this_rq); + preempt_if_warranted(p, this_rq); +} + +#ifdef CONFIG_SMP +/* + * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, + * as part of a balancing operation within "domain". Returns the number of + * tasks moved. + * + * Called with both runqueues locked. + */ +static int spa_move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) +{ + struct list_head *head, *curr; + int idx, pulled = 0; + struct task_struct *tmp; + + if (max_nr_move <= 0 || busiest->nr_running <= 1) + goto out; + + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(busiest->qu.spa.bitmap); + else + idx = find_next_bit(busiest->qu.spa.bitmap, SPA_IDLE_PRIO, idx); + if (idx >= SPA_IDLE_PRIO) + goto out; + + head = &busiest->qu.spa.queue[idx].list; + curr = head->prev; +skip_queue: + tmp = list_entry(curr, task_t, run_list); + /* Take the opportunity to update task's prio field just in + * in case it's been promoted. This makes sure that the task doesn't + * lose any promotions it has received during the move. + */ + tmp->prio = idx; + + curr = curr->prev; + + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + +#ifdef CONFIG_SCHEDSTATS + if (task_hot(tmp, busiest->timestamp_last_tick, sd)) + schedstat_inc(sd, lb_hot_gained[idle]); +#endif + + pull_task(busiest, tmp, this_rq, this_cpu); + pulled++; + + /* We only want to steal up to the prescribed number of tasks. */ + if (pulled < max_nr_move) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } +out: + return pulled; +} +#endif + +#ifdef CONFIG_CPUSCHED_SPA_NF +#ifdef CONFIG_CPUSCHED_ZAPHOD +static void spa_nf_runq_data_tick(unsigned int cpu, unsigned long numr) {} +static void (*spa_reassess_at_end_of_ts)(task_t *p) = do_nothing_to_task; +static void (*spa_runq_data_tick)(unsigned int cpu, unsigned long numr) = spa_nf_runq_data_tick; +#else +static inline void spa_reassess_at_end_of_ts(task_t *p) {} +#define spa_runq_data_tick(p, numr) +#endif +#else +#define spa_reassess_at_end_of_ts(p) zaphod_reassess_at_end_of_ts(p) +#define spa_runq_data_tick(p, numr) zaphod_runq_data_tick(p, numr) +#endif + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + */ +static void spa_tick(struct task_struct *p, struct runqueue *rq, unsigned long long now) +{ + int cpu = smp_processor_id(); + struct spa_runqueue_queue *rqq = &rq->qu.spa; + + spa_runq_data_tick(cpu, rq->nr_running); + + if (p == rq->idle) { + if (wake_priority_sleeper(rq)) + goto out; + rebalance_tick(cpu, rq, SCHED_IDLE); + return; + } + + /* + * SCHED_FIFO tasks never run out of timeslice. + */ + if (unlikely(p->policy == SCHED_FIFO)) + goto out; + + spin_lock(&rq->lock); + /* + * The task was running during this tick - update the + * time slice counter. Note: we do not update a thread's + * priority until it either goes to sleep or uses up its + * timeslice. This makes it possible for interactive tasks + * to use up their timeslices at their highest priority levels. + */ + if (!--p->sdu.spa.time_slice) { + dequeue_task(p, rqq); + set_tsk_need_resched(p); + update_cpustats_at_end_of_ts(p, now); + if (unlikely(p->policy == SCHED_RR)) + p->sdu.spa.time_slice = sched_rr_time_slice; + else { + spa_reassess_at_end_of_ts(p); + p->prio = effective_prio(p); + p->sdu.spa.time_slice = normal_task_timeslice(p); + } + enqueue_task(p, rqq); + } + spin_unlock(&rq->lock); +out: + if (unlikely(promotions_due(rq))) + do_promotions(rq); + rebalance_tick(cpu, rq, NOT_IDLE); +} + +/* + * Take an active task off the runqueue for a short while + * Assun=mes that task's runqueue is already locked + */ +static inline void put_task_in_sinbin(struct task_struct *p, unsigned long durn) +{ + if (durn == 0) + return; + deactivate_task(p, task_rq(p)); + p->sdu.spa.flags |= SPAF_SINBINNED; + p->sdu.spa.sinbin_timer.expires = jiffies + durn; + add_timer(&p->sdu.spa.sinbin_timer); +} + +#ifdef CONFIG_CPUSCHED_SPA_NF +#ifdef CONFIG_CPUSCHED_ZAPHOD +static void (*reassess_at_sinbin_release)(task_t *p) = do_nothing_to_task; +#else +static inline void reassess_at_sinbin_release(task_t *p) {} +#endif +#else +#define reassess_at_sinbin_release(p) zaphod_reassess_at_sinbin_release(p) +#endif + +/* + * Release a task from the sinbin + */ +void sinbin_release_fn(unsigned long arg) +{ + unsigned long flags; + struct task_struct *p = (struct task_struct*)arg; + struct runqueue *rq = task_rq_lock(p, &flags); + + /* + * Sinbin time is included in delay time + */ + delta_delay_cpustats(p, adjusted_sched_clock(p)); + p->sdu.spa.flags &= ~SPAF_SINBINNED; + if (!rt_task(p)) { + reassess_at_sinbin_release(p); + p->prio = effective_prio(p); + } + __activate_task(p, rq); + + task_rq_unlock(rq, &flags); +} + +static inline int task_needs_sinbinning(const struct task_struct *p) +{ + return unlikely(avg_cpu_usage_rate(p) > p->sdu.spa.cpu_rate_hard_cap) && + (p->state == TASK_RUNNING) && !rt_task(p) && + ((p->sdu.spa.flags & PF_EXITING) == 0); +} + +static inline unsigned long required_sinbin_durn(const struct task_struct *p) +{ + return delay_in_jiffies_for_usage(p, p->sdu.spa.cpu_rate_hard_cap); +} + +#ifdef CONFIG_SCHED_SMT +static struct task_struct *spa_head_of_queue(union runqueue_queue *rqq) +{ + struct task_struct *tmp; + int idx = sched_find_first_bit(rqq->spa.bitmap); + + tmp = list_entry(rqq->spa.queue[idx].list.next, task_t, run_list); + /* Take the opportunity to update task's prio field just in + * in case it's been promoted. + */ + tmp->prio = idx; + + return tmp; +} + +/* maximum expected priority difference for SCHED_NORMAL tasks */ +#define MAX_SN_PD (SPA_IDLE_PRIO - MAX_RT_PRIO) +static int spa_dependent_sleeper_trumps(const struct task_struct *p1, + const struct task_struct * p2, struct sched_domain *sd) +{ + int dp = p2->prio - p1->prio; + + if ((dp > 0) && (sd->per_cpu_gain < 100) && p2->mm && !rt_task(p2)) { + unsigned long rq_ts_rm; + + if (rt_task(p1)) + return 1; + + rq_ts_rm = ((MAX_SN_PD - dp) * time_slice * sd->per_cpu_gain) / + (100 * MAX_SN_PD); + + return p1->sdu.spa.time_slice > rq_ts_rm; + } + + return 0; +} +#endif + +/* + * schedule() is the main scheduler function. + */ +static void spa_schedule(void) +{ + long *switch_count; + int cpu, idx; + struct task_struct *prev = current, *next; + struct runqueue *rq = this_rq(); + unsigned long long now = sched_clock(); + struct list_head *queue; + + spin_lock_irq(&rq->lock); + + if (unlikely(current->flags & PF_DEAD)) + current->state = EXIT_DEAD; + /* + * if entering off of a kernel preemption go straight + * to picking the next task. + */ + switch_count = &prev->nivcsw; + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; + else { + if (prev->state == TASK_UNINTERRUPTIBLE) { + rq->nr_uninterruptible++; + prev->sdu.spa.flags |= SPAF_UISLEEP; + } + deactivate_task(prev, rq); + } + } + + delta_cpu_cpustats(prev, now); + prev->sched_time = prev->sdu.spa.cpustats.total_cpu; + if (task_needs_sinbinning(prev) && likely(!signal_pending(prev))) + put_task_in_sinbin(prev, required_sinbin_durn(prev)); + + cpu = smp_processor_id(); + if (unlikely(!rq->nr_running)) { +go_idle: + idle_balance(cpu, rq); + if (!rq->nr_running) { + next = rq->idle; + wake_sleeping_dependent(cpu, rq); + /* + * wake_sleeping_dependent() might have released + * the runqueue, so break out if we got new + * tasks meanwhile: + */ + if (!rq->nr_running) + goto switch_tasks; + } + } else { + if (dependent_sleeper(cpu, rq)) { + next = rq->idle; + goto switch_tasks; + } + /* + * dependent_sleeper() releases and reacquires the runqueue + * lock, hence go into the idle loop if the rq went + * empty meanwhile: + */ + if (unlikely(!rq->nr_running)) + goto go_idle; + } + + idx = sched_find_first_bit(rq->qu.spa.bitmap); + queue = &rq->qu.spa.queue[idx].list; + next = list_entry(queue->next, task_t, run_list); + /* Take the opportunity to update task's prio field just in + * in case it's been promoted. + */ + next->prio = idx; +switch_tasks: + if (next == rq->idle) + schedstat_inc(rq, sched_goidle); + prefetch(next); + clear_tsk_need_resched(prev); + rcu_qsctr_inc(task_cpu(prev)); + + next->timestamp = prev->last_ran = now; + + sched_info_switch(prev, next); + if (likely(prev != next)) { + delta_delay_cpustats(next, now); + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + + prepare_arch_switch(rq, next); + prev = context_switch(rq, prev, next); + barrier(); + + finish_task_switch(prev); + } else + spin_unlock_irq(&rq->lock); +} + +static void spa_set_normal_task_nice(task_t *p, long nice) +{ + int old_static_prio, delta; + struct runqueue *rq = task_rq(p); + struct spa_runqueue_queue *rqq = &rq->qu.spa; + + old_static_prio = p->static_prio; + p->static_prio = NICE_TO_PRIO(nice); +#ifdef CONFIG_CPUSCHED_ZAPHOD + if (zaphod_in_charge()) + zaphod_reassess_at_renice(p); +#endif + + if (p->prio == SPA_BGND_PRIO) + return; + + delta = p->static_prio - old_static_prio; + if (delta == 0) + return; + + if (task_is_queued(p)) { + dec_prio_bias(rq, old_static_prio); + inc_prio_bias(rq, p->static_prio); + dequeue_task(p, rqq); + /* This check is done here rather than outside the if statement + * as there is a need to avoid a race condition with p->prio in + * dequeue_task() + */ + if (unlikely(delta > (SPA_SOFT_CAP_PRIO - p->prio))) + delta = (SPA_SOFT_CAP_PRIO - p->prio); + else if (unlikely(delta < (MAX_RT_PRIO - p->prio))) + delta = (MAX_RT_PRIO - p->prio); + p->prio += delta; + enqueue_task(p, rqq); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + if (delta < 0 || (delta > 0 && task_running(rq, p))) + resched_task(rq->curr); + } else { + /* See comment in other branch of if statement */ + if (unlikely(delta > (SPA_SOFT_CAP_PRIO - p->prio))) + delta = (SPA_SOFT_CAP_PRIO - p->prio); + else if (unlikely(delta < (MAX_RT_PRIO - p->prio))) + delta = (MAX_RT_PRIO - p->prio); + p->prio += delta; + } +} + +/* + * setscheduler - change the scheduling policy and/or RT priority of a thread. + */ +static void spa_setscheduler(task_t *p, int policy, int prio) +{ + int oldprio; + int queued; + runqueue_t *rq = task_rq(p); + + queued = task_is_queued(p); + if (queued) + deactivate_task(p, rq); + oldprio = p->prio; + __setscheduler(p, policy, prio); + if (queued) { + __activate_task(p, rq); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else + preempt_if_warranted(p, rq); + } +} + +/* + * Require: 0 <= new_cap <= 1000 + */ +int set_cpu_rate_cap(struct task_struct *p, unsigned long new_cap) +{ + int is_allowed; + unsigned long flags; + struct runqueue *rq; + long delta; + + /* this function could be called when other schedulers are in + * charge (with catastrophic results) so let's check + */ + if (!spa_in_charge()) + return -ENOSYS; + + if (new_cap > 1000) + return -EINVAL; + is_allowed = capable(CAP_SYS_NICE); + /* + * We have to be careful, if called from /proc code, + * the task might be in the middle of scheduling on another CPU. + */ + new_cap = ppt_to_proportion(new_cap); + rq = task_rq_lock(p, &flags); + delta = new_cap - p->sdu.spa.cpu_rate_cap; + if (!is_allowed) { + /* + * Ordinary users can set/change caps on their own tasks + * provided that the new setting is MORE constraining + */ + if (((current->euid != p->uid) && (current->uid != p->uid)) || (delta > 0)) { + task_rq_unlock(rq, &flags); + return -EPERM; + } + } + /* + * The RT tasks don't have caps, but we still allow the caps to be + * set - but as expected it wont have any effect on scheduling until + * the task becomes SCHED_NORMAL: + */ + p->sdu.spa.cpu_rate_cap = new_cap; + if (p->sdu.spa.cpu_rate_cap < p->sdu.spa.cpu_rate_hard_cap) + p->sdu.spa.min_cpu_rate_cap = p->sdu.spa.cpu_rate_cap; + else + p->sdu.spa.min_cpu_rate_cap = p->sdu.spa.cpu_rate_hard_cap; + +#ifdef CONFIG_CPUSCHED_ZAPHOD + if zaphod_in_charge() + zaphod_reassess_at_renice(p); +#endif + + if (!rt_task(p) && task_is_queued(p)) { + int delta = -p->prio; + struct spa_runqueue_queue *rqq = &rq->qu.spa; + + dequeue_task(p, rqq); + delta += p->prio = effective_prio(p); + enqueue_task(p, rqq); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + if (delta < 0 || (delta > 0 && task_running(rq, p))) + resched_task(rq->curr); + } + task_rq_unlock(rq, &flags); + + return 0; +} + +EXPORT_SYMBOL(set_cpu_rate_cap); + +unsigned long get_cpu_rate_cap(struct task_struct *p) +{ + if (!spa_in_charge()) + return 1000; + + return proportion_to_ppt(p->sdu.spa.cpu_rate_cap); +} + +EXPORT_SYMBOL(get_cpu_rate_cap); + +/* + * Require: 1 <= new_cap <= 1000 + */ +int set_cpu_rate_hard_cap(struct task_struct *p, unsigned long new_cap) +{ + int is_allowed; + unsigned long flags; + struct runqueue *rq; + long delta; + + /* this function could be called when other schedulers are in + * charge (with catastrophic results) so let's check + */ + if (!spa_in_charge()) + return -ENOSYS; + + if ((new_cap > 1000) || (new_cap == 0)) /* zero hard caps are not allowed */ + return -EINVAL; + is_allowed = capable(CAP_SYS_NICE); + new_cap = ppt_to_proportion(new_cap); + /* + * We have to be careful, if called from /proc code, + * the task might be in the middle of scheduling on another CPU. + */ + rq = task_rq_lock(p, &flags); + delta = new_cap - p->sdu.spa.cpu_rate_hard_cap; + if (!is_allowed) { + /* + * Ordinary users can set/change caps on their own tasks + * provided that the new setting is MORE constraining + */ + if (((current->euid != p->uid) && (current->uid != p->uid)) || (delta > 0)) { + task_rq_unlock(rq, &flags); + return -EPERM; + } + } + /* + * The RT tasks don't have caps, but we still allow the caps to be + * set - but as expected it wont have any effect on scheduling until + * the task becomes SCHED_NORMAL: + */ + p->sdu.spa.cpu_rate_hard_cap = new_cap; + if (p->sdu.spa.cpu_rate_cap < p->sdu.spa.cpu_rate_hard_cap) + p->sdu.spa.min_cpu_rate_cap = p->sdu.spa.cpu_rate_cap; + else + p->sdu.spa.min_cpu_rate_cap = p->sdu.spa.cpu_rate_hard_cap; + +#ifdef CONFIG_CPUSCHED_ZAPHOD + if zaphod_in_charge() + zaphod_reassess_at_renice(p); +#endif + + /* (POSSIBLY) TODO: if it's sinbinned and the cap is relaxed then + * release it from the sinbin + */ + task_rq_unlock(rq, &flags); + return 0; +} + +EXPORT_SYMBOL(set_cpu_rate_hard_cap); + +unsigned long get_cpu_rate_hard_cap(struct task_struct *p) +{ + if (!spa_in_charge()) + return 1000; + + return proportion_to_ppt(p->sdu.spa.cpu_rate_hard_cap); +} + +EXPORT_SYMBOL(get_cpu_rate_hard_cap); + +int get_task_accrued_cpustats(struct task_struct *tsk, struct task_accrued_cpustats *stats) +{ + int on_runq = 0; + int on_cpu = 0; + int is_sinbinned = 0; + unsigned long long rq_timestamp; + unsigned long flags; + struct runqueue *rq; + + if (!spa_in_charge()) + return -ENOSYS; + + rq = task_rq_lock(tsk, &flags); + + rq_timestamp = rq->timestamp_last_tick; + stats->total_wake_ups = tsk->sdu.spa.cpustats.total_wake_ups; + stats->intr_wake_ups = tsk->sdu.spa.cpustats.intr_wake_ups; + stats->total_sleep = tsk->sdu.spa.cpustats.total_sleep; + stats->total_cpu = tsk->sdu.spa.cpustats.total_cpu; + stats->total_delay = tsk->sdu.spa.cpustats.total_delay; + stats->total_sinbin = tsk->sdu.spa.cpustats.total_sinbin; + stats->timestamp = tsk->timestamp; + is_sinbinned = task_is_sinbinned(tsk); + if ((on_runq = task_is_queued(tsk))) + on_cpu = task_running(rq, tsk); + + task_rq_unlock(rq, &flags); + + /* + * Update values to the previous tick (only) + */ + if (rq_timestamp > stats->timestamp) { + unsigned long long delta = rq_timestamp - stats->timestamp; + + stats->timestamp = rq_timestamp; + if (on_cpu) { + stats->total_cpu += delta; + } else if (on_runq || is_sinbinned) { + stats->total_delay += delta; + if (is_sinbinned) + stats->total_sinbin += delta; + } else { + stats->total_sleep += delta; + } + } + + return 0; +} + +EXPORT_SYMBOL(get_task_accrued_cpustats); + +/* + * Get scheduling statistics for the nominated CPU + */ +int get_cpu_cpustats(unsigned int cpu, struct cpu_cpustats *stats) +{ + int idle; + unsigned long long idle_timestamp; + struct runqueue *rq = cpu_rq(cpu); + struct runq_cpustats *csrq; + + if (!spa_in_charge()) + return -ENOSYS; + + /* + * No need to crash the whole machine if they've asked for stats for + * a non existent CPU. + */ + if ((csrq = cpu_runq_cpustats(cpu)) == NULL) + return -EFAULT; + + local_irq_disable(); + spin_lock(&rq->lock); + idle = rq->curr == rq->idle; +#ifdef CONFIG_SMP + if (rq->timestamp_last_tick > rq->curr->timestamp) + stats->timestamp = rq->timestamp_last_tick; + else +#endif + stats->timestamp = rq->curr->timestamp; + idle_timestamp = rq->idle->timestamp; + if (idle_timestamp > stats->timestamp) + stats->timestamp = idle_timestamp; + stats->total_idle = rq->idle->sdu.spa.cpustats.total_cpu; + stats->total_busy = rq->idle->sdu.spa.cpustats.total_delay; + stats->total_delay = csrq->total_delay; + stats->total_rt_delay = csrq->total_rt_delay; + stats->total_intr_delay = csrq->total_intr_delay; + stats->total_rt_intr_delay = csrq->total_rt_intr_delay; + stats->total_fork_delay = csrq->total_fork_delay; + stats->total_sinbin = csrq->total_sinbin; + stats->nr_switches = rq->nr_switches; + spin_unlock_irq(&rq->lock); + + /* + * Update idle/busy time to the current tick + */ + if (idle) + stats->total_idle += (stats->timestamp - idle_timestamp); + else + stats->total_busy += (stats->timestamp - idle_timestamp); + + return 0; +} + +EXPORT_SYMBOL(get_cpu_cpustats); + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * this function yields the current CPU by moving the calling thread + * to the expired array. If there are no other threads running on this + * CPU then this function will return. + */ + +static long spa_sys_yield(void) +{ + runqueue_t *rq = this_rq_lock(); + struct spa_runqueue_queue *rqq = &rq->qu.spa; + + schedstat_inc(rq, yld_cnt); + /* If there's other tasks on this CPU make sure that at least + * one of them get some CPU before this task's next bite of the + * cherry. Dequeue before looking for the appropriate run + * queue so that we don't find our queue if we were the sole + * occupant of that queue. + */ + dequeue_task(current, rqq); + /* + * special rule: RT tasks will just roundrobin. + */ + if (likely(!rt_task(current))) { + int idx = find_next_bit(rqq->bitmap, SPA_IDLE_PRIO, current->prio); + + if (idx < SPA_IDLE_PRIO) { + if ((idx < SPA_BGND_PRIO) || task_is_bgnd(current)) + current->prio = idx; + else + current->prio = SPA_BGND_PRIO - 1; + } + } + enqueue_task(current, rqq); + + if (rq->nr_running == 1) + schedstat_inc(rq, yld_both_empty); + + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: + */ + __release(rq->lock); + _raw_spin_unlock(&rq->lock); + preempt_enable_no_resched(); + + schedule(); + + return 0; +} + +static void spa_yield(void) +{ + set_current_state(TASK_RUNNING); + spa_sys_yield(); +} + +static void spa_init_idle(task_t *idle, int cpu) +{ + idle->prio = SPA_IDLE_PRIO; + /* + * Initialize scheduling statistics counters as they may provide + * valuable about the CPU e.g. avg_cpu_time_per_cycle for the idle + * task will be an estimate of the average time the CPU is idle. + * sched_init() may not be ready so use INITIAL_JIFFIES instead. + */ + initialize_cpustats(idle, INITIAL_CPUSTATS_TIMESTAMP); +} + +#ifdef CONFIG_SMP +/* source and destination queues will be already locked */ +static void spa_migrate_queued_task(struct task_struct *p, int dest_cpu) +{ + struct runqueue *rq_src = task_rq(p); + struct runqueue *rq_dest = cpu_rq(dest_cpu); + + deactivate_task(p, rq_src); + /* not the current task on its cpu so increment delay stats */ + delta_delay_cpustats(p, adjusted_sched_clock(p)); + set_task_cpu(p, dest_cpu); + adjust_timestamp(p, rq_dest, rq_src); + activate_task(p, rq_dest); + preempt_if_warranted(p, rq_dest); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void spa_set_select_idle_first(struct runqueue *rq) +{ + __setscheduler(rq->idle, SCHED_FIFO, MAX_RT_PRIO - 1); + /* Add idle task to _front_ of it's priority queue */ + enqueue_task_head(rq->idle, &rq->qu.spa); + inc_nr_running(rq->idle, rq); +} + +static void spa_set_select_idle_last(struct runqueue *rq) +{ + deactivate_task(rq->idle, rq); + rq->idle->static_prio = SPA_IDLE_PRIO; + __setscheduler(rq->idle, SCHED_NORMAL, 0); +} + +static void spa_migrate_dead_tasks(unsigned int dead_cpu) +{ + unsigned i; + struct runqueue *rq = cpu_rq(dead_cpu); + + for (i = 0; i < SPA_IDLE_PRIO; i++) { + struct list_head *list = &rq->qu.spa.queue[i].list; + while (!list_empty(list)) + migrate_dead(dead_cpu, list_entry(list->next, task_t, run_list)); + } +} +#endif +#endif + +static void spa_sched_init(void) +{ + int i, cpu; + + for (i = 0; i < NR_CPUS; i++) { + init_runq_cpustats(i); +#ifdef CONFIG_CPUSCHED_ZAPHOD + if (zaphod_in_charge()) + zaphod_init_cpu_runq_data(i); +#endif + } + + cpu = smp_processor_id(); + init_task.sdu.spa.time_slice = HZ; + init_task.sdu.spa.cpu_rate_cap = PROPORTION_ONE; + init_task.sdu.spa.cpu_rate_hard_cap = PROPORTION_ONE; + init_task.sdu.spa.min_cpu_rate_cap = PROPORTION_ONE; + init_task.sdu.spa.sinbin_timer.function = sinbin_release_fn; +#ifdef CONFIG_CPUSCHED_ZAPHOD + if (zaphod_in_charge()) { +#ifdef CONFIG_CPUSCHED_SPA_NF + effective_prio = spa_zaphod_effective_prio; + reassess_at_activation = zaphod_reassess_at_activation; + spa_fork_extras = zaphod_fork; + spa_runq_data_tick = zaphod_runq_data_tick; + spa_reassess_at_end_of_ts = zaphod_reassess_at_end_of_ts; +#endif + init_task.sdu.spa.zaphod = zaphod_task_data_init(); + } +#endif +} + +#ifdef CONFIG_MAGIC_SYSRQ +static void spa_normalize_rt_task(struct task_struct *p) +{ + int queued; + unsigned long flags; + runqueue_t *rq; + + rq = task_rq_lock(p, &flags); + + queued = task_is_queued(p); + if (queued) + deactivate_task(p, rq); + __setscheduler(p, SCHED_NORMAL, 0); + if (queued) { + __activate_task(p, rq); + resched_task(rq->curr); + } + + task_rq_unlock(rq, &flags); +} +#endif + +static inline unsigned long rnd_msecs_to_jiffies(unsigned long msecs) +{ + return (msecs * HZ + HZ / 2) / 1000; +} + +static inline unsigned long rnd_jiffies_to_msecs(unsigned long msecs) +{ + return (msecs * 1000 + 500) / HZ; +} + +#define no_change(a) (a) + +SCHED_DRV_SYSFS_UINT_RW_STATIC(time_slice, rnd_msecs_to_jiffies, rnd_jiffies_to_msecs, MIN_TIMESLICE, MAX_TIMESLICE); +SCHED_DRV_SYSFS_UINT_RW_STATIC(sched_rr_time_slice, rnd_msecs_to_jiffies, rnd_jiffies_to_msecs, MIN_TIMESLICE, MAX_TIMESLICE); +SCHED_DRV_SYSFS_UINT_RW_STATIC(base_prom_interval, rnd_msecs_to_jiffies, rnd_jiffies_to_msecs, MIN_TIMESLICE, ULONG_MAX); +SCHED_DRV_SYSFS_UINT_RW_STATIC(log_at_exit, no_change, no_change, 0, 1); +SCHED_DRV_SYSFS_UINT_RW_STATIC(bgnd_time_slice_multiplier, no_change, no_change, 1, 100); + +static int show_cpustats(char *page) +{ + int i; + int len = 0; + int avail = 1; + struct cpu_cpustats total = {0, }; + unsigned long long timestamp = (unsigned long long)-1LL; + + for_each_online_cpu(i) { + struct cpu_cpustats stats; + + if (get_cpu_cpustats(i, &stats) != 0) { + avail = 0; + break; + } + if (stats.timestamp < timestamp) + timestamp = stats.timestamp; + total.total_idle += stats.total_idle; + total.total_busy += stats.total_busy; + total.total_delay += stats.total_delay; + total.total_rt_delay += stats.total_rt_delay; + total.total_intr_delay += stats.total_intr_delay; + total.total_rt_intr_delay += stats.total_rt_intr_delay; + total.total_fork_delay += stats.total_fork_delay; + total.total_sinbin += stats.total_sinbin; + total.nr_switches += stats.nr_switches; + } + if (avail) + len = sprintf(page, "%llu %llu %llu %llu %llu %llu %llu %llu %llu @ %llu\n", + total.total_idle, + total.total_busy, + total.total_delay, + total.total_intr_delay, + total.total_rt_delay, + total.total_rt_intr_delay, + total.total_fork_delay, + total.total_sinbin, + total.nr_switches, + timestamp); + else + len = sprintf(page, "Data unavailable\n"); + + return len; +} + +static struct sched_drv_sysfs_entry cpustats_sdse = { + .attr = { .name = "cpustats", .mode = S_IRUGO }, + .show = show_cpustats, + .store = NULL, +}; + +#ifdef CONFIG_CPUSCHED_SPA_NF +static struct attribute *spa_nf_attrs[] = { + &SCHED_DRV_SYSFS_ATTR(time_slice), + &SCHED_DRV_SYSFS_ATTR(sched_rr_time_slice), + &SCHED_DRV_SYSFS_ATTR(bgnd_time_slice_multiplier), + &SCHED_DRV_SYSFS_ATTR(base_prom_interval), + &SCHED_DRV_SYSFS_ATTR(log_at_exit), + &SCHED_DRV_SYSFS_ATTR(cpustats), + NULL, +}; +#endif + +SCHED_DRV_DECLARE_SYSFS_ENTRY(max_ia_bonus); +SCHED_DRV_DECLARE_SYSFS_ENTRY(initial_ia_bonus); +SCHED_DRV_DECLARE_SYSFS_ENTRY(max_tpt_bonus); +SCHED_DRV_DECLARE_SYSFS_ENTRY(ia_threshold); +SCHED_DRV_DECLARE_SYSFS_ENTRY(cpu_hog_threshold); +SCHED_DRV_DECLARE_SYSFS_ENTRY(zaphod_mode); + +#ifdef CONFIG_CPUSCHED_ZAPHOD +static struct attribute *zaphod_attrs[] = { + &SCHED_DRV_SYSFS_ATTR(time_slice), + &SCHED_DRV_SYSFS_ATTR(sched_rr_time_slice), + &SCHED_DRV_SYSFS_ATTR(bgnd_time_slice_multiplier), + &SCHED_DRV_SYSFS_ATTR(base_prom_interval), + &SCHED_DRV_SYSFS_ATTR(log_at_exit), + &SCHED_DRV_SYSFS_ATTR(cpustats), + &SCHED_DRV_SYSFS_ATTR(max_ia_bonus), + &SCHED_DRV_SYSFS_ATTR(initial_ia_bonus), + &SCHED_DRV_SYSFS_ATTR(max_tpt_bonus), + &SCHED_DRV_SYSFS_ATTR(ia_threshold), + &SCHED_DRV_SYSFS_ATTR(cpu_hog_threshold), + &SCHED_DRV_SYSFS_ATTR(zaphod_mode), + NULL, +}; +#endif + +#ifdef CONFIG_CPUSCHED_SPA_NF +const struct sched_drv spa_nf_sched_drv = { + .name = "spa_no_frills", + .init_runqueue_queue = spa_init_runqueue_queue, + .set_oom_time_slice = spa_set_oom_time_slice, + .task_timeslice = spa_task_timeslice, + .wake_up_task = spa_wake_up_task, + .fork = spa_fork, + .wake_up_new_task = spa_wake_up_new_task, + .exit = spa_exit, +#ifdef CONFIG_SMP + .move_tasks = spa_move_tasks, +#endif + .tick = spa_tick, +#ifdef CONFIG_SCHED_SMT + .head_of_queue = spa_head_of_queue, + .dependent_sleeper_trumps = spa_dependent_sleeper_trumps, +#endif + .schedule = spa_schedule, + .set_normal_task_nice = spa_set_normal_task_nice, + .setscheduler = spa_setscheduler, + .sys_yield = spa_sys_yield, + .yield = spa_yield, + .init_idle = spa_init_idle, + .sched_init = spa_sched_init, +#ifdef CONFIG_SMP + .migrate_queued_task = spa_migrate_queued_task, +#ifdef CONFIG_HOTPLUG_CPU + .set_select_idle_first = spa_set_select_idle_first, + .set_select_idle_last = spa_set_select_idle_last, + .migrate_dead_tasks = spa_migrate_dead_tasks, +#endif +#endif +#ifdef CONFIG_MAGIC_SYSRQ + .normalize_rt_task = spa_normalize_rt_task, +#endif + .attrs = spa_nf_attrs, +}; +#endif + +#ifdef CONFIG_CPUSCHED_ZAPHOD +const struct sched_drv zaphod_sched_drv = { + .name = "zaphod", + .init_runqueue_queue = spa_init_runqueue_queue, + .set_oom_time_slice = spa_set_oom_time_slice, + .task_timeslice = spa_task_timeslice, + .wake_up_task = spa_wake_up_task, + .fork = spa_fork, + .wake_up_new_task = spa_wake_up_new_task, + .exit = spa_exit, + .tick = spa_tick, +#ifdef CONFIG_SMP + .move_tasks = spa_move_tasks, +#endif + .tick = spa_tick, +#ifdef CONFIG_SCHED_SMT + .head_of_queue = spa_head_of_queue, + .dependent_sleeper_trumps = spa_dependent_sleeper_trumps, +#endif + .schedule = spa_schedule, + .set_normal_task_nice = spa_set_normal_task_nice, + .setscheduler = spa_setscheduler, + .sys_yield = spa_sys_yield, + .yield = spa_yield, + .init_idle = spa_init_idle, + .sched_init = spa_sched_init, +#ifdef CONFIG_SMP + .migrate_queued_task = spa_migrate_queued_task, +#ifdef CONFIG_HOTPLUG_CPU + .set_select_idle_first = spa_set_select_idle_first, + .set_select_idle_last = spa_set_select_idle_last, + .migrate_dead_tasks = spa_migrate_dead_tasks, +#endif +#endif +#ifdef CONFIG_MAGIC_SYSRQ + .normalize_rt_task = spa_normalize_rt_task, +#endif + .attrs = zaphod_attrs, +}; +#endif diff -Naur linux-2.6.12-rc5-mm1/kernel/sched_zaphod.c linux-2.6.12-rc5-mm1-plug/kernel/sched_zaphod.c --- linux-2.6.12-rc5-mm1/kernel/sched_zaphod.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.12-rc5-mm1-plug/kernel/sched_zaphod.c 2005-05-25 17:05:49.638976880 -0700 @@ -0,0 +1,484 @@ +/* + * kernel/sched_zaphod.c + * + * CPU scheduler mode + * + * Copyright (C) 2004 Aurema Pty Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include + +#include + +#define MIN_NORMAL_PRIO ZAPHOD_MIN_NORMAL_PRIO +#define IDLE_PRIO ZAPHOD_IDLE_PRIO +#define BGND_PRIO ZAPHOD_BGND_PRIO +#define TASK_ZD(p) (p)->sdu.spa.zaphod +#define MIN_RATE_CAP(p) (p)->sdu.spa.min_cpu_rate_cap +#define task_is_bgnd(p) (unlikely((p)->sdu.spa.cpu_rate_cap == 0)) + +#define EB_YARDSTICK_DECAY_INTERVAL 100 + +enum zaphod_mode_enum { + ZAPHOD_MODE_PRIORITY_BASED, + ZAPHOD_MODE_ENTITLEMENT_BASED +}; + +static enum zaphod_mode_enum zaphod_mode = ZAPHOD_MODE_PRIORITY_BASED; + +static const char *zaphod_mode_names[] = { + "pb", /* ZAPHOD_MODE_PRIORITY_BASED */ + "eb", /* ZAPHOD_MODE_ENTITLEMENT_BASED */ + NULL /* end of list marker */ +}; + +static DEFINE_PER_CPU(struct sched_zaphod_runq_data, zaphod_runqs); +#define cpu_zrq(cpu) (&per_cpu(zaphod_runqs, cpu)) +#define task_zrq(p) cpu_zrq(task_cpu(p)) +/* + * Convert nice to shares + * Proportional symmetry is aimed for: i.e. + * (nice_to_shares(0) / nice_to_shares(19)) == (nice_to_shares(-20) / nice_to_shares(0)) + * Make sure that this function is robust for variations of EB_SHARES_PER_NICE + */ +static inline unsigned int nice_to_shares(int nice) +{ + unsigned int result = DEFAULT_EB_SHARES; + + if (nice > 0) + result -= (nice * (20 * EB_SHARES_PER_NICE - 1)) / 19; + else if (nice < 0) + result += (nice * nice * ((20 * EB_SHARES_PER_NICE - 1) * EB_SHARES_PER_NICE)) / 20; + + return result; +} + +static inline int shares_to_nice(unsigned int shares) +{ + int result = 0; + + if (shares > DEFAULT_EB_SHARES) + result = -int_sqrt((20 * (shares - DEFAULT_EB_SHARES)) / + (EB_SHARES_PER_NICE * (20 * EB_SHARES_PER_NICE - 1))); + else if (shares < DEFAULT_EB_SHARES) + result = (19 * (DEFAULT_EB_SHARES - shares)) / + (20 * EB_SHARES_PER_NICE - 1); + + return result; +} + +#define MAX_TOTAL_BONUS (BGND_PRIO - ZAPHOD_MAX_PRIO - 1) +#define MAX_MAX_IA_BONUS ((MAX_TOTAL_BONUS + 1) / 2) +#define MAX_MAX_TPT_BONUS (MAX_TOTAL_BONUS - MAX_MAX_IA_BONUS) +#define DEFAULT_MAX_IA_BONUS ((MAX_MAX_IA_BONUS < 7) ? MAX_MAX_IA_BONUS : 7) +#define DEFAULT_MAX_TPT_BONUS ((DEFAULT_MAX_IA_BONUS - 2) ? : 1) + + +#define SCHED_IA_BONUS_OFFSET 8 +#define SCHED_IA_BONUS_ALPHA ((1 << SCHED_IA_BONUS_OFFSET) - 1) +#define SCHED_IA_BONUS_MUL(a, b) (((a) * (b)) >> SCHED_IA_BONUS_OFFSET) +/* + * Get the rounded integer value of the interactive bonus + */ +#define SCHED_IA_BONUS_RND(x) \ + (((x) + (1 << (SCHED_IA_BONUS_OFFSET - 1))) >> (SCHED_IA_BONUS_OFFSET)) + +unsigned int max_ia_bonus = DEFAULT_MAX_IA_BONUS; +unsigned int max_max_ia_bonus = MAX_MAX_IA_BONUS; +unsigned int initial_ia_bonus = 1; +unsigned int max_tpt_bonus = DEFAULT_MAX_TPT_BONUS; +unsigned int max_max_tpt_bonus = MAX_MAX_TPT_BONUS; + +/* + * Find the square root of a proportion + * Require: x <= PROPORTION_ONE + */ +static unsigned long proportion_sqrt(unsigned long x) +{ + /* use 64 bits internally to avoid overflow */ + unsigned long long res, b, ulx; + int bshift; + + /* + * Take shortcut AND prevent overflow + */ + if (x == PROPORTION_ONE) + return PROPORTION_ONE; + + res = 0; + b = (1UL << (PROPORTION_OFFSET - 1)); + bshift = PROPORTION_OFFSET - 1; + ulx = x << PROPORTION_OFFSET; + + for (; ulx && b; b >>= 1, bshift--) { + unsigned long long temp = (((res << 1) + b) << bshift); + + if (ulx >= temp) { + res += b; + ulx -= temp; + } + } + + return res; +} + +/* + * Tasks that have a CPU usage rate greater than this threshold (in parts per + * thousand) are considered to be CPU bound and start to lose interactive bonus + * points + */ +#define DEFAULT_CPU_HOG_THRESHOLD 900 +unsigned long cpu_hog_threshold = PROP_FM_PPT(DEFAULT_CPU_HOG_THRESHOLD); + +/* + * Tasks that would sleep for more than 900 parts per thousand of the time if + * they had the CPU to themselves are considered to be interactive provided + * that their average sleep duration per scheduling cycle isn't too long + */ +#define DEFAULT_IA_THRESHOLD 900 +unsigned long ia_threshold = PROP_FM_PPT(DEFAULT_IA_THRESHOLD); +#define LOWER_MAX_IA_SLEEP SCHED_AVG_REAL(15 * 60LL * NSEC_PER_SEC) +#define UPPER_MAX_IA_SLEEP SCHED_AVG_REAL(2 * 60 * 60LL * NSEC_PER_SEC) + +/* + * Calculate CPU usage rate and sleepiness. + * This never gets called on real time tasks + */ +static unsigned long calc_sleepiness(task_t *p) +{ + unsigned long long bl; + + bl = TASK_CPUSTATS(p).avg_sleep_per_cycle + TASK_CPUSTATS(p).avg_cpu_per_cycle; + /* + * Take a shortcut and avoid possible divide by zero + */ + if (unlikely(bl == 0)) + return PROPORTION_ONE; + else + return calc_proportion(TASK_CPUSTATS(p).avg_sleep_per_cycle, bl); +} + +static inline void decay_sched_ia_bonus(struct task_struct *p) +{ + TASK_ZD(p).interactive_bonus *= SCHED_IA_BONUS_ALPHA; + TASK_ZD(p).interactive_bonus >>= SCHED_IA_BONUS_OFFSET; +} + +/* + * Check whether a task with an interactive bonus still qualifies and if not + * decrease its bonus + * This never gets called on real time tasks + */ +static void reassess_cpu_boundness(task_t *p) +{ + if (max_ia_bonus == 0) { + TASK_ZD(p).interactive_bonus = 0; + return; + } + /* + * No point going any further if there's no bonus to lose + */ + if (TASK_ZD(p).interactive_bonus == 0) + return; + + if (TASK_CPUSTATS(p).cpu_usage_rate > cpu_hog_threshold) + decay_sched_ia_bonus(p); +} + +/* + * Check whether a task qualifies for an interactive bonus and if it does + * increase its bonus + * This never gets called on real time tasks + */ +static void reassess_interactiveness(task_t *p) +{ + unsigned long sleepiness; + + if (max_ia_bonus == 0) { + TASK_ZD(p).interactive_bonus = 0; + return; + } + /* + * No sleep means not interactive (in most cases), but + */ + if (unlikely(TASK_CPUSTATS(p).avg_sleep_per_cycle > LOWER_MAX_IA_SLEEP)) { + /* + * Really long sleeps mean it's probably not interactive + */ + if (unlikely(TASK_CPUSTATS(p).avg_sleep_per_cycle > UPPER_MAX_IA_SLEEP)) + decay_sched_ia_bonus(p); + return; + } + + sleepiness = calc_sleepiness(p); + if (sleepiness > ia_threshold) { + decay_sched_ia_bonus(p); + TASK_ZD(p).interactive_bonus += map_proportion_rnd(sleepiness, max_ia_bonus); + } +} + +/* + * Check whether a task qualifies for a throughput bonus and if it does + * give it one + * This never gets called on real time tasks + */ +#define NRUN_AVG_OFFSET 6 +#define NRUN_AVG_ALPHA ((1 << NRUN_AVG_OFFSET) - 1) +#define NRUN_AVG_ONE (1UL << NRUN_AVG_OFFSET) +#define NRUN_AVG_MUL(a, b) (((a) * (b)) >> NRUN_AVG_OFFSET) +static void recalc_throughput_bonus(task_t *p) +{ + unsigned long long ratio; + unsigned long long expected_delay; + unsigned long long adjusted_delay; + struct sched_zaphod_runq_data *zrq = task_zrq(p); + unsigned long long load = zrq->avg_nr_running; + + TASK_ZD(p).throughput_bonus = 0; + if (max_tpt_bonus == 0) + return; + + if (load <= NRUN_AVG_ONE) + expected_delay = 0; + else + expected_delay = NRUN_AVG_MUL(TASK_CPUSTATS(p).avg_cpu_per_cycle, (load - NRUN_AVG_ONE)); + + /* + * No unexpected delay means no bonus, but + * NB this test also avoids a possible divide by zero error if + * cpu is also zero and negative bonuses + */ + if (TASK_CPUSTATS(p).avg_delay_per_cycle <= expected_delay) + return; + + adjusted_delay = TASK_CPUSTATS(p).avg_delay_per_cycle - expected_delay; + ratio = calc_proportion(adjusted_delay, adjusted_delay + TASK_CPUSTATS(p).avg_cpu_per_cycle); + ratio = proportion_sqrt(ratio); + TASK_ZD(p).throughput_bonus = map_proportion_rnd(ratio, max_tpt_bonus); +} + +/* + * Calculate priority based priority (without bonuses). + * This never gets called on real time tasks + */ +static void calculate_pb_pre_bonus_priority(task_t *p) +{ + TASK_ZD(p).pre_bonus_priority = p->static_prio + MAX_TOTAL_BONUS; +} + +/* + * We're just trying to protect a reading and writing of the yardstick. + * We not to fussed about protecting the calculation so the following is + * adequate + */ +static inline void decay_eb_yardstick(struct sched_zaphod_runq_data *zrq) +{ + static const unsigned long decay_per_interval = PROP_FM_PPT(990); + unsigned long curry = atomic_read(&zrq->eb_yardstick); + unsigned long pny; /* potential new yardstick */ + struct task_struct *p = current; + + curry = map_proportion(decay_per_interval, curry); + atomic_set(&zrq->eb_ticks_to_decay, EB_YARDSTICK_DECAY_INTERVAL); + if (unlikely(rt_task(p) || task_is_bgnd(p))) + goto out; + if (TASK_CPUSTATS(p).cpu_usage_rate < MIN_RATE_CAP(p)) + pny = TASK_CPUSTATS(p).cpu_usage_rate / TASK_ZD(p).eb_shares; + else + pny = MIN_RATE_CAP(p) / TASK_ZD(p).eb_shares; + if (pny > curry) + curry = pny; +out: + if (unlikely(curry >= PROPORTION_ONE)) + curry = PROPORTION_ONE - 1; + atomic_set(&zrq->eb_yardstick, curry); +} + +/* + * Calculate entitlement based priority (without bonuses). + * This never gets called on real time tasks + */ +#define EB_PAR 19 +static void calculate_eb_pre_bonus_priority(task_t *p) +{ + /* + * Prevent possible divide by zero and take shortcut + */ + if (unlikely(MIN_RATE_CAP(p) == 0)) { + TASK_ZD(p).pre_bonus_priority = BGND_PRIO - 1; + } else if (TASK_CPUSTATS(p).cpu_usage_rate > MIN_RATE_CAP(p)) { + struct sched_zaphod_runq_data *zrq = task_zrq(p); + unsigned long cap_per_share = MIN_RATE_CAP(p) / TASK_ZD(p).eb_shares; + unsigned long prop = calc_proportion(MIN_RATE_CAP(p), TASK_CPUSTATS(p).cpu_usage_rate); + + TASK_ZD(p).pre_bonus_priority = (BGND_PRIO - 1); + TASK_ZD(p).pre_bonus_priority -= map_proportion_rnd(prop, EB_PAR + 1); + if (cap_per_share > atomic_read(&zrq->eb_yardstick)) { + if (likely(cap_per_share < PROPORTION_ONE)) + atomic_set(&zrq->eb_yardstick, cap_per_share); + else + atomic_set(&zrq->eb_yardstick, PROPORTION_ONE - 1); + } + + } else { + struct sched_zaphod_runq_data *zrq = task_zrq(p); + unsigned long usage_per_share = TASK_CPUSTATS(p).cpu_usage_rate / TASK_ZD(p).eb_shares; + + if (usage_per_share > atomic_read(&zrq->eb_yardstick)) { + if (likely(usage_per_share < PROPORTION_ONE)) + atomic_set(&zrq->eb_yardstick, usage_per_share); + else + atomic_set(&zrq->eb_yardstick, PROPORTION_ONE - 1); + TASK_ZD(p).pre_bonus_priority = MAX_RT_PRIO + MAX_TOTAL_BONUS + EB_PAR; + } else { + unsigned long prop; + + prop = calc_proportion(usage_per_share, atomic_read(&zrq->eb_yardstick)); + TASK_ZD(p).pre_bonus_priority = MAX_RT_PRIO + MAX_TOTAL_BONUS; + TASK_ZD(p).pre_bonus_priority += map_proportion_rnd(prop, EB_PAR); + } + } +} + +static inline void calculate_pre_bonus_priority(task_t *p) +{ + if (zaphod_mode == ZAPHOD_MODE_ENTITLEMENT_BASED) + calculate_eb_pre_bonus_priority(p); + else + calculate_pb_pre_bonus_priority(p); +} + +void zaphod_init_cpu_runq_data(unsigned int cpu) +{ + struct sched_zaphod_runq_data *zrq = &per_cpu(zaphod_runqs, cpu); + + zrq->avg_nr_running = 0; + atomic_set(&zrq->eb_yardstick, 0); + atomic_set(&zrq->eb_ticks_to_decay, EB_YARDSTICK_DECAY_INTERVAL + cpu); +} + +struct sched_zaphod_runq_data *zaphod_cpu_runq_data(unsigned int cpu) +{ + return cpu_zrq(cpu); +} + +void zaphod_runq_data_tick(unsigned int cpu, unsigned long numr) +{ + struct sched_zaphod_runq_data *zrq = cpu_zrq(cpu); + unsigned long nval = NRUN_AVG_MUL(zrq->avg_nr_running, NRUN_AVG_ALPHA); + nval += numr; + + zrq->avg_nr_running = nval; + + if (atomic_dec_and_test(&zrq->eb_ticks_to_decay)) + decay_eb_yardstick(zrq); +} + +void zaphod_fork(struct task_struct *p) +{ + TASK_ZD(p).interactive_bonus = (max_ia_bonus >= initial_ia_bonus) ? + initial_ia_bonus : max_ia_bonus; + TASK_ZD(p).throughput_bonus = 0; +} + +unsigned int zaphod_effective_prio(struct task_struct *p) +{ + unsigned int bonus = 0; + + /* no bonuses for tasks that have exceeded their cap */ + if (likely(TASK_CPUSTATS(p).cpu_usage_rate < MIN_RATE_CAP(p))) { + bonus = SCHED_IA_BONUS_RND(TASK_ZD(p).interactive_bonus); + bonus += TASK_ZD(p).throughput_bonus; + } + + return TASK_ZD(p).pre_bonus_priority - bonus; +} + +void zaphod_reassess_at_activation(struct task_struct *p) +{ + recalc_throughput_bonus(p); + reassess_interactiveness(p); + calculate_pre_bonus_priority(p); +} + +void zaphod_reassess_at_end_of_ts(struct task_struct *p) +{ + recalc_throughput_bonus(p); + reassess_cpu_boundness(p); + /* + * Arguably the interactive bonus should be updated here + * as well. But depends on whether we wish to encourage + * interactive tasks to maintain a high bonus or CPU bound + * tasks to lose some of there bonus? + */ + calculate_pre_bonus_priority(p); +} + +void zaphod_reassess_at_sinbin_release(struct task_struct *p) +{ + calculate_pre_bonus_priority(p); +} + +void zaphod_reassess_at_renice(struct task_struct *p) +{ + TASK_ZD(p).eb_shares = nice_to_shares(task_nice(p)); + if (!rt_task(p)) + calculate_pre_bonus_priority(p); +} + +#include + +#define no_change(a) (a) +SCHED_DRV_SYSFS_UINT_RW(max_ia_bonus, no_change, no_change, 0, max_max_ia_bonus); +SCHED_DRV_SYSFS_UINT_RW(initial_ia_bonus, no_change, no_change, 0, max_max_ia_bonus); +SCHED_DRV_SYSFS_UINT_RW(max_tpt_bonus, no_change, no_change, 0, max_max_tpt_bonus); +SCHED_DRV_SYSFS_UINT_RW(ia_threshold, ppt_to_proportion, proportion_to_ppt, 0, 1000); +SCHED_DRV_SYSFS_UINT_RW(cpu_hog_threshold, ppt_to_proportion, proportion_to_ppt, 0, 1000); + +static ssize_t show_zaphod_mode(char *page) +{ + return sprintf(page, "%s\n", zaphod_mode_names[zaphod_mode]); +} + +static ssize_t store_zaphod_mode(const char *page, size_t count) +{ + int i; + int clen = strlen(page); + + { + char *nlp = strrchr(page, '\n'); + + if (nlp != NULL) + clen = nlp - page; + } + + for (i = 0; zaphod_mode_names[i] != NULL; i++) + if (strncmp(page, zaphod_mode_names[i], clen) == 0) + break; + if (zaphod_mode_names[i] == NULL) + return -EINVAL; + else /* set the zaphod mode */ + zaphod_mode = i; + + return count; +} + +struct sched_drv_sysfs_entry zaphod_mode_sdse = { + .attr = { .name = "mode", .mode = S_IRUGO | S_IWUSR }, + .show = show_zaphod_mode, + .store = store_zaphod_mode, +}; diff -Naur linux-2.6.12-rc5-mm1/kernel/staircase.c linux-2.6.12-rc5-mm1-plug/kernel/staircase.c --- linux-2.6.12-rc5-mm1/kernel/staircase.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.12-rc5-mm1-plug/kernel/staircase.c 2005-05-25 17:05:49.640976576 -0700 @@ -0,0 +1,1019 @@ +/* + * kernel/staircase.c + * Copyright (C) 1991-2005 Linus Torvalds + * + * 2005-02-13 Staircase scheduler by Con Kolivas + * Staircase v10.7 + */ +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Unique staircase process flags used by scheduler. + */ +#define SF_FORKED 0x00000001 /* I have just forked */ +#define SF_YIELDED 0x00000002 /* I have just yielded */ +#define SF_UISLEEP 0x00000004 /* Uninterruptible sleep */ + +#define task_is_queued(p) (!list_empty(&(p)->run_list)) + +static void staircase_init_runqueue_queue(union runqueue_queue *qup) +{ + int k; + + qup->staircase.cache_ticks = 0; + qup->staircase.preempted = 0; + + for (k = 0; k < STAIRCASE_MAX_PRIO; k++) { + INIT_LIST_HEAD(qup->staircase.queue + k); + __clear_bit(k, qup->staircase.bitmap); + } + // delimiter for bitsearch + __set_bit(STAIRCASE_MAX_PRIO, qup->staircase.bitmap); +} + +static void staircase_set_oom_time_slice(struct task_struct *p, unsigned long t) +{ + p->sdu.staircase.slice = p->sdu.staircase.time_slice = t; +} + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) +#define MAX_USER_PRIO (USER_PRIO(STAIRCASE_MAX_PRIO)) + +/* + * Some helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +#define NSJIFFY (1000000000 / HZ) /* One jiffy in ns */ + +int sched_compute = 0; +/* + *This is the time all tasks within the same priority round robin. + *compute setting is reserved for dedicated computational scheduling + *and has ten times larger intervals. + */ +#define _RR_INTERVAL ((10 * HZ / 1000) ? : 1) +#define RR_INTERVAL() (_RR_INTERVAL * (1 + 9 * sched_compute)) + +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) + +/* + * Get nanosecond clock difference without overflowing unsigned long. + */ +static inline unsigned long ns_diff(unsigned long long v1, unsigned long long v2) +{ + unsigned long long vdiff; + if (unlikely(v1 < v2)) + /* + * Rarely the clock goes backwards. There should always be + * a positive difference so return 1. + */ + vdiff = 1; + else + vdiff = v1 - v2; + if (vdiff > (1 << 31)) + vdiff = 1 << 31; + return (unsigned long)vdiff; +} + +/* + * Adding/removing a task to/from a priority array: + */ +static inline void dequeue_task(struct task_struct *p, struct staircase_runqueue_queue *rqq) +{ + list_del_init(&p->run_list); + if (list_empty(rqq->queue + p->prio)) + __clear_bit(p->prio, rqq->bitmap); + p->sdu.staircase.ns_debit = 0; +} + +static void enqueue_task(struct task_struct *p, struct staircase_runqueue_queue *rqq) +{ + sched_info_queued(p); + list_add_tail(&p->run_list, rqq->queue + p->prio); + __set_bit(p->prio, rqq->bitmap); +} + +static void requeue_task(struct task_struct *p, struct staircase_runqueue_queue *rq) +{ + list_move_tail(&p->run_list, rq->queue + p->prio); +} + +/* + * Used by the migration code - we pull tasks from the head of the + * remote queue so we want these tasks to show up at the head of the + * local queue: + */ +static inline void enqueue_task_head(struct task_struct *p, struct staircase_runqueue_queue *rqq) +{ + list_add(&p->run_list, rqq->queue + p->prio); + __set_bit(p->prio, rqq->bitmap); +} + +/* + * __activate_task - move a task to the runqueue. + */ +static inline void __activate_task(task_t *p, runqueue_t *rq) +{ + enqueue_task(p, &rq->qu.staircase); + inc_nr_running(p, rq); +} + +#ifdef CONFIG_HOTPLUG_CPU +/* + * __activate_idle_task - move idle task to the _front_ of runqueue. + */ +static inline void __activate_idle_task(task_t *p, runqueue_t *rq) +{ + enqueue_task_head(p, &rq->qu.staircase); + inc_nr_running(p, rq); +} +#endif + +/* + * burst - extra intervals an interactive task can run for at best priority + * instead of descending priorities. + */ +static inline unsigned int burst(const task_t *p) +{ + if (likely(!rt_task(p))) { + unsigned int task_user_prio = TASK_USER_PRIO(p); + return 39 - task_user_prio; + } else + return p->sdu.staircase.burst; +} + +static void inc_burst(task_t *p) +{ + unsigned int best_burst; + best_burst = burst(p); + if (p->sdu.staircase.burst < best_burst) + p->sdu.staircase.burst++; +} + +static void dec_burst(task_t *p) +{ + if (p->sdu.staircase.burst) + p->sdu.staircase.burst--; +} + +static inline unsigned int rr_interval(const task_t * p) +{ + unsigned int rr_interval = RR_INTERVAL(); + int nice = TASK_NICE(p); + + if (nice < 0 && !rt_task(p)) + rr_interval += -(nice); + + return rr_interval; +} + +/* + * slice - the duration a task runs before getting requeued at its best + * priority and has its burst decremented. + */ +static inline unsigned int slice(const task_t *p) +{ + unsigned int slice, rr; + + slice = rr = rr_interval(p); + if (likely(!rt_task(p))) + slice += burst(p) * rr; + + return slice; +} + +/* + * sched_interactive - sysctl which allows interactive tasks to have bursts + */ +int sched_interactive = 1; + +/* + * effective_prio - dynamic priority dependent on burst. + * The priority normally decreases by one each RR_INTERVAL. + * As the burst increases the priority stays at the top "stair" or + * priority for longer. + */ +static int effective_prio(task_t *p) +{ + int prio; + unsigned int full_slice, used_slice, first_slice; + unsigned int best_burst, rr; + if (rt_task(p)) + return p->prio; + + best_burst = burst(p); + full_slice = slice(p); + rr = rr_interval(p); + used_slice = full_slice - p->sdu.staircase.slice; + if (p->sdu.staircase.burst > best_burst) + p->sdu.staircase.burst = best_burst; + first_slice = rr; + if (sched_interactive && !sched_compute && p->mm) + first_slice *= (p->sdu.staircase.burst + 1); + prio = STAIRCASE_MAX_PRIO - 1 - best_burst; + + if (used_slice < first_slice) + return prio; + prio += 1 + (used_slice - first_slice) / rr; + if (prio > STAIRCASE_MAX_PRIO - 1) + prio = STAIRCASE_MAX_PRIO - 1; + + return prio; +} + +static void continue_slice(task_t *p) +{ + unsigned long total_run = NS_TO_JIFFIES(p->sdu.staircase.totalrun); + + if (total_run >= p->sdu.staircase.slice) { + p->sdu.staircase.totalrun = 0; + dec_burst(p); + } else { + unsigned int remainder; + p->sdu.staircase.slice -= total_run; + remainder = p->sdu.staircase.slice % rr_interval(p); + if (remainder) + p->sdu.staircase.time_slice = remainder; + } +} + +/* + * recalc_task_prio - this checks for tasks that run ultra short timeslices + * or have just forked a thread/process and make them continue their old + * slice instead of starting a new one at high priority. + */ +static void recalc_task_prio(task_t *p, unsigned long long now, unsigned long rq_load) +{ + unsigned long sleep_time; + + if (rq_load > 31) + rq_load = 31; + sleep_time = ns_diff(now, p->timestamp) / (1 << rq_load); + + p->sdu.staircase.totalrun += p->sdu.staircase.runtime; + if (NS_TO_JIFFIES(p->sdu.staircase.totalrun) >= p->sdu.staircase.slice && + NS_TO_JIFFIES(sleep_time) < p->sdu.staircase.slice) { + p->sdu.staircase.sflags &= ~SF_FORKED; + dec_burst(p); + goto new_slice; + } + + if (p->sdu.staircase.sflags & SF_FORKED) { + continue_slice(p); + p->sdu.staircase.sflags &= ~SF_FORKED; + return; + } + + if (sched_compute) { + continue_slice(p); + return; + } + + if (sleep_time >= p->sdu.staircase.totalrun) { + if (!(p->sdu.staircase.sflags & SF_UISLEEP)) + inc_burst(p); + goto new_slice; + } + + p->sdu.staircase.totalrun -= sleep_time; + continue_slice(p); + return; +new_slice: + p->sdu.staircase.totalrun = 0; +} + +/* + * activate_task - move a task to the runqueue and do priority recalculation + * + * Update all the scheduling statistics stuff. (sleep average + * calculation, priority modifiers, etc.) + */ +static void activate_task(task_t *p, runqueue_t *rq, int local) +{ + unsigned long long now; + + now = sched_clock(); +#ifdef CONFIG_SMP + if (!local) { + /* Compensate for drifting sched_clock */ + runqueue_t *this_rq = this_rq(); + now = (now - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; + } +#endif + p->sdu.staircase.slice = slice(p); + p->sdu.staircase.time_slice = rr_interval(p); + recalc_task_prio(p, now, rq->nr_running); + p->sdu.staircase.sflags &= ~SF_UISLEEP; + p->prio = effective_prio(p); + p->timestamp = now; + __activate_task(p, rq); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) +{ + dec_nr_running(p, rq); + dequeue_task(p, &rq->qu.staircase); +} + +/* + * cache_delay is the time preemption is delayed in sched_compute mode + * and is set to 5*cache_decay_ticks on SMP or a nominal 10ms on UP. + */ +static int cache_delay = 10 * HZ / 1000; + +/* + * Check to see if p preempts rq->curr and resched if it does. In compute + * mode we do not preempt for at least cache_delay and set rq->preempted. + */ +static void preempt(task_t *p, struct runqueue *rq) +{ + if (!TASK_PREEMPTS_CURR(p, rq)) + return; + + if (p->prio == rq->curr->prio && + ((p->sdu.staircase.totalrun || p->sdu.staircase.slice != slice(p)) || + rt_task(rq->curr))) + return; + + if (!sched_compute || rq->qu.staircase.cache_ticks >= cache_delay || + !p->mm || rt_task(p)) + resched_task(rq->curr); + rq->qu.staircase.preempted = 1; +} + +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @old_state: thetask's state before being woken + * @sync: do a synchronous wakeup? + * @rq: The run queue on which the task is to be placed (already locked) + */ +static void staircase_wake_up_task(struct task_struct *p, struct runqueue *rq, unsigned int old_state, int sync) +{ + int same_cpu = (rq == this_rq()); + + if (old_state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; + + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption, if the woken up task will run on + * this cpu. (in this case the 'I will reschedule' promise of + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ + activate_task(p, rq, same_cpu); + if (!sync || !same_cpu) + preempt(p, rq); +} + +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + */ +static void staircase_fork(task_t *p) +{ +} + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +static void staircase_wake_up_new_task(task_t * p, unsigned long clone_flags) +{ + unsigned long flags; + int this_cpu, cpu; + runqueue_t *rq, *this_rq; + + rq = task_rq_lock(p, &flags); + cpu = task_cpu(p); + this_cpu = smp_processor_id(); + + BUG_ON(p->state != TASK_RUNNING); + + /* + * Forked process gets no burst to prevent fork bombs. + */ + p->sdu.staircase.burst = 0; + + if (likely(cpu == this_cpu)) { + current->sdu.staircase.sflags |= SF_FORKED; + + if (!(clone_flags & CLONE_VM)) { + /* + * The VM isn't cloned, so we're in a good position to + * do child-runs-first in anticipation of an exec. This + * usually avoids a lot of COW overhead. + */ + if (unlikely(!task_is_queued(current))) { + p->prio = effective_prio(p); + __activate_task(p, rq); + } else { + p->prio = current->prio; + list_add_tail(&p->run_list, ¤t->run_list); + inc_nr_running(p, rq); + } + set_need_resched(); + } else { + p->prio = effective_prio(p); + /* Run child last */ + __activate_task(p, rq); + } + /* + * We skip the following code due to cpu == this_cpu + */ + this_rq = rq; + } else { + this_rq = cpu_rq(this_cpu); + + /* + * Not the local CPU - must adjust timestamp. This should + * get optimised away in the !CONFIG_SMP case. + */ + p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; + p->prio = effective_prio(p); + __activate_task(p, rq); + preempt(p, rq); + + /* + * Parent and child are on different CPUs, now get the + * parent runqueue to update the parent's ->sdu.staircase.sleep_avg: + */ + task_rq_unlock(rq, &flags); + this_rq = task_rq_lock(current, &flags); + current->sdu.staircase.sflags |= SF_FORKED; + } + + task_rq_unlock(this_rq, &flags); +} + +/* + * Potentially available exiting-child timeslices are + * retrieved here - this way the parent does not get + * penalized for creating too many threads. + * + * (this cannot be used to 'generate' timeslices + * artificially, because any timeslice recovered here + * was given away by the parent in the first place.) + */ +static void staircase_exit(task_t * p) +{ +} + +/* + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static inline +void pull_task(runqueue_t *src_rq, task_t *p, runqueue_t *this_rq, int this_cpu) +{ + dequeue_task(p, &src_rq->qu.staircase); + dec_nr_running(p, src_rq); + set_task_cpu(p, this_cpu); + inc_nr_running(p, this_rq); + enqueue_task(p, &this_rq->qu.staircase); + p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + + this_rq->timestamp_last_tick; + /* + * Note that idle threads have a prio of STAIRCASE_MAX_PRIO, for this test + * to be always true for them. + */ + preempt(p, this_rq); +} + +#ifdef CONFIG_SMP +/* + * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, + * as part of a balancing operation within "domain". Returns the number of + * tasks moved. + * + * Called with both runqueues locked. + */ +static int staircase_move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) +{ + struct list_head *head, *curr; + int idx, pulled = 0; + task_t *tmp; + + if (max_nr_move <= 0 || busiest->nr_running <= 1) + goto out; + + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(busiest->qu.staircase.bitmap); + else + idx = find_next_bit(busiest->qu.staircase.bitmap, STAIRCASE_MAX_PRIO, idx); + if (idx >= STAIRCASE_MAX_PRIO) + goto out; + + head = busiest->qu.staircase.queue + idx; + curr = head->prev; +skip_queue: + tmp = list_entry(curr, task_t, run_list); + + curr = curr->prev; + + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + +#ifdef CONFIG_SCHEDSTATS + if (task_hot(tmp, busiest->timestamp_last_tick, sd)) + schedstat_inc(sd, lb_hot_gained[idle]); +#endif + + pull_task(busiest, tmp, this_rq, this_cpu); + pulled++; + + /* We only want to steal up to the prescribed number of tasks. */ + if (pulled < max_nr_move) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } +out: + return pulled; +} +#endif + +static void time_slice_expired(task_t *p, runqueue_t *rq) +{ + struct staircase_runqueue_queue *rqq = &rq->qu.staircase; + + set_tsk_need_resched(p); + dequeue_task(p, rqq); + p->prio = effective_prio(p); + p->sdu.staircase.time_slice = rr_interval(p); + enqueue_task(p, rqq); +} + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + */ +static void staircase_tick(struct task_struct *p, struct runqueue *rq, unsigned long long now) +{ + int cpu = smp_processor_id(); + unsigned long debit; + + if (p == rq->idle) { + if (wake_priority_sleeper(rq)) + goto out; + rebalance_tick(cpu, rq, SCHED_IDLE); + return; + } + + /* Task might have expired already, but not scheduled off yet */ + if (unlikely(!task_is_queued(p))) { + set_tsk_need_resched(p); + goto out; + } + + /* + * SCHED_FIFO tasks never run out of timeslice. + */ + if (unlikely(p->policy == SCHED_FIFO)) + goto out; + + spin_lock(&rq->lock); + debit = ns_diff(rq->timestamp_last_tick, p->timestamp); + p->sdu.staircase.ns_debit += debit; + if (p->sdu.staircase.ns_debit < NSJIFFY) + goto out_unlock; + p->sdu.staircase.ns_debit %= NSJIFFY; + /* + * Tasks lose burst each time they use up a full slice(). + */ + if (!--p->sdu.staircase.slice) { + dec_burst(p); + p->sdu.staircase.slice = slice(p); + time_slice_expired(p, rq); + p->sdu.staircase.totalrun = 0; + goto out_unlock; + } + /* + * Tasks that run out of time_slice but still have slice left get + * requeued with a lower priority && RR_INTERVAL time_slice. + */ + if (!--p->sdu.staircase.time_slice) { + time_slice_expired(p, rq); + goto out_unlock; + } + rq->qu.staircase.cache_ticks++; + if (rq->qu.staircase.preempted && rq->qu.staircase.cache_ticks >= cache_delay) + set_tsk_need_resched(p); +out_unlock: + spin_unlock(&rq->lock); +out: + rebalance_tick(cpu, rq, NOT_IDLE); +} + +#ifdef CONFIG_SCHED_SMT +static struct task_struct *staircase_head_of_queue(union runqueue_queue *rqq) +{ + return list_entry(rqq->staircase.queue[sched_find_first_bit(rqq->staircase.bitmap)].next, + task_t, run_list); +} + +static int staircase_dependent_sleeper_trumps(const struct task_struct *p1, + const struct task_struct * p2, struct sched_domain *sd) +{ + return ((p1->sdu.staircase.time_slice * (100 - sd->per_cpu_gain) / 100) > + slice(p2) || rt_task(p1)) && + p2->mm && p1->mm && !rt_task(p2); +} +#endif + +/* + * schedule() is the main scheduler function. + */ +static void staircase_schedule(void) +{ + long *switch_count; + int cpu, idx; + struct task_struct *prev = current, *next; + struct runqueue *rq = this_rq(); + unsigned long long now = sched_clock(); + unsigned long debit; + struct list_head *queue; + + spin_lock_irq(&rq->lock); + + prev->sdu.staircase.runtime = ns_diff(now, prev->timestamp); + debit = ns_diff(now, rq->timestamp_last_tick) % NSJIFFY; + prev->sdu.staircase.ns_debit += debit; + + if (unlikely(current->flags & PF_DEAD)) + current->state = EXIT_DEAD; + /* + * if entering off of a kernel preemption go straight + * to picking the next task. + */ + switch_count = &prev->nivcsw; + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; + else { + if (prev->state == TASK_UNINTERRUPTIBLE) { + rq->nr_uninterruptible++; + prev->sdu.staircase.sflags |= SF_UISLEEP; + } + deactivate_task(prev, rq); + } + } + + cpu = smp_processor_id(); + if (unlikely(!rq->nr_running)) { +go_idle: + idle_balance(cpu, rq); + if (!rq->nr_running) { + next = rq->idle; + wake_sleeping_dependent(cpu, rq); + /* + * wake_sleeping_dependent() might have released + * the runqueue, so break out if we got new + * tasks meanwhile: + */ + if (!rq->nr_running) + goto switch_tasks; + } + } else { + if (dependent_sleeper(cpu, rq)) { + next = rq->idle; + goto switch_tasks; + } + /* + * dependent_sleeper() releases and reacquires the runqueue + * lock, hence go into the idle loop if the rq went + * empty meanwhile: + */ + if (unlikely(!rq->nr_running)) + goto go_idle; + } + + idx = sched_find_first_bit(rq->qu.staircase.bitmap); + queue = rq->qu.staircase.queue + idx; + next = list_entry(queue->next, task_t, run_list); +switch_tasks: + if (next == rq->idle) + schedstat_inc(rq, sched_goidle); + prefetch(next); + clear_tsk_need_resched(prev); + rcu_qsctr_inc(task_cpu(prev)); + + update_cpu_clock(prev, rq, now); + prev->timestamp = prev->last_ran = now; + if (unlikely(next->sdu.staircase.sflags & SF_YIELDED)) { + /* + * Tasks that have yield()ed get requeued at normal priority + */ + int newprio = effective_prio(next); + next->sdu.staircase.sflags &= ~SF_YIELDED; + if (newprio != next->prio) { + struct staircase_runqueue_queue *rqq = &rq->qu.staircase; + + dequeue_task(next, rqq); + next->prio = newprio; + enqueue_task_head(next, rqq); + } + } + + sched_info_switch(prev, next); + if (likely(prev != next)) { + rq->qu.staircase.preempted = 0; + rq->qu.staircase.cache_ticks = 0; + next->timestamp = now; + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + + prepare_arch_switch(rq, next); + prev = context_switch(rq, prev, next); + barrier(); + + finish_task_switch(prev); + } else + spin_unlock_irq(&rq->lock); +} + +static void staircase_set_normal_task_nice(task_t *p, long nice) +{ + int queued; + int old_prio, new_prio, delta; + struct runqueue *rq = task_rq(p); + struct staircase_runqueue_queue *rqq = &rq->qu.staircase; + + queued = task_is_queued(p); + if (queued) { + dequeue_task(p, rqq); + dec_prio_bias(rq, p->static_prio); + } + + old_prio = p->prio; + new_prio = NICE_TO_PRIO(nice); + delta = new_prio - old_prio; + p->static_prio = NICE_TO_PRIO(nice); + p->prio += delta; + + if (queued) { + inc_prio_bias(rq, p->static_prio); + enqueue_task(p, rqq); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + if (delta < 0 || (delta > 0 && task_running(rq, p))) + resched_task(rq->curr); + } +} + +/* + * setscheduler - change the scheduling policy and/or RT priority of a thread. + */ +static void staircase_setscheduler(task_t *p, int policy, int prio) +{ + int oldprio; + int queued; + runqueue_t *rq = task_rq(p); + + queued = task_is_queued(p); + if (queued) + deactivate_task(p, rq); + oldprio = p->prio; + __setscheduler(p, policy, prio); + if (queued) { + __activate_task(p, rq); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else + preempt(p, rq); + } +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * this function yields the current CPU by moving the calling thread + * to the expired array. If there are no other threads running on this + * CPU then this function will return. + */ + +static long staircase_sys_yield(void) +{ + int newprio; + runqueue_t *rq = this_rq_lock(); + struct staircase_runqueue_queue *rqq = &rq->qu.staircase; + + schedstat_inc(rq, yld_cnt); + newprio = current->prio; + current->sdu.staircase.slice = slice(current); + current->sdu.staircase.time_slice = rr_interval(current); + if (likely(!rt_task(current))) { + current->sdu.staircase.sflags |= SF_YIELDED; + newprio = STAIRCASE_MAX_PRIO - 1; + } + + if (newprio != current->prio) { + dequeue_task(current, rqq); + current->prio = newprio; + enqueue_task(current, rqq); + } else + requeue_task(current, rqq); + + if (rq->nr_running == 1) + schedstat_inc(rq, yld_both_empty); + + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: + */ + __release(rq->lock); + _raw_spin_unlock(&rq->lock); + preempt_enable_no_resched(); + + schedule(); + + return 0; +} + +static void staircase_yield(void) +{ + set_current_state(TASK_RUNNING); + staircase_sys_yield(); +} + +static void staircase_init_idle(task_t *idle, int cpu) +{ + idle->prio = STAIRCASE_MAX_PRIO; +} + +#ifdef CONFIG_SMP +/* source and destination queues will be already locked */ +static void staircase_migrate_queued_task(struct task_struct *p, int dest_cpu) +{ + struct runqueue *rq_src = task_rq(p); + struct runqueue *rq_dest = cpu_rq(dest_cpu); + + /* + * Sync timestamp with rq_dest's before activating. + * The same thing could be achieved by doing this step + * afterwards, and pretending it was a local activate. + * This way is cleaner and logically correct. + */ + p->timestamp = p->timestamp - rq_src->timestamp_last_tick + + rq_dest->timestamp_last_tick; + deactivate_task(p, rq_src); + set_task_cpu(p, dest_cpu); + activate_task(p, rq_dest, 0); + preempt(p, rq_dest); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void staircase_set_select_idle_first(struct runqueue *rq) +{ + __setscheduler(rq->idle, SCHED_FIFO, MAX_RT_PRIO-1); + /* Add idle task to _front_ of it's priority queue */ + __activate_idle_task(rq->idle, rq); +} + +static void staircase_set_select_idle_last(struct runqueue *rq) +{ + deactivate_task(rq->idle, rq); + rq->idle->static_prio = STAIRCASE_MAX_PRIO; + __setscheduler(rq->idle, SCHED_NORMAL, 0); +} + +static void staircase_migrate_dead_tasks(unsigned int dead_cpu) +{ + unsigned i; + struct runqueue *rq = cpu_rq(dead_cpu); + + for (i = 0; i < STAIRCASE_MAX_PRIO; i++) { + struct list_head *list = &rq->qu.staircase.queue[i]; + while (!list_empty(list)) + migrate_dead(dead_cpu, list_entry(list->next, task_t, run_list)); + } +} +#endif +#endif + +static void staircase_sched_init(void) +{ + init_task.sdu.staircase.time_slice = HZ; + init_task.sdu.staircase.slice = HZ; +} + +#ifdef CONFIG_MAGIC_SYSRQ +static void staircase_normalize_rt_task(struct task_struct *p) +{ + int queued; + unsigned long flags; + runqueue_t *rq; + + rq = task_rq_lock(p, &flags); + + queued = task_is_queued(p); + if (queued) + deactivate_task(p, rq); + __setscheduler(p, SCHED_NORMAL, 0); + if (queued) { + __activate_task(p, rq); + resched_task(rq->curr); + } + + task_rq_unlock(rq, &flags); +} +#endif + +#ifdef CONFIG_SYSFS +#define no_change(a) (a) +SCHED_DRV_SYSFS_UINT_RW(cache_delay, msecs_to_jiffies, jiffies_to_msecs, 1, 1000); +SCHED_DRV_SYSFS_UINT_RW(sched_compute, no_change, no_change, 0, 1); +SCHED_DRV_SYSFS_UINT_RW(sched_interactive, no_change, no_change, 0, 1); + +static struct attribute *staircase_attrs[] = { + &SCHED_DRV_SYSFS_ATTR(cache_delay), + &SCHED_DRV_SYSFS_ATTR(sched_compute), + &SCHED_DRV_SYSFS_ATTR(sched_interactive), + NULL, +}; +#endif + +const struct sched_drv staircase_sched_drv = { + .name = "staircase", + .init_runqueue_queue = staircase_init_runqueue_queue, + .set_oom_time_slice = staircase_set_oom_time_slice, + .task_timeslice = slice, + .wake_up_task = staircase_wake_up_task, + .fork = staircase_fork, + .wake_up_new_task = staircase_wake_up_new_task, + .exit = staircase_exit, +#ifdef CONFIG_SMP + .move_tasks = staircase_move_tasks, +#endif + .tick = staircase_tick, +#ifdef CONFIG_SCHED_SMT + .head_of_queue = staircase_head_of_queue, + .dependent_sleeper_trumps = staircase_dependent_sleeper_trumps, +#endif + .schedule = staircase_schedule, + .set_normal_task_nice = staircase_set_normal_task_nice, + .setscheduler = staircase_setscheduler, + .sys_yield = staircase_sys_yield, + .yield = staircase_yield, + .init_idle = staircase_init_idle, + .sched_init = staircase_sched_init, +#ifdef CONFIG_SMP + .migrate_queued_task = staircase_migrate_queued_task, +#ifdef CONFIG_HOTPLUG_CPU + .set_select_idle_first = staircase_set_select_idle_first, + .set_select_idle_last = staircase_set_select_idle_last, + .migrate_dead_tasks = staircase_migrate_dead_tasks, +#endif +#endif +#ifdef CONFIG_MAGIC_SYSRQ + .normalize_rt_task = staircase_normalize_rt_task, +#endif +#ifdef CONFIG_SYSFS + .attrs = staircase_attrs, +#endif +}; diff -Naur linux-2.6.12-rc5-mm1/kernel/sys_ni.c linux-2.6.12-rc5-mm1-plug/kernel/sys_ni.c --- linux-2.6.12-rc5-mm1/kernel/sys_ni.c 2005-05-25 16:23:47.405414664 -0700 +++ linux-2.6.12-rc5-mm1-plug/kernel/sys_ni.c 2005-05-25 17:04:42.731148416 -0700 @@ -68,10 +68,6 @@ cond_syscall(compat_sys_mq_timedreceive); cond_syscall(compat_sys_mq_notify); cond_syscall(compat_sys_mq_getsetattr); -cond_syscall(sys_vperfctr_open); -cond_syscall(sys_vperfctr_control); -cond_syscall(sys_vperfctr_write); -cond_syscall(sys_vperfctr_read); cond_syscall(sys_mbind); cond_syscall(sys_get_mempolicy); cond_syscall(sys_set_mempolicy); diff -Naur linux-2.6.12-rc5-mm1/kernel/timer.c linux-2.6.12-rc5-mm1-plug/kernel/timer.c --- linux-2.6.12-rc5-mm1/kernel/timer.c 2005-05-25 16:23:47.408414208 -0700 +++ linux-2.6.12-rc5-mm1-plug/kernel/timer.c 2005-05-25 17:04:42.727149024 -0700 @@ -32,7 +32,6 @@ #include #include #include -#include #include #include @@ -847,7 +846,6 @@ account_user_time(p, jiffies_to_cputime(1)); else account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1)); - perfctr_sample_thread(&p->thread); run_local_timers(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_tick); diff -Naur linux-2.6.12-rc5-mm1/mm/oom_kill.c linux-2.6.12-rc5-mm1-plug/mm/oom_kill.c --- linux-2.6.12-rc5-mm1/mm/oom_kill.c 2005-05-25 16:21:00.000000000 -0700 +++ linux-2.6.12-rc5-mm1-plug/mm/oom_kill.c 2005-05-25 17:05:49.657973992 -0700 @@ -196,7 +196,7 @@ * all the memory it needs. That way it should be able to * exit() and clear out its resources quickly... */ - p->time_slice = HZ; + set_oom_time_slice(p, HZ); set_tsk_thread_flag(p, TIF_MEMDIE); force_sig(SIGKILL, p);