diff -urN oldtree/include/linux/mmzone.h newtree/include/linux/mmzone.h --- oldtree/include/linux/mmzone.h 2006-03-08 18:48:02.316023750 +0000 +++ newtree/include/linux/mmzone.h 2006-03-08 20:52:38.991286750 +0000 @@ -121,7 +121,7 @@ struct zone { /* Fields commonly accessed by the page allocator */ unsigned long free_pages; - unsigned long pages_min, pages_low, pages_high; + unsigned long pages_min, pages_low, pages_high, pages_lots; /* * We don't know if the memory that we're going to allocate will be freeable * or/and it will be released eventually, so to avoid totally wasting several diff -urN oldtree/include/linux/mmzone.h.orig newtree/include/linux/mmzone.h.orig --- oldtree/include/linux/mmzone.h.orig 1970-01-01 00:00:00.000000000 +0000 +++ newtree/include/linux/mmzone.h.orig 2006-03-08 18:48:02.000000000 +0000 @@ -0,0 +1,613 @@ +#ifndef _LINUX_MMZONE_H +#define _LINUX_MMZONE_H + +#ifdef __KERNEL__ +#ifndef __ASSEMBLY__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Free memory management - zoned buddy allocator. */ +#ifndef CONFIG_FORCE_MAX_ZONEORDER +#define MAX_ORDER 11 +#else +#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER +#endif + +struct free_area { + struct list_head free_list; + unsigned long nr_free; +}; + +struct pglist_data; + +/* + * zone->lock and zone->lru_lock are two of the hottest locks in the kernel. + * So add a wild amount of padding here to ensure that they fall into separate + * cachelines. There are very few zone structures in the machine, so space + * consumption is not a concern here. + */ +#if defined(CONFIG_SMP) +struct zone_padding { + char x[0]; +} ____cacheline_internodealigned_in_smp; +#define ZONE_PADDING(name) struct zone_padding name; +#else +#define ZONE_PADDING(name) +#endif + +struct per_cpu_pages { + int count; /* number of pages in the list */ + int high; /* high watermark, emptying needed */ + int batch; /* chunk size for buddy add/remove */ + struct list_head list; /* the list of pages */ +}; + +struct per_cpu_pageset { + struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ +#ifdef CONFIG_NUMA + unsigned long numa_hit; /* allocated in intended node */ + unsigned long numa_miss; /* allocated in non intended node */ + unsigned long numa_foreign; /* was intended here, hit elsewhere */ + unsigned long interleave_hit; /* interleaver prefered this zone */ + unsigned long local_node; /* allocation from local node */ + unsigned long other_node; /* allocation from other node */ +#endif +} ____cacheline_aligned_in_smp; + +#ifdef CONFIG_NUMA +#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)]) +#else +#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)]) +#endif + +#define ZONE_DMA 0 +#define ZONE_DMA32 1 +#define ZONE_NORMAL 2 +#define ZONE_HIGHMEM 3 + +#define MAX_NR_ZONES 4 /* Sync this with ZONES_SHIFT */ +#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */ + + +/* + * When a memory allocation must conform to specific limitations (such + * as being suitable for DMA) the caller will pass in hints to the + * allocator in the gfp_mask, in the zone modifier bits. These bits + * are used to select a priority ordered list of memory zones which + * match the requested limits. GFP_ZONEMASK defines which bits within + * the gfp_mask should be considered as zone modifiers. Each valid + * combination of the zone modifier bits has a corresponding list + * of zones (in node_zonelists). Thus for two zone modifiers there + * will be a maximum of 4 (2 ** 2) zonelists, for 3 modifiers there will + * be 8 (2 ** 3) zonelists. GFP_ZONETYPES defines the number of possible + * combinations of zone modifiers in "zone modifier space". + * + * As an optimisation any zone modifier bits which are only valid when + * no other zone modifier bits are set (loners) should be placed in + * the highest order bits of this field. This allows us to reduce the + * extent of the zonelists thus saving space. For example in the case + * of three zone modifier bits, we could require up to eight zonelists. + * If the left most zone modifier is a "loner" then the highest valid + * zonelist would be four allowing us to allocate only five zonelists. + * Use the first form for GFP_ZONETYPES when the left most bit is not + * a "loner", otherwise use the second. + * + * NOTE! Make sure this matches the zones in + */ +#define GFP_ZONEMASK 0x07 +/* #define GFP_ZONETYPES (GFP_ZONEMASK + 1) */ /* Non-loner */ +#define GFP_ZONETYPES ((GFP_ZONEMASK + 1) / 2 + 1) /* Loner */ + +/* + * On machines where it is needed (eg PCs) we divide physical memory + * into multiple physical zones. On a 32bit PC we have 4 zones: + * + * ZONE_DMA < 16 MB ISA DMA capable memory + * ZONE_DMA32 0 MB Empty + * ZONE_NORMAL 16-896 MB direct mapped by the kernel + * ZONE_HIGHMEM > 896 MB only page cache and user processes + */ + +struct zone { + /* Fields commonly accessed by the page allocator */ + unsigned long free_pages; + unsigned long pages_min, pages_low, pages_high; + /* + * We don't know if the memory that we're going to allocate will be freeable + * or/and it will be released eventually, so to avoid totally wasting several + * GB of ram we must reserve some of the lower zone memory (otherwise we risk + * to run OOM on the lower zones despite there's tons of freeable ram + * on the higher zones). This array is recalculated at runtime if the + * sysctl_lowmem_reserve_ratio sysctl changes. + */ + unsigned long lowmem_reserve[MAX_NR_ZONES]; + +#ifdef CONFIG_NUMA + struct per_cpu_pageset *pageset[NR_CPUS]; +#else + struct per_cpu_pageset pageset[NR_CPUS]; +#endif + /* + * free areas of different sizes + */ + spinlock_t lock; +#ifdef CONFIG_MEMORY_HOTPLUG + /* see spanned/present_pages for more description */ + seqlock_t span_seqlock; +#endif + struct free_area free_area[MAX_ORDER]; + + + ZONE_PADDING(_pad1_) + + /* Fields commonly accessed by the page reclaim scanner */ + spinlock_t lru_lock; + struct list_head active_list; + struct list_head inactive_list; + unsigned long nr_scan_active; + unsigned long nr_scan_inactive; + unsigned long nr_active; + unsigned long nr_inactive; + unsigned long pages_scanned; /* since last reclaim */ + int all_unreclaimable; /* All pages pinned */ + + /* A count of how many reclaimers are scanning this zone */ + atomic_t reclaim_in_progress; + + /* + * timestamp (in jiffies) of the last zone reclaim that did not + * result in freeing of pages. This is used to avoid repeated scans + * if all memory in the zone is in use. + */ + unsigned long last_unsuccessful_zone_reclaim; + + /* + * prev_priority holds the scanning priority for this zone. It is + * defined as the scanning priority at which we achieved our reclaim + * target at the previous try_to_free_pages() or balance_pgdat() + * invokation. + * + * We use prev_priority as a measure of how much stress page reclaim is + * under - it drives the swappiness decision: whether to unmap mapped + * pages. + * + * temp_priority is used to remember the scanning priority at which + * this zone was successfully refilled to free_pages == pages_high. + * + * Access to both these fields is quite racy even on uniprocessor. But + * it is expected to average out OK. + */ + int temp_priority; + int prev_priority; + + + ZONE_PADDING(_pad2_) + /* Rarely used or read-mostly fields */ + + /* + * wait_table -- the array holding the hash table + * wait_table_size -- the size of the hash table array + * wait_table_bits -- wait_table_size == (1 << wait_table_bits) + * + * The purpose of all these is to keep track of the people + * waiting for a page to become available and make them + * runnable again when possible. The trouble is that this + * consumes a lot of space, especially when so few things + * wait on pages at a given time. So instead of using + * per-page waitqueues, we use a waitqueue hash table. + * + * The bucket discipline is to sleep on the same queue when + * colliding and wake all in that wait queue when removing. + * When something wakes, it must check to be sure its page is + * truly available, a la thundering herd. The cost of a + * collision is great, but given the expected load of the + * table, they should be so rare as to be outweighed by the + * benefits from the saved space. + * + * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the + * primary users of these fields, and in mm/page_alloc.c + * free_area_init_core() performs the initialization of them. + */ + wait_queue_head_t * wait_table; + unsigned long wait_table_size; + unsigned long wait_table_bits; + + /* + * Discontig memory support fields. + */ + struct pglist_data *zone_pgdat; + /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ + unsigned long zone_start_pfn; + + /* + * zone_start_pfn, spanned_pages and present_pages are all + * protected by span_seqlock. It is a seqlock because it has + * to be read outside of zone->lock, and it is done in the main + * allocator path. But, it is written quite infrequently. + * + * The lock is declared along with zone->lock because it is + * frequently read in proximity to zone->lock. It's good to + * give them a chance of being in the same cacheline. + */ + unsigned long spanned_pages; /* total size, including holes */ + unsigned long present_pages; /* amount of memory (excluding holes) */ + + /* + * rarely used fields: + */ + char *name; +} ____cacheline_internodealigned_in_smp; + + +/* + * The "priority" of VM scanning is how much of the queues we will scan in one + * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the + * queues ("queue_length >> 12") during an aging round. + */ +#define DEF_PRIORITY 12 + +/* + * One allocation request operates on a zonelist. A zonelist + * is a list of zones, the first one is the 'goal' of the + * allocation, the other zones are fallback zones, in decreasing + * priority. + * + * Right now a zonelist takes up less than a cacheline. We never + * modify it apart from boot-up, and only a few indices are used, + * so despite the zonelist table being relatively big, the cache + * footprint of this construct is very small. + */ +struct zonelist { + struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited +}; + + +/* + * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM + * (mostly NUMA machines?) to denote a higher-level memory zone than the + * zone denotes. + * + * On NUMA machines, each NUMA node would have a pg_data_t to describe + * it's memory layout. + * + * Memory statistics and page replacement data structures are maintained on a + * per-zone basis. + */ +struct bootmem_data; +typedef struct pglist_data { + struct zone node_zones[MAX_NR_ZONES]; + struct zonelist node_zonelists[GFP_ZONETYPES]; + int nr_zones; +#ifdef CONFIG_FLAT_NODE_MEM_MAP + struct page *node_mem_map; +#endif + struct bootmem_data *bdata; +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * Must be held any time you expect node_start_pfn, node_present_pages + * or node_spanned_pages stay constant. Holding this will also + * guarantee that any pfn_valid() stays that way. + * + * Nests above zone->lock and zone->size_seqlock. + */ + spinlock_t node_size_lock; +#endif + unsigned long node_start_pfn; + unsigned long node_present_pages; /* total number of physical pages */ + unsigned long node_spanned_pages; /* total size of physical page + range, including holes */ + int node_id; + wait_queue_head_t kswapd_wait; + struct task_struct *kswapd; + int kswapd_max_order; +} pg_data_t; + +#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) +#define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) +#ifdef CONFIG_FLAT_NODE_MEM_MAP +#define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) +#else +#define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr)) +#endif +#define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) + +#include + +void __get_zone_counts(unsigned long *active, unsigned long *inactive, + unsigned long *free, struct pglist_data *pgdat); +void get_zone_counts(unsigned long *active, unsigned long *inactive, + unsigned long *free); +void build_all_zonelists(void); +void wakeup_kswapd(struct zone *zone, int order); +int zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags); + +#ifdef CONFIG_HAVE_MEMORY_PRESENT +void memory_present(int nid, unsigned long start, unsigned long end); +#else +static inline void memory_present(int nid, unsigned long start, unsigned long end) {} +#endif + +#ifdef CONFIG_NEED_NODE_MEMMAP_SIZE +unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); +#endif + +/* + * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. + */ +#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) + +static inline int populated_zone(struct zone *zone) +{ + return (!!zone->present_pages); +} + +static inline int is_highmem_idx(int idx) +{ + return (idx == ZONE_HIGHMEM); +} + +static inline int is_normal_idx(int idx) +{ + return (idx == ZONE_NORMAL); +} + +/** + * is_highmem - helper function to quickly check if a struct zone is a + * highmem zone or not. This is an attempt to keep references + * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. + * @zone - pointer to struct zone variable + */ +static inline int is_highmem(struct zone *zone) +{ + return zone == zone->zone_pgdat->node_zones + ZONE_HIGHMEM; +} + +static inline int is_normal(struct zone *zone) +{ + return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL; +} + +static inline int is_dma32(struct zone *zone) +{ + return zone == zone->zone_pgdat->node_zones + ZONE_DMA32; +} + +static inline int is_dma(struct zone *zone) +{ + return zone == zone->zone_pgdat->node_zones + ZONE_DMA; +} + +/* These two functions are used to setup the per zone pages min values */ +struct ctl_table; +struct file; +int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); +extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; +int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); +int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); + +#include +/* Returns the number of the current Node. */ +#ifndef numa_node_id +#define numa_node_id() (cpu_to_node(raw_smp_processor_id())) +#endif + +#ifndef CONFIG_NEED_MULTIPLE_NODES + +extern struct pglist_data contig_page_data; +#define NODE_DATA(nid) (&contig_page_data) +#define NODE_MEM_MAP(nid) mem_map +#define MAX_NODES_SHIFT 1 + +#else /* CONFIG_NEED_MULTIPLE_NODES */ + +#include + +#endif /* !CONFIG_NEED_MULTIPLE_NODES */ + +extern struct pglist_data *first_online_pgdat(void); +extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); +extern struct zone *next_zone(struct zone *zone); + +/** + * for_each_pgdat - helper macro to iterate over all nodes + * @pgdat - pointer to a pg_data_t variable + */ +#define for_each_online_pgdat(pgdat) \ + for (pgdat = first_online_pgdat(); \ + pgdat; \ + pgdat = next_online_pgdat(pgdat)) +/** + * for_each_zone - helper macro to iterate over all memory zones + * @zone - pointer to struct zone variable + * + * The user only needs to declare the zone variable, for_each_zone + * fills it in. + */ +#define for_each_zone(zone) \ + for (zone = (first_online_pgdat())->node_zones; \ + zone; \ + zone = next_zone(zone)) + +#ifdef CONFIG_SPARSEMEM +#include +#endif + +#if BITS_PER_LONG == 32 +/* + * with 32 bit page->flags field, we reserve 9 bits for node/zone info. + * there are 4 zones (3 bits) and this leaves 9-3=6 bits for nodes. + */ +#define FLAGS_RESERVED 9 + +#elif BITS_PER_LONG == 64 +/* + * with 64 bit flags field, there's plenty of room. + */ +#define FLAGS_RESERVED 32 + +#else + +#error BITS_PER_LONG not defined + +#endif + +#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID +#define early_pfn_to_nid(nid) (0UL) +#endif + +#ifdef CONFIG_FLATMEM +#define pfn_to_nid(pfn) (0) +#endif + +#define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT) +#define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT) + +#ifdef CONFIG_SPARSEMEM + +/* + * SECTION_SHIFT #bits space required to store a section # + * + * PA_SECTION_SHIFT physical address to/from section number + * PFN_SECTION_SHIFT pfn to/from section number + */ +#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) + +#define PA_SECTION_SHIFT (SECTION_SIZE_BITS) +#define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) + +#define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT) + +#define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT) +#define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) + +#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS +#error Allocator MAX_ORDER exceeds SECTION_SIZE +#endif + +struct page; +struct mem_section { + /* + * This is, logically, a pointer to an array of struct + * pages. However, it is stored with some other magic. + * (see sparse.c::sparse_init_one_section()) + * + * Making it a UL at least makes someone do a cast + * before using it wrong. + */ + unsigned long section_mem_map; +}; + +#ifdef CONFIG_SPARSEMEM_EXTREME +#define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section)) +#else +#define SECTIONS_PER_ROOT 1 +#endif + +#define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT) +#define NR_SECTION_ROOTS (NR_MEM_SECTIONS / SECTIONS_PER_ROOT) +#define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) + +#ifdef CONFIG_SPARSEMEM_EXTREME +extern struct mem_section *mem_section[NR_SECTION_ROOTS]; +#else +extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; +#endif + +static inline struct mem_section *__nr_to_section(unsigned long nr) +{ + if (!mem_section[SECTION_NR_TO_ROOT(nr)]) + return NULL; + return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; +} +extern int __section_nr(struct mem_section* ms); + +/* + * We use the lower bits of the mem_map pointer to store + * a little bit of information. There should be at least + * 3 bits here due to 32-bit alignment. + */ +#define SECTION_MARKED_PRESENT (1UL<<0) +#define SECTION_HAS_MEM_MAP (1UL<<1) +#define SECTION_MAP_LAST_BIT (1UL<<2) +#define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) + +static inline struct page *__section_mem_map_addr(struct mem_section *section) +{ + unsigned long map = section->section_mem_map; + map &= SECTION_MAP_MASK; + return (struct page *)map; +} + +static inline int valid_section(struct mem_section *section) +{ + return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); +} + +static inline int section_has_mem_map(struct mem_section *section) +{ + return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); +} + +static inline int valid_section_nr(unsigned long nr) +{ + return valid_section(__nr_to_section(nr)); +} + +static inline struct mem_section *__pfn_to_section(unsigned long pfn) +{ + return __nr_to_section(pfn_to_section_nr(pfn)); +} + +static inline int pfn_valid(unsigned long pfn) +{ + if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) + return 0; + return valid_section(__nr_to_section(pfn_to_section_nr(pfn))); +} + +/* + * These are _only_ used during initialisation, therefore they + * can use __initdata ... They could have names to indicate + * this restriction. + */ +#ifdef CONFIG_NUMA +#define pfn_to_nid(pfn) \ +({ \ + unsigned long __pfn_to_nid_pfn = (pfn); \ + page_to_nid(pfn_to_page(__pfn_to_nid_pfn)); \ +}) +#else +#define pfn_to_nid(pfn) (0) +#endif + +#define early_pfn_valid(pfn) pfn_valid(pfn) +void sparse_init(void); +#else +#define sparse_init() do {} while (0) +#define sparse_index_init(_sec, _nid) do {} while (0) +#endif /* CONFIG_SPARSEMEM */ + +#ifndef early_pfn_valid +#define early_pfn_valid(pfn) (1) +#endif + +void memory_present(int nid, unsigned long start, unsigned long end); +unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); + +#endif /* !__ASSEMBLY__ */ +#endif /* __KERNEL__ */ +#endif /* _LINUX_MMZONE_H */ diff -urN oldtree/include/linux/swap.h newtree/include/linux/swap.h --- oldtree/include/linux/swap.h 2006-03-08 18:48:02.356026250 +0000 +++ newtree/include/linux/swap.h 2006-03-08 20:55:17.937220250 +0000 @@ -175,7 +175,8 @@ /* linux/mm/vmscan.c */ extern unsigned long try_to_free_pages(struct zone **, gfp_t); extern unsigned long shrink_all_memory(unsigned long nr_pages); -extern int vm_swappiness; +extern int vm_mapped; +extern int vm_hardmaplimit; #ifdef CONFIG_NUMA extern int zone_reclaim_mode; diff -urN oldtree/include/linux/swap.h.orig newtree/include/linux/swap.h.orig --- oldtree/include/linux/swap.h.orig 1970-01-01 00:00:00.000000000 +0000 +++ newtree/include/linux/swap.h.orig 2006-03-08 18:48:02.000000000 +0000 @@ -0,0 +1,331 @@ +#ifndef _LINUX_SWAP_H +#define _LINUX_SWAP_H + +#include +#include +#include +#include +#include +#include + +#include +#include + +#define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ +#define SWAP_FLAG_PRIO_MASK 0x7fff +#define SWAP_FLAG_PRIO_SHIFT 0 + +static inline int current_is_kswapd(void) +{ + return current->flags & PF_KSWAPD; +} + +/* + * MAX_SWAPFILES defines the maximum number of swaptypes: things which can + * be swapped to. The swap type and the offset into that swap type are + * encoded into pte's and into pgoff_t's in the swapcache. Using five bits + * for the type means that the maximum number of swapcache pages is 27 bits + * on 32-bit-pgoff_t architectures. And that assumes that the architecture packs + * the type/offset into the pte as 5/27 as well. + */ +#define MAX_SWAPFILES_SHIFT 5 +#define MAX_SWAPFILES (1 << MAX_SWAPFILES_SHIFT) + +/* + * Magic header for a swap area. The first part of the union is + * what the swap magic looks like for the old (limited to 128MB) + * swap area format, the second part of the union adds - in the + * old reserved area - some extra information. Note that the first + * kilobyte is reserved for boot loader or disk label stuff... + * + * Having the magic at the end of the PAGE_SIZE makes detecting swap + * areas somewhat tricky on machines that support multiple page sizes. + * For 2.5 we'll probably want to move the magic to just beyond the + * bootbits... + */ +union swap_header { + struct { + char reserved[PAGE_SIZE - 10]; + char magic[10]; /* SWAP-SPACE or SWAPSPACE2 */ + } magic; + struct { + char bootbits[1024]; /* Space for disklabel etc. */ + unsigned int version; + unsigned int last_page; + unsigned int nr_badpages; + unsigned int padding[125]; + unsigned int badpages[1]; + } info; +}; + + /* A swap entry has to fit into a "unsigned long", as + * the entry is hidden in the "index" field of the + * swapper address space. + */ +typedef struct { + unsigned long val; +} swp_entry_t; + +/* + * current->reclaim_state points to one of these when a task is running + * memory reclaim + */ +struct reclaim_state { + unsigned long reclaimed_slab; +}; + +#ifdef __KERNEL__ + +struct address_space; +struct sysinfo; +struct writeback_control; +struct zone; + +/* + * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of + * disk blocks. A list of swap extents maps the entire swapfile. (Where the + * term `swapfile' refers to either a blockdevice or an IS_REG file. Apart + * from setup, they're handled identically. + * + * We always assume that blocks are of size PAGE_SIZE. + */ +struct swap_extent { + struct list_head list; + pgoff_t start_page; + pgoff_t nr_pages; + sector_t start_block; +}; + +/* + * Max bad pages in the new format.. + */ +#define __swapoffset(x) ((unsigned long)&((union swap_header *)0)->x) +#define MAX_SWAP_BADPAGES \ + ((__swapoffset(magic.magic) - __swapoffset(info.badpages)) / sizeof(int)) + +enum { + SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ + SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ + SWP_ACTIVE = (SWP_USED | SWP_WRITEOK), + /* add others here before... */ + SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ +}; + +#define SWAP_CLUSTER_MAX 32 + +#define SWAP_MAP_MAX 0x7fff +#define SWAP_MAP_BAD 0x8000 + +/* + * The in-memory structure used to track swap areas. + */ +struct swap_info_struct { + unsigned int flags; + int prio; /* swap priority */ + struct file *swap_file; + struct block_device *bdev; + struct list_head extent_list; + struct swap_extent *curr_swap_extent; + unsigned old_block_size; + unsigned short * swap_map; + unsigned int lowest_bit; + unsigned int highest_bit; + unsigned int cluster_next; + unsigned int cluster_nr; + unsigned int pages; + unsigned int max; + unsigned int inuse_pages; + int next; /* next entry on swap list */ +}; + +struct swap_list_t { + int head; /* head of priority-ordered swapfile list */ + int next; /* swapfile to be used next */ +}; + +/* Swap 50% full? Release swapcache more aggressively.. */ +#define vm_swap_full() (nr_swap_pages*2 < total_swap_pages) + +/* linux/mm/oom_kill.c */ +extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order); + +/* linux/mm/memory.c */ +extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *); + +/* linux/mm/page_alloc.c */ +extern unsigned long totalram_pages; +extern unsigned long totalhigh_pages; +extern long nr_swap_pages; +extern unsigned int nr_free_pages(void); +extern unsigned int nr_free_pages_pgdat(pg_data_t *pgdat); +extern unsigned int nr_free_buffer_pages(void); +extern unsigned int nr_free_pagecache_pages(void); + +/* linux/mm/swap.c */ +extern void FASTCALL(lru_cache_add(struct page *)); +extern void FASTCALL(lru_cache_add_active(struct page *)); +extern void FASTCALL(lru_cache_add_tail(struct page *)); +extern void FASTCALL(activate_page(struct page *)); +extern void FASTCALL(mark_page_accessed(struct page *)); +extern void lru_add_drain(void); +extern int lru_add_drain_all(void); +extern int rotate_reclaimable_page(struct page *page); +extern void swap_setup(void); + +/* linux/mm/vmscan.c */ +extern unsigned long try_to_free_pages(struct zone **, gfp_t); +extern unsigned long shrink_all_memory(unsigned long nr_pages); +extern int vm_swappiness; + +#ifdef CONFIG_NUMA +extern int zone_reclaim_mode; +extern int zone_reclaim_interval; +extern int zone_reclaim(struct zone *, gfp_t, unsigned int); +#else +#define zone_reclaim_mode 0 +static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) +{ + return 0; +} +#endif + +#ifdef CONFIG_MIGRATION +extern int isolate_lru_page(struct page *p); +extern unsigned long putback_lru_pages(struct list_head *l); +extern int migrate_page(struct page *, struct page *); +extern void migrate_page_copy(struct page *, struct page *); +extern int migrate_page_remove_references(struct page *, struct page *, int); +extern unsigned long migrate_pages(struct list_head *l, struct list_head *t, + struct list_head *moved, struct list_head *failed); +extern int fail_migrate_page(struct page *, struct page *); +#else +static inline int isolate_lru_page(struct page *p) { return -ENOSYS; } +static inline int putback_lru_pages(struct list_head *l) { return 0; } +static inline int migrate_pages(struct list_head *l, struct list_head *t, + struct list_head *moved, struct list_head *failed) { return -ENOSYS; } +/* Possible settings for the migrate_page() method in address_operations */ +#define migrate_page NULL +#define fail_migrate_page NULL +#endif + +#ifdef CONFIG_MMU +/* linux/mm/shmem.c */ +extern int shmem_unuse(swp_entry_t entry, struct page *page); +#endif /* CONFIG_MMU */ + +extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *); + +#ifdef CONFIG_SWAP +/* linux/mm/page_io.c */ +extern int swap_readpage(struct file *, struct page *); +extern int swap_writepage(struct page *page, struct writeback_control *wbc); +extern int rw_swap_page_sync(int, swp_entry_t, struct page *); + +/* linux/mm/swap_state.c */ +extern struct address_space swapper_space; +#define total_swapcache_pages swapper_space.nrpages +extern void show_swap_cache_info(void); +extern int add_to_swap(struct page *, gfp_t); +extern void __delete_from_swap_cache(struct page *); +extern void delete_from_swap_cache(struct page *); +extern int move_to_swap_cache(struct page *, swp_entry_t); +extern int move_from_swap_cache(struct page *, unsigned long, + struct address_space *); +extern void free_page_and_swap_cache(struct page *); +extern void free_pages_and_swap_cache(struct page **, int); +extern struct page * lookup_swap_cache(swp_entry_t); +extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma, + unsigned long addr); +extern int add_to_swap_cache(struct page *page, swp_entry_t entry); +/* linux/mm/swapfile.c */ +extern long total_swap_pages; +extern unsigned int nr_swapfiles; +extern void si_swapinfo(struct sysinfo *); +extern swp_entry_t get_swap_page(void); +extern swp_entry_t get_swap_page_of_type(int); +extern int swap_duplicate(swp_entry_t); +extern int valid_swaphandles(swp_entry_t, unsigned long *); +extern void swap_free(swp_entry_t); +extern void free_swap_and_cache(swp_entry_t); +extern int swap_type_of(dev_t); +extern unsigned int count_swap_pages(int, int); +extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t); +extern struct swap_info_struct *get_swap_info_struct(unsigned); +extern int can_share_swap_page(struct page *); +extern int remove_exclusive_swap_page(struct page *); +struct backing_dev_info; + +extern spinlock_t swap_lock; +extern int remove_vma_swap(struct vm_area_struct *vma, struct page *page); + +/* linux/mm/thrash.c */ +extern struct mm_struct * swap_token_mm; +extern unsigned long swap_token_default_timeout; +extern void grab_swap_token(void); +extern void __put_swap_token(struct mm_struct *); + +static inline int has_swap_token(struct mm_struct *mm) +{ + return (mm == swap_token_mm); +} + +static inline void put_swap_token(struct mm_struct *mm) +{ + if (has_swap_token(mm)) + __put_swap_token(mm); +} + +static inline void disable_swap_token(void) +{ + put_swap_token(swap_token_mm); +} + +#else /* CONFIG_SWAP */ + +#define total_swap_pages 0 +#define total_swapcache_pages 0UL + +#define si_swapinfo(val) \ + do { (val)->freeswap = (val)->totalswap = 0; } while (0) +/* only sparc can not include linux/pagemap.h in this file + * so leave page_cache_release and release_pages undeclared... */ +#define free_page_and_swap_cache(page) \ + page_cache_release(page) +#define free_pages_and_swap_cache(pages, nr) \ + release_pages((pages), (nr), 0); + +#define show_swap_cache_info() /*NOTHING*/ +#define free_swap_and_cache(swp) /*NOTHING*/ +#define swap_duplicate(swp) /*NOTHING*/ +#define swap_free(swp) /*NOTHING*/ +#define read_swap_cache_async(swp,vma,addr) NULL +#define lookup_swap_cache(swp) NULL +#define valid_swaphandles(swp, off) 0 +#define can_share_swap_page(p) 0 +#define move_to_swap_cache(p, swp) 1 +#define move_from_swap_cache(p, i, m) 1 +#define __delete_from_swap_cache(p) /*NOTHING*/ +#define delete_from_swap_cache(p) /*NOTHING*/ +#define swap_token_default_timeout 0 + +static inline int remove_exclusive_swap_page(struct page *p) +{ + return 0; +} + +static inline swp_entry_t get_swap_page(void) +{ + swp_entry_t entry; + entry.val = 0; + return entry; +} + +/* linux/mm/thrash.c */ +#define put_swap_token(x) do { } while(0) +#define grab_swap_token() do { } while(0) +#define has_swap_token(x) 0 +#define disable_swap_token() do { } while(0) + +#endif /* CONFIG_SWAP */ +#endif /* __KERNEL__*/ +#endif /* _LINUX_SWAP_H */ diff -urN oldtree/include/linux/sysctl.h newtree/include/linux/sysctl.h --- oldtree/include/linux/sysctl.h 2006-03-08 18:48:02.360026500 +0000 +++ newtree/include/linux/sysctl.h 2006-03-08 20:52:42.011475500 +0000 @@ -172,7 +172,7 @@ VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */ VM_PAGEBUF=17, /* struct: Control pagebuf parameters */ VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */ - VM_SWAPPINESS=19, /* Tendency to steal mapped memory */ + VM_MAPPED=19, /* percent mapped min while evicting cache */ VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */ VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */ @@ -187,6 +187,7 @@ VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */ VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */ VM_SWAP_PREFETCH=33, /* swap prefetch */ + VM_HARDMAPLIMIT=34, /* Make mapped a hard limit */ }; diff -urN oldtree/include/linux/sysctl.h.orig newtree/include/linux/sysctl.h.orig --- oldtree/include/linux/sysctl.h.orig 1970-01-01 00:00:00.000000000 +0000 +++ newtree/include/linux/sysctl.h.orig 2006-03-08 18:48:02.000000000 +0000 @@ -0,0 +1,1024 @@ +/* + * sysctl.h: General linux system control interface + * + * Begun 24 March 1995, Stephen Tweedie + * + **************************************************************** + **************************************************************** + ** + ** The values in this file are exported to user space via + ** the sysctl() binary interface. However this interface + ** is unstable and deprecated and will be removed in the future. + ** For a stable interface use /proc/sys. + ** + **************************************************************** + **************************************************************** + */ + +#ifndef _LINUX_SYSCTL_H +#define _LINUX_SYSCTL_H + +#include +#include +#include + +struct file; +struct completion; + +#define CTL_MAXNAME 10 /* how many path components do we allow in a + call to sysctl? In other words, what is + the largest acceptable value for the nlen + member of a struct __sysctl_args to have? */ + +struct __sysctl_args { + int __user *name; + int nlen; + void __user *oldval; + size_t __user *oldlenp; + void __user *newval; + size_t newlen; + unsigned long __unused[4]; +}; + +/* Define sysctl names first */ + +/* Top-level names: */ + +/* For internal pattern-matching use only: */ +#ifdef __KERNEL__ +#define CTL_ANY -1 /* Matches any name */ +#define CTL_NONE 0 +#endif + +enum +{ + CTL_KERN=1, /* General kernel info and control */ + CTL_VM=2, /* VM management */ + CTL_NET=3, /* Networking */ + CTL_PROC=4, /* Process info */ + CTL_FS=5, /* Filesystems */ + CTL_DEBUG=6, /* Debugging */ + CTL_DEV=7, /* Devices */ + CTL_BUS=8, /* Busses */ + CTL_ABI=9, /* Binary emulation */ + CTL_CPU=10 /* CPU stuff (speed scaling, etc) */ +}; + +/* CTL_BUS names: */ +enum +{ + CTL_BUS_ISA=1 /* ISA */ +}; + +/* /proc/sys/fs/inotify/ */ +enum +{ + INOTIFY_MAX_USER_INSTANCES=1, /* max instances per user */ + INOTIFY_MAX_USER_WATCHES=2, /* max watches per user */ + INOTIFY_MAX_QUEUED_EVENTS=3 /* max queued events per instance */ +}; + +/* CTL_KERN names: */ +enum +{ + KERN_OSTYPE=1, /* string: system version */ + KERN_OSRELEASE=2, /* string: system release */ + KERN_OSREV=3, /* int: system revision */ + KERN_VERSION=4, /* string: compile time info */ + KERN_SECUREMASK=5, /* struct: maximum rights mask */ + KERN_PROF=6, /* table: profiling information */ + KERN_NODENAME=7, + KERN_DOMAINNAME=8, + + KERN_CAP_BSET=14, /* int: capability bounding set */ + KERN_PANIC=15, /* int: panic timeout */ + KERN_REALROOTDEV=16, /* real root device to mount after initrd */ + + KERN_SPARC_REBOOT=21, /* reboot command on Sparc */ + KERN_CTLALTDEL=22, /* int: allow ctl-alt-del to reboot */ + KERN_PRINTK=23, /* struct: control printk logging parameters */ + KERN_NAMETRANS=24, /* Name translation */ + KERN_PPC_HTABRECLAIM=25, /* turn htab reclaimation on/off on PPC */ + KERN_PPC_ZEROPAGED=26, /* turn idle page zeroing on/off on PPC */ + KERN_PPC_POWERSAVE_NAP=27, /* use nap mode for power saving */ + KERN_MODPROBE=28, + KERN_SG_BIG_BUFF=29, + KERN_ACCT=30, /* BSD process accounting parameters */ + KERN_PPC_L2CR=31, /* l2cr register on PPC */ + + KERN_RTSIGNR=32, /* Number of rt sigs queued */ + KERN_RTSIGMAX=33, /* Max queuable */ + + KERN_SHMMAX=34, /* long: Maximum shared memory segment */ + KERN_MSGMAX=35, /* int: Maximum size of a messege */ + KERN_MSGMNB=36, /* int: Maximum message queue size */ + KERN_MSGPOOL=37, /* int: Maximum system message pool size */ + KERN_SYSRQ=38, /* int: Sysreq enable */ + KERN_MAX_THREADS=39, /* int: Maximum nr of threads in the system */ + KERN_RANDOM=40, /* Random driver */ + KERN_SHMALL=41, /* int: Maximum size of shared memory */ + KERN_MSGMNI=42, /* int: msg queue identifiers */ + KERN_SEM=43, /* struct: sysv semaphore limits */ + KERN_SPARC_STOP_A=44, /* int: Sparc Stop-A enable */ + KERN_SHMMNI=45, /* int: shm array identifiers */ + KERN_OVERFLOWUID=46, /* int: overflow UID */ + KERN_OVERFLOWGID=47, /* int: overflow GID */ + KERN_SHMPATH=48, /* string: path to shm fs */ + KERN_HOTPLUG=49, /* string: path to uevent helper (deprecated) */ + KERN_IEEE_EMULATION_WARNINGS=50, /* int: unimplemented ieee instructions */ + KERN_S390_USER_DEBUG_LOGGING=51, /* int: dumps of user faults */ + KERN_CORE_USES_PID=52, /* int: use core or core.%pid */ + KERN_TAINTED=53, /* int: various kernel tainted flags */ + KERN_CADPID=54, /* int: PID of the process to notify on CAD */ + KERN_PIDMAX=55, /* int: PID # limit */ + KERN_CORE_PATTERN=56, /* string: pattern for core-file names */ + KERN_PANIC_ON_OOPS=57, /* int: whether we will panic on an oops */ + KERN_HPPA_PWRSW=58, /* int: hppa soft-power enable */ + KERN_HPPA_UNALIGNED=59, /* int: hppa unaligned-trap enable */ + KERN_PRINTK_RATELIMIT=60, /* int: tune printk ratelimiting */ + KERN_PRINTK_RATELIMIT_BURST=61, /* int: tune printk ratelimiting */ + KERN_PTY=62, /* dir: pty driver */ + KERN_NGROUPS_MAX=63, /* int: NGROUPS_MAX */ + KERN_SPARC_SCONS_PWROFF=64, /* int: serial console power-off halt */ + KERN_HZ_TIMER=65, /* int: hz timer on or off */ + KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */ + KERN_BOOTLOADER_TYPE=67, /* int: boot loader type */ + KERN_RANDOMIZE=68, /* int: randomize virtual address space */ + KERN_SETUID_DUMPABLE=69, /* int: behaviour of dumps for setuid core */ + KERN_SPIN_RETRY=70, /* int: number of spinlock retries */ + KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */ + KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */ +}; + + +/* CTL_VM names: */ +enum +{ + VM_UNUSED1=1, /* was: struct: Set vm swapping control */ + VM_UNUSED2=2, /* was; int: Linear or sqrt() swapout for hogs */ + VM_UNUSED3=3, /* was: struct: Set free page thresholds */ + VM_UNUSED4=4, /* Spare */ + VM_OVERCOMMIT_MEMORY=5, /* Turn off the virtual memory safety limit */ + VM_UNUSED5=6, /* was: struct: Set buffer memory thresholds */ + VM_UNUSED7=7, /* was: struct: Set cache memory thresholds */ + VM_UNUSED8=8, /* was: struct: Control kswapd behaviour */ + VM_UNUSED9=9, /* was: struct: Set page table cache parameters */ + VM_PAGE_CLUSTER=10, /* int: set number of pages to swap together */ + VM_DIRTY_BACKGROUND=11, /* dirty_background_ratio */ + VM_DIRTY_RATIO=12, /* dirty_ratio */ + VM_DIRTY_WB_CS=13, /* dirty_writeback_centisecs */ + VM_DIRTY_EXPIRE_CS=14, /* dirty_expire_centisecs */ + VM_NR_PDFLUSH_THREADS=15, /* nr_pdflush_threads */ + VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */ + VM_PAGEBUF=17, /* struct: Control pagebuf parameters */ + VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */ + VM_SWAPPINESS=19, /* Tendency to steal mapped memory */ + VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */ + VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ + VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */ + VM_LAPTOP_MODE=23, /* vm laptop mode */ + VM_BLOCK_DUMP=24, /* block dump mode */ + VM_HUGETLB_GROUP=25, /* permitted hugetlb group */ + VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */ + VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */ + VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */ + VM_DROP_PAGECACHE=29, /* int: nuke lots of pagecache */ + VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */ + VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */ + VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */ + VM_SWAP_PREFETCH=33, /* swap prefetch */ +}; + + +/* CTL_NET names: */ +enum +{ + NET_CORE=1, + NET_ETHER=2, + NET_802=3, + NET_UNIX=4, + NET_IPV4=5, + NET_IPX=6, + NET_ATALK=7, + NET_NETROM=8, + NET_AX25=9, + NET_BRIDGE=10, + NET_ROSE=11, + NET_IPV6=12, + NET_X25=13, + NET_TR=14, + NET_DECNET=15, + NET_ECONET=16, + NET_SCTP=17, + NET_LLC=18, + NET_NETFILTER=19, + NET_DCCP=20, +}; + +/* /proc/sys/kernel/random */ +enum +{ + RANDOM_POOLSIZE=1, + RANDOM_ENTROPY_COUNT=2, + RANDOM_READ_THRESH=3, + RANDOM_WRITE_THRESH=4, + RANDOM_BOOT_ID=5, + RANDOM_UUID=6 +}; + +/* /proc/sys/kernel/pty */ +enum +{ + PTY_MAX=1, + PTY_NR=2 +}; + +/* /proc/sys/bus/isa */ +enum +{ + BUS_ISA_MEM_BASE=1, + BUS_ISA_PORT_BASE=2, + BUS_ISA_PORT_SHIFT=3 +}; + +/* /proc/sys/net/core */ +enum +{ + NET_CORE_WMEM_MAX=1, + NET_CORE_RMEM_MAX=2, + NET_CORE_WMEM_DEFAULT=3, + NET_CORE_RMEM_DEFAULT=4, +/* was NET_CORE_DESTROY_DELAY */ + NET_CORE_MAX_BACKLOG=6, + NET_CORE_FASTROUTE=7, + NET_CORE_MSG_COST=8, + NET_CORE_MSG_BURST=9, + NET_CORE_OPTMEM_MAX=10, + NET_CORE_HOT_LIST_LENGTH=11, + NET_CORE_DIVERT_VERSION=12, + NET_CORE_NO_CONG_THRESH=13, + NET_CORE_NO_CONG=14, + NET_CORE_LO_CONG=15, + NET_CORE_MOD_CONG=16, + NET_CORE_DEV_WEIGHT=17, + NET_CORE_SOMAXCONN=18, + NET_CORE_BUDGET=19, + NET_CORE_AEVENT_ETIME=20, + NET_CORE_AEVENT_RSEQTH=21, +}; + +/* /proc/sys/net/ethernet */ + +/* /proc/sys/net/802 */ + +/* /proc/sys/net/unix */ + +enum +{ + NET_UNIX_DESTROY_DELAY=1, + NET_UNIX_DELETE_DELAY=2, + NET_UNIX_MAX_DGRAM_QLEN=3, +}; + +/* /proc/sys/net/netfilter */ +enum +{ + NET_NF_CONNTRACK_MAX=1, + NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT=2, + NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV=3, + NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED=4, + NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT=5, + NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT=6, + NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK=7, + NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT=8, + NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE=9, + NET_NF_CONNTRACK_UDP_TIMEOUT=10, + NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM=11, + NET_NF_CONNTRACK_ICMP_TIMEOUT=12, + NET_NF_CONNTRACK_GENERIC_TIMEOUT=13, + NET_NF_CONNTRACK_BUCKETS=14, + NET_NF_CONNTRACK_LOG_INVALID=15, + NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS=16, + NET_NF_CONNTRACK_TCP_LOOSE=17, + NET_NF_CONNTRACK_TCP_BE_LIBERAL=18, + NET_NF_CONNTRACK_TCP_MAX_RETRANS=19, + NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED=20, + NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT=21, + NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED=22, + NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED=23, + NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT=24, + NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD=25, + NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT=26, + NET_NF_CONNTRACK_COUNT=27, + NET_NF_CONNTRACK_ICMPV6_TIMEOUT=28, + NET_NF_CONNTRACK_FRAG6_TIMEOUT=29, + NET_NF_CONNTRACK_FRAG6_LOW_THRESH=30, + NET_NF_CONNTRACK_FRAG6_HIGH_THRESH=31, +}; + +/* /proc/sys/net/ipv4 */ +enum +{ + /* v2.0 compatibile variables */ + NET_IPV4_FORWARD=8, + NET_IPV4_DYNADDR=9, + + NET_IPV4_CONF=16, + NET_IPV4_NEIGH=17, + NET_IPV4_ROUTE=18, + NET_IPV4_FIB_HASH=19, + NET_IPV4_NETFILTER=20, + + NET_IPV4_TCP_TIMESTAMPS=33, + NET_IPV4_TCP_WINDOW_SCALING=34, + NET_IPV4_TCP_SACK=35, + NET_IPV4_TCP_RETRANS_COLLAPSE=36, + NET_IPV4_DEFAULT_TTL=37, + NET_IPV4_AUTOCONFIG=38, + NET_IPV4_NO_PMTU_DISC=39, + NET_IPV4_TCP_SYN_RETRIES=40, + NET_IPV4_IPFRAG_HIGH_THRESH=41, + NET_IPV4_IPFRAG_LOW_THRESH=42, + NET_IPV4_IPFRAG_TIME=43, + NET_IPV4_TCP_MAX_KA_PROBES=44, + NET_IPV4_TCP_KEEPALIVE_TIME=45, + NET_IPV4_TCP_KEEPALIVE_PROBES=46, + NET_IPV4_TCP_RETRIES1=47, + NET_IPV4_TCP_RETRIES2=48, + NET_IPV4_TCP_FIN_TIMEOUT=49, + NET_IPV4_IP_MASQ_DEBUG=50, + NET_TCP_SYNCOOKIES=51, + NET_TCP_STDURG=52, + NET_TCP_RFC1337=53, + NET_TCP_SYN_TAILDROP=54, + NET_TCP_MAX_SYN_BACKLOG=55, + NET_IPV4_LOCAL_PORT_RANGE=56, + NET_IPV4_ICMP_ECHO_IGNORE_ALL=57, + NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS=58, + NET_IPV4_ICMP_SOURCEQUENCH_RATE=59, + NET_IPV4_ICMP_DESTUNREACH_RATE=60, + NET_IPV4_ICMP_TIMEEXCEED_RATE=61, + NET_IPV4_ICMP_PARAMPROB_RATE=62, + NET_IPV4_ICMP_ECHOREPLY_RATE=63, + NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES=64, + NET_IPV4_IGMP_MAX_MEMBERSHIPS=65, + NET_TCP_TW_RECYCLE=66, + NET_IPV4_ALWAYS_DEFRAG=67, + NET_IPV4_TCP_KEEPALIVE_INTVL=68, + NET_IPV4_INET_PEER_THRESHOLD=69, + NET_IPV4_INET_PEER_MINTTL=70, + NET_IPV4_INET_PEER_MAXTTL=71, + NET_IPV4_INET_PEER_GC_MINTIME=72, + NET_IPV4_INET_PEER_GC_MAXTIME=73, + NET_TCP_ORPHAN_RETRIES=74, + NET_TCP_ABORT_ON_OVERFLOW=75, + NET_TCP_SYNACK_RETRIES=76, + NET_TCP_MAX_ORPHANS=77, + NET_TCP_MAX_TW_BUCKETS=78, + NET_TCP_FACK=79, + NET_TCP_REORDERING=80, + NET_TCP_ECN=81, + NET_TCP_DSACK=82, + NET_TCP_MEM=83, + NET_TCP_WMEM=84, + NET_TCP_RMEM=85, + NET_TCP_APP_WIN=86, + NET_TCP_ADV_WIN_SCALE=87, + NET_IPV4_NONLOCAL_BIND=88, + NET_IPV4_ICMP_RATELIMIT=89, + NET_IPV4_ICMP_RATEMASK=90, + NET_TCP_TW_REUSE=91, + NET_TCP_FRTO=92, + NET_TCP_LOW_LATENCY=93, + NET_IPV4_IPFRAG_SECRET_INTERVAL=94, + NET_IPV4_IGMP_MAX_MSF=96, + NET_TCP_NO_METRICS_SAVE=97, + NET_TCP_DEFAULT_WIN_SCALE=105, + NET_TCP_MODERATE_RCVBUF=106, + NET_TCP_TSO_WIN_DIVISOR=107, + NET_TCP_BIC_BETA=108, + NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, + NET_TCP_CONG_CONTROL=110, + NET_TCP_ABC=111, + NET_IPV4_IPFRAG_MAX_DIST=112, + NET_TCP_MTU_PROBING=113, + NET_TCP_BASE_MSS=114, +}; + +enum { + NET_IPV4_ROUTE_FLUSH=1, + NET_IPV4_ROUTE_MIN_DELAY=2, + NET_IPV4_ROUTE_MAX_DELAY=3, + NET_IPV4_ROUTE_GC_THRESH=4, + NET_IPV4_ROUTE_MAX_SIZE=5, + NET_IPV4_ROUTE_GC_MIN_INTERVAL=6, + NET_IPV4_ROUTE_GC_TIMEOUT=7, + NET_IPV4_ROUTE_GC_INTERVAL=8, + NET_IPV4_ROUTE_REDIRECT_LOAD=9, + NET_IPV4_ROUTE_REDIRECT_NUMBER=10, + NET_IPV4_ROUTE_REDIRECT_SILENCE=11, + NET_IPV4_ROUTE_ERROR_COST=12, + NET_IPV4_ROUTE_ERROR_BURST=13, + NET_IPV4_ROUTE_GC_ELASTICITY=14, + NET_IPV4_ROUTE_MTU_EXPIRES=15, + NET_IPV4_ROUTE_MIN_PMTU=16, + NET_IPV4_ROUTE_MIN_ADVMSS=17, + NET_IPV4_ROUTE_SECRET_INTERVAL=18, + NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS=19, +}; + +enum +{ + NET_PROTO_CONF_ALL=-2, + NET_PROTO_CONF_DEFAULT=-3 + + /* And device ifindices ... */ +}; + +enum +{ + NET_IPV4_CONF_FORWARDING=1, + NET_IPV4_CONF_MC_FORWARDING=2, + NET_IPV4_CONF_PROXY_ARP=3, + NET_IPV4_CONF_ACCEPT_REDIRECTS=4, + NET_IPV4_CONF_SECURE_REDIRECTS=5, + NET_IPV4_CONF_SEND_REDIRECTS=6, + NET_IPV4_CONF_SHARED_MEDIA=7, + NET_IPV4_CONF_RP_FILTER=8, + NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE=9, + NET_IPV4_CONF_BOOTP_RELAY=10, + NET_IPV4_CONF_LOG_MARTIANS=11, + NET_IPV4_CONF_TAG=12, + NET_IPV4_CONF_ARPFILTER=13, + NET_IPV4_CONF_MEDIUM_ID=14, + NET_IPV4_CONF_NOXFRM=15, + NET_IPV4_CONF_NOPOLICY=16, + NET_IPV4_CONF_FORCE_IGMP_VERSION=17, + NET_IPV4_CONF_ARP_ANNOUNCE=18, + NET_IPV4_CONF_ARP_IGNORE=19, + NET_IPV4_CONF_PROMOTE_SECONDARIES=20, + __NET_IPV4_CONF_MAX +}; + +/* /proc/sys/net/ipv4/netfilter */ +enum +{ + NET_IPV4_NF_CONNTRACK_MAX=1, + NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT=2, + NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV=3, + NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED=4, + NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT=5, + NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT=6, + NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK=7, + NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT=8, + NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE=9, + NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT=10, + NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM=11, + NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT=12, + NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT=13, + NET_IPV4_NF_CONNTRACK_BUCKETS=14, + NET_IPV4_NF_CONNTRACK_LOG_INVALID=15, + NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS=16, + NET_IPV4_NF_CONNTRACK_TCP_LOOSE=17, + NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL=18, + NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS=19, + NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED=20, + NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT=21, + NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED=22, + NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED=23, + NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT=24, + NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD=25, + NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT=26, + NET_IPV4_NF_CONNTRACK_COUNT=27, +}; + +/* /proc/sys/net/ipv6 */ +enum { + NET_IPV6_CONF=16, + NET_IPV6_NEIGH=17, + NET_IPV6_ROUTE=18, + NET_IPV6_ICMP=19, + NET_IPV6_BINDV6ONLY=20, + NET_IPV6_IP6FRAG_HIGH_THRESH=21, + NET_IPV6_IP6FRAG_LOW_THRESH=22, + NET_IPV6_IP6FRAG_TIME=23, + NET_IPV6_IP6FRAG_SECRET_INTERVAL=24, + NET_IPV6_MLD_MAX_MSF=25, +}; + +enum { + NET_IPV6_ROUTE_FLUSH=1, + NET_IPV6_ROUTE_GC_THRESH=2, + NET_IPV6_ROUTE_MAX_SIZE=3, + NET_IPV6_ROUTE_GC_MIN_INTERVAL=4, + NET_IPV6_ROUTE_GC_TIMEOUT=5, + NET_IPV6_ROUTE_GC_INTERVAL=6, + NET_IPV6_ROUTE_GC_ELASTICITY=7, + NET_IPV6_ROUTE_MTU_EXPIRES=8, + NET_IPV6_ROUTE_MIN_ADVMSS=9, + NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS=10 +}; + +enum { + NET_IPV6_FORWARDING=1, + NET_IPV6_HOP_LIMIT=2, + NET_IPV6_MTU=3, + NET_IPV6_ACCEPT_RA=4, + NET_IPV6_ACCEPT_REDIRECTS=5, + NET_IPV6_AUTOCONF=6, + NET_IPV6_DAD_TRANSMITS=7, + NET_IPV6_RTR_SOLICITS=8, + NET_IPV6_RTR_SOLICIT_INTERVAL=9, + NET_IPV6_RTR_SOLICIT_DELAY=10, + NET_IPV6_USE_TEMPADDR=11, + NET_IPV6_TEMP_VALID_LFT=12, + NET_IPV6_TEMP_PREFERED_LFT=13, + NET_IPV6_REGEN_MAX_RETRY=14, + NET_IPV6_MAX_DESYNC_FACTOR=15, + NET_IPV6_MAX_ADDRESSES=16, + NET_IPV6_FORCE_MLD_VERSION=17, + NET_IPV6_ACCEPT_RA_DEFRTR=18, + NET_IPV6_ACCEPT_RA_PINFO=19, + NET_IPV6_ACCEPT_RA_RTR_PREF=20, + NET_IPV6_RTR_PROBE_INTERVAL=21, + NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN=22, + __NET_IPV6_MAX +}; + +/* /proc/sys/net/ipv6/icmp */ +enum { + NET_IPV6_ICMP_RATELIMIT=1 +}; + +/* /proc/sys/net//neigh/ */ +enum { + NET_NEIGH_MCAST_SOLICIT=1, + NET_NEIGH_UCAST_SOLICIT=2, + NET_NEIGH_APP_SOLICIT=3, + NET_NEIGH_RETRANS_TIME=4, + NET_NEIGH_REACHABLE_TIME=5, + NET_NEIGH_DELAY_PROBE_TIME=6, + NET_NEIGH_GC_STALE_TIME=7, + NET_NEIGH_UNRES_QLEN=8, + NET_NEIGH_PROXY_QLEN=9, + NET_NEIGH_ANYCAST_DELAY=10, + NET_NEIGH_PROXY_DELAY=11, + NET_NEIGH_LOCKTIME=12, + NET_NEIGH_GC_INTERVAL=13, + NET_NEIGH_GC_THRESH1=14, + NET_NEIGH_GC_THRESH2=15, + NET_NEIGH_GC_THRESH3=16, + NET_NEIGH_RETRANS_TIME_MS=17, + NET_NEIGH_REACHABLE_TIME_MS=18, + __NET_NEIGH_MAX +}; + +/* /proc/sys/net/dccp */ +enum { + NET_DCCP_DEFAULT=1, +}; + +/* /proc/sys/net/dccp/default */ +enum { + NET_DCCP_DEFAULT_SEQ_WINDOW = 1, + NET_DCCP_DEFAULT_RX_CCID = 2, + NET_DCCP_DEFAULT_TX_CCID = 3, + NET_DCCP_DEFAULT_ACK_RATIO = 4, + NET_DCCP_DEFAULT_SEND_ACKVEC = 5, + NET_DCCP_DEFAULT_SEND_NDP = 6, +}; + +/* /proc/sys/net/ipx */ +enum { + NET_IPX_PPROP_BROADCASTING=1, + NET_IPX_FORWARDING=2 +}; + +/* /proc/sys/net/llc */ +enum { + NET_LLC2=1, + NET_LLC_STATION=2, +}; + +/* /proc/sys/net/llc/llc2 */ +enum { + NET_LLC2_TIMEOUT=1, +}; + +/* /proc/sys/net/llc/station */ +enum { + NET_LLC_STATION_ACK_TIMEOUT=1, +}; + +/* /proc/sys/net/llc/llc2/timeout */ +enum { + NET_LLC2_ACK_TIMEOUT=1, + NET_LLC2_P_TIMEOUT=2, + NET_LLC2_REJ_TIMEOUT=3, + NET_LLC2_BUSY_TIMEOUT=4, +}; + +/* /proc/sys/net/appletalk */ +enum { + NET_ATALK_AARP_EXPIRY_TIME=1, + NET_ATALK_AARP_TICK_TIME=2, + NET_ATALK_AARP_RETRANSMIT_LIMIT=3, + NET_ATALK_AARP_RESOLVE_TIME=4 +}; + + +/* /proc/sys/net/netrom */ +enum { + NET_NETROM_DEFAULT_PATH_QUALITY=1, + NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER=2, + NET_NETROM_NETWORK_TTL_INITIALISER=3, + NET_NETROM_TRANSPORT_TIMEOUT=4, + NET_NETROM_TRANSPORT_MAXIMUM_TRIES=5, + NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY=6, + NET_NETROM_TRANSPORT_BUSY_DELAY=7, + NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE=8, + NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT=9, + NET_NETROM_ROUTING_CONTROL=10, + NET_NETROM_LINK_FAILS_COUNT=11, + NET_NETROM_RESET=12 +}; + +/* /proc/sys/net/ax25 */ +enum { + NET_AX25_IP_DEFAULT_MODE=1, + NET_AX25_DEFAULT_MODE=2, + NET_AX25_BACKOFF_TYPE=3, + NET_AX25_CONNECT_MODE=4, + NET_AX25_STANDARD_WINDOW=5, + NET_AX25_EXTENDED_WINDOW=6, + NET_AX25_T1_TIMEOUT=7, + NET_AX25_T2_TIMEOUT=8, + NET_AX25_T3_TIMEOUT=9, + NET_AX25_IDLE_TIMEOUT=10, + NET_AX25_N2=11, + NET_AX25_PACLEN=12, + NET_AX25_PROTOCOL=13, + NET_AX25_DAMA_SLAVE_TIMEOUT=14 +}; + +/* /proc/sys/net/rose */ +enum { + NET_ROSE_RESTART_REQUEST_TIMEOUT=1, + NET_ROSE_CALL_REQUEST_TIMEOUT=2, + NET_ROSE_RESET_REQUEST_TIMEOUT=3, + NET_ROSE_CLEAR_REQUEST_TIMEOUT=4, + NET_ROSE_ACK_HOLD_BACK_TIMEOUT=5, + NET_ROSE_ROUTING_CONTROL=6, + NET_ROSE_LINK_FAIL_TIMEOUT=7, + NET_ROSE_MAX_VCS=8, + NET_ROSE_WINDOW_SIZE=9, + NET_ROSE_NO_ACTIVITY_TIMEOUT=10 +}; + +/* /proc/sys/net/x25 */ +enum { + NET_X25_RESTART_REQUEST_TIMEOUT=1, + NET_X25_CALL_REQUEST_TIMEOUT=2, + NET_X25_RESET_REQUEST_TIMEOUT=3, + NET_X25_CLEAR_REQUEST_TIMEOUT=4, + NET_X25_ACK_HOLD_BACK_TIMEOUT=5 +}; + +/* /proc/sys/net/token-ring */ +enum +{ + NET_TR_RIF_TIMEOUT=1 +}; + +/* /proc/sys/net/decnet/ */ +enum { + NET_DECNET_NODE_TYPE = 1, + NET_DECNET_NODE_ADDRESS = 2, + NET_DECNET_NODE_NAME = 3, + NET_DECNET_DEFAULT_DEVICE = 4, + NET_DECNET_TIME_WAIT = 5, + NET_DECNET_DN_COUNT = 6, + NET_DECNET_DI_COUNT = 7, + NET_DECNET_DR_COUNT = 8, + NET_DECNET_DST_GC_INTERVAL = 9, + NET_DECNET_CONF = 10, + NET_DECNET_NO_FC_MAX_CWND = 11, + NET_DECNET_MEM = 12, + NET_DECNET_RMEM = 13, + NET_DECNET_WMEM = 14, + NET_DECNET_DEBUG_LEVEL = 255 +}; + +/* /proc/sys/net/decnet/conf/ */ +enum { + NET_DECNET_CONF_LOOPBACK = -2, + NET_DECNET_CONF_DDCMP = -3, + NET_DECNET_CONF_PPP = -4, + NET_DECNET_CONF_X25 = -5, + NET_DECNET_CONF_GRE = -6, + NET_DECNET_CONF_ETHER = -7 + + /* ... and ifindex of devices */ +}; + +/* /proc/sys/net/decnet/conf// */ +enum { + NET_DECNET_CONF_DEV_PRIORITY = 1, + NET_DECNET_CONF_DEV_T1 = 2, + NET_DECNET_CONF_DEV_T2 = 3, + NET_DECNET_CONF_DEV_T3 = 4, + NET_DECNET_CONF_DEV_FORWARDING = 5, + NET_DECNET_CONF_DEV_BLKSIZE = 6, + NET_DECNET_CONF_DEV_STATE = 7 +}; + +/* /proc/sys/net/sctp */ +enum { + NET_SCTP_RTO_INITIAL = 1, + NET_SCTP_RTO_MIN = 2, + NET_SCTP_RTO_MAX = 3, + NET_SCTP_RTO_ALPHA = 4, + NET_SCTP_RTO_BETA = 5, + NET_SCTP_VALID_COOKIE_LIFE = 6, + NET_SCTP_ASSOCIATION_MAX_RETRANS = 7, + NET_SCTP_PATH_MAX_RETRANS = 8, + NET_SCTP_MAX_INIT_RETRANSMITS = 9, + NET_SCTP_HB_INTERVAL = 10, + NET_SCTP_PRESERVE_ENABLE = 11, + NET_SCTP_MAX_BURST = 12, + NET_SCTP_ADDIP_ENABLE = 13, + NET_SCTP_PRSCTP_ENABLE = 14, + NET_SCTP_SNDBUF_POLICY = 15, + NET_SCTP_SACK_TIMEOUT = 16, + NET_SCTP_RCVBUF_POLICY = 17, +}; + +/* /proc/sys/net/bridge */ +enum { + NET_BRIDGE_NF_CALL_ARPTABLES = 1, + NET_BRIDGE_NF_CALL_IPTABLES = 2, + NET_BRIDGE_NF_CALL_IP6TABLES = 3, + NET_BRIDGE_NF_FILTER_VLAN_TAGGED = 4, +}; + +/* CTL_PROC names: */ + +/* CTL_FS names: */ +enum +{ + FS_NRINODE=1, /* int:current number of allocated inodes */ + FS_STATINODE=2, + FS_MAXINODE=3, /* int:maximum number of inodes that can be allocated */ + FS_NRDQUOT=4, /* int:current number of allocated dquots */ + FS_MAXDQUOT=5, /* int:maximum number of dquots that can be allocated */ + FS_NRFILE=6, /* int:current number of allocated filedescriptors */ + FS_MAXFILE=7, /* int:maximum number of filedescriptors that can be allocated */ + FS_DENTRY=8, + FS_NRSUPER=9, /* int:current number of allocated super_blocks */ + FS_MAXSUPER=10, /* int:maximum number of super_blocks that can be allocated */ + FS_OVERFLOWUID=11, /* int: overflow UID */ + FS_OVERFLOWGID=12, /* int: overflow GID */ + FS_LEASES=13, /* int: leases enabled */ + FS_DIR_NOTIFY=14, /* int: directory notification enabled */ + FS_LEASE_TIME=15, /* int: maximum time to wait for a lease break */ + FS_DQSTATS=16, /* disc quota usage statistics and control */ + FS_XFS=17, /* struct: control xfs parameters */ + FS_AIO_NR=18, /* current system-wide number of aio requests */ + FS_AIO_MAX_NR=19, /* system-wide maximum number of aio requests */ + FS_INOTIFY=20, /* inotify submenu */ +}; + +/* /proc/sys/fs/quota/ */ +enum { + FS_DQ_LOOKUPS = 1, + FS_DQ_DROPS = 2, + FS_DQ_READS = 3, + FS_DQ_WRITES = 4, + FS_DQ_CACHE_HITS = 5, + FS_DQ_ALLOCATED = 6, + FS_DQ_FREE = 7, + FS_DQ_SYNCS = 8, + FS_DQ_WARNINGS = 9, +}; + +/* CTL_DEBUG names: */ + +/* CTL_DEV names: */ +enum { + DEV_CDROM=1, + DEV_HWMON=2, + DEV_PARPORT=3, + DEV_RAID=4, + DEV_MAC_HID=5, + DEV_SCSI=6, + DEV_IPMI=7, + DEV_ADBHID=8, +}; + +/* /proc/sys/dev/cdrom */ +enum { + DEV_CDROM_INFO=1, + DEV_CDROM_AUTOCLOSE=2, + DEV_CDROM_AUTOEJECT=3, + DEV_CDROM_DEBUG=4, + DEV_CDROM_LOCK=5, + DEV_CDROM_CHECK_MEDIA=6 +}; + +/* /proc/sys/dev/parport */ +enum { + DEV_PARPORT_DEFAULT=-3 +}; + +/* /proc/sys/dev/raid */ +enum { + DEV_RAID_SPEED_LIMIT_MIN=1, + DEV_RAID_SPEED_LIMIT_MAX=2 +}; + +/* /proc/sys/dev/parport/default */ +enum { + DEV_PARPORT_DEFAULT_TIMESLICE=1, + DEV_PARPORT_DEFAULT_SPINTIME=2 +}; + +/* /proc/sys/dev/parport/parport n */ +enum { + DEV_PARPORT_SPINTIME=1, + DEV_PARPORT_BASE_ADDR=2, + DEV_PARPORT_IRQ=3, + DEV_PARPORT_DMA=4, + DEV_PARPORT_MODES=5, + DEV_PARPORT_DEVICES=6, + DEV_PARPORT_AUTOPROBE=16 +}; + +/* /proc/sys/dev/parport/parport n/devices/ */ +enum { + DEV_PARPORT_DEVICES_ACTIVE=-3, +}; + +/* /proc/sys/dev/parport/parport n/devices/device n */ +enum { + DEV_PARPORT_DEVICE_TIMESLICE=1, +}; + +/* /proc/sys/dev/mac_hid */ +enum { + DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES=1, + DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES=2, + DEV_MAC_HID_MOUSE_BUTTON_EMULATION=3, + DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE=4, + DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE=5, + DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES=6 +}; + +/* /proc/sys/dev/scsi */ +enum { + DEV_SCSI_LOGGING_LEVEL=1, +}; + +/* /proc/sys/dev/ipmi */ +enum { + DEV_IPMI_POWEROFF_POWERCYCLE=1, +}; + +/* /proc/sys/dev/adbhid */ +enum { + DEV_ADBHID_MANGLE_CAPS_LOCK_EVENTS=1, +}; + +/* /proc/sys/abi */ +enum +{ + ABI_DEFHANDLER_COFF=1, /* default handler for coff binaries */ + ABI_DEFHANDLER_ELF=2, /* default handler for ELF binaries */ + ABI_DEFHANDLER_LCALL7=3,/* default handler for procs using lcall7 */ + ABI_DEFHANDLER_LIBCSO=4,/* default handler for an libc.so ELF interp */ + ABI_TRACE=5, /* tracing flags */ + ABI_FAKE_UTSNAME=6, /* fake target utsname information */ +}; + +#ifdef __KERNEL__ +#include + +extern void sysctl_init(void); + +typedef struct ctl_table ctl_table; + +typedef int ctl_handler (ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context); + +typedef int proc_handler (ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos); + +extern int proc_dostring(ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); +extern int proc_dointvec(ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); +extern int proc_dointvec_bset(ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); +extern int proc_dointvec_minmax(ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); +extern int proc_dointvec_jiffies(ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); +extern int proc_dointvec_userhz_jiffies(ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); +extern int proc_dointvec_ms_jiffies(ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); +extern int proc_doulongvec_minmax(ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); +extern int proc_doulonglongvec_minmax(ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); +extern int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int, + struct file *, void __user *, size_t *, loff_t *); + +extern int do_sysctl (int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen); + +extern int do_sysctl_strategy (ctl_table *table, + int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void ** context); + +extern ctl_handler sysctl_string; +extern ctl_handler sysctl_intvec; +extern ctl_handler sysctl_jiffies; +extern ctl_handler sysctl_ms_jiffies; + + +/* + * Register a set of sysctl names by calling register_sysctl_table + * with an initialised array of ctl_table's. An entry with zero + * ctl_name terminates the table. table->de will be set up by the + * registration and need not be initialised in advance. + * + * sysctl names can be mirrored automatically under /proc/sys. The + * procname supplied controls /proc naming. + * + * The table's mode will be honoured both for sys_sysctl(2) and + * proc-fs access. + * + * Leaf nodes in the sysctl tree will be represented by a single file + * under /proc; non-leaf nodes will be represented by directories. A + * null procname disables /proc mirroring at this node. + * + * sysctl(2) can automatically manage read and write requests through + * the sysctl table. The data and maxlen fields of the ctl_table + * struct enable minimal validation of the values being written to be + * performed, and the mode field allows minimal authentication. + * + * More sophisticated management can be enabled by the provision of a + * strategy routine with the table entry. This will be called before + * any automatic read or write of the data is performed. + * + * The strategy routine may return: + * <0: Error occurred (error is passed to user process) + * 0: OK - proceed with automatic read or write. + * >0: OK - read or write has been done by the strategy routine, so + * return immediately. + * + * There must be a proc_handler routine for any terminal nodes + * mirrored under /proc/sys (non-terminals are handled by a built-in + * directory handler). Several default handlers are available to + * cover common cases. + */ + +/* A sysctl table is an array of struct ctl_table: */ +struct ctl_table +{ + int ctl_name; /* Binary ID */ + const char *procname; /* Text ID for /proc/sys, or zero */ + void *data; + int maxlen; + mode_t mode; + ctl_table *child; + proc_handler *proc_handler; /* Callback for text formatting */ + ctl_handler *strategy; /* Callback function for all r/w */ + struct proc_dir_entry *de; /* /proc control block */ + void *extra1; + void *extra2; +}; + +/* struct ctl_table_header is used to maintain dynamic lists of + ctl_table trees. */ +struct ctl_table_header +{ + ctl_table *ctl_table; + struct list_head ctl_entry; + int used; + struct completion *unregistering; +}; + +struct ctl_table_header * register_sysctl_table(ctl_table * table, + int insert_at_head); +void unregister_sysctl_table(struct ctl_table_header * table); + +#else /* __KERNEL__ */ + +#endif /* __KERNEL__ */ + +#endif /* _LINUX_SYSCTL_H */ diff -urN oldtree/kernel/Kconfig.hz newtree/kernel/Kconfig.hz --- oldtree/kernel/Kconfig.hz 2006-01-03 03:21:10.000000000 +0000 +++ newtree/kernel/Kconfig.hz 2006-03-08 20:52:32.414875750 +0000 @@ -21,14 +21,17 @@ help 100 HZ is a typical choice for servers, SMP and NUMA systems with lots of processors that may show reduced performance if - too many timer interrupts are occurring. + too many timer interrupts are occurring. Laptops may also show + improved battery life. - config HZ_250 + config HZ_250_NODEFAULT bool "250 HZ" help - 250 HZ is a good compromise choice allowing server performance - while also showing good interactive responsiveness even - on SMP and NUMA systems. + 250 HZ is a lousy compromise choice allowing server interactivity + while also showing desktop throughput and no extra power saving on + laptops. Good for when you can't make up your mind. + + Recommend 100 or 1000 instead. config HZ_1000 bool "1000 HZ" @@ -41,6 +44,6 @@ config HZ int default 100 if HZ_100 - default 250 if HZ_250 + default 250 if HZ_250_NODEFAULT default 1000 if HZ_1000 diff -urN oldtree/kernel/sysctl.c newtree/kernel/sysctl.c --- oldtree/kernel/sysctl.c 2006-03-08 18:48:02.976065000 +0000 +++ newtree/kernel/sysctl.c 2006-03-08 20:52:42.019476000 +0000 @@ -765,16 +765,24 @@ .proc_handler = &proc_dointvec, }, { - .ctl_name = VM_SWAPPINESS, - .procname = "swappiness", - .data = &vm_swappiness, - .maxlen = sizeof(vm_swappiness), + .ctl_name = VM_MAPPED, + .procname = "mapped", + .data = &vm_mapped, + .maxlen = sizeof(vm_mapped), .mode = 0644, .proc_handler = &proc_dointvec_minmax, .strategy = &sysctl_intvec, .extra1 = &zero, .extra2 = &one_hundred, }, + { + .ctl_name = VM_HARDMAPLIMIT, + .procname = "hardmaplimit", + .data = &vm_hardmaplimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #ifdef CONFIG_HUGETLB_PAGE { .ctl_name = VM_HUGETLB_PAGES, diff -urN oldtree/kernel/sysctl.c.orig newtree/kernel/sysctl.c.orig --- oldtree/kernel/sysctl.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/sysctl.c.orig 2006-03-08 18:48:02.000000000 +0000 @@ -0,0 +1,2658 @@ +/* + * sysctl.c: General linux system control interface + * + * Begun 24 March 1995, Stephen Tweedie + * Added /proc support, Dec 1995 + * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas. + * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver. + * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver. + * Dynamic registration fixes, Stephen Tweedie. + * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn. + * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris + * Horn. + * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer. + * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer. + * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill + * Wendling. + * The list_for_each() macro wasn't appropriate for the sysctl loop. + * Removed it and replaced it with older style, 03/23/00, Bill Wendling + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +extern int proc_nr_files(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); + +#if defined(CONFIG_SYSCTL) + +/* External variables not in a header file. */ +extern int C_A_D; +extern int sysctl_overcommit_memory; +extern int sysctl_overcommit_ratio; +extern int max_threads; +extern int sysrq_enabled; +extern int core_uses_pid; +extern int suid_dumpable; +extern char core_pattern[]; +extern int cad_pid; +extern int pid_max; +extern int min_free_kbytes; +extern int printk_ratelimit_jiffies; +extern int printk_ratelimit_burst; +extern int pid_max_min, pid_max_max; +extern int sysctl_drop_caches; +extern int percpu_pagelist_fraction; + +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) +int unknown_nmi_panic; +extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); +#endif + +/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ +static int maxolduid = 65535; +static int minolduid; +static int min_percpu_pagelist_fract = 8; + +static int ngroups_max = NGROUPS_MAX; + +#ifdef CONFIG_KMOD +extern char modprobe_path[]; +#endif +#ifdef CONFIG_CHR_DEV_SG +extern int sg_big_buff; +#endif +#ifdef CONFIG_SYSVIPC +extern size_t shm_ctlmax; +extern size_t shm_ctlall; +extern int shm_ctlmni; +extern int msg_ctlmax; +extern int msg_ctlmnb; +extern int msg_ctlmni; +extern int sem_ctls[]; +#endif + +#ifdef __sparc__ +extern char reboot_command []; +extern int stop_a_enabled; +extern int scons_pwroff; +#endif + +#ifdef __hppa__ +extern int pwrsw_enabled; +extern int unaligned_enabled; +#endif + +#ifdef CONFIG_S390 +#ifdef CONFIG_MATHEMU +extern int sysctl_ieee_emulation_warnings; +#endif +extern int sysctl_userprocess_debug; +extern int spin_retry; +#endif + +extern int sysctl_hz_timer; + +#ifdef CONFIG_BSD_PROCESS_ACCT +extern int acct_parm[]; +#endif + +#ifdef CONFIG_IA64 +extern int no_unaligned_warning; +#endif + +static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, + ctl_table *, void **); +static int proc_doutsstring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); + +static ctl_table root_table[]; +static struct ctl_table_header root_table_header = + { root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) }; + +static ctl_table kern_table[]; +static ctl_table vm_table[]; +static ctl_table proc_table[]; +static ctl_table fs_table[]; +static ctl_table debug_table[]; +static ctl_table dev_table[]; +extern ctl_table random_table[]; +#ifdef CONFIG_UNIX98_PTYS +extern ctl_table pty_table[]; +#endif +#ifdef CONFIG_INOTIFY +extern ctl_table inotify_table[]; +#endif + +#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT +int sysctl_legacy_va_layout; +#endif + +/* /proc declarations: */ + +#ifdef CONFIG_PROC_FS + +static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *); +static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); +static int proc_opensys(struct inode *, struct file *); + +struct file_operations proc_sys_file_operations = { + .open = proc_opensys, + .read = proc_readsys, + .write = proc_writesys, +}; + +extern struct proc_dir_entry *proc_sys_root; + +static void register_proc_table(ctl_table *, struct proc_dir_entry *, void *); +static void unregister_proc_table(ctl_table *, struct proc_dir_entry *); +#endif + +/* The default sysctl tables: */ + +static ctl_table root_table[] = { + { + .ctl_name = CTL_KERN, + .procname = "kernel", + .mode = 0555, + .child = kern_table, + }, + { + .ctl_name = CTL_VM, + .procname = "vm", + .mode = 0555, + .child = vm_table, + }, +#ifdef CONFIG_NET + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = net_table, + }, +#endif + { + .ctl_name = CTL_PROC, + .procname = "proc", + .mode = 0555, + .child = proc_table, + }, + { + .ctl_name = CTL_FS, + .procname = "fs", + .mode = 0555, + .child = fs_table, + }, + { + .ctl_name = CTL_DEBUG, + .procname = "debug", + .mode = 0555, + .child = debug_table, + }, + { + .ctl_name = CTL_DEV, + .procname = "dev", + .mode = 0555, + .child = dev_table, + }, + + { .ctl_name = 0 } +}; + +static ctl_table kern_table[] = { + { + .ctl_name = KERN_OSTYPE, + .procname = "ostype", + .data = system_utsname.sysname, + .maxlen = sizeof(system_utsname.sysname), + .mode = 0444, + .proc_handler = &proc_doutsstring, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_OSRELEASE, + .procname = "osrelease", + .data = system_utsname.release, + .maxlen = sizeof(system_utsname.release), + .mode = 0444, + .proc_handler = &proc_doutsstring, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_VERSION, + .procname = "version", + .data = system_utsname.version, + .maxlen = sizeof(system_utsname.version), + .mode = 0444, + .proc_handler = &proc_doutsstring, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_NODENAME, + .procname = "hostname", + .data = system_utsname.nodename, + .maxlen = sizeof(system_utsname.nodename), + .mode = 0644, + .proc_handler = &proc_doutsstring, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_DOMAINNAME, + .procname = "domainname", + .data = system_utsname.domainname, + .maxlen = sizeof(system_utsname.domainname), + .mode = 0644, + .proc_handler = &proc_doutsstring, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_PANIC, + .procname = "panic", + .data = &panic_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_CORE_USES_PID, + .procname = "core_uses_pid", + .data = &core_uses_pid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_CORE_PATTERN, + .procname = "core_pattern", + .data = core_pattern, + .maxlen = 64, + .mode = 0644, + .proc_handler = &proc_dostring, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_TAINTED, + .procname = "tainted", + .data = &tainted, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_CAP_BSET, + .procname = "cap-bound", + .data = &cap_bset, + .maxlen = sizeof(kernel_cap_t), + .mode = 0600, + .proc_handler = &proc_dointvec_bset, + }, +#ifdef CONFIG_BLK_DEV_INITRD + { + .ctl_name = KERN_REALROOTDEV, + .procname = "real-root-dev", + .data = &real_root_dev, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef __sparc__ + { + .ctl_name = KERN_SPARC_REBOOT, + .procname = "reboot-cmd", + .data = reboot_command, + .maxlen = 256, + .mode = 0644, + .proc_handler = &proc_dostring, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_SPARC_STOP_A, + .procname = "stop-a", + .data = &stop_a_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_SPARC_SCONS_PWROFF, + .procname = "scons-poweroff", + .data = &scons_pwroff, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef __hppa__ + { + .ctl_name = KERN_HPPA_PWRSW, + .procname = "soft-power", + .data = &pwrsw_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_HPPA_UNALIGNED, + .procname = "unaligned-trap", + .data = &unaligned_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = KERN_CTLALTDEL, + .procname = "ctrl-alt-del", + .data = &C_A_D, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PRINTK, + .procname = "printk", + .data = &console_loglevel, + .maxlen = 4*sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_KMOD + { + .ctl_name = KERN_MODPROBE, + .procname = "modprobe", + .data = &modprobe_path, + .maxlen = KMOD_PATH_LEN, + .mode = 0644, + .proc_handler = &proc_dostring, + .strategy = &sysctl_string, + }, +#endif +#ifdef CONFIG_HOTPLUG + { + .ctl_name = KERN_HOTPLUG, + .procname = "hotplug", + .data = &uevent_helper, + .maxlen = UEVENT_HELPER_PATH_LEN, + .mode = 0644, + .proc_handler = &proc_dostring, + .strategy = &sysctl_string, + }, +#endif +#ifdef CONFIG_CHR_DEV_SG + { + .ctl_name = KERN_SG_BIG_BUFF, + .procname = "sg-big-buff", + .data = &sg_big_buff, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_BSD_PROCESS_ACCT + { + .ctl_name = KERN_ACCT, + .procname = "acct", + .data = &acct_parm, + .maxlen = 3*sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_SYSVIPC + { + .ctl_name = KERN_SHMMAX, + .procname = "shmmax", + .data = &shm_ctlmax, + .maxlen = sizeof (size_t), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + }, + { + .ctl_name = KERN_SHMALL, + .procname = "shmall", + .data = &shm_ctlall, + .maxlen = sizeof (size_t), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + }, + { + .ctl_name = KERN_SHMMNI, + .procname = "shmmni", + .data = &shm_ctlmni, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_MSGMAX, + .procname = "msgmax", + .data = &msg_ctlmax, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_MSGMNI, + .procname = "msgmni", + .data = &msg_ctlmni, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_MSGMNB, + .procname = "msgmnb", + .data = &msg_ctlmnb, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_SEM, + .procname = "sem", + .data = &sem_ctls, + .maxlen = 4*sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_MAGIC_SYSRQ + { + .ctl_name = KERN_SYSRQ, + .procname = "sysrq", + .data = &sysrq_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = KERN_CADPID, + .procname = "cad_pid", + .data = &cad_pid, + .maxlen = sizeof (int), + .mode = 0600, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_MAX_THREADS, + .procname = "threads-max", + .data = &max_threads, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_RANDOM, + .procname = "random", + .mode = 0555, + .child = random_table, + }, +#ifdef CONFIG_UNIX98_PTYS + { + .ctl_name = KERN_PTY, + .procname = "pty", + .mode = 0555, + .child = pty_table, + }, +#endif + { + .ctl_name = KERN_OVERFLOWUID, + .procname = "overflowuid", + .data = &overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, + { + .ctl_name = KERN_OVERFLOWGID, + .procname = "overflowgid", + .data = &overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, +#ifdef CONFIG_S390 +#ifdef CONFIG_MATHEMU + { + .ctl_name = KERN_IEEE_EMULATION_WARNINGS, + .procname = "ieee_emulation_warnings", + .data = &sysctl_ieee_emulation_warnings, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_NO_IDLE_HZ + { + .ctl_name = KERN_HZ_TIMER, + .procname = "hz_timer", + .data = &sysctl_hz_timer, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = KERN_S390_USER_DEBUG_LOGGING, + .procname = "userprocess_debug", + .data = &sysctl_userprocess_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = KERN_PIDMAX, + .procname = "pid_max", + .data = &pid_max, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = sysctl_intvec, + .extra1 = &pid_max_min, + .extra2 = &pid_max_max, + }, + { + .ctl_name = KERN_PANIC_ON_OOPS, + .procname = "panic_on_oops", + .data = &panic_on_oops, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PRINTK_RATELIMIT, + .procname = "printk_ratelimit", + .data = &printk_ratelimit_jiffies, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = KERN_PRINTK_RATELIMIT_BURST, + .procname = "printk_ratelimit_burst", + .data = &printk_ratelimit_burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_NGROUPS_MAX, + .procname = "ngroups_max", + .data = &ngroups_max, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) + { + .ctl_name = KERN_UNKNOWN_NMI_PANIC, + .procname = "unknown_nmi_panic", + .data = &unknown_nmi_panic, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_unknown_nmi_panic, + }, +#endif +#if defined(CONFIG_X86) + { + .ctl_name = KERN_BOOTLOADER_TYPE, + .procname = "bootloader_type", + .data = &bootloader_type, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, +#endif +#if defined(CONFIG_MMU) + { + .ctl_name = KERN_RANDOMIZE, + .procname = "randomize_va_space", + .data = &randomize_va_space, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .ctl_name = KERN_SPIN_RETRY, + .procname = "spin_retry", + .data = &spin_retry, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_ACPI_SLEEP + { + .ctl_name = KERN_ACPI_VIDEO_FLAGS, + .procname = "acpi_video_flags", + .data = &acpi_video_flags, + .maxlen = sizeof (unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + }, +#endif +#ifdef CONFIG_IA64 + { + .ctl_name = KERN_IA64_UNALIGNED, + .procname = "ignore-unaligned-usertrap", + .data = &no_unaligned_warning, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { .ctl_name = 0 } +}; + +/* Constants for minimum and maximum testing in vm_table. + We use these as one-element integer vectors. */ +static int zero; +static int one_hundred = 100; + + +static ctl_table vm_table[] = { + { + .ctl_name = VM_OVERCOMMIT_MEMORY, + .procname = "overcommit_memory", + .data = &sysctl_overcommit_memory, + .maxlen = sizeof(sysctl_overcommit_memory), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = VM_OVERCOMMIT_RATIO, + .procname = "overcommit_ratio", + .data = &sysctl_overcommit_ratio, + .maxlen = sizeof(sysctl_overcommit_ratio), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = VM_PAGE_CLUSTER, + .procname = "page-cluster", + .data = &page_cluster, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = VM_DIRTY_BACKGROUND, + .procname = "dirty_background_ratio", + .data = &dirty_background_ratio, + .maxlen = sizeof(dirty_background_ratio), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .ctl_name = VM_DIRTY_RATIO, + .procname = "dirty_ratio", + .data = &vm_dirty_ratio, + .maxlen = sizeof(vm_dirty_ratio), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .ctl_name = VM_DIRTY_WB_CS, + .procname = "dirty_writeback_centisecs", + .data = &dirty_writeback_interval, + .maxlen = sizeof(dirty_writeback_interval), + .mode = 0644, + .proc_handler = &dirty_writeback_centisecs_handler, + }, + { + .ctl_name = VM_DIRTY_EXPIRE_CS, + .procname = "dirty_expire_centisecs", + .data = &dirty_expire_interval, + .maxlen = sizeof(dirty_expire_interval), + .mode = 0644, + .proc_handler = &proc_dointvec_userhz_jiffies, + }, + { + .ctl_name = VM_NR_PDFLUSH_THREADS, + .procname = "nr_pdflush_threads", + .data = &nr_pdflush_threads, + .maxlen = sizeof nr_pdflush_threads, + .mode = 0444 /* read-only*/, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = VM_SWAPPINESS, + .procname = "swappiness", + .data = &vm_swappiness, + .maxlen = sizeof(vm_swappiness), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, +#ifdef CONFIG_HUGETLB_PAGE + { + .ctl_name = VM_HUGETLB_PAGES, + .procname = "nr_hugepages", + .data = &max_huge_pages, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &hugetlb_sysctl_handler, + .extra1 = (void *)&hugetlb_zero, + .extra2 = (void *)&hugetlb_infinity, + }, + { + .ctl_name = VM_HUGETLB_GROUP, + .procname = "hugetlb_shm_group", + .data = &sysctl_hugetlb_shm_group, + .maxlen = sizeof(gid_t), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = VM_LOWMEM_RESERVE_RATIO, + .procname = "lowmem_reserve_ratio", + .data = &sysctl_lowmem_reserve_ratio, + .maxlen = sizeof(sysctl_lowmem_reserve_ratio), + .mode = 0644, + .proc_handler = &lowmem_reserve_ratio_sysctl_handler, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = VM_DROP_PAGECACHE, + .procname = "drop_caches", + .data = &sysctl_drop_caches, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = drop_caches_sysctl_handler, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = VM_MIN_FREE_KBYTES, + .procname = "min_free_kbytes", + .data = &min_free_kbytes, + .maxlen = sizeof(min_free_kbytes), + .mode = 0644, + .proc_handler = &min_free_kbytes_sysctl_handler, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, + { + .ctl_name = VM_PERCPU_PAGELIST_FRACTION, + .procname = "percpu_pagelist_fraction", + .data = &percpu_pagelist_fraction, + .maxlen = sizeof(percpu_pagelist_fraction), + .mode = 0644, + .proc_handler = &percpu_pagelist_fraction_sysctl_handler, + .strategy = &sysctl_intvec, + .extra1 = &min_percpu_pagelist_fract, + }, +#ifdef CONFIG_MMU + { + .ctl_name = VM_MAX_MAP_COUNT, + .procname = "max_map_count", + .data = &sysctl_max_map_count, + .maxlen = sizeof(sysctl_max_map_count), + .mode = 0644, + .proc_handler = &proc_dointvec + }, +#endif + { + .ctl_name = VM_LAPTOP_MODE, + .procname = "laptop_mode", + .data = &laptop_mode, + .maxlen = sizeof(laptop_mode), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = VM_BLOCK_DUMP, + .procname = "block_dump", + .data = &block_dump, + .maxlen = sizeof(block_dump), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, + { + .ctl_name = VM_VFS_CACHE_PRESSURE, + .procname = "vfs_cache_pressure", + .data = &sysctl_vfs_cache_pressure, + .maxlen = sizeof(sysctl_vfs_cache_pressure), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, +#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT + { + .ctl_name = VM_LEGACY_VA_LAYOUT, + .procname = "legacy_va_layout", + .data = &sysctl_legacy_va_layout, + .maxlen = sizeof(sysctl_legacy_va_layout), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, +#endif +#ifdef CONFIG_SWAP + { + .ctl_name = VM_SWAP_TOKEN_TIMEOUT, + .procname = "swap_token_timeout", + .data = &swap_token_default_timeout, + .maxlen = sizeof(swap_token_default_timeout), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, +#endif +#ifdef CONFIG_NUMA + { + .ctl_name = VM_ZONE_RECLAIM_MODE, + .procname = "zone_reclaim_mode", + .data = &zone_reclaim_mode, + .maxlen = sizeof(zone_reclaim_mode), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, + { + .ctl_name = VM_ZONE_RECLAIM_INTERVAL, + .procname = "zone_reclaim_interval", + .data = &zone_reclaim_interval, + .maxlen = sizeof(zone_reclaim_interval), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, +#endif +#ifdef CONFIG_SWAP_PREFETCH + { + .ctl_name = VM_SWAP_PREFETCH, + .procname = "swap_prefetch", + .data = &swap_prefetch, + .maxlen = sizeof(swap_prefetch), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { .ctl_name = 0 } +}; + +static ctl_table proc_table[] = { + { .ctl_name = 0 } +}; + +static ctl_table fs_table[] = { + { + .ctl_name = FS_NRINODE, + .procname = "inode-nr", + .data = &inodes_stat, + .maxlen = 2*sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = FS_STATINODE, + .procname = "inode-state", + .data = &inodes_stat, + .maxlen = 7*sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = FS_NRFILE, + .procname = "file-nr", + .data = &files_stat, + .maxlen = 3*sizeof(int), + .mode = 0444, + .proc_handler = &proc_nr_files, + }, + { + .ctl_name = FS_MAXFILE, + .procname = "file-max", + .data = &files_stat.max_files, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = FS_DENTRY, + .procname = "dentry-state", + .data = &dentry_stat, + .maxlen = 6*sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = FS_OVERFLOWUID, + .procname = "overflowuid", + .data = &fs_overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, + { + .ctl_name = FS_OVERFLOWGID, + .procname = "overflowgid", + .data = &fs_overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, + { + .ctl_name = FS_LEASES, + .procname = "leases-enable", + .data = &leases_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_DNOTIFY + { + .ctl_name = FS_DIR_NOTIFY, + .procname = "dir-notify-enable", + .data = &dir_notify_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_MMU + { + .ctl_name = FS_LEASE_TIME, + .procname = "lease-break-time", + .data = &lease_break_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = FS_AIO_NR, + .procname = "aio-nr", + .data = &aio_nr, + .maxlen = sizeof(aio_nr), + .mode = 0444, + .proc_handler = &proc_doulongvec_minmax, + }, + { + .ctl_name = FS_AIO_MAX_NR, + .procname = "aio-max-nr", + .data = &aio_max_nr, + .maxlen = sizeof(aio_max_nr), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + }, +#ifdef CONFIG_INOTIFY + { + .ctl_name = FS_INOTIFY, + .procname = "inotify", + .mode = 0555, + .child = inotify_table, + }, +#endif +#endif + { + .ctl_name = KERN_SETUID_DUMPABLE, + .procname = "suid_dumpable", + .data = &suid_dumpable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +static ctl_table debug_table[] = { + { .ctl_name = 0 } +}; + +static ctl_table dev_table[] = { + { .ctl_name = 0 } +}; + +extern void init_irq_proc (void); + +static DEFINE_SPINLOCK(sysctl_lock); + +/* called under sysctl_lock */ +static int use_table(struct ctl_table_header *p) +{ + if (unlikely(p->unregistering)) + return 0; + p->used++; + return 1; +} + +/* called under sysctl_lock */ +static void unuse_table(struct ctl_table_header *p) +{ + if (!--p->used) + if (unlikely(p->unregistering)) + complete(p->unregistering); +} + +/* called under sysctl_lock, will reacquire if has to wait */ +static void start_unregistering(struct ctl_table_header *p) +{ + /* + * if p->used is 0, nobody will ever touch that entry again; + * we'll eliminate all paths to it before dropping sysctl_lock + */ + if (unlikely(p->used)) { + struct completion wait; + init_completion(&wait); + p->unregistering = &wait; + spin_unlock(&sysctl_lock); + wait_for_completion(&wait); + spin_lock(&sysctl_lock); + } + /* + * do not remove from the list until nobody holds it; walking the + * list in do_sysctl() relies on that. + */ + list_del_init(&p->ctl_entry); +} + +void __init sysctl_init(void) +{ +#ifdef CONFIG_PROC_FS + register_proc_table(root_table, proc_sys_root, &root_table_header); + init_irq_proc(); +#endif +} + +int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + struct list_head *tmp; + int error = -ENOTDIR; + + if (nlen <= 0 || nlen >= CTL_MAXNAME) + return -ENOTDIR; + if (oldval) { + int old_len; + if (!oldlenp || get_user(old_len, oldlenp)) + return -EFAULT; + } + spin_lock(&sysctl_lock); + tmp = &root_table_header.ctl_entry; + do { + struct ctl_table_header *head = + list_entry(tmp, struct ctl_table_header, ctl_entry); + void *context = NULL; + + if (!use_table(head)) + continue; + + spin_unlock(&sysctl_lock); + + error = parse_table(name, nlen, oldval, oldlenp, + newval, newlen, head->ctl_table, + &context); + kfree(context); + + spin_lock(&sysctl_lock); + unuse_table(head); + if (error != -ENOTDIR) + break; + } while ((tmp = tmp->next) != &root_table_header.ctl_entry); + spin_unlock(&sysctl_lock); + return error; +} + +asmlinkage long sys_sysctl(struct __sysctl_args __user *args) +{ + struct __sysctl_args tmp; + int error; + + if (copy_from_user(&tmp, args, sizeof(tmp))) + return -EFAULT; + + lock_kernel(); + error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp, + tmp.newval, tmp.newlen); + unlock_kernel(); + return error; +} + +/* + * ctl_perm does NOT grant the superuser all rights automatically, because + * some sysctl variables are readonly even to root. + */ + +static int test_perm(int mode, int op) +{ + if (!current->euid) + mode >>= 6; + else if (in_egroup_p(0)) + mode >>= 3; + if ((mode & op & 0007) == op) + return 0; + return -EACCES; +} + +static inline int ctl_perm(ctl_table *table, int op) +{ + int error; + error = security_sysctl(table, op); + if (error) + return error; + return test_perm(table->mode, op); +} + +static int parse_table(int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, + ctl_table *table, void **context) +{ + int n; +repeat: + if (!nlen) + return -ENOTDIR; + if (get_user(n, name)) + return -EFAULT; + for ( ; table->ctl_name; table++) { + if (n == table->ctl_name || table->ctl_name == CTL_ANY) { + int error; + if (table->child) { + if (ctl_perm(table, 001)) + return -EPERM; + if (table->strategy) { + error = table->strategy( + table, name, nlen, + oldval, oldlenp, + newval, newlen, context); + if (error) + return error; + } + name++; + nlen--; + table = table->child; + goto repeat; + } + error = do_sysctl_strategy(table, name, nlen, + oldval, oldlenp, + newval, newlen, context); + return error; + } + } + return -ENOTDIR; +} + +/* Perform the actual read/write of a sysctl table entry. */ +int do_sysctl_strategy (ctl_table *table, + int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + int op = 0, rc; + size_t len; + + if (oldval) + op |= 004; + if (newval) + op |= 002; + if (ctl_perm(table, op)) + return -EPERM; + + if (table->strategy) { + rc = table->strategy(table, name, nlen, oldval, oldlenp, + newval, newlen, context); + if (rc < 0) + return rc; + if (rc > 0) + return 0; + } + + /* If there is no strategy routine, or if the strategy returns + * zero, proceed with automatic r/w */ + if (table->data && table->maxlen) { + if (oldval && oldlenp) { + if (get_user(len, oldlenp)) + return -EFAULT; + if (len) { + if (len > table->maxlen) + len = table->maxlen; + if(copy_to_user(oldval, table->data, len)) + return -EFAULT; + if(put_user(len, oldlenp)) + return -EFAULT; + } + } + if (newval && newlen) { + len = newlen; + if (len > table->maxlen) + len = table->maxlen; + if(copy_from_user(table->data, newval, len)) + return -EFAULT; + } + } + return 0; +} + +/** + * register_sysctl_table - register a sysctl hierarchy + * @table: the top-level table structure + * @insert_at_head: whether the entry should be inserted in front or at the end + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. An entry with a ctl_name of 0 terminates the table. + * + * The members of the &ctl_table structure are used as follows: + * + * ctl_name - This is the numeric sysctl value used by sysctl(2). The number + * must be unique within that level of sysctl + * + * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not + * enter a sysctl file + * + * data - a pointer to data for use by proc_handler + * + * maxlen - the maximum size in bytes of the data + * + * mode - the file permissions for the /proc/sys file, and for sysctl(2) + * + * child - a pointer to the child sysctl table if this entry is a directory, or + * %NULL. + * + * proc_handler - the text handler routine (described below) + * + * strategy - the strategy routine (described below) + * + * de - for internal use by the sysctl routines + * + * extra1, extra2 - extra pointers usable by the proc handler routines + * + * Leaf nodes in the sysctl tree will be represented by a single file + * under /proc; non-leaf nodes will be represented by directories. + * + * sysctl(2) can automatically manage read and write requests through + * the sysctl table. The data and maxlen fields of the ctl_table + * struct enable minimal validation of the values being written to be + * performed, and the mode field allows minimal authentication. + * + * More sophisticated management can be enabled by the provision of a + * strategy routine with the table entry. This will be called before + * any automatic read or write of the data is performed. + * + * The strategy routine may return + * + * < 0 - Error occurred (error is passed to user process) + * + * 0 - OK - proceed with automatic read or write. + * + * > 0 - OK - read or write has been done by the strategy routine, so + * return immediately. + * + * There must be a proc_handler routine for any terminal nodes + * mirrored under /proc/sys (non-terminals are handled by a built-in + * directory handler). Several default handlers are available to + * cover common cases - + * + * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), + * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), + * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() + * + * It is the handler's job to read the input buffer from user memory + * and process it. The handler should return 0 on success. + * + * This routine returns %NULL on a failure to register, and a pointer + * to the table header on success. + */ +struct ctl_table_header *register_sysctl_table(ctl_table * table, + int insert_at_head) +{ + struct ctl_table_header *tmp; + tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); + if (!tmp) + return NULL; + tmp->ctl_table = table; + INIT_LIST_HEAD(&tmp->ctl_entry); + tmp->used = 0; + tmp->unregistering = NULL; + spin_lock(&sysctl_lock); + if (insert_at_head) + list_add(&tmp->ctl_entry, &root_table_header.ctl_entry); + else + list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); + spin_unlock(&sysctl_lock); +#ifdef CONFIG_PROC_FS + register_proc_table(table, proc_sys_root, tmp); +#endif + return tmp; +} + +/** + * unregister_sysctl_table - unregister a sysctl table hierarchy + * @header: the header returned from register_sysctl_table + * + * Unregisters the sysctl table and all children. proc entries may not + * actually be removed until they are no longer used by anyone. + */ +void unregister_sysctl_table(struct ctl_table_header * header) +{ + might_sleep(); + spin_lock(&sysctl_lock); + start_unregistering(header); +#ifdef CONFIG_PROC_FS + unregister_proc_table(header->ctl_table, proc_sys_root); +#endif + spin_unlock(&sysctl_lock); + kfree(header); +} + +/* + * /proc/sys support + */ + +#ifdef CONFIG_PROC_FS + +/* Scan the sysctl entries in table and add them all into /proc */ +static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set) +{ + struct proc_dir_entry *de; + int len; + mode_t mode; + + for (; table->ctl_name; table++) { + /* Can't do anything without a proc name. */ + if (!table->procname) + continue; + /* Maybe we can't do anything with it... */ + if (!table->proc_handler && !table->child) { + printk(KERN_WARNING "SYSCTL: Can't register %s\n", + table->procname); + continue; + } + + len = strlen(table->procname); + mode = table->mode; + + de = NULL; + if (table->proc_handler) + mode |= S_IFREG; + else { + mode |= S_IFDIR; + for (de = root->subdir; de; de = de->next) { + if (proc_match(len, table->procname, de)) + break; + } + /* If the subdir exists already, de is non-NULL */ + } + + if (!de) { + de = create_proc_entry(table->procname, mode, root); + if (!de) + continue; + de->set = set; + de->data = (void *) table; + if (table->proc_handler) + de->proc_fops = &proc_sys_file_operations; + } + table->de = de; + if (de->mode & S_IFDIR) + register_proc_table(table->child, de, set); + } +} + +/* + * Unregister a /proc sysctl table and any subdirectories. + */ +static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root) +{ + struct proc_dir_entry *de; + for (; table->ctl_name; table++) { + if (!(de = table->de)) + continue; + if (de->mode & S_IFDIR) { + if (!table->child) { + printk (KERN_ALERT "Help - malformed sysctl tree on free\n"); + continue; + } + unregister_proc_table(table->child, de); + + /* Don't unregister directories which still have entries.. */ + if (de->subdir) + continue; + } + + /* + * In any case, mark the entry as goner; we'll keep it + * around if it's busy, but we'll know to do nothing with + * its fields. We are under sysctl_lock here. + */ + de->data = NULL; + + /* Don't unregister proc entries that are still being used.. */ + if (atomic_read(&de->count)) + continue; + + table->de = NULL; + remove_proc_entry(table->procname, root); + } +} + +static ssize_t do_rw_proc(int write, struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + int op; + struct proc_dir_entry *de = PDE(file->f_dentry->d_inode); + struct ctl_table *table; + size_t res; + ssize_t error = -ENOTDIR; + + spin_lock(&sysctl_lock); + if (de && de->data && use_table(de->set)) { + /* + * at that point we know that sysctl was not unregistered + * and won't be until we finish + */ + spin_unlock(&sysctl_lock); + table = (struct ctl_table *) de->data; + if (!table || !table->proc_handler) + goto out; + error = -EPERM; + op = (write ? 002 : 004); + if (ctl_perm(table, op)) + goto out; + + /* careful: calling conventions are nasty here */ + res = count; + error = (*table->proc_handler)(table, write, file, + buf, &res, ppos); + if (!error) + error = res; + out: + spin_lock(&sysctl_lock); + unuse_table(de->set); + } + spin_unlock(&sysctl_lock); + return error; +} + +static int proc_opensys(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_WRITE) { + /* + * sysctl entries that are not writable, + * are _NOT_ writable, capabilities or not. + */ + if (!(inode->i_mode & S_IWUSR)) + return -EPERM; + } + + return 0; +} + +static ssize_t proc_readsys(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + return do_rw_proc(0, file, buf, count, ppos); +} + +static ssize_t proc_writesys(struct file * file, const char __user * buf, + size_t count, loff_t *ppos) +{ + return do_rw_proc(1, file, (char __user *) buf, count, ppos); +} + +/** + * proc_dostring - read a string sysctl + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes a string from/to the user buffer. If the kernel + * buffer provided is not large enough to hold the string, the + * string is truncated. The copied string is %NULL-terminated. + * If the string is being read by the user process, it is copied + * and a newline '\n' is added. It is truncated if the buffer is + * not large enough. + * + * Returns 0 on success. + */ +int proc_dostring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + size_t len; + char __user *p; + char c; + + if (!table->data || !table->maxlen || !*lenp || + (*ppos && !write)) { + *lenp = 0; + return 0; + } + + if (write) { + len = 0; + p = buffer; + while (len < *lenp) { + if (get_user(c, p++)) + return -EFAULT; + if (c == 0 || c == '\n') + break; + len++; + } + if (len >= table->maxlen) + len = table->maxlen-1; + if(copy_from_user(table->data, buffer, len)) + return -EFAULT; + ((char *) table->data)[len] = 0; + *ppos += *lenp; + } else { + len = strlen(table->data); + if (len > table->maxlen) + len = table->maxlen; + if (len > *lenp) + len = *lenp; + if (len) + if(copy_to_user(buffer, table->data, len)) + return -EFAULT; + if (len < *lenp) { + if(put_user('\n', ((char __user *) buffer) + len)) + return -EFAULT; + len++; + } + *lenp = len; + *ppos += len; + } + return 0; +} + +/* + * Special case of dostring for the UTS structure. This has locks + * to observe. Should this be in kernel/sys.c ???? + */ + +static int proc_doutsstring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int r; + + if (!write) { + down_read(&uts_sem); + r=proc_dostring(table,0,filp,buffer,lenp, ppos); + up_read(&uts_sem); + } else { + down_write(&uts_sem); + r=proc_dostring(table,1,filp,buffer,lenp, ppos); + up_write(&uts_sem); + } + return r; +} + +static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + *valp = *negp ? -*lvalp : *lvalp; + } else { + int val = *valp; + if (val < 0) { + *negp = -1; + *lvalp = (unsigned long)-val; + } else { + *negp = 0; + *lvalp = (unsigned long)val; + } + } + return 0; +} + +static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(int *negp, unsigned long *lvalp, int *valp, + int write, void *data), + void *data) +{ +#define TMPBUFLEN 21 + int *i, vleft, first=1, neg, val; + unsigned long lval; + size_t left, len; + + char buf[TMPBUFLEN], *p; + char __user *s = buffer; + + if (!table->data || !table->maxlen || !*lenp || + (*ppos && !write)) { + *lenp = 0; + return 0; + } + + i = (int *) table->data; + vleft = table->maxlen / sizeof(*i); + left = *lenp; + + if (!conv) + conv = do_proc_dointvec_conv; + + for (; left && vleft--; i++, first=0) { + if (write) { + while (left) { + char c; + if (get_user(c, s)) + return -EFAULT; + if (!isspace(c)) + break; + left--; + s++; + } + if (!left) + break; + neg = 0; + len = left; + if (len > sizeof(buf) - 1) + len = sizeof(buf) - 1; + if (copy_from_user(buf, s, len)) + return -EFAULT; + buf[len] = 0; + p = buf; + if (*p == '-' && left > 1) { + neg = 1; + left--, p++; + } + if (*p < '0' || *p > '9') + break; + + lval = simple_strtoul(p, &p, 0); + + len = p-buf; + if ((len < left) && *p && !isspace(*p)) + break; + if (neg) + val = -val; + s += len; + left -= len; + + if (conv(&neg, &lval, i, 1, data)) + break; + } else { + p = buf; + if (!first) + *p++ = '\t'; + + if (conv(&neg, &lval, i, 0, data)) + break; + + sprintf(p, "%s%lu", neg ? "-" : "", lval); + len = strlen(buf); + if (len > left) + len = left; + if(copy_to_user(s, buf, len)) + return -EFAULT; + left -= len; + s += len; + } + } + + if (!write && !first && left) { + if(put_user('\n', s)) + return -EFAULT; + left--, s++; + } + if (write) { + while (left) { + char c; + if (get_user(c, s++)) + return -EFAULT; + if (!isspace(c)) + break; + left--; + } + } + if (write && first) + return -EINVAL; + *lenp -= left; + *ppos += *lenp; + return 0; +#undef TMPBUFLEN +} + +/** + * proc_dointvec - read a vector of integers + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * Returns 0 on success. + */ +int proc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + NULL,NULL); +} + +#define OP_SET 0 +#define OP_AND 1 +#define OP_OR 2 +#define OP_MAX 3 +#define OP_MIN 4 + +static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + int op = *(int *)data; + if (write) { + int val = *negp ? -*lvalp : *lvalp; + switch(op) { + case OP_SET: *valp = val; break; + case OP_AND: *valp &= val; break; + case OP_OR: *valp |= val; break; + case OP_MAX: if(*valp < val) + *valp = val; + break; + case OP_MIN: if(*valp > val) + *valp = val; + break; + } + } else { + int val = *valp; + if (val < 0) { + *negp = -1; + *lvalp = (unsigned long)-val; + } else { + *negp = 0; + *lvalp = (unsigned long)val; + } + } + return 0; +} + +/* + * init may raise the set. + */ + +int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int op; + + if (!capable(CAP_SYS_MODULE)) { + return -EPERM; + } + + op = (current->pid == 1) ? OP_SET : OP_AND; + return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + do_proc_dointvec_bset_conv,&op); +} + +struct do_proc_dointvec_minmax_conv_param { + int *min; + int *max; +}; + +static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + struct do_proc_dointvec_minmax_conv_param *param = data; + if (write) { + int val = *negp ? -*lvalp : *lvalp; + if ((param->min && *param->min > val) || + (param->max && *param->max < val)) + return -EINVAL; + *valp = val; + } else { + int val = *valp; + if (val < 0) { + *negp = -1; + *lvalp = (unsigned long)-val; + } else { + *negp = 0; + *lvalp = (unsigned long)val; + } + } + return 0; +} + +/** + * proc_dointvec_minmax - read a vector of integers with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_dointvec_minmax_conv_param param = { + .min = (int *) table->extra1, + .max = (int *) table->extra2, + }; + return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, + do_proc_dointvec_minmax_conv, ¶m); +} + +static int do_proc_doulongvec_minmax(ctl_table *table, int write, + struct file *filp, + void __user *buffer, + size_t *lenp, loff_t *ppos, + unsigned long convmul, + unsigned long convdiv) +{ +#define TMPBUFLEN 21 + unsigned long *i, *min, *max, val; + int vleft, first=1, neg; + size_t len, left; + char buf[TMPBUFLEN], *p; + char __user *s = buffer; + + if (!table->data || !table->maxlen || !*lenp || + (*ppos && !write)) { + *lenp = 0; + return 0; + } + + i = (unsigned long *) table->data; + min = (unsigned long *) table->extra1; + max = (unsigned long *) table->extra2; + vleft = table->maxlen / sizeof(unsigned long); + left = *lenp; + + for (; left && vleft--; i++, min++, max++, first=0) { + if (write) { + while (left) { + char c; + if (get_user(c, s)) + return -EFAULT; + if (!isspace(c)) + break; + left--; + s++; + } + if (!left) + break; + neg = 0; + len = left; + if (len > TMPBUFLEN-1) + len = TMPBUFLEN-1; + if (copy_from_user(buf, s, len)) + return -EFAULT; + buf[len] = 0; + p = buf; + if (*p == '-' && left > 1) { + neg = 1; + left--, p++; + } + if (*p < '0' || *p > '9') + break; + val = simple_strtoul(p, &p, 0) * convmul / convdiv ; + len = p-buf; + if ((len < left) && *p && !isspace(*p)) + break; + if (neg) + val = -val; + s += len; + left -= len; + + if(neg) + continue; + if ((min && val < *min) || (max && val > *max)) + continue; + *i = val; + } else { + p = buf; + if (!first) + *p++ = '\t'; + sprintf(p, "%lu", convdiv * (*i) / convmul); + len = strlen(buf); + if (len > left) + len = left; + if(copy_to_user(s, buf, len)) + return -EFAULT; + left -= len; + s += len; + } + } + + if (!write && !first && left) { + if(put_user('\n', s)) + return -EFAULT; + left--, s++; + } + if (write) { + while (left) { + char c; + if (get_user(c, s++)) + return -EFAULT; + if (!isspace(c)) + break; + left--; + } + } + if (write && first) + return -EINVAL; + *lenp -= left; + *ppos += *lenp; + return 0; +#undef TMPBUFLEN +} + +/** + * proc_doulongvec_minmax - read a vector of long integers with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long + * values from/to the user buffer, treated as an ASCII string. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); +} + +static int do_proc_doulonglongvec_minmax(ctl_table *table, int write, + struct file *filp, + void __user *buffer, + size_t *lenp, loff_t *ppos, + unsigned long long convmul, + unsigned long long convdiv) +{ +#define TMPBUFLEN 21 + unsigned long long *i, *min, *max, val; + int vleft, first=1, neg; + size_t len, left; + char buf[TMPBUFLEN], *p; + char __user *s = buffer; + + if (!table->data || !table->maxlen || !*lenp || + (*ppos && !write)) { + *lenp = 0; + return 0; + } + + i = (unsigned long long *) table->data; + min = (unsigned long long *) table->extra1; + max = (unsigned long long *) table->extra2; + vleft = table->maxlen / sizeof(unsigned long long); + left = *lenp; + + for (; left && vleft--; i++, min++, max++, first=0) { + if (write) { + while (left) { + char c; + if (get_user(c, s)) + return -EFAULT; + if (!isspace(c)) + break; + left--; + s++; + } + if (!left) + break; + neg = 0; + len = left; + if (len > TMPBUFLEN-1) + len = TMPBUFLEN-1; + if (copy_from_user(buf, s, len)) + return -EFAULT; + buf[len] = 0; + p = buf; + if (*p == '-' && left > 1) { + neg = 1; + left--, p++; + } + if (*p < '0' || *p > '9') + break; + val = simple_strtoull(p, &p, 0) * convmul; + do_div(val, convdiv); + len = p-buf; + if ((len < left) && *p && !isspace(*p)) + break; + if (neg) + val = -val; + s += len; + left -= len; + + if(neg) + continue; + if ((min && val < *min) || (max && val > *max)) + continue; + *i = val; + } else { + p = buf; + if (!first) + *p++ = '\t'; + val = convdiv * (*i); + do_div(val, convmul); + sprintf(p, "%llu", val); + len = strlen(buf); + if (len > left) + len = left; + if (copy_to_user(s, buf, len)) + return -EFAULT; + left -= len; + s += len; + } + } + + if (!write && !first && left) { + if(put_user('\n', s)) + return -EFAULT; + left--, s++; + } + if (write) { + while (left) { + char c; + if (get_user(c, s++)) + return -EFAULT; + if (!isspace(c)) + break; + left--; + } + } + if (write && first) + return -EINVAL; + *lenp -= left; + *ppos += *lenp; + return 0; +#undef TMPBUFLEN +} + +int proc_doulonglongvec_minmax(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_doulonglongvec_minmax(table, write, filp, buffer, lenp, ppos, 1LLU, 1LLU); +} + +/** + * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long + * values from/to the user buffer, treated as an ASCII string. The values + * are treated as milliseconds, and converted to jiffies when they are stored. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, + struct file *filp, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return do_proc_doulongvec_minmax(table, write, filp, buffer, + lenp, ppos, HZ, 1000l); +} + + +static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + if (*lvalp > LONG_MAX / HZ) + return 1; + *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); + } else { + int val = *valp; + unsigned long lval; + if (val < 0) { + *negp = -1; + lval = (unsigned long)-val; + } else { + *negp = 0; + lval = (unsigned long)val; + } + *lvalp = lval / HZ; + } + return 0; +} + +static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ) + return 1; + *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); + } else { + int val = *valp; + unsigned long lval; + if (val < 0) { + *negp = -1; + lval = (unsigned long)-val; + } else { + *negp = 0; + lval = (unsigned long)val; + } + *lvalp = jiffies_to_clock_t(lval); + } + return 0; +} + +static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + *valp = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); + } else { + int val = *valp; + unsigned long lval; + if (val < 0) { + *negp = -1; + lval = (unsigned long)-val; + } else { + *negp = 0; + lval = (unsigned long)val; + } + *lvalp = jiffies_to_msecs(lval); + } + return 0; +} + +/** + * proc_dointvec_jiffies - read a vector of integers as seconds + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in seconds, and are converted into + * jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + do_proc_dointvec_jiffies_conv,NULL); +} + +/** + * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: pointer to the file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/USER_HZ seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + do_proc_dointvec_userhz_jiffies_conv,NULL); +} + +/** + * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * @ppos: the current position in the file + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/1000 seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, + do_proc_dointvec_ms_jiffies_conv, NULL); +} + +#else /* CONFIG_PROC_FS */ + +int proc_dostring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +static int proc_doutsstring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_doulonglongvec_minmax(ctl_table *table, int write, + struct file *filp, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} +int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, + struct file *filp, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + + +#endif /* CONFIG_PROC_FS */ + + +/* + * General sysctl support routines + */ + +/* The generic string strategy routine: */ +int sysctl_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + if (!table->data || !table->maxlen) + return -ENOTDIR; + + if (oldval && oldlenp) { + size_t bufsize; + if (get_user(bufsize, oldlenp)) + return -EFAULT; + if (bufsize) { + size_t len = strlen(table->data), copied; + + /* This shouldn't trigger for a well-formed sysctl */ + if (len > table->maxlen) + len = table->maxlen; + + /* Copy up to a max of bufsize-1 bytes of the string */ + copied = (len >= bufsize) ? bufsize - 1 : len; + + if (copy_to_user(oldval, table->data, copied) || + put_user(0, (char __user *)(oldval + copied))) + return -EFAULT; + if (put_user(len, oldlenp)) + return -EFAULT; + } + } + if (newval && newlen) { + size_t len = newlen; + if (len > table->maxlen) + len = table->maxlen; + if(copy_from_user(table->data, newval, len)) + return -EFAULT; + if (len == table->maxlen) + len--; + ((char *) table->data)[len] = 0; + } + return 1; +} + +/* + * This function makes sure that all of the integers in the vector + * are between the minimum and maximum values given in the arrays + * table->extra1 and table->extra2, respectively. + */ +int sysctl_intvec(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + + if (newval && newlen) { + int __user *vec = (int __user *) newval; + int *min = (int *) table->extra1; + int *max = (int *) table->extra2; + size_t length; + int i; + + if (newlen % sizeof(int) != 0) + return -EINVAL; + + if (!table->extra1 && !table->extra2) + return 0; + + if (newlen > table->maxlen) + newlen = table->maxlen; + length = newlen / sizeof(int); + + for (i = 0; i < length; i++) { + int value; + if (get_user(value, vec + i)) + return -EFAULT; + if (min && value < min[i]) + return -EINVAL; + if (max && value > max[i]) + return -EINVAL; + } + } + return 0; +} + +/* Strategy function to convert jiffies to seconds */ +int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + if (oldval) { + size_t olen; + if (oldlenp) { + if (get_user(olen, oldlenp)) + return -EFAULT; + if (olen!=sizeof(int)) + return -EINVAL; + } + if (put_user(*(int *)(table->data)/HZ, (int __user *)oldval) || + (oldlenp && put_user(sizeof(int),oldlenp))) + return -EFAULT; + } + if (newval && newlen) { + int new; + if (newlen != sizeof(int)) + return -EINVAL; + if (get_user(new, (int __user *)newval)) + return -EFAULT; + *(int *)(table->data) = new*HZ; + } + return 1; +} + +/* Strategy function to convert jiffies to seconds */ +int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + if (oldval) { + size_t olen; + if (oldlenp) { + if (get_user(olen, oldlenp)) + return -EFAULT; + if (olen!=sizeof(int)) + return -EINVAL; + } + if (put_user(jiffies_to_msecs(*(int *)(table->data)), (int __user *)oldval) || + (oldlenp && put_user(sizeof(int),oldlenp))) + return -EFAULT; + } + if (newval && newlen) { + int new; + if (newlen != sizeof(int)) + return -EINVAL; + if (get_user(new, (int __user *)newval)) + return -EFAULT; + *(int *)(table->data) = msecs_to_jiffies(new); + } + return 1; +} + +#else /* CONFIG_SYSCTL */ + + +asmlinkage long sys_sysctl(struct __sysctl_args __user *args) +{ + return -ENOSYS; +} + +int sysctl_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + return -ENOSYS; +} + +int sysctl_intvec(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + return -ENOSYS; +} + +int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + return -ENOSYS; +} + +int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) +{ + return -ENOSYS; +} + +int proc_dostring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_doulonglongvec_minmax(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, + struct file *filp, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +struct ctl_table_header * register_sysctl_table(ctl_table * table, + int insert_at_head) +{ + return NULL; +} + +void unregister_sysctl_table(struct ctl_table_header * table) +{ +} + +#endif /* CONFIG_SYSCTL */ + +/* + * No sense putting this after each symbol definition, twice, + * exception granted :-) + */ +EXPORT_SYMBOL(proc_dointvec); +EXPORT_SYMBOL(proc_dointvec_jiffies); +EXPORT_SYMBOL(proc_dointvec_minmax); +EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); +EXPORT_SYMBOL(proc_dointvec_ms_jiffies); +EXPORT_SYMBOL(proc_dostring); +EXPORT_SYMBOL(proc_doulongvec_minmax); +EXPORT_SYMBOL(proc_doulonglongvec_minmax); +EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); +EXPORT_SYMBOL(register_sysctl_table); +EXPORT_SYMBOL(sysctl_intvec); +EXPORT_SYMBOL(sysctl_jiffies); +EXPORT_SYMBOL(sysctl_ms_jiffies); +EXPORT_SYMBOL(sysctl_string); +EXPORT_SYMBOL(unregister_sysctl_table); diff -urN oldtree/mm/page_alloc.c newtree/mm/page_alloc.c --- oldtree/mm/page_alloc.c 2006-03-08 18:48:03.008067000 +0000 +++ newtree/mm/page_alloc.c 2006-03-08 20:52:39.007287750 +0000 @@ -1523,6 +1523,7 @@ " min:%lukB" " low:%lukB" " high:%lukB" + " lots:%lukB" " active:%lukB" " inactive:%lukB" " present:%lukB" @@ -1534,6 +1535,7 @@ K(zone->pages_min), K(zone->pages_low), K(zone->pages_high), + K(zone->pages_lots), K(zone->nr_active), K(zone->nr_inactive), K(zone->present_pages), @@ -2317,6 +2319,7 @@ "\n min %lu" "\n low %lu" "\n high %lu" + "\n lots %lu" "\n active %lu" "\n inactive %lu" "\n scanned %lu (a: %lu i: %lu)" @@ -2326,6 +2329,7 @@ zone->pages_min, zone->pages_low, zone->pages_high, + zone->pages_lots, zone->nr_active, zone->nr_inactive, zone->pages_scanned, @@ -2628,6 +2632,7 @@ zone->pages_low = zone->pages_min + tmp / 4; zone->pages_high = zone->pages_min + tmp / 2; + zone->pages_lots = zone->pages_min + tmp; spin_unlock_irqrestore(&zone->lru_lock, flags); } } diff -urN oldtree/mm/page_alloc.c.orig newtree/mm/page_alloc.c.orig --- oldtree/mm/page_alloc.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ newtree/mm/page_alloc.c.orig 2006-03-08 18:48:03.000000000 +0000 @@ -0,0 +1,2862 @@ +/* + * linux/mm/page_alloc.c + * + * Manages the free list, the system allocates free pages here. + * Note that kmalloc() lives in slab.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 + * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 + * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 + * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 + * (lots of bits borrowed from Ingo Molnar & Andrew Morton) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "internal.h" + +/* + * MCD - HACK: Find somewhere to initialize this EARLY, or make this + * initializer cleaner + */ +nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; +EXPORT_SYMBOL(node_online_map); +nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; +EXPORT_SYMBOL(node_possible_map); +unsigned long totalram_pages __read_mostly; +unsigned long totalhigh_pages __read_mostly; +long nr_swap_pages; +int percpu_pagelist_fraction; + +static void __free_pages_ok(struct page *page, unsigned int order); + +/* + * results with 256, 32 in the lowmem_reserve sysctl: + * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) + * 1G machine -> (16M dma, 784M normal, 224M high) + * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA + * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL + * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA + * + * TBD: should special case ZONE_DMA32 machines here - in those we normally + * don't need any ZONE_NORMAL reservation + */ +int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 }; + +EXPORT_SYMBOL(totalram_pages); + +/* + * Used by page_zone() to look up the address of the struct zone whose + * id is encoded in the upper bits of page->flags + */ +struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; +EXPORT_SYMBOL(zone_table); + +static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; +int min_free_kbytes = 1024; + +unsigned long __initdata nr_kernel_pages; +unsigned long __initdata nr_all_pages; + +#ifdef CONFIG_DEBUG_VM +static int page_outside_zone_boundaries(struct zone *zone, struct page *page) +{ + int ret = 0; + unsigned seq; + unsigned long pfn = page_to_pfn(page); + + do { + seq = zone_span_seqbegin(zone); + if (pfn >= zone->zone_start_pfn + zone->spanned_pages) + ret = 1; + else if (pfn < zone->zone_start_pfn) + ret = 1; + } while (zone_span_seqretry(zone, seq)); + + return ret; +} + +static int page_is_consistent(struct zone *zone, struct page *page) +{ +#ifdef CONFIG_HOLES_IN_ZONE + if (!pfn_valid(page_to_pfn(page))) + return 0; +#endif + if (zone != page_zone(page)) + return 0; + + return 1; +} +/* + * Temporary debugging check for pages not lying within a given zone. + */ +static int bad_range(struct zone *zone, struct page *page) +{ + if (page_outside_zone_boundaries(zone, page)) + return 1; + if (!page_is_consistent(zone, page)) + return 1; + + return 0; +} + +#else +static inline int bad_range(struct zone *zone, struct page *page) +{ + return 0; +} +#endif + +static void bad_page(struct page *page) +{ + printk(KERN_EMERG "Bad page state in process '%s'\n" + KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" + KERN_EMERG "Trying to fix it up, but a reboot is needed\n" + KERN_EMERG "Backtrace:\n", + current->comm, page, (int)(2*sizeof(unsigned long)), + (unsigned long)page->flags, page->mapping, + page_mapcount(page), page_count(page)); + dump_stack(); + { + int i; + unsigned char *ptr = (unsigned char *)page; + ptr -= 64; + + printk(KERN_EMERG "Hexdump:"); + for (i=0;i<192;i++) { + if ((i%16) == 0) { + printk("\n"); + printk(KERN_EMERG "%03x:", i); + } + printk(" %02x", ptr[i]); + } + printk("\n"); + } + page->flags &= ~(1 << PG_lru | + 1 << PG_private | + 1 << PG_locked | + 1 << PG_active | + 1 << PG_dirty | + 1 << PG_reclaim | + 1 << PG_slab | + 1 << PG_swapcache | + 1 << PG_writeback ); + set_page_count(page, 0); + reset_page_mapcount(page); + page->mapping = NULL; + add_taint(TAINT_BAD_PAGE); +} + +/* + * Higher-order pages are called "compound pages". They are structured thusly: + * + * The first PAGE_SIZE page is called the "head page". + * + * The remaining PAGE_SIZE pages are called "tail pages". + * + * All pages have PG_compound set. All pages have their ->private pointing at + * the head page (even the head page has this). + * + * The first tail page's ->lru.next holds the address of the compound page's + * put_page() function. Its ->lru.prev holds the order of allocation. + * This usage means that zero-order pages may not be compound. + */ + +static void free_compound_page(struct page *page) +{ + __free_pages_ok(page, (unsigned long)page[1].lru.prev); +} + +static void prep_compound_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + + page[1].lru.next = (void *)free_compound_page; /* set dtor */ + page[1].lru.prev = (void *)order; + for (i = 0; i < nr_pages; i++) { + struct page *p = page + i; + + __SetPageCompound(p); + set_page_private(p, (unsigned long)page); + } +} + +static void destroy_compound_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + + if (unlikely((unsigned long)page[1].lru.prev != order)) + bad_page(page); + + for (i = 0; i < nr_pages; i++) { + struct page *p = page + i; + + if (unlikely(!PageCompound(p) | + (page_private(p) != (unsigned long)page))) + bad_page(page); + __ClearPageCompound(p); + } +} + +static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) +{ + int i; + + BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); + /* + * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO + * and __GFP_HIGHMEM from hard or soft interrupt context. + */ + BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); + for (i = 0; i < (1 << order); i++) + clear_highpage(page + i); +} + +/* + * function for dealing with page's order in buddy system. + * zone->lock is already acquired when we use these. + * So, we don't need atomic page->flags operations here. + */ +static inline unsigned long page_order(struct page *page) { + return page_private(page); +} + +static inline void set_page_order(struct page *page, int order) { + set_page_private(page, order); + __SetPagePrivate(page); +} + +static inline void rmv_page_order(struct page *page) +{ + __ClearPagePrivate(page); + set_page_private(page, 0); +} + +/* + * Locate the struct page for both the matching buddy in our + * pair (buddy1) and the combined O(n+1) page they form (page). + * + * 1) Any buddy B1 will have an order O twin B2 which satisfies + * the following equation: + * B2 = B1 ^ (1 << O) + * For example, if the starting buddy (buddy2) is #8 its order + * 1 buddy is #10: + * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 + * + * 2) Any buddy B will have an order O+1 parent P which + * satisfies the following equation: + * P = B & ~(1 << O) + * + * Assumption: *_mem_map is contigious at least up to MAX_ORDER + */ +static inline struct page * +__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) +{ + unsigned long buddy_idx = page_idx ^ (1 << order); + + return page + (buddy_idx - page_idx); +} + +static inline unsigned long +__find_combined_index(unsigned long page_idx, unsigned int order) +{ + return (page_idx & ~(1 << order)); +} + +/* + * This function checks whether a page is free && is the buddy + * we can do coalesce a page and its buddy if + * (a) the buddy is not in a hole && + * (b) the buddy is free && + * (c) the buddy is on the buddy system && + * (d) a page and its buddy have the same order. + * for recording page's order, we use page_private(page) and PG_private. + * + */ +static inline int page_is_buddy(struct page *page, int order) +{ +#ifdef CONFIG_HOLES_IN_ZONE + if (!pfn_valid(page_to_pfn(page))) + return 0; +#endif + + if (PagePrivate(page) && + (page_order(page) == order) && + page_count(page) == 0) + return 1; + return 0; +} + +/* + * Freeing function for a buddy system allocator. + * + * The concept of a buddy system is to maintain direct-mapped table + * (containing bit values) for memory blocks of various "orders". + * The bottom level table contains the map for the smallest allocatable + * units of memory (here, pages), and each level above it describes + * pairs of units from the levels below, hence, "buddies". + * At a high level, all that happens here is marking the table entry + * at the bottom level available, and propagating the changes upward + * as necessary, plus some accounting needed to play nicely with other + * parts of the VM system. + * At each level, we keep a list of pages, which are heads of continuous + * free pages of length of (1 << order) and marked with PG_Private.Page's + * order is recorded in page_private(page) field. + * So when we are allocating or freeing one, we can derive the state of the + * other. That is, if we allocate a small block, and both were + * free, the remainder of the region must be split into blocks. + * If a block is freed, and its buddy is also free, then this + * triggers coalescing into a block of larger size. + * + * -- wli + */ + +static inline void __free_one_page(struct page *page, + struct zone *zone, unsigned int order) +{ + unsigned long page_idx; + int order_size = 1 << order; + + if (unlikely(PageCompound(page))) + destroy_compound_page(page, order); + + page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); + + BUG_ON(page_idx & (order_size - 1)); + BUG_ON(bad_range(zone, page)); + + zone->free_pages += order_size; + while (order < MAX_ORDER-1) { + unsigned long combined_idx; + struct free_area *area; + struct page *buddy; + + buddy = __page_find_buddy(page, page_idx, order); + if (!page_is_buddy(buddy, order)) + break; /* Move the buddy up one level. */ + + list_del(&buddy->lru); + area = zone->free_area + order; + area->nr_free--; + rmv_page_order(buddy); + combined_idx = __find_combined_index(page_idx, order); + page = page + (combined_idx - page_idx); + page_idx = combined_idx; + order++; + } + set_page_order(page, order); + list_add(&page->lru, &zone->free_area[order].free_list); + zone->free_area[order].nr_free++; +} + +static inline int free_pages_check(struct page *page) +{ + if (unlikely(page_mapcount(page) | + (page->mapping != NULL) | + (page_count(page) != 0) | + (page->flags & ( + 1 << PG_lru | + 1 << PG_private | + 1 << PG_locked | + 1 << PG_active | + 1 << PG_reclaim | + 1 << PG_slab | + 1 << PG_swapcache | + 1 << PG_writeback | + 1 << PG_reserved )))) + bad_page(page); + if (PageDirty(page)) + __ClearPageDirty(page); + /* + * For now, we report if PG_reserved was found set, but do not + * clear it, and do not free the page. But we shall soon need + * to do more, for when the ZERO_PAGE count wraps negative. + */ + return PageReserved(page); +} + +/* + * Frees a list of pages. + * Assumes all pages on list are in same zone, and of same order. + * count is the number of pages to free. + * + * If the zone was previously in an "all pages pinned" state then look to + * see if this freeing clears that state. + * + * And clear the zone's pages_scanned counter, to hold off the "all pages are + * pinned" detection logic. + */ +static void free_pages_bulk(struct zone *zone, int count, + struct list_head *list, int order) +{ + spin_lock(&zone->lock); + zone->all_unreclaimable = 0; + zone->pages_scanned = 0; + while (count--) { + struct page *page; + + BUG_ON(list_empty(list)); + page = list_entry(list->prev, struct page, lru); + /* have to delete it as __free_one_page list manipulates */ + list_del(&page->lru); + __free_one_page(page, zone, order); + } + spin_unlock(&zone->lock); +} + +static void free_one_page(struct zone *zone, struct page *page, int order) +{ + LIST_HEAD(list); + list_add(&page->lru, &list); + free_pages_bulk(zone, 1, &list, order); +} + +static void __free_pages_ok(struct page *page, unsigned int order) +{ + unsigned long flags; + int i; + int reserved = 0; + + arch_free_page(page, order); + if (!PageHighMem(page)) + mutex_debug_check_no_locks_freed(page_address(page), + PAGE_SIZE< low) { + area--; + high--; + size >>= 1; + BUG_ON(bad_range(zone, &page[size])); + list_add(&page[size].lru, &area->free_list); + area->nr_free++; + set_page_order(&page[size], high); + } +} + +/* + * This page is about to be returned from the page allocator + */ +static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +{ + if (unlikely(page_mapcount(page) | + (page->mapping != NULL) | + (page_count(page) != 0) | + (page->flags & ( + 1 << PG_lru | + 1 << PG_private | + 1 << PG_locked | + 1 << PG_active | + 1 << PG_dirty | + 1 << PG_reclaim | + 1 << PG_slab | + 1 << PG_swapcache | + 1 << PG_writeback | + 1 << PG_reserved )))) + bad_page(page); + + /* + * For now, we report if PG_reserved was found set, but do not + * clear it, and do not allocate the page: as a safety net. + */ + if (PageReserved(page)) + return 1; + + page->flags &= ~(1 << PG_uptodate | 1 << PG_error | + 1 << PG_referenced | 1 << PG_arch_1 | + 1 << PG_checked | 1 << PG_mappedtodisk); + set_page_private(page, 0); + set_page_refcounted(page); + kernel_map_pages(page, 1 << order, 1); + + if (gfp_flags & __GFP_ZERO) + prep_zero_page(page, order, gfp_flags); + + if (order && (gfp_flags & __GFP_COMP)) + prep_compound_page(page, order); + + return 0; +} + +/* + * Do the hard work of removing an element from the buddy allocator. + * Call me with the zone->lock already held. + */ +static struct page *__rmqueue(struct zone *zone, unsigned int order) +{ + struct free_area * area; + unsigned int current_order; + struct page *page; + + for (current_order = order; current_order < MAX_ORDER; ++current_order) { + area = zone->free_area + current_order; + if (list_empty(&area->free_list)) + continue; + + page = list_entry(area->free_list.next, struct page, lru); + list_del(&page->lru); + rmv_page_order(page); + area->nr_free--; + zone->free_pages -= 1UL << order; + expand(zone, page, order, current_order, area); + return page; + } + + return NULL; +} + +/* + * Obtain a specified number of elements from the buddy allocator, all under + * a single hold of the lock, for efficiency. Add them to the supplied list. + * Returns the number of new pages which were placed at *list. + */ +static int rmqueue_bulk(struct zone *zone, unsigned int order, + unsigned long count, struct list_head *list) +{ + int i; + + spin_lock(&zone->lock); + for (i = 0; i < count; ++i) { + struct page *page = __rmqueue(zone, order); + if (unlikely(page == NULL)) + break; + list_add_tail(&page->lru, list); + } + spin_unlock(&zone->lock); + return i; +} + +#ifdef CONFIG_NUMA +/* + * Called from the slab reaper to drain pagesets on a particular node that + * belong to the currently executing processor. + */ +void drain_node_pages(int nodeid) +{ + int i, z; + unsigned long flags; + + local_irq_save(flags); + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = NODE_DATA(nodeid)->node_zones + z; + struct per_cpu_pageset *pset; + + pset = zone_pcp(zone, smp_processor_id()); + for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { + struct per_cpu_pages *pcp; + + pcp = &pset->pcp[i]; + free_pages_bulk(zone, pcp->count, &pcp->list, 0); + pcp->count = 0; + } + } + local_irq_restore(flags); +} +#endif + +#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) +static void __drain_pages(unsigned int cpu) +{ + unsigned long flags; + struct zone *zone; + int i; + + for_each_zone(zone) { + struct per_cpu_pageset *pset; + + pset = zone_pcp(zone, cpu); + for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { + struct per_cpu_pages *pcp; + + pcp = &pset->pcp[i]; + local_irq_save(flags); + free_pages_bulk(zone, pcp->count, &pcp->list, 0); + pcp->count = 0; + local_irq_restore(flags); + } + } +} +#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ + +#ifdef CONFIG_PM + +void mark_free_pages(struct zone *zone) +{ + unsigned long zone_pfn, flags; + int order; + struct list_head *curr; + + if (!zone->spanned_pages) + return; + + spin_lock_irqsave(&zone->lock, flags); + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) + ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); + + for (order = MAX_ORDER - 1; order >= 0; --order) + list_for_each(curr, &zone->free_area[order].free_list) { + unsigned long start_pfn, i; + + start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); + + for (i=0; i < (1<lock, flags); +} + +/* + * Spill all of this CPU's per-cpu pages back into the buddy allocator. + */ +void drain_local_pages(void) +{ + unsigned long flags; + + local_irq_save(flags); + __drain_pages(smp_processor_id()); + local_irq_restore(flags); +} +#endif /* CONFIG_PM */ + +static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu) +{ +#ifdef CONFIG_NUMA + pg_data_t *pg = z->zone_pgdat; + pg_data_t *orig = zonelist->zones[0]->zone_pgdat; + struct per_cpu_pageset *p; + + p = zone_pcp(z, cpu); + if (pg == orig) { + p->numa_hit++; + } else { + p->numa_miss++; + zone_pcp(zonelist->zones[0], cpu)->numa_foreign++; + } + if (pg == NODE_DATA(numa_node_id())) + p->local_node++; + else + p->other_node++; +#endif +} + +/* + * Free a 0-order page + */ +static void fastcall free_hot_cold_page(struct page *page, int cold) +{ + struct zone *zone = page_zone(page); + struct per_cpu_pages *pcp; + unsigned long flags; + + arch_free_page(page, 0); + + if (PageAnon(page)) + page->mapping = NULL; + if (free_pages_check(page)) + return; + + kernel_map_pages(page, 1, 0); + + pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; + local_irq_save(flags); + __inc_page_state(pgfree); + list_add(&page->lru, &pcp->list); + pcp->count++; + if (pcp->count >= pcp->high) { + free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + pcp->count -= pcp->batch; + } + local_irq_restore(flags); + put_cpu(); +} + +void fastcall free_hot_page(struct page *page) +{ + free_hot_cold_page(page, 0); +} + +void fastcall free_cold_page(struct page *page) +{ + free_hot_cold_page(page, 1); +} + +/* + * split_page takes a non-compound higher-order page, and splits it into + * n (1< 0 path. Saves a branch + * or two. + */ +static struct page *buffered_rmqueue(struct zonelist *zonelist, + struct zone *zone, int order, gfp_t gfp_flags) +{ + unsigned long flags; + struct page *page; + int cold = !!(gfp_flags & __GFP_COLD); + int cpu; + +again: + cpu = get_cpu(); + if (likely(order == 0)) { + struct per_cpu_pages *pcp; + + pcp = &zone_pcp(zone, cpu)->pcp[cold]; + local_irq_save(flags); + if (!pcp->count) { + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, &pcp->list); + if (unlikely(!pcp->count)) + goto failed; + } + page = list_entry(pcp->list.next, struct page, lru); + list_del(&page->lru); + pcp->count--; + } else { + spin_lock_irqsave(&zone->lock, flags); + page = __rmqueue(zone, order); + spin_unlock(&zone->lock); + if (!page) + goto failed; + } + + __mod_page_state_zone(zone, pgalloc, 1 << order); + zone_statistics(zonelist, zone, cpu); + local_irq_restore(flags); + put_cpu(); + + BUG_ON(bad_range(zone, page)); + if (prep_new_page(page, order, gfp_flags)) + goto again; + return page; + +failed: + local_irq_restore(flags); + put_cpu(); + return NULL; +} + +#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ +#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ +#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ +#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ +#define ALLOC_HARDER 0x10 /* try to alloc harder */ +#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ + +/* + * Return 1 if free pages are above 'mark'. This takes into account the order + * of the allocation. + */ +int zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags) +{ + /* free_pages my go negative - that's OK */ + long min = mark, free_pages = z->free_pages - (1 << order) + 1; + int o; + + if (alloc_flags & ALLOC_HIGH) + min -= min / 2; + if (alloc_flags & ALLOC_HARDER) + min -= min / 4; + + if (free_pages <= min + z->lowmem_reserve[classzone_idx]) + return 0; + for (o = 0; o < order; o++) { + /* At the next order, this order's pages become unavailable */ + free_pages -= z->free_area[o].nr_free << o; + + /* Require fewer higher order pages to be free */ + min >>= 1; + + if (free_pages <= min) + return 0; + } + return 1; +} + +/* + * get_page_from_freeliest goes through the zonelist trying to allocate + * a page. + */ +static struct page * +get_page_from_freelist(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, int alloc_flags) +{ + struct zone **z = zonelist->zones; + struct page *page = NULL; + int classzone_idx = zone_idx(*z); + + /* + * Go through the zonelist once, looking for a zone with enough free. + * See also cpuset_zone_allowed() comment in kernel/cpuset.c. + */ + do { + if ((alloc_flags & ALLOC_CPUSET) && + !cpuset_zone_allowed(*z, gfp_mask)) + continue; + + if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { + unsigned long mark; + if (alloc_flags & ALLOC_WMARK_MIN) + mark = (*z)->pages_min; + else if (alloc_flags & ALLOC_WMARK_LOW) + mark = (*z)->pages_low; + else + mark = (*z)->pages_high; + if (!zone_watermark_ok(*z, order, mark, + classzone_idx, alloc_flags)) + if (!zone_reclaim_mode || + !zone_reclaim(*z, gfp_mask, order)) + continue; + } + + page = buffered_rmqueue(zonelist, *z, order, gfp_mask); + if (page) { + break; + } + } while (*(++z) != NULL); + return page; +} + +#ifdef CONFIG_PAGE_OWNER +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) +{ + return p > (void *)tinfo && + p < (void *)tinfo + THREAD_SIZE - 3; +} + +static inline void __stack_trace(struct page *page, unsigned long *stack, + unsigned long bp) +{ + int i = 0; + unsigned long addr; + struct thread_info *tinfo = (struct thread_info *) + ((unsigned long)stack & (~(THREAD_SIZE - 1))); + + memset(page->trace, 0, sizeof(long) * 8); + +#ifdef CONFIG_FRAME_POINTER + while (valid_stack_ptr(tinfo, (void *)bp)) { + addr = *(unsigned long *)(bp + sizeof(long)); + page->trace[i] = addr; + if (++i >= 8) + break; + bp = *(unsigned long *)bp; + } +#else + while (valid_stack_ptr(tinfo, stack)) { + addr = *stack++; + if (__kernel_text_address(addr)) { + page->trace[i] = addr; + if (++i >= 8) + break; + } + } +#endif +} + +static inline void set_page_owner(struct page *page, + unsigned int order, unsigned int gfp_mask) +{ + unsigned long address, bp; +#ifdef CONFIG_X86_64 + asm ("movq %%rbp, %0" : "=r" (bp) : ); +#else + asm ("movl %%ebp, %0" : "=r" (bp) : ); +#endif + page->order = (int) order; + page->gfp_mask = gfp_mask; + __stack_trace(page, &address, bp); +} +#endif /* CONFIG_PAGE_OWNER */ + +/* + * This is the 'heart' of the zoned buddy allocator. + */ +struct page * fastcall +__alloc_pages(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist) +{ + const gfp_t wait = gfp_mask & __GFP_WAIT; + struct zone **z; + struct page *page; + struct reclaim_state reclaim_state; + struct task_struct *p = current; + int do_retry; + int alloc_flags; + int did_some_progress; + + might_sleep_if(wait); + +restart: + z = zonelist->zones; /* the list of zones suitable for gfp_mask */ + + if (unlikely(*z == NULL)) { + /* Should this ever happen?? */ + return NULL; + } + + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, + zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); + if (page) + goto got_pg; + + do { + if (cpuset_zone_allowed(*z, gfp_mask)) + wakeup_kswapd(*z, order); + } while (*(++z)); + + /* + * OK, we're below the kswapd watermark and have kicked background + * reclaim. Now things get more complex, so set up alloc_flags according + * to how we want to proceed. + * + * The caller may dip into page reserves a bit more if the caller + * cannot run direct reclaim, or if the caller has realtime scheduling + * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will + * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). + */ + alloc_flags = ALLOC_WMARK_MIN; + if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) + alloc_flags |= ALLOC_HARDER; + if (gfp_mask & __GFP_HIGH) + alloc_flags |= ALLOC_HIGH; + alloc_flags |= ALLOC_CPUSET; + + /* + * Go through the zonelist again. Let __GFP_HIGH and allocations + * coming from realtime tasks go deeper into reserves. + * + * This is the last chance, in general, before the goto nopage. + * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. + * See also cpuset_zone_allowed() comment in kernel/cpuset.c. + */ + page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); + if (page) + goto got_pg; + + /* This allocation should allow future memory freeing. */ + + if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) + && !in_interrupt()) { + if (!(gfp_mask & __GFP_NOMEMALLOC)) { +nofail_alloc: + /* go through the zonelist yet again, ignoring mins */ + page = get_page_from_freelist(gfp_mask, order, + zonelist, ALLOC_NO_WATERMARKS); + if (page) + goto got_pg; + if (gfp_mask & __GFP_NOFAIL) { + blk_congestion_wait(WRITE, HZ/50); + goto nofail_alloc; + } + } + goto nopage; + } + + /* Atomic allocations - we can't balance anything */ + if (!wait) + goto nopage; + +rebalance: + cond_resched(); + + /* We now go into synchronous reclaim */ + cpuset_memory_pressure_bump(); + p->flags |= PF_MEMALLOC; + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + + did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); + + p->reclaim_state = NULL; + p->flags &= ~PF_MEMALLOC; + + cond_resched(); + + if (likely(did_some_progress)) { + page = get_page_from_freelist(gfp_mask, order, + zonelist, alloc_flags); + if (page) + goto got_pg; + } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { + /* + * Go through the zonelist yet one more time, keep + * very high watermark here, this is only to catch + * a parallel oom killing, we must fail if we're still + * under heavy pressure. + */ + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, + zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); + if (page) + goto got_pg; + + out_of_memory(zonelist, gfp_mask, order); + goto restart; + } + + /* + * Don't let big-order allocations loop unless the caller explicitly + * requests that. Wait for some write requests to complete then retry. + * + * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order + * <= 3, but that may not be true in other implementations. + */ + do_retry = 0; + if (!(gfp_mask & __GFP_NORETRY)) { + if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) + do_retry = 1; + if (gfp_mask & __GFP_NOFAIL) + do_retry = 1; + } + if (do_retry) { + blk_congestion_wait(WRITE, HZ/50); + goto rebalance; + } + +nopage: + if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { + printk(KERN_WARNING "%s: page allocation failure." + " order:%d, mode:0x%x\n", + p->comm, order, gfp_mask); + dump_stack(); + show_mem(); + } +got_pg: +#ifdef CONFIG_PAGE_OWNER + if (page) + set_page_owner(page, order, gfp_mask); +#endif + return page; +} + +EXPORT_SYMBOL(__alloc_pages); + +/* + * Common helper functions. + */ +fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) +{ + struct page * page; + page = alloc_pages(gfp_mask, order); + if (!page) + return 0; + return (unsigned long) page_address(page); +} + +EXPORT_SYMBOL(__get_free_pages); + +fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) +{ + struct page * page; + + /* + * get_zeroed_page() returns a 32-bit address, which cannot represent + * a highmem page + */ + BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); + + page = alloc_pages(gfp_mask | __GFP_ZERO, 0); + if (page) + return (unsigned long) page_address(page); + return 0; +} + +EXPORT_SYMBOL(get_zeroed_page); + +void __pagevec_free(struct pagevec *pvec) +{ + int i = pagevec_count(pvec); + + while (--i >= 0) + free_hot_cold_page(pvec->pages[i], pvec->cold); +} + +fastcall void __free_pages(struct page *page, unsigned int order) +{ + if (put_page_testzero(page)) { + if (order == 0) + free_hot_page(page); + else + __free_pages_ok(page, order); +#ifdef CONFIG_PAGE_OWNER + page->order = -1; +#endif + } +} + +EXPORT_SYMBOL(__free_pages); + +fastcall void free_pages(unsigned long addr, unsigned int order) +{ + if (addr != 0) { + BUG_ON(!virt_addr_valid((void *)addr)); + __free_pages(virt_to_page((void *)addr), order); + } +} + +EXPORT_SYMBOL(free_pages); + +/* + * Total amount of free (allocatable) RAM: + */ +unsigned int nr_free_pages(void) +{ + unsigned int sum = 0; + struct zone *zone; + + for_each_zone(zone) + sum += zone->free_pages; + + return sum; +} + +EXPORT_SYMBOL(nr_free_pages); + +#ifdef CONFIG_NUMA +unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) +{ + unsigned int i, sum = 0; + + for (i = 0; i < MAX_NR_ZONES; i++) + sum += pgdat->node_zones[i].free_pages; + + return sum; +} +#endif + +static unsigned int nr_free_zone_pages(int offset) +{ + /* Just pick one node, since fallback list is circular */ + pg_data_t *pgdat = NODE_DATA(numa_node_id()); + unsigned int sum = 0; + + struct zonelist *zonelist = pgdat->node_zonelists + offset; + struct zone **zonep = zonelist->zones; + struct zone *zone; + + for (zone = *zonep++; zone; zone = *zonep++) { + unsigned long size = zone->present_pages; + unsigned long high = zone->pages_high; + if (size > high) + sum += size - high; + } + + return sum; +} + +/* + * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL + */ +unsigned int nr_free_buffer_pages(void) +{ + return nr_free_zone_pages(gfp_zone(GFP_USER)); +} + +/* + * Amount of free RAM allocatable within all zones + */ +unsigned int nr_free_pagecache_pages(void) +{ + return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); +} + +#ifdef CONFIG_HIGHMEM +unsigned int nr_free_highpages (void) +{ + pg_data_t *pgdat; + unsigned int pages = 0; + + for_each_online_pgdat(pgdat) + pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; + + return pages; +} +#endif + +#ifdef CONFIG_NUMA +static void show_node(struct zone *zone) +{ + printk("Node %d ", zone->zone_pgdat->node_id); +} +#else +#define show_node(zone) do { } while (0) +#endif + +/* + * Accumulate the page_state information across all CPUs. + * The result is unavoidably approximate - it can change + * during and after execution of this function. + */ +static DEFINE_PER_CPU(struct page_state, page_states) = {0}; + +atomic_t nr_pagecache = ATOMIC_INIT(0); +EXPORT_SYMBOL(nr_pagecache); +#ifdef CONFIG_SMP +DEFINE_PER_CPU(long, nr_pagecache_local) = 0; +#endif + +static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) +{ + int cpu = 0; + + memset(ret, 0, nr * sizeof(unsigned long)); + cpus_and(*cpumask, *cpumask, cpu_online_map); + + cpu = first_cpu(*cpumask); + while (cpu < NR_CPUS) { + unsigned long *in, *out, off; + + if (!cpu_isset(cpu, *cpumask)) + continue; + + in = (unsigned long *)&per_cpu(page_states, cpu); + + cpu = next_cpu(cpu, *cpumask); + + if (likely(cpu < NR_CPUS)) + prefetch(&per_cpu(page_states, cpu)); + + out = (unsigned long *)ret; + for (off = 0; off < nr; off++) + *out++ += *in++; + } +} + +void get_page_state_node(struct page_state *ret, int node) +{ + int nr; + cpumask_t mask = node_to_cpumask(node); + + nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); + nr /= sizeof(unsigned long); + + __get_page_state(ret, nr+1, &mask); +} + +void get_page_state(struct page_state *ret) +{ + int nr; + cpumask_t mask = CPU_MASK_ALL; + + nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); + nr /= sizeof(unsigned long); + + __get_page_state(ret, nr + 1, &mask); +} + +void get_full_page_state(struct page_state *ret) +{ + cpumask_t mask = CPU_MASK_ALL; + + __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); +} + +unsigned long read_page_state_offset(unsigned long offset) +{ + unsigned long ret = 0; + int cpu; + + for_each_online_cpu(cpu) { + unsigned long in; + + in = (unsigned long)&per_cpu(page_states, cpu) + offset; + ret += *((unsigned long *)in); + } + return ret; +} + +void __mod_page_state_offset(unsigned long offset, unsigned long delta) +{ + void *ptr; + + ptr = &__get_cpu_var(page_states); + *(unsigned long *)(ptr + offset) += delta; +} +EXPORT_SYMBOL(__mod_page_state_offset); + +void mod_page_state_offset(unsigned long offset, unsigned long delta) +{ + unsigned long flags; + void *ptr; + + local_irq_save(flags); + ptr = &__get_cpu_var(page_states); + *(unsigned long *)(ptr + offset) += delta; + local_irq_restore(flags); +} +EXPORT_SYMBOL(mod_page_state_offset); + +void __get_zone_counts(unsigned long *active, unsigned long *inactive, + unsigned long *free, struct pglist_data *pgdat) +{ + struct zone *zones = pgdat->node_zones; + int i; + + *active = 0; + *inactive = 0; + *free = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { + *active += zones[i].nr_active; + *inactive += zones[i].nr_inactive; + *free += zones[i].free_pages; + } +} + +void get_zone_counts(unsigned long *active, + unsigned long *inactive, unsigned long *free) +{ + struct pglist_data *pgdat; + + *active = 0; + *inactive = 0; + *free = 0; + for_each_online_pgdat(pgdat) { + unsigned long l, m, n; + __get_zone_counts(&l, &m, &n, pgdat); + *active += l; + *inactive += m; + *free += n; + } +} + +void si_meminfo(struct sysinfo *val) +{ + val->totalram = totalram_pages; + val->sharedram = 0; + val->freeram = nr_free_pages(); + val->bufferram = nr_blockdev_pages(); +#ifdef CONFIG_HIGHMEM + val->totalhigh = totalhigh_pages; + val->freehigh = nr_free_highpages(); +#else + val->totalhigh = 0; + val->freehigh = 0; +#endif + val->mem_unit = PAGE_SIZE; +} + +EXPORT_SYMBOL(si_meminfo); + +#ifdef CONFIG_NUMA +void si_meminfo_node(struct sysinfo *val, int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + + val->totalram = pgdat->node_present_pages; + val->freeram = nr_free_pages_pgdat(pgdat); + val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; + val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; + val->mem_unit = PAGE_SIZE; +} +#endif + +#define K(x) ((x) << (PAGE_SHIFT-10)) + +/* + * Show free area list (used inside shift_scroll-lock stuff) + * We also calculate the percentage fragmentation. We do this by counting the + * memory on each free list with the exception of the first item on the list. + */ +void show_free_areas(void) +{ + struct page_state ps; + int cpu, temperature; + unsigned long active; + unsigned long inactive; + unsigned long free; + struct zone *zone; + + for_each_zone(zone) { + show_node(zone); + printk("%s per-cpu:", zone->name); + + if (!populated_zone(zone)) { + printk(" empty\n"); + continue; + } else + printk("\n"); + + for_each_online_cpu(cpu) { + struct per_cpu_pageset *pageset; + + pageset = zone_pcp(zone, cpu); + + for (temperature = 0; temperature < 2; temperature++) + printk("cpu %d %s: high %d, batch %d used:%d\n", + cpu, + temperature ? "cold" : "hot", + pageset->pcp[temperature].high, + pageset->pcp[temperature].batch, + pageset->pcp[temperature].count); + } + } + + get_page_state(&ps); + get_zone_counts(&active, &inactive, &free); + + printk("Free pages: %11ukB (%ukB HighMem)\n", + K(nr_free_pages()), + K(nr_free_highpages())); + + printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " + "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", + active, + inactive, + ps.nr_dirty, + ps.nr_writeback, + ps.nr_unstable, + nr_free_pages(), + ps.nr_slab, + ps.nr_mapped, + ps.nr_page_table_pages); + + for_each_zone(zone) { + int i; + + show_node(zone); + printk("%s" + " free:%lukB" + " min:%lukB" + " low:%lukB" + " high:%lukB" + " active:%lukB" + " inactive:%lukB" + " present:%lukB" + " pages_scanned:%lu" + " all_unreclaimable? %s" + "\n", + zone->name, + K(zone->free_pages), + K(zone->pages_min), + K(zone->pages_low), + K(zone->pages_high), + K(zone->nr_active), + K(zone->nr_inactive), + K(zone->present_pages), + zone->pages_scanned, + (zone->all_unreclaimable ? "yes" : "no") + ); + printk("lowmem_reserve[]:"); + for (i = 0; i < MAX_NR_ZONES; i++) + printk(" %lu", zone->lowmem_reserve[i]); + printk("\n"); + } + + for_each_zone(zone) { + unsigned long nr, flags, order, total = 0; + + show_node(zone); + printk("%s: ", zone->name); + if (!populated_zone(zone)) { + printk("empty\n"); + continue; + } + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + nr = zone->free_area[order].nr_free; + total += nr << order; + printk("%lu*%lukB ", nr, K(1UL) << order); + } + spin_unlock_irqrestore(&zone->lock, flags); + printk("= %lukB\n", K(total)); + } + + show_swap_cache_info(); +} + +/* + * Builds allocation fallback zone lists. + * + * Add all populated zones of a node to the zonelist. + */ +static int __init build_zonelists_node(pg_data_t *pgdat, + struct zonelist *zonelist, int nr_zones, int zone_type) +{ + struct zone *zone; + + BUG_ON(zone_type > ZONE_HIGHMEM); + + do { + zone = pgdat->node_zones + zone_type; + if (populated_zone(zone)) { +#ifndef CONFIG_HIGHMEM + BUG_ON(zone_type > ZONE_NORMAL); +#endif + zonelist->zones[nr_zones++] = zone; + check_highest_zone(zone_type); + } + zone_type--; + + } while (zone_type >= 0); + return nr_zones; +} + +static inline int highest_zone(int zone_bits) +{ + int res = ZONE_NORMAL; + if (zone_bits & (__force int)__GFP_HIGHMEM) + res = ZONE_HIGHMEM; + if (zone_bits & (__force int)__GFP_DMA32) + res = ZONE_DMA32; + if (zone_bits & (__force int)__GFP_DMA) + res = ZONE_DMA; + return res; +} + +#ifdef CONFIG_NUMA +#define MAX_NODE_LOAD (num_online_nodes()) +static int __initdata node_load[MAX_NUMNODES]; +/** + * find_next_best_node - find the next node that should appear in a given node's fallback list + * @node: node whose fallback list we're appending + * @used_node_mask: nodemask_t of already used nodes + * + * We use a number of factors to determine which is the next node that should + * appear on a given node's fallback list. The node should not have appeared + * already in @node's fallback list, and it should be the next closest node + * according to the distance array (which contains arbitrary distance values + * from each node to each node in the system), and should also prefer nodes + * with no CPUs, since presumably they'll have very little allocation pressure + * on them otherwise. + * It returns -1 if no node is found. + */ +static int __init find_next_best_node(int node, nodemask_t *used_node_mask) +{ + int n, val; + int min_val = INT_MAX; + int best_node = -1; + + /* Use the local node if we haven't already */ + if (!node_isset(node, *used_node_mask)) { + node_set(node, *used_node_mask); + return node; + } + + for_each_online_node(n) { + cpumask_t tmp; + + /* Don't want a node to appear more than once */ + if (node_isset(n, *used_node_mask)) + continue; + + /* Use the distance array to find the distance */ + val = node_distance(node, n); + + /* Penalize nodes under us ("prefer the next node") */ + val += (n < node); + + /* Give preference to headless and unused nodes */ + tmp = node_to_cpumask(n); + if (!cpus_empty(tmp)) + val += PENALTY_FOR_NODE_WITH_CPUS; + + /* Slight preference for less loaded node */ + val *= (MAX_NODE_LOAD*MAX_NUMNODES); + val += node_load[n]; + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + if (best_node >= 0) + node_set(best_node, *used_node_mask); + + return best_node; +} + +static void __init build_zonelists(pg_data_t *pgdat) +{ + int i, j, k, node, local_node; + int prev_node, load; + struct zonelist *zonelist; + nodemask_t used_mask; + + /* initialize zonelists */ + for (i = 0; i < GFP_ZONETYPES; i++) { + zonelist = pgdat->node_zonelists + i; + zonelist->zones[0] = NULL; + } + + /* NUMA-aware ordering of nodes */ + local_node = pgdat->node_id; + load = num_online_nodes(); + prev_node = local_node; + nodes_clear(used_mask); + while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { + int distance = node_distance(local_node, node); + + /* + * If another node is sufficiently far away then it is better + * to reclaim pages in a zone before going off node. + */ + if (distance > RECLAIM_DISTANCE) + zone_reclaim_mode = 1; + + /* + * We don't want to pressure a particular node. + * So adding penalty to the first node in same + * distance group to make it round-robin. + */ + + if (distance != node_distance(local_node, prev_node)) + node_load[node] += load; + prev_node = node; + load--; + for (i = 0; i < GFP_ZONETYPES; i++) { + zonelist = pgdat->node_zonelists + i; + for (j = 0; zonelist->zones[j] != NULL; j++); + + k = highest_zone(i); + + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + zonelist->zones[j] = NULL; + } + } +} + +#else /* CONFIG_NUMA */ + +static void __init build_zonelists(pg_data_t *pgdat) +{ + int i, j, k, node, local_node; + + local_node = pgdat->node_id; + for (i = 0; i < GFP_ZONETYPES; i++) { + struct zonelist *zonelist; + + zonelist = pgdat->node_zonelists + i; + + j = 0; + k = highest_zone(i); + j = build_zonelists_node(pgdat, zonelist, j, k); + /* + * Now we build the zonelist so that it contains the zones + * of all the other nodes. + * We don't want to pressure a particular node, so when + * building the zones for node N, we make sure that the + * zones coming right after the local ones are those from + * node N+1 (modulo N) + */ + for (node = local_node + 1; node < MAX_NUMNODES; node++) { + if (!node_online(node)) + continue; + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + } + for (node = 0; node < local_node; node++) { + if (!node_online(node)) + continue; + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + } + + zonelist->zones[j] = NULL; + } +} + +#endif /* CONFIG_NUMA */ + +void __init build_all_zonelists(void) +{ + int i; + + for_each_online_node(i) + build_zonelists(NODE_DATA(i)); + printk("Built %i zonelists\n", num_online_nodes()); + cpuset_init_current_mems_allowed(); +} + +/* + * Helper functions to size the waitqueue hash table. + * Essentially these want to choose hash table sizes sufficiently + * large so that collisions trying to wait on pages are rare. + * But in fact, the number of active page waitqueues on typical + * systems is ridiculously low, less than 200. So this is even + * conservative, even though it seems large. + * + * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to + * waitqueues, i.e. the size of the waitq table given the number of pages. + */ +#define PAGES_PER_WAITQUEUE 256 + +static inline unsigned long wait_table_size(unsigned long pages) +{ + unsigned long size = 1; + + pages /= PAGES_PER_WAITQUEUE; + + while (size < pages) + size <<= 1; + + /* + * Once we have dozens or even hundreds of threads sleeping + * on IO we've got bigger problems than wait queue collision. + * Limit the size of the wait table to a reasonable size. + */ + size = min(size, 4096UL); + + return max(size, 4UL); +} + +/* + * This is an integer logarithm so that shifts can be used later + * to extract the more random high bits from the multiplicative + * hash function before the remainder is taken. + */ +static inline unsigned long wait_table_bits(unsigned long size) +{ + return ffz(~size); +} + +#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) + +static void __init calculate_zone_totalpages(struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long *zholes_size) +{ + unsigned long realtotalpages, totalpages = 0; + int i; + + for (i = 0; i < MAX_NR_ZONES; i++) + totalpages += zones_size[i]; + pgdat->node_spanned_pages = totalpages; + + realtotalpages = totalpages; + if (zholes_size) + for (i = 0; i < MAX_NR_ZONES; i++) + realtotalpages -= zholes_size[i]; + pgdat->node_present_pages = realtotalpages; + printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); +} + + +/* + * Initially all pages are reserved - free ones are freed + * up by free_all_bootmem() once the early boot process is + * done. Non-atomic initialization, single-pass. + */ +void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, + unsigned long start_pfn) +{ + struct page *page; + unsigned long end_pfn = start_pfn + size; + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + if (!early_pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + set_page_links(page, zone, nid, pfn); + init_page_count(page); + reset_page_mapcount(page); + SetPageReserved(page); + INIT_LIST_HEAD(&page->lru); +#ifdef WANT_PAGE_VIRTUAL + /* The shift won't overflow because ZONE_NORMAL is below 4G. */ + if (!is_highmem_idx(zone)) + set_page_address(page, __va(pfn << PAGE_SHIFT)); +#endif +#ifdef CONFIG_PAGE_OWNER + page->order = -1; +#endif + } +} + +void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, + unsigned long size) +{ + int order; + for (order = 0; order < MAX_ORDER ; order++) { + INIT_LIST_HEAD(&zone->free_area[order].free_list); + zone->free_area[order].nr_free = 0; + } +} + +#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) +void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, + unsigned long size) +{ + unsigned long snum = pfn_to_section_nr(pfn); + unsigned long end = pfn_to_section_nr(pfn + size); + + if (FLAGS_HAS_NODE) + zone_table[ZONETABLE_INDEX(nid, zid)] = zone; + else + for (; snum <= end; snum++) + zone_table[ZONETABLE_INDEX(snum, zid)] = zone; +} + +#ifndef __HAVE_ARCH_MEMMAP_INIT +#define memmap_init(size, nid, zone, start_pfn) \ + memmap_init_zone((size), (nid), (zone), (start_pfn)) +#endif + +static int __cpuinit zone_batchsize(struct zone *zone) +{ + int batch; + + /* + * The per-cpu-pages pools are set to around 1000th of the + * size of the zone. But no more than 1/2 of a meg. + * + * OK, so we don't know how big the cache is. So guess. + */ + batch = zone->present_pages / 1024; + if (batch * PAGE_SIZE > 512 * 1024) + batch = (512 * 1024) / PAGE_SIZE; + batch /= 4; /* We effectively *= 4 below */ + if (batch < 1) + batch = 1; + + /* + * Clamp the batch to a 2^n - 1 value. Having a power + * of 2 value was found to be more likely to have + * suboptimal cache aliasing properties in some cases. + * + * For example if 2 tasks are alternately allocating + * batches of pages, one task can end up with a lot + * of pages of one half of the possible page colors + * and the other with pages of the other colors. + */ + batch = (1 << (fls(batch + batch/2)-1)) - 1; + + return batch; +} + +inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +{ + struct per_cpu_pages *pcp; + + memset(p, 0, sizeof(*p)); + + pcp = &p->pcp[0]; /* hot */ + pcp->count = 0; + pcp->high = 6 * batch; + pcp->batch = max(1UL, 1 * batch); + INIT_LIST_HEAD(&pcp->list); + + pcp = &p->pcp[1]; /* cold*/ + pcp->count = 0; + pcp->high = 2 * batch; + pcp->batch = max(1UL, batch/2); + INIT_LIST_HEAD(&pcp->list); +} + +/* + * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist + * to the value high for the pageset p. + */ + +static void setup_pagelist_highmark(struct per_cpu_pageset *p, + unsigned long high) +{ + struct per_cpu_pages *pcp; + + pcp = &p->pcp[0]; /* hot list */ + pcp->high = high; + pcp->batch = max(1UL, high/4); + if ((high/4) > (PAGE_SHIFT * 8)) + pcp->batch = PAGE_SHIFT * 8; +} + + +#ifdef CONFIG_NUMA +/* + * Boot pageset table. One per cpu which is going to be used for all + * zones and all nodes. The parameters will be set in such a way + * that an item put on a list will immediately be handed over to + * the buddy list. This is safe since pageset manipulation is done + * with interrupts disabled. + * + * Some NUMA counter updates may also be caught by the boot pagesets. + * + * The boot_pagesets must be kept even after bootup is complete for + * unused processors and/or zones. They do play a role for bootstrapping + * hotplugged processors. + * + * zoneinfo_show() and maybe other functions do + * not check if the processor is online before following the pageset pointer. + * Other parts of the kernel may not check if the zone is available. + */ +static struct per_cpu_pageset boot_pageset[NR_CPUS]; + +/* + * Dynamically allocate memory for the + * per cpu pageset array in struct zone. + */ +static int __cpuinit process_zones(int cpu) +{ + struct zone *zone, *dzone; + + for_each_zone(zone) { + + zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), + GFP_KERNEL, cpu_to_node(cpu)); + if (!zone_pcp(zone, cpu)) + goto bad; + + setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); + + if (percpu_pagelist_fraction) + setup_pagelist_highmark(zone_pcp(zone, cpu), + (zone->present_pages / percpu_pagelist_fraction)); + } + + return 0; +bad: + for_each_zone(dzone) { + if (dzone == zone) + break; + kfree(zone_pcp(dzone, cpu)); + zone_pcp(dzone, cpu) = NULL; + } + return -ENOMEM; +} + +static inline void free_zone_pagesets(int cpu) +{ + struct zone *zone; + + for_each_zone(zone) { + struct per_cpu_pageset *pset = zone_pcp(zone, cpu); + + zone_pcp(zone, cpu) = NULL; + kfree(pset); + } +} + +static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + int ret = NOTIFY_OK; + + switch (action) { + case CPU_UP_PREPARE: + if (process_zones(cpu)) + ret = NOTIFY_BAD; + break; + case CPU_UP_CANCELED: + case CPU_DEAD: + free_zone_pagesets(cpu); + break; + default: + break; + } + return ret; +} + +static struct notifier_block pageset_notifier = + { &pageset_cpuup_callback, NULL, 0 }; + +void __init setup_per_cpu_pageset(void) +{ + int err; + + /* Initialize per_cpu_pageset for cpu 0. + * A cpuup callback will do this for every cpu + * as it comes online + */ + err = process_zones(smp_processor_id()); + BUG_ON(err); + register_cpu_notifier(&pageset_notifier); +} + +#endif + +static __meminit +void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) +{ + int i; + struct pglist_data *pgdat = zone->zone_pgdat; + + /* + * The per-page waitqueue mechanism uses hashed waitqueues + * per zone. + */ + zone->wait_table_size = wait_table_size(zone_size_pages); + zone->wait_table_bits = wait_table_bits(zone->wait_table_size); + zone->wait_table = (wait_queue_head_t *) + alloc_bootmem_node(pgdat, zone->wait_table_size + * sizeof(wait_queue_head_t)); + + for(i = 0; i < zone->wait_table_size; ++i) + init_waitqueue_head(zone->wait_table + i); +} + +static __meminit void zone_pcp_init(struct zone *zone) +{ + int cpu; + unsigned long batch = zone_batchsize(zone); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { +#ifdef CONFIG_NUMA + /* Early boot. Slab allocator not functional yet */ + zone_pcp(zone, cpu) = &boot_pageset[cpu]; + setup_pageset(&boot_pageset[cpu],0); +#else + setup_pageset(zone_pcp(zone,cpu), batch); +#endif + } + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", + zone->name, zone->present_pages, batch); +} + +static __meminit void init_currently_empty_zone(struct zone *zone, + unsigned long zone_start_pfn, unsigned long size) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + + zone_wait_table_init(zone, size); + pgdat->nr_zones = zone_idx(zone) + 1; + + zone->zone_start_pfn = zone_start_pfn; + + memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); + + zone_init_free_lists(pgdat, zone, zone->spanned_pages); +} + +/* + * Set up the zone data structures: + * - mark all pages reserved + * - mark all memory queues empty + * - clear the memory bitmaps + */ +static void __init free_area_init_core(struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long *zholes_size) +{ + unsigned long j; + int nid = pgdat->node_id; + unsigned long zone_start_pfn = pgdat->node_start_pfn; + + pgdat_resize_init(pgdat); + pgdat->nr_zones = 0; + init_waitqueue_head(&pgdat->kswapd_wait); + pgdat->kswapd_max_order = 0; + + for (j = 0; j < MAX_NR_ZONES; j++) { + struct zone *zone = pgdat->node_zones + j; + unsigned long size, realsize; + + realsize = size = zones_size[j]; + if (zholes_size) + realsize -= zholes_size[j]; + + if (j < ZONE_HIGHMEM) + nr_kernel_pages += realsize; + nr_all_pages += realsize; + + zone->spanned_pages = size; + zone->present_pages = realsize; + zone->name = zone_names[j]; + spin_lock_init(&zone->lock); + spin_lock_init(&zone->lru_lock); + zone_seqlock_init(zone); + zone->zone_pgdat = pgdat; + zone->free_pages = 0; + + zone->temp_priority = zone->prev_priority = DEF_PRIORITY; + + zone_pcp_init(zone); + INIT_LIST_HEAD(&zone->active_list); + INIT_LIST_HEAD(&zone->inactive_list); + zone->nr_scan_active = 0; + zone->nr_scan_inactive = 0; + zone->nr_active = 0; + zone->nr_inactive = 0; + atomic_set(&zone->reclaim_in_progress, 0); + if (!size) + continue; + + zonetable_add(zone, nid, j, zone_start_pfn, size); + init_currently_empty_zone(zone, zone_start_pfn, size); + zone_start_pfn += size; + } +} + +static void __init alloc_node_mem_map(struct pglist_data *pgdat) +{ + /* Skip empty nodes */ + if (!pgdat->node_spanned_pages) + return; + +#ifdef CONFIG_FLAT_NODE_MEM_MAP + /* ia64 gets its own node_mem_map, before this, without bootmem */ + if (!pgdat->node_mem_map) { + unsigned long size; + struct page *map; + + size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); + map = alloc_remap(pgdat->node_id, size); + if (!map) + map = alloc_bootmem_node(pgdat, size); + pgdat->node_mem_map = map; + } +#ifdef CONFIG_FLATMEM + /* + * With no DISCONTIG, the global mem_map is just set as node 0's + */ + if (pgdat == NODE_DATA(0)) + mem_map = NODE_DATA(0)->node_mem_map; +#endif +#endif /* CONFIG_FLAT_NODE_MEM_MAP */ +} + +void __init free_area_init_node(int nid, struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long node_start_pfn, + unsigned long *zholes_size) +{ + pgdat->node_id = nid; + pgdat->node_start_pfn = node_start_pfn; + calculate_zone_totalpages(pgdat, zones_size, zholes_size); + + alloc_node_mem_map(pgdat); + + free_area_init_core(pgdat, zones_size, zholes_size); +} + +#ifndef CONFIG_NEED_MULTIPLE_NODES +static bootmem_data_t contig_bootmem_data; +struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; + +EXPORT_SYMBOL(contig_page_data); +#endif + +void __init free_area_init(unsigned long *zones_size) +{ + free_area_init_node(0, NODE_DATA(0), zones_size, + __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); +} + +#ifdef CONFIG_PROC_FS + +#include + +static void *frag_start(struct seq_file *m, loff_t *pos) +{ + pg_data_t *pgdat; + loff_t node = *pos; + for (pgdat = first_online_pgdat(); + pgdat && node; + pgdat = next_online_pgdat(pgdat)) + --node; + + return pgdat; +} + +static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + (*pos)++; + return next_online_pgdat(pgdat); +} + +static void frag_stop(struct seq_file *m, void *arg) +{ +} + +/* + * This walks the free areas for each zone. + */ +static int frag_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + int order; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!populated_zone(zone)) + continue; + + spin_lock_irqsave(&zone->lock, flags); + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].nr_free); + spin_unlock_irqrestore(&zone->lock, flags); + seq_putc(m, '\n'); + } + return 0; +} + +struct seq_operations fragmentation_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = frag_show, +}; + +/* + * Output information about zones in @pgdat. + */ +static int zoneinfo_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = arg; + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { + int i; + + if (!populated_zone(zone)) + continue; + + spin_lock_irqsave(&zone->lock, flags); + seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); + seq_printf(m, + "\n pages free %lu" + "\n min %lu" + "\n low %lu" + "\n high %lu" + "\n active %lu" + "\n inactive %lu" + "\n scanned %lu (a: %lu i: %lu)" + "\n spanned %lu" + "\n present %lu", + zone->free_pages, + zone->pages_min, + zone->pages_low, + zone->pages_high, + zone->nr_active, + zone->nr_inactive, + zone->pages_scanned, + zone->nr_scan_active, zone->nr_scan_inactive, + zone->spanned_pages, + zone->present_pages); + seq_printf(m, + "\n protection: (%lu", + zone->lowmem_reserve[0]); + for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) + seq_printf(m, ", %lu", zone->lowmem_reserve[i]); + seq_printf(m, + ")" + "\n pagesets"); + for_each_online_cpu(i) { + struct per_cpu_pageset *pageset; + int j; + + pageset = zone_pcp(zone, i); + for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { + if (pageset->pcp[j].count) + break; + } + if (j == ARRAY_SIZE(pageset->pcp)) + continue; + for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { + seq_printf(m, + "\n cpu: %i pcp: %i" + "\n count: %i" + "\n high: %i" + "\n batch: %i", + i, j, + pageset->pcp[j].count, + pageset->pcp[j].high, + pageset->pcp[j].batch); + } +#ifdef CONFIG_NUMA + seq_printf(m, + "\n numa_hit: %lu" + "\n numa_miss: %lu" + "\n numa_foreign: %lu" + "\n interleave_hit: %lu" + "\n local_node: %lu" + "\n other_node: %lu", + pageset->numa_hit, + pageset->numa_miss, + pageset->numa_foreign, + pageset->interleave_hit, + pageset->local_node, + pageset->other_node); +#endif + } + seq_printf(m, + "\n all_unreclaimable: %u" + "\n prev_priority: %i" + "\n temp_priority: %i" + "\n start_pfn: %lu", + zone->all_unreclaimable, + zone->prev_priority, + zone->temp_priority, + zone->zone_start_pfn); + spin_unlock_irqrestore(&zone->lock, flags); + seq_putc(m, '\n'); + } + return 0; +} + +struct seq_operations zoneinfo_op = { + .start = frag_start, /* iterate over all zones. The same as in + * fragmentation. */ + .next = frag_next, + .stop = frag_stop, + .show = zoneinfo_show, +}; + +static char *vmstat_text[] = { + "nr_dirty", + "nr_writeback", + "nr_unstable", + "nr_page_table_pages", + "nr_mapped", + "nr_slab", + + "pgpgin", + "pgpgout", + "pswpin", + "pswpout", + + "pgalloc_high", + "pgalloc_normal", + "pgalloc_dma32", + "pgalloc_dma", + + "pgfree", + "pgactivate", + "pgdeactivate", + + "pgfault", + "pgmajfault", + + "pgrefill_high", + "pgrefill_normal", + "pgrefill_dma32", + "pgrefill_dma", + + "pgsteal_high", + "pgsteal_normal", + "pgsteal_dma32", + "pgsteal_dma", + + "pgscan_kswapd_high", + "pgscan_kswapd_normal", + "pgscan_kswapd_dma32", + "pgscan_kswapd_dma", + + "pgscan_direct_high", + "pgscan_direct_normal", + "pgscan_direct_dma32", + "pgscan_direct_dma", + + "pginodesteal", + "slabs_scanned", + "kswapd_steal", + "kswapd_inodesteal", + "pageoutrun", + "allocstall", + + "pgrotated", + "nr_bounce", +}; + +static void *vmstat_start(struct seq_file *m, loff_t *pos) +{ + struct page_state *ps; + + if (*pos >= ARRAY_SIZE(vmstat_text)) + return NULL; + + ps = kmalloc(sizeof(*ps), GFP_KERNEL); + m->private = ps; + if (!ps) + return ERR_PTR(-ENOMEM); + get_full_page_state(ps); + ps->pgpgin /= 2; /* sectors -> kbytes */ + ps->pgpgout /= 2; + return (unsigned long *)ps + *pos; +} + +static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) +{ + (*pos)++; + if (*pos >= ARRAY_SIZE(vmstat_text)) + return NULL; + return (unsigned long *)m->private + *pos; +} + +static int vmstat_show(struct seq_file *m, void *arg) +{ + unsigned long *l = arg; + unsigned long off = l - (unsigned long *)m->private; + + seq_printf(m, "%s %lu\n", vmstat_text[off], *l); + return 0; +} + +static void vmstat_stop(struct seq_file *m, void *arg) +{ + kfree(m->private); + m->private = NULL; +} + +struct seq_operations vmstat_op = { + .start = vmstat_start, + .next = vmstat_next, + .stop = vmstat_stop, + .show = vmstat_show, +}; + +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_HOTPLUG_CPU +static int page_alloc_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (unsigned long)hcpu; + long *count; + unsigned long *src, *dest; + + if (action == CPU_DEAD) { + int i; + + /* Drain local pagecache count. */ + count = &per_cpu(nr_pagecache_local, cpu); + atomic_add(*count, &nr_pagecache); + *count = 0; + local_irq_disable(); + __drain_pages(cpu); + + /* Add dead cpu's page_states to our own. */ + dest = (unsigned long *)&__get_cpu_var(page_states); + src = (unsigned long *)&per_cpu(page_states, cpu); + + for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long); + i++) { + dest[i] += src[i]; + src[i] = 0; + } + + local_irq_enable(); + } + return NOTIFY_OK; +} +#endif /* CONFIG_HOTPLUG_CPU */ + +void __init page_alloc_init(void) +{ + hotcpu_notifier(page_alloc_cpu_notify, 0); +} + +/* + * setup_per_zone_lowmem_reserve - called whenever + * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone + * has a correct pages reserved value, so an adequate number of + * pages are left in the zone after a successful __alloc_pages(). + */ +static void setup_per_zone_lowmem_reserve(void) +{ + struct pglist_data *pgdat; + int j, idx; + + for_each_online_pgdat(pgdat) { + for (j = 0; j < MAX_NR_ZONES; j++) { + struct zone *zone = pgdat->node_zones + j; + unsigned long present_pages = zone->present_pages; + + zone->lowmem_reserve[j] = 0; + + for (idx = j-1; idx >= 0; idx--) { + struct zone *lower_zone; + + if (sysctl_lowmem_reserve_ratio[idx] < 1) + sysctl_lowmem_reserve_ratio[idx] = 1; + + lower_zone = pgdat->node_zones + idx; + lower_zone->lowmem_reserve[j] = present_pages / + sysctl_lowmem_reserve_ratio[idx]; + present_pages += lower_zone->present_pages; + } + } + } +} + +/* + * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures + * that the pages_{min,low,high} values for each zone are set correctly + * with respect to min_free_kbytes. + */ +void setup_per_zone_pages_min(void) +{ + unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); + unsigned long lowmem_pages = 0; + struct zone *zone; + unsigned long flags; + + /* Calculate total number of !ZONE_HIGHMEM pages */ + for_each_zone(zone) { + if (!is_highmem(zone)) + lowmem_pages += zone->present_pages; + } + + for_each_zone(zone) { + unsigned long tmp; + spin_lock_irqsave(&zone->lru_lock, flags); + tmp = (pages_min * zone->present_pages) / lowmem_pages; + if (is_highmem(zone)) { + /* + * __GFP_HIGH and PF_MEMALLOC allocations usually don't + * need highmem pages, so cap pages_min to a small + * value here. + * + * The (pages_high-pages_low) and (pages_low-pages_min) + * deltas controls asynch page reclaim, and so should + * not be capped for highmem. + */ + int min_pages; + + min_pages = zone->present_pages / 1024; + if (min_pages < SWAP_CLUSTER_MAX) + min_pages = SWAP_CLUSTER_MAX; + if (min_pages > 128) + min_pages = 128; + zone->pages_min = min_pages; + } else { + /* + * If it's a lowmem zone, reserve a number of pages + * proportionate to the zone's size. + */ + zone->pages_min = tmp; + } + + zone->pages_low = zone->pages_min + tmp / 4; + zone->pages_high = zone->pages_min + tmp / 2; + spin_unlock_irqrestore(&zone->lru_lock, flags); + } +} + +/* + * Initialise min_free_kbytes. + * + * For small machines we want it small (128k min). For large machines + * we want it large (64MB max). But it is not linear, because network + * bandwidth does not increase linearly with machine size. We use + * + * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: + * min_free_kbytes = sqrt(lowmem_kbytes * 16) + * + * which yields + * + * 16MB: 512k + * 32MB: 724k + * 64MB: 1024k + * 128MB: 1448k + * 256MB: 2048k + * 512MB: 2896k + * 1024MB: 4096k + * 2048MB: 5792k + * 4096MB: 8192k + * 8192MB: 11584k + * 16384MB: 16384k + */ +static int __init init_per_zone_pages_min(void) +{ + unsigned long lowmem_kbytes; + + lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); + + min_free_kbytes = int_sqrt(lowmem_kbytes * 16); + if (min_free_kbytes < 128) + min_free_kbytes = 128; + if (min_free_kbytes > 65536) + min_free_kbytes = 65536; + setup_per_zone_pages_min(); + setup_per_zone_lowmem_reserve(); + return 0; +} +module_init(init_per_zone_pages_min) + +/* + * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so + * that we can call two helper functions whenever min_free_kbytes + * changes. + */ +int min_free_kbytes_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec(table, write, file, buffer, length, ppos); + setup_per_zone_pages_min(); + return 0; +} + +/* + * lowmem_reserve_ratio_sysctl_handler - just a wrapper around + * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() + * whenever sysctl_lowmem_reserve_ratio changes. + * + * The reserve ratio obviously has absolutely no relation with the + * pages_min watermarks. The lowmem reserve ratio can only make sense + * if in function of the boot time zone sizes. + */ +int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec_minmax(table, write, file, buffer, length, ppos); + setup_per_zone_lowmem_reserve(); + return 0; +} + +/* + * percpu_pagelist_fraction - changes the pcp->high for each zone on each + * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist + * can have before it gets flushed back to buddy allocator. + */ + +int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + unsigned int cpu; + int ret; + + ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + if (!write || (ret == -EINVAL)) + return ret; + for_each_zone(zone) { + for_each_online_cpu(cpu) { + unsigned long high; + high = zone->present_pages / percpu_pagelist_fraction; + setup_pagelist_highmark(zone_pcp(zone, cpu), high); + } + } + return 0; +} + +__initdata int hashdist = HASHDIST_DEFAULT; + +#ifdef CONFIG_NUMA +static int __init set_hashdist(char *str) +{ + if (!str) + return 0; + hashdist = simple_strtoul(str, &str, 0); + return 1; +} +__setup("hashdist=", set_hashdist); +#endif + +/* + * allocate a large system hash table from bootmem + * - it is assumed that the hash table must contain an exact power-of-2 + * quantity of entries + * - limit is the number of hash buckets, not the total allocation size + */ +void *__init alloc_large_system_hash(const char *tablename, + unsigned long bucketsize, + unsigned long numentries, + int scale, + int flags, + unsigned int *_hash_shift, + unsigned int *_hash_mask, + unsigned long limit) +{ + unsigned long long max = limit; + unsigned long log2qty, size; + void *table = NULL; + + /* allow the kernel cmdline to have a say */ + if (!numentries) { + /* round applicable memory size up to nearest megabyte */ + numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; + numentries += (1UL << (20 - PAGE_SHIFT)) - 1; + numentries >>= 20 - PAGE_SHIFT; + numentries <<= 20 - PAGE_SHIFT; + + /* limit to 1 bucket per 2^scale bytes of low memory */ + if (scale > PAGE_SHIFT) + numentries >>= (scale - PAGE_SHIFT); + else + numentries <<= (PAGE_SHIFT - scale); + } + /* rounded up to nearest power of 2 in size */ + numentries = 1UL << (long_log2(numentries) + 1); + + /* limit allocation size to 1/16 total memory by default */ + if (max == 0) { + max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; + do_div(max, bucketsize); + } + + if (numentries > max) + numentries = max; + + log2qty = long_log2(numentries); + + do { + size = bucketsize << log2qty; + if (flags & HASH_EARLY) + table = alloc_bootmem(size); + else if (hashdist) + table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); + else { + unsigned long order; + for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) + ; + table = (void*) __get_free_pages(GFP_ATOMIC, order); + } + } while (!table && size > PAGE_SIZE && --log2qty); + + if (!table) + panic("Failed to allocate %s hash table\n", tablename); + + printk("%s hash table entries: %d (order: %d, %lu bytes)\n", + tablename, + (1U << log2qty), + long_log2(size) - PAGE_SHIFT, + size); + + if (_hash_shift) + *_hash_shift = log2qty; + if (_hash_mask) + *_hash_mask = (1 << log2qty) - 1; + + return table; +} + +#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE +/* + * pfn <-> page translation. out-of-line version. + * (see asm-generic/memory_model.h) + */ +#if defined(CONFIG_FLATMEM) +struct page *pfn_to_page(unsigned long pfn) +{ + return mem_map + (pfn - ARCH_PFN_OFFSET); +} +unsigned long page_to_pfn(struct page *page) +{ + return (page - mem_map) + ARCH_PFN_OFFSET; +} +#elif defined(CONFIG_DISCONTIGMEM) +struct page *pfn_to_page(unsigned long pfn) +{ + int nid = arch_pfn_to_nid(pfn); + return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid); +} +unsigned long page_to_pfn(struct page *page) +{ + struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + return (page - pgdat->node_mem_map) + pgdat->node_start_pfn; +} +#elif defined(CONFIG_SPARSEMEM) +struct page *pfn_to_page(unsigned long pfn) +{ + return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn; +} + +unsigned long page_to_pfn(struct page *page) +{ + long section_id = page_to_section(page); + return page - __section_mem_map_addr(__nr_to_section(section_id)); +} +#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */ +EXPORT_SYMBOL(pfn_to_page); +EXPORT_SYMBOL(page_to_pfn); +#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ diff -urN oldtree/mm/vmscan.c newtree/mm/vmscan.c --- oldtree/mm/vmscan.c 2006-03-08 18:48:03.020067750 +0000 +++ newtree/mm/vmscan.c 2006-03-08 20:52:42.027476500 +0000 @@ -107,10 +107,11 @@ #endif /* - * From 0 .. 100. Higher means more swappy. + * From 0 .. 100. Lower means more swappy. */ -int vm_swappiness = 60; -static long total_memory; +int vm_mapped __read_mostly = 66; +int vm_hardmaplimit __read_mostly = 1; +static long total_memory __read_mostly; static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); @@ -1231,10 +1232,14 @@ * The distress ratio is important - we don't want to start * going oom. * - * A 100% value of vm_swappiness overrides this algorithm - * altogether. + * This distress value is ignored if we apply a hardmaplimit except + * in extreme distress. + * + * A 0% value of vm_mapped overrides this algorithm altogether. */ - swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; + swap_tendency = mapped_ratio * 100 / (vm_mapped + 1); + if (!vm_hardmaplimit || distress == 100) + swap_tendency += distress; /* * Now use this metric to decide whether to start moving mapped @@ -1584,6 +1589,7 @@ */ for (i = pgdat->nr_zones - 1; i >= 0; i--) { struct zone *zone = pgdat->node_zones + i; + unsigned long watermark; if (!populated_zone(zone)) continue; @@ -1592,8 +1598,17 @@ priority != DEF_PRIORITY) continue; + /* + * The watermark is relaxed depending on the + * level of "priority" till it drops to + * pages_high. + */ + watermark = zone->pages_high + + (zone->pages_high * priority / + DEF_PRIORITY); + if (!zone_watermark_ok(zone, order, - zone->pages_high, 0, 0)) { + watermark, 0, 0)) { end_zone = i; goto scan; } @@ -1629,8 +1644,11 @@ continue; if (nr_pages == 0) { /* Not software suspend */ + unsigned long watermark = zone->pages_high + + (zone->pages_high * priority / + DEF_PRIORITY); if (!zone_watermark_ok(zone, order, - zone->pages_high, end_zone, 0)) + watermark, end_zone, 0)) all_zones_ok = 0; } zone->temp_priority = priority; diff -urN oldtree/mm/vmscan.c.orig newtree/mm/vmscan.c.orig --- oldtree/mm/vmscan.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ newtree/mm/vmscan.c.orig 2006-03-08 20:52:39.019288500 +0000 @@ -0,0 +1,2007 @@ +/* + * linux/mm/vmscan.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Swap reorganised 29.12.95, Stephen Tweedie. + * kswapd added: 7.1.96 sct + * Removed kswapd_ctl limits, and swap out as many pages as needed + * to bring the system back to freepages.high: 2.4.97, Rik van Riel. + * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). + * Multiqueue VM started 5.8.00, Rik van Riel. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for try_to_release_page(), + buffer_heads_over_limit */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "internal.h" + +/* possible outcome of pageout() */ +typedef enum { + /* failed to write page out, page is locked */ + PAGE_KEEP, + /* move page to the active list, page is locked */ + PAGE_ACTIVATE, + /* page has been sent to the disk successfully, page is unlocked */ + PAGE_SUCCESS, + /* page is clean and locked */ + PAGE_CLEAN, +} pageout_t; + +struct scan_control { + /* Incremented by the number of inactive pages that were scanned */ + unsigned long nr_scanned; + + unsigned long nr_mapped; /* From page_state */ + + /* This context's GFP mask */ + gfp_t gfp_mask; + + int may_writepage; + + /* Can pages be swapped as part of reclaim? */ + int may_swap; + + /* This context's SWAP_CLUSTER_MAX. If freeing memory for + * suspend, we effectively ignore SWAP_CLUSTER_MAX. + * In this context, it doesn't matter that we scan the + * whole list at once. */ + int swap_cluster_max; +}; + +#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) + +#ifdef ARCH_HAS_PREFETCH +#define prefetch_prev_lru_page(_page, _base, _field) \ + do { \ + if ((_page)->lru.prev != _base) { \ + struct page *prev; \ + \ + prev = lru_to_page(&(_page->lru)); \ + prefetch(&prev->_field); \ + } \ + } while (0) +#else +#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) +#endif + +#ifdef ARCH_HAS_PREFETCHW +#define prefetchw_prev_lru_page(_page, _base, _field) \ + do { \ + if ((_page)->lru.prev != _base) { \ + struct page *prev; \ + \ + prev = lru_to_page(&(_page->lru)); \ + prefetchw(&prev->_field); \ + } \ + } while (0) +#else +#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) +#endif + +/* + * From 0 .. 100. Higher means more swappy. + */ +int vm_swappiness = 60; +static long total_memory; + +static LIST_HEAD(shrinker_list); +static DECLARE_RWSEM(shrinker_rwsem); + +/* + * Add a shrinker callback to be called from the vm + */ +struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker) +{ + struct shrinker *shrinker; + + shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL); + if (shrinker) { + shrinker->shrinker = theshrinker; + shrinker->seeks = seeks; + shrinker->nr = 0; + shrinker->s_stats = alloc_percpu(struct shrinker_stats); + if (!shrinker->s_stats) { + kfree(shrinker); + return NULL; + } + down_write(&shrinker_rwsem); + list_add_tail(&shrinker->list, &shrinker_list); + up_write(&shrinker_rwsem); + } + return shrinker; +} +EXPORT_SYMBOL(set_shrinker); + +/* + * Remove one + */ +void remove_shrinker(struct shrinker *shrinker) +{ + down_write(&shrinker_rwsem); + list_del(&shrinker->list); + up_write(&shrinker_rwsem); + free_percpu(shrinker->s_stats); + kfree(shrinker); +} +EXPORT_SYMBOL(remove_shrinker); + +#define SHRINK_BATCH 128 +/* + * Call the shrink functions to age shrinkable caches + * + * Here we assume it costs one seek to replace a lru page and that it also + * takes a seek to recreate a cache object. With this in mind we age equal + * percentages of the lru and ageable caches. This should balance the seeks + * generated by these structures. + * + * If the vm encounted mapped pages on the LRU it increase the pressure on + * slab to avoid swapping. + * + * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. + * + * `lru_pages' represents the number of on-LRU pages in all the zones which + * are eligible for the caller's allocation attempt. It is used for balancing + * slab reclaim versus page reclaim. + * + * Returns the number of slab objects which we shrunk. + */ +unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, + unsigned long lru_pages) +{ + struct shrinker *shrinker; + unsigned long ret = 0; + + if (scanned == 0) + scanned = SWAP_CLUSTER_MAX; + + if (!down_read_trylock(&shrinker_rwsem)) + return 1; /* Assume we'll be able to shrink next time */ + + list_for_each_entry(shrinker, &shrinker_list, list) { + unsigned long long delta; + unsigned long total_scan; + unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask); + + delta = (4 * scanned) / shrinker->seeks; + delta *= max_pass; + do_div(delta, lru_pages + 1); + shrinker->nr += delta; + if (shrinker->nr < 0) { + printk(KERN_ERR "%s: nr=%ld\n", + __FUNCTION__, shrinker->nr); + shrinker->nr = max_pass; + } + + /* + * Avoid risking looping forever due to too large nr value: + * never try to free more than twice the estimate number of + * freeable entries. + */ + if (shrinker->nr > max_pass * 2) + shrinker->nr = max_pass * 2; + + total_scan = shrinker->nr; + shrinker->nr = 0; + + while (total_scan >= SHRINK_BATCH) { + long this_scan = SHRINK_BATCH; + int shrink_ret; + int nr_before; + + nr_before = (*shrinker->shrinker)(0, gfp_mask); + shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask); + if (shrink_ret == -1) + break; + if (shrink_ret < nr_before) { + ret += nr_before - shrink_ret; + shrinker_stat_add(shrinker, nr_freed, + (nr_before - shrink_ret)); + } + shrinker_stat_add(shrinker, nr_req, this_scan); + mod_page_state(slabs_scanned, this_scan); + total_scan -= this_scan; + + cond_resched(); + } + + shrinker->nr += total_scan; + } + up_read(&shrinker_rwsem); + return ret; +} + +/* Called without lock on whether page is mapped, so answer is unstable */ +static inline int page_mapping_inuse(struct page *page) +{ + struct address_space *mapping; + + /* Page is in somebody's page tables. */ + if (page_mapped(page)) + return 1; + + /* Be more reluctant to reclaim swapcache than pagecache */ + if (PageSwapCache(page)) + return 1; + + mapping = page_mapping(page); + if (!mapping) + return 0; + + /* File is mmap'd by somebody? */ + return mapping_mapped(mapping); +} + +static inline int is_page_cache_freeable(struct page *page) +{ + return page_count(page) - !!PagePrivate(page) == 2; +} + +static int may_write_to_queue(struct backing_dev_info *bdi) +{ + if (current->flags & PF_SWAPWRITE) + return 1; + if (!bdi_write_congested(bdi)) + return 1; + if (bdi == current->backing_dev_info) + return 1; + return 0; +} + +/* + * We detected a synchronous write error writing a page out. Probably + * -ENOSPC. We need to propagate that into the address_space for a subsequent + * fsync(), msync() or close(). + * + * The tricky part is that after writepage we cannot touch the mapping: nothing + * prevents it from being freed up. But we have a ref on the page and once + * that page is locked, the mapping is pinned. + * + * We're allowed to run sleeping lock_page() here because we know the caller has + * __GFP_FS. + */ +static void handle_write_error(struct address_space *mapping, + struct page *page, int error) +{ + lock_page(page); + if (page_mapping(page) == mapping) { + if (error == -ENOSPC) + set_bit(AS_ENOSPC, &mapping->flags); + else + set_bit(AS_EIO, &mapping->flags); + } + unlock_page(page); +} + +/* + * pageout is called by shrink_page_list() for each dirty page. + * Calls ->writepage(). + */ +static pageout_t pageout(struct page *page, struct address_space *mapping) +{ + /* + * If the page is dirty, only perform writeback if that write + * will be non-blocking. To prevent this allocation from being + * stalled by pagecache activity. But note that there may be + * stalls if we need to run get_block(). We could test + * PagePrivate for that. + * + * If this process is currently in generic_file_write() against + * this page's queue, we can perform writeback even if that + * will block. + * + * If the page is swapcache, write it back even if that would + * block, for some throttling. This happens by accident, because + * swap_backing_dev_info is bust: it doesn't reflect the + * congestion state of the swapdevs. Easy to fix, if needed. + * See swapfile.c:page_queue_congested(). + */ + if (!is_page_cache_freeable(page)) + return PAGE_KEEP; + if (!mapping) { + /* + * Some data journaling orphaned pages can have + * page->mapping == NULL while being dirty with clean buffers. + */ + if (PagePrivate(page)) { + if (try_to_free_buffers(page)) { + ClearPageDirty(page); + printk("%s: orphaned page\n", __FUNCTION__); + return PAGE_CLEAN; + } + } + return PAGE_KEEP; + } + if (mapping->a_ops->writepage == NULL) + return PAGE_ACTIVATE; + if (!may_write_to_queue(mapping->backing_dev_info)) + return PAGE_KEEP; + + if (clear_page_dirty_for_io(page)) { + int res; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = SWAP_CLUSTER_MAX, + .nonblocking = 1, + .for_reclaim = 1, + }; + + SetPageReclaim(page); + res = mapping->a_ops->writepage(page, &wbc); + if (res < 0) + handle_write_error(mapping, page, res); + if (res == AOP_WRITEPAGE_ACTIVATE) { + ClearPageReclaim(page); + return PAGE_ACTIVATE; + } + if (!PageWriteback(page)) { + /* synchronous write or broken a_ops? */ + ClearPageReclaim(page); + } + + return PAGE_SUCCESS; + } + + return PAGE_CLEAN; +} + +static int remove_mapping(struct address_space *mapping, struct page *page) +{ + if (!mapping) + return 0; /* truncate got there first */ + + write_lock_irq(&mapping->tree_lock); + + /* + * The non-racy check for busy page. It is critical to check + * PageDirty _after_ making sure that the page is freeable and + * not in use by anybody. (pagecache + us == 2) + */ + if (unlikely(page_count(page) != 2)) + goto cannot_free; + smp_rmb(); + if (unlikely(PageDirty(page))) + goto cannot_free; + + if (PageSwapCache(page)) { + swp_entry_t swap = { .val = page_private(page) }; + add_to_swapped_list(page); + __delete_from_swap_cache(page); + write_unlock_irq(&mapping->tree_lock); + swap_free(swap); + __put_page(page); /* The pagecache ref */ + return 1; + } + + __remove_from_page_cache(page); + write_unlock_irq(&mapping->tree_lock); + __put_page(page); + return 1; + +cannot_free: + write_unlock_irq(&mapping->tree_lock); + return 0; +} + +/* + * shrink_page_list() returns the number of reclaimed pages + */ +static unsigned long shrink_page_list(struct list_head *page_list, + struct scan_control *sc) +{ + LIST_HEAD(ret_pages); + struct pagevec freed_pvec; + int pgactivate = 0; + unsigned long nr_reclaimed = 0; + + cond_resched(); + + pagevec_init(&freed_pvec, 1); + while (!list_empty(page_list)) { + struct address_space *mapping; + struct page *page; + int may_enter_fs; + int referenced; + + cond_resched(); + + page = lru_to_page(page_list); + list_del(&page->lru); + + if (TestSetPageLocked(page)) + goto keep; + + BUG_ON(PageActive(page)); + + sc->nr_scanned++; + + if (!sc->may_swap && page_mapped(page)) + goto keep_locked; + + /* Double the slab pressure for mapped and swapcache pages */ + if (page_mapped(page) || PageSwapCache(page)) + sc->nr_scanned++; + + if (PageWriteback(page)) + goto keep_locked; + + referenced = page_referenced(page, 1); + /* In active use or really unfreeable? Activate it. */ + if (referenced && page_mapping_inuse(page)) + goto activate_locked; + +#ifdef CONFIG_SWAP + /* + * Anonymous process memory has backing store? + * Try to allocate it some swap space here. + */ + if (PageAnon(page) && !PageSwapCache(page)) + if (!add_to_swap(page, GFP_ATOMIC)) + goto activate_locked; +#endif /* CONFIG_SWAP */ + + mapping = page_mapping(page); + may_enter_fs = (sc->gfp_mask & __GFP_FS) || + (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + + /* + * The page is mapped into the page tables of one or more + * processes. Try to unmap it here. + */ + if (page_mapped(page) && mapping) { + switch (try_to_unmap(page, 0)) { + case SWAP_FAIL: + goto activate_locked; + case SWAP_AGAIN: + goto keep_locked; + case SWAP_SUCCESS: + ; /* try to free the page below */ + } + } + + if (PageDirty(page)) { + if (referenced) + goto keep_locked; + if (!may_enter_fs) + goto keep_locked; + if (!sc->may_writepage) + goto keep_locked; + + /* Page is dirty, try to write it out here */ + switch(pageout(page, mapping)) { + case PAGE_KEEP: + goto keep_locked; + case PAGE_ACTIVATE: + goto activate_locked; + case PAGE_SUCCESS: + if (PageWriteback(page) || PageDirty(page)) + goto keep; + /* + * A synchronous write - probably a ramdisk. Go + * ahead and try to reclaim the page. + */ + if (TestSetPageLocked(page)) + goto keep; + if (PageDirty(page) || PageWriteback(page)) + goto keep_locked; + mapping = page_mapping(page); + case PAGE_CLEAN: + ; /* try to free the page below */ + } + } + + /* + * If the page has buffers, try to free the buffer mappings + * associated with this page. If we succeed we try to free + * the page as well. + * + * We do this even if the page is PageDirty(). + * try_to_release_page() does not perform I/O, but it is + * possible for a page to have PageDirty set, but it is actually + * clean (all its buffers are clean). This happens if the + * buffers were written out directly, with submit_bh(). ext3 + * will do this, as well as the blockdev mapping. + * try_to_release_page() will discover that cleanness and will + * drop the buffers and mark the page clean - it can be freed. + * + * Rarely, pages can have buffers and no ->mapping. These are + * the pages which were not successfully invalidated in + * truncate_complete_page(). We try to drop those buffers here + * and if that worked, and the page is no longer mapped into + * process address space (page_count == 1) it can be freed. + * Otherwise, leave the page on the LRU so it is swappable. + */ + if (PagePrivate(page)) { + if (!try_to_release_page(page, sc->gfp_mask)) + goto activate_locked; + /* + * file system may manually remove page from the page + * cache in ->releasepage(). Check for this. + */ + mapping = page_mapping(page); + if (!mapping && page_count(page) == 1) + goto free_it; + } + + if (!remove_mapping(mapping, page)) + goto keep_locked; + +free_it: + unlock_page(page); + nr_reclaimed++; + if (!pagevec_add(&freed_pvec, page)) + __pagevec_release_nonlru(&freed_pvec); + continue; + +activate_locked: + SetPageActive(page); + pgactivate++; +keep_locked: + unlock_page(page); +keep: + list_add(&page->lru, &ret_pages); + BUG_ON(PageLRU(page)); + } + list_splice(&ret_pages, page_list); + if (pagevec_count(&freed_pvec)) + __pagevec_release_nonlru(&freed_pvec); + mod_page_state(pgactivate, pgactivate); + return nr_reclaimed; +} + +#ifdef CONFIG_MIGRATION +static inline void move_to_lru(struct page *page) +{ + list_del(&page->lru); + if (PageActive(page)) { + /* + * lru_cache_add_active checks that + * the PG_active bit is off. + */ + ClearPageActive(page); + lru_cache_add_active(page); + } else { + lru_cache_add(page); + } + put_page(page); +} + +/* + * Add isolated pages on the list back to the LRU. + * + * returns the number of pages put back. + */ +unsigned long putback_lru_pages(struct list_head *l) +{ + struct page *page; + struct page *page2; + unsigned long count = 0; + + list_for_each_entry_safe(page, page2, l, lru) { + move_to_lru(page); + count++; + } + return count; +} + +/* + * Non migratable page + */ +int fail_migrate_page(struct page *newpage, struct page *page) +{ + return -EIO; +} +EXPORT_SYMBOL(fail_migrate_page); + +/* + * swapout a single page + * page is locked upon entry, unlocked on exit + */ +static int swap_page(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + if (page_mapped(page) && mapping) + if (try_to_unmap(page, 1) != SWAP_SUCCESS) + goto unlock_retry; + + if (PageDirty(page)) { + /* Page is dirty, try to write it out here */ + switch(pageout(page, mapping)) { + case PAGE_KEEP: + case PAGE_ACTIVATE: + goto unlock_retry; + + case PAGE_SUCCESS: + goto retry; + + case PAGE_CLEAN: + ; /* try to free the page below */ + } + } + + if (PagePrivate(page)) { + if (!try_to_release_page(page, GFP_KERNEL) || + (!mapping && page_count(page) == 1)) + goto unlock_retry; + } + + if (remove_mapping(mapping, page)) { + /* Success */ + unlock_page(page); + return 0; + } + +unlock_retry: + unlock_page(page); + +retry: + return -EAGAIN; +} +EXPORT_SYMBOL(swap_page); + +/* + * Page migration was first developed in the context of the memory hotplug + * project. The main authors of the migration code are: + * + * IWAMOTO Toshihiro + * Hirokazu Takahashi + * Dave Hansen + * Christoph Lameter + */ + +/* + * Remove references for a page and establish the new page with the correct + * basic settings to be able to stop accesses to the page. + */ +int migrate_page_remove_references(struct page *newpage, + struct page *page, int nr_refs) +{ + struct address_space *mapping = page_mapping(page); + struct page **radix_pointer; + + /* + * Avoid doing any of the following work if the page count + * indicates that the page is in use or truncate has removed + * the page. + */ + if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) + return -EAGAIN; + + /* + * Establish swap ptes for anonymous pages or destroy pte + * maps for files. + * + * In order to reestablish file backed mappings the fault handlers + * will take the radix tree_lock which may then be used to stop + * processses from accessing this page until the new page is ready. + * + * A process accessing via a swap pte (an anonymous page) will take a + * page_lock on the old page which will block the process until the + * migration attempt is complete. At that time the PageSwapCache bit + * will be examined. If the page was migrated then the PageSwapCache + * bit will be clear and the operation to retrieve the page will be + * retried which will find the new page in the radix tree. Then a new + * direct mapping may be generated based on the radix tree contents. + * + * If the page was not migrated then the PageSwapCache bit + * is still set and the operation may continue. + */ + if (try_to_unmap(page, 1) == SWAP_FAIL) + /* A vma has VM_LOCKED set -> Permanent failure */ + return -EPERM; + + /* + * Give up if we were unable to remove all mappings. + */ + if (page_mapcount(page)) + return -EAGAIN; + + write_lock_irq(&mapping->tree_lock); + + radix_pointer = (struct page **)radix_tree_lookup_slot( + &mapping->page_tree, + page_index(page)); + + if (!page_mapping(page) || page_count(page) != nr_refs || + *radix_pointer != page) { + write_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + /* + * Now we know that no one else is looking at the page. + * + * Certain minimal information about a page must be available + * in order for other subsystems to properly handle the page if they + * find it through the radix tree update before we are finished + * copying the page. + */ + get_page(newpage); + newpage->index = page->index; + newpage->mapping = page->mapping; + if (PageSwapCache(page)) { + SetPageSwapCache(newpage); + set_page_private(newpage, page_private(page)); + } + + *radix_pointer = newpage; + __put_page(page); + write_unlock_irq(&mapping->tree_lock); + + return 0; +} +EXPORT_SYMBOL(migrate_page_remove_references); + +/* + * Copy the page to its new location + */ +void migrate_page_copy(struct page *newpage, struct page *page) +{ + copy_highpage(newpage, page); + + if (PageError(page)) + SetPageError(newpage); + if (PageReferenced(page)) + SetPageReferenced(newpage); + if (PageUptodate(page)) + SetPageUptodate(newpage); + if (PageActive(page)) + SetPageActive(newpage); + if (PageChecked(page)) + SetPageChecked(newpage); + if (PageMappedToDisk(page)) + SetPageMappedToDisk(newpage); + + if (PageDirty(page)) { + clear_page_dirty_for_io(page); + set_page_dirty(newpage); + } + + ClearPageSwapCache(page); + ClearPageActive(page); + ClearPagePrivate(page); + set_page_private(page, 0); + page->mapping = NULL; + + /* + * If any waiters have accumulated on the new page then + * wake them up. + */ + if (PageWriteback(newpage)) + end_page_writeback(newpage); +} +EXPORT_SYMBOL(migrate_page_copy); + +/* + * Common logic to directly migrate a single page suitable for + * pages that do not use PagePrivate. + * + * Pages are locked upon entry and exit. + */ +int migrate_page(struct page *newpage, struct page *page) +{ + int rc; + + BUG_ON(PageWriteback(page)); /* Writeback must be complete */ + + rc = migrate_page_remove_references(newpage, page, 2); + + if (rc) + return rc; + + migrate_page_copy(newpage, page); + + /* + * Remove auxiliary swap entries and replace + * them with real ptes. + * + * Note that a real pte entry will allow processes that are not + * waiting on the page lock to use the new page via the page tables + * before the new page is unlocked. + */ + remove_from_swap(newpage); + return 0; +} +EXPORT_SYMBOL(migrate_page); + +/* + * migrate_pages + * + * Two lists are passed to this function. The first list + * contains the pages isolated from the LRU to be migrated. + * The second list contains new pages that the pages isolated + * can be moved to. If the second list is NULL then all + * pages are swapped out. + * + * The function returns after 10 attempts or if no pages + * are movable anymore because to has become empty + * or no retryable pages exist anymore. + * + * Return: Number of pages not migrated when "to" ran empty. + */ +unsigned long migrate_pages(struct list_head *from, struct list_head *to, + struct list_head *moved, struct list_head *failed) +{ + unsigned long retry; + unsigned long nr_failed = 0; + int pass = 0; + struct page *page; + struct page *page2; + int swapwrite = current->flags & PF_SWAPWRITE; + int rc; + + if (!swapwrite) + current->flags |= PF_SWAPWRITE; + +redo: + retry = 0; + + list_for_each_entry_safe(page, page2, from, lru) { + struct page *newpage = NULL; + struct address_space *mapping; + + cond_resched(); + + rc = 0; + if (page_count(page) == 1) + /* page was freed from under us. So we are done. */ + goto next; + + if (to && list_empty(to)) + break; + + /* + * Skip locked pages during the first two passes to give the + * functions holding the lock time to release the page. Later we + * use lock_page() to have a higher chance of acquiring the + * lock. + */ + rc = -EAGAIN; + if (pass > 2) + lock_page(page); + else + if (TestSetPageLocked(page)) + goto next; + + /* + * Only wait on writeback if we have already done a pass where + * we we may have triggered writeouts for lots of pages. + */ + if (pass > 0) { + wait_on_page_writeback(page); + } else { + if (PageWriteback(page)) + goto unlock_page; + } + + /* + * Anonymous pages must have swap cache references otherwise + * the information contained in the page maps cannot be + * preserved. + */ + if (PageAnon(page) && !PageSwapCache(page)) { + if (!add_to_swap(page, GFP_KERNEL)) { + rc = -ENOMEM; + goto unlock_page; + } + } + + if (!to) { + rc = swap_page(page); + goto next; + } + + newpage = lru_to_page(to); + lock_page(newpage); + + /* + * Pages are properly locked and writeback is complete. + * Try to migrate the page. + */ + mapping = page_mapping(page); + if (!mapping) + goto unlock_both; + + if (mapping->a_ops->migratepage) { + /* + * Most pages have a mapping and most filesystems + * should provide a migration function. Anonymous + * pages are part of swap space which also has its + * own migration function. This is the most common + * path for page migration. + */ + rc = mapping->a_ops->migratepage(newpage, page); + goto unlock_both; + } + + /* + * Default handling if a filesystem does not provide + * a migration function. We can only migrate clean + * pages so try to write out any dirty pages first. + */ + if (PageDirty(page)) { + switch (pageout(page, mapping)) { + case PAGE_KEEP: + case PAGE_ACTIVATE: + goto unlock_both; + + case PAGE_SUCCESS: + unlock_page(newpage); + goto next; + + case PAGE_CLEAN: + ; /* try to migrate the page below */ + } + } + + /* + * Buffers are managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (!page_has_buffers(page) || + try_to_release_page(page, GFP_KERNEL)) { + rc = migrate_page(newpage, page); + goto unlock_both; + } + + /* + * On early passes with mapped pages simply + * retry. There may be a lock held for some + * buffers that may go away. Later + * swap them out. + */ + if (pass > 4) { + /* + * Persistently unable to drop buffers..... As a + * measure of last resort we fall back to + * swap_page(). + */ + unlock_page(newpage); + newpage = NULL; + rc = swap_page(page); + goto next; + } + +unlock_both: + unlock_page(newpage); + +unlock_page: + unlock_page(page); + +next: + if (rc == -EAGAIN) { + retry++; + } else if (rc) { + /* Permanent failure */ + list_move(&page->lru, failed); + nr_failed++; + } else { + if (newpage) { + /* Successful migration. Return page to LRU */ + move_to_lru(newpage); + } + list_move(&page->lru, moved); + } + } + if (retry && pass++ < 10) + goto redo; + + if (!swapwrite) + current->flags &= ~PF_SWAPWRITE; + + return nr_failed + retry; +} + +/* + * Isolate one page from the LRU lists and put it on the + * indicated list with elevated refcount. + * + * Result: + * 0 = page not on LRU list + * 1 = page removed from LRU list and added to the specified list. + */ +int isolate_lru_page(struct page *page) +{ + int ret = 0; + + if (PageLRU(page)) { + struct zone *zone = page_zone(page); + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page)) { + ret = 1; + get_page(page); + ClearPageLRU(page); + if (PageActive(page)) + del_page_from_active_list(zone, page); + else + del_page_from_inactive_list(zone, page); + } + spin_unlock_irq(&zone->lru_lock); + } + + return ret; +} +#endif + +/* + * zone->lru_lock is heavily contended. Some of the functions that + * shrink the lists perform better by taking out a batch of pages + * and working on them outside the LRU lock. + * + * For pagecache intensive workloads, this function is the hottest + * spot in the kernel (apart from copy_*_user functions). + * + * Appropriate locks must be held before calling this function. + * + * @nr_to_scan: The number of pages to look through on the list. + * @src: The LRU list to pull pages off. + * @dst: The temp list to put pages on to. + * @scanned: The number of pages that were scanned. + * + * returns how many pages were moved onto *@dst. + */ +static unsigned long isolate_lru_pages(unsigned long nr_to_scan, + struct list_head *src, struct list_head *dst, + unsigned long *scanned) +{ + unsigned long nr_taken = 0; + struct page *page; + unsigned long scan; + + for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { + struct list_head *target; + page = lru_to_page(src); + prefetchw_prev_lru_page(page, src, flags); + + BUG_ON(!PageLRU(page)); + + list_del(&page->lru); + target = src; + if (likely(get_page_unless_zero(page))) { + /* + * Be careful not to clear PageLRU until after we're + * sure the page is not being freed elsewhere -- the + * page release code relies on it. + */ + ClearPageLRU(page); + target = dst; + nr_taken++; + } /* else it is being freed elsewhere */ + + list_add(&page->lru, target); + } + + *scanned = scan; + return nr_taken; +} + +/* + * shrink_inactive_list() is a helper for shrink_zone(). It returns the number + * of reclaimed pages + */ +static unsigned long shrink_inactive_list(unsigned long max_scan, + struct zone *zone, struct scan_control *sc) +{ + LIST_HEAD(page_list); + struct pagevec pvec; + unsigned long nr_scanned = 0; + unsigned long nr_reclaimed = 0; + + pagevec_init(&pvec, 1); + + lru_add_drain(); + spin_lock_irq(&zone->lru_lock); + do { + struct page *page; + unsigned long nr_taken; + unsigned long nr_scan; + unsigned long nr_freed; + + nr_taken = isolate_lru_pages(sc->swap_cluster_max, + &zone->inactive_list, + &page_list, &nr_scan); + zone->nr_inactive -= nr_taken; + zone->pages_scanned += nr_scan; + spin_unlock_irq(&zone->lru_lock); + + nr_scanned += nr_scan; + nr_freed = shrink_page_list(&page_list, sc); + nr_reclaimed += nr_freed; + local_irq_disable(); + if (current_is_kswapd()) { + __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); + __mod_page_state(kswapd_steal, nr_freed); + } else + __mod_page_state_zone(zone, pgscan_direct, nr_scan); + __mod_page_state_zone(zone, pgsteal, nr_freed); + + if (nr_taken == 0) + goto done; + + spin_lock(&zone->lru_lock); + /* + * Put back any unfreeable pages. + */ + while (!list_empty(&page_list)) { + page = lru_to_page(&page_list); + BUG_ON(PageLRU(page)); + SetPageLRU(page); + list_del(&page->lru); + if (PageActive(page)) + add_page_to_active_list(zone, page); + else + add_page_to_inactive_list(zone, page); + if (!pagevec_add(&pvec, page)) { + spin_unlock_irq(&zone->lru_lock); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + } while (nr_scanned < max_scan); + spin_unlock(&zone->lru_lock); +done: + local_irq_enable(); + pagevec_release(&pvec); + return nr_reclaimed; +} + +/* + * This moves pages from the active list to the inactive list. + * + * We move them the other way if the page is referenced by one or more + * processes, from rmap. + * + * If the pages are mostly unmapped, the processing is fast and it is + * appropriate to hold zone->lru_lock across the whole operation. But if + * the pages are mapped, the processing is slow (page_referenced()) so we + * should drop zone->lru_lock around each page. It's impossible to balance + * this, so instead we remove the pages from the LRU while processing them. + * It is safe to rely on PG_active against the non-LRU pages in here because + * nobody will play with that bit on a non-LRU page. + * + * The downside is that we have to touch page->_count against each page. + * But we had to alter page->flags anyway. + */ +static void shrink_active_list(unsigned long nr_pages, struct zone *zone, + struct scan_control *sc) +{ + unsigned long pgmoved; + int pgdeactivate = 0; + unsigned long pgscanned; + LIST_HEAD(l_hold); /* The pages which were snipped off */ + LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ + LIST_HEAD(l_active); /* Pages to go onto the active_list */ + struct page *page; + struct pagevec pvec; + int reclaim_mapped = 0; + + if (sc->may_swap) { + long mapped_ratio; + long distress; + long swap_tendency; + + /* + * `distress' is a measure of how much trouble we're having + * reclaiming pages. 0 -> no problems. 100 -> great trouble. + */ + distress = 100 >> zone->prev_priority; + + /* + * The point of this algorithm is to decide when to start + * reclaiming mapped memory instead of just pagecache. Work out + * how much memory + * is mapped. + */ + mapped_ratio = (sc->nr_mapped * 100) / total_memory; + + /* + * Now decide how much we really want to unmap some pages. The + * mapped ratio is downgraded - just because there's a lot of + * mapped memory doesn't necessarily mean that page reclaim + * isn't succeeding. + * + * The distress ratio is important - we don't want to start + * going oom. + * + * A 100% value of vm_swappiness overrides this algorithm + * altogether. + */ + swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; + + /* + * Now use this metric to decide whether to start moving mapped + * memory onto the inactive list. + */ + if (swap_tendency >= 100) + reclaim_mapped = 1; + } + + lru_add_drain(); + spin_lock_irq(&zone->lru_lock); + pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, + &l_hold, &pgscanned); + zone->pages_scanned += pgscanned; + zone->nr_active -= pgmoved; + spin_unlock_irq(&zone->lru_lock); + + while (!list_empty(&l_hold)) { + cond_resched(); + page = lru_to_page(&l_hold); + list_del(&page->lru); + if (page_mapped(page)) { + if (!reclaim_mapped || + (total_swap_pages == 0 && PageAnon(page)) || + page_referenced(page, 0)) { + list_add(&page->lru, &l_active); + continue; + } + } + list_add(&page->lru, &l_inactive); + } + + pagevec_init(&pvec, 1); + pgmoved = 0; + spin_lock_irq(&zone->lru_lock); + while (!list_empty(&l_inactive)) { + page = lru_to_page(&l_inactive); + prefetchw_prev_lru_page(page, &l_inactive, flags); + BUG_ON(PageLRU(page)); + SetPageLRU(page); + BUG_ON(!PageActive(page)); + ClearPageActive(page); + + list_move(&page->lru, &zone->inactive_list); + pgmoved++; + if (!pagevec_add(&pvec, page)) { + zone->nr_inactive += pgmoved; + spin_unlock_irq(&zone->lru_lock); + pgdeactivate += pgmoved; + pgmoved = 0; + if (buffer_heads_over_limit) + pagevec_strip(&pvec); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + zone->nr_inactive += pgmoved; + pgdeactivate += pgmoved; + if (buffer_heads_over_limit) { + spin_unlock_irq(&zone->lru_lock); + pagevec_strip(&pvec); + spin_lock_irq(&zone->lru_lock); + } + + pgmoved = 0; + while (!list_empty(&l_active)) { + page = lru_to_page(&l_active); + prefetchw_prev_lru_page(page, &l_active, flags); + BUG_ON(PageLRU(page)); + SetPageLRU(page); + BUG_ON(!PageActive(page)); + list_move(&page->lru, &zone->active_list); + pgmoved++; + if (!pagevec_add(&pvec, page)) { + zone->nr_active += pgmoved; + pgmoved = 0; + spin_unlock_irq(&zone->lru_lock); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + zone->nr_active += pgmoved; + spin_unlock(&zone->lru_lock); + + __mod_page_state_zone(zone, pgrefill, pgscanned); + __mod_page_state(pgdeactivate, pgdeactivate); + local_irq_enable(); + + pagevec_release(&pvec); +} + +/* + * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. + */ +static unsigned long shrink_zone(int priority, struct zone *zone, + struct scan_control *sc) +{ + unsigned long nr_active; + unsigned long nr_inactive; + unsigned long nr_to_scan; + unsigned long nr_reclaimed = 0; + + atomic_inc(&zone->reclaim_in_progress); + + /* + * Add one to `nr_to_scan' just to make sure that the kernel will + * slowly sift through the active list. + */ + zone->nr_scan_active += (zone->nr_active >> priority) + 1; + nr_active = zone->nr_scan_active; + if (nr_active >= sc->swap_cluster_max) + zone->nr_scan_active = 0; + else + nr_active = 0; + + zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1; + nr_inactive = zone->nr_scan_inactive; + if (nr_inactive >= sc->swap_cluster_max) + zone->nr_scan_inactive = 0; + else + nr_inactive = 0; + + while (nr_active || nr_inactive) { + if (nr_active) { + nr_to_scan = min(nr_active, + (unsigned long)sc->swap_cluster_max); + nr_active -= nr_to_scan; + shrink_active_list(nr_to_scan, zone, sc); + } + + if (nr_inactive) { + nr_to_scan = min(nr_inactive, + (unsigned long)sc->swap_cluster_max); + nr_inactive -= nr_to_scan; + nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, + sc); + } + } + + throttle_vm_writeout(); + + atomic_dec(&zone->reclaim_in_progress); + return nr_reclaimed; +} + +/* + * This is the direct reclaim path, for page-allocating processes. We only + * try to reclaim pages from zones which will satisfy the caller's allocation + * request. + * + * We reclaim from a zone even if that zone is over pages_high. Because: + * a) The caller may be trying to free *extra* pages to satisfy a higher-order + * allocation or + * b) The zones may be over pages_high but they must go *over* pages_high to + * satisfy the `incremental min' zone defense algorithm. + * + * Returns the number of reclaimed pages. + * + * If a zone is deemed to be full of pinned pages then just give it a light + * scan then give up on it. + */ +static unsigned long shrink_zones(int priority, struct zone **zones, + struct scan_control *sc) +{ + unsigned long nr_reclaimed = 0; + int i; + + for (i = 0; zones[i] != NULL; i++) { + struct zone *zone = zones[i]; + + if (!populated_zone(zone)) + continue; + + if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) + continue; + + zone->temp_priority = priority; + if (zone->prev_priority > priority) + zone->prev_priority = priority; + + if (zone->all_unreclaimable && priority != DEF_PRIORITY) + continue; /* Let kswapd poll it */ + + nr_reclaimed += shrink_zone(priority, zone, sc); + } + return nr_reclaimed; +} + +/* + * This is the main entry point to direct page reclaim. + * + * If a full scan of the inactive list fails to free enough memory then we + * are "out of memory" and something needs to be killed. + * + * If the caller is !__GFP_FS then the probability of a failure is reasonably + * high - the zone may be full of dirty or under-writeback pages, which this + * caller can't do much about. We kick pdflush and take explicit naps in the + * hope that some of these pages can be written. But if the allocating task + * holds filesystem locks which prevent writeout this might not work, and the + * allocation attempt will fail. + */ +unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) +{ + int priority; + int ret = 0; + unsigned long total_scanned = 0; + unsigned long nr_reclaimed = 0; + struct reclaim_state *reclaim_state = current->reclaim_state; + unsigned long lru_pages = 0; + int i; + struct scan_control sc = { + .gfp_mask = gfp_mask, + .may_writepage = !laptop_mode, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .may_swap = 1, + }; + + delay_swap_prefetch(); + + inc_page_state(allocstall); + + for (i = 0; zones[i] != NULL; i++) { + struct zone *zone = zones[i]; + + if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) + continue; + + zone->temp_priority = DEF_PRIORITY; + lru_pages += zone->nr_active + zone->nr_inactive; + } + + for (priority = DEF_PRIORITY; priority >= 0; priority--) { + sc.nr_mapped = read_page_state(nr_mapped); + sc.nr_scanned = 0; + if (!priority) + disable_swap_token(); + nr_reclaimed += shrink_zones(priority, zones, &sc); + shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); + if (reclaim_state) { + nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } + total_scanned += sc.nr_scanned; + if (nr_reclaimed >= sc.swap_cluster_max) { + ret = 1; + goto out; + } + + /* + * Try to write back as many pages as we just scanned. This + * tends to cause slow streaming writers to write data to the + * disk smoothly, at the dirtying rate, which is nice. But + * that's undesirable in laptop mode, where we *want* lumpy + * writeout. So in laptop mode, write out the whole world. + */ + if (total_scanned > sc.swap_cluster_max + + sc.swap_cluster_max / 2) { + wakeup_pdflush(laptop_mode ? 0 : total_scanned); + sc.may_writepage = 1; + } + + /* Take a nap, wait for some writeback to complete */ + if (sc.nr_scanned && priority < DEF_PRIORITY - 2) + blk_congestion_wait(WRITE, HZ/10); + } +out: + for (i = 0; zones[i] != 0; i++) { + struct zone *zone = zones[i]; + + if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) + continue; + + zone->prev_priority = zone->temp_priority; + } + return ret; +} + +/* + * For kswapd, balance_pgdat() will work across all this node's zones until + * they are all at pages_high. + * + * If `nr_pages' is non-zero then it is the number of pages which are to be + * reclaimed, regardless of the zone occupancies. This is a software suspend + * special. + * + * Returns the number of pages which were actually freed. + * + * There is special handling here for zones which are full of pinned pages. + * This can happen if the pages are all mlocked, or if they are all used by + * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. + * What we do is to detect the case where all pages in the zone have been + * scanned twice and there has been zero successful reclaim. Mark the zone as + * dead and from now on, only perform a short scan. Basically we're polling + * the zone for when the problem goes away. + * + * kswapd scans the zones in the highmem->normal->dma direction. It skips + * zones which have free_pages > pages_high, but once a zone is found to have + * free_pages <= pages_high, we scan that zone and the lower zones regardless + * of the number of free pages in the lower zones. This interoperates with + * the page allocator fallback scheme to ensure that aging of pages is balanced + * across the zones. + */ +static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, + int order) +{ + unsigned long to_free = nr_pages; + int all_zones_ok; + int priority; + int i; + unsigned long total_scanned; + unsigned long nr_reclaimed; + struct reclaim_state *reclaim_state = current->reclaim_state; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .may_swap = 1, + .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX, + }; + +loop_again: + total_scanned = 0; + nr_reclaimed = 0; + sc.may_writepage = !laptop_mode, + sc.nr_mapped = read_page_state(nr_mapped); + + inc_page_state(pageoutrun); + + for (i = 0; i < pgdat->nr_zones; i++) { + struct zone *zone = pgdat->node_zones + i; + + zone->temp_priority = DEF_PRIORITY; + } + + for (priority = DEF_PRIORITY; priority >= 0; priority--) { + int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ + unsigned long lru_pages = 0; + + /* The swap token gets in the way of swapout... */ + if (!priority) + disable_swap_token(); + + all_zones_ok = 1; + + if (nr_pages == 0) { + /* + * Scan in the highmem->dma direction for the highest + * zone which needs scanning + */ + for (i = pgdat->nr_zones - 1; i >= 0; i--) { + struct zone *zone = pgdat->node_zones + i; + unsigned long watermark; + + if (!populated_zone(zone)) + continue; + + if (zone->all_unreclaimable && + priority != DEF_PRIORITY) + continue; + + /* + * The watermark is relaxed depending on the + * level of "priority" till it drops to + * pages_high. + */ + watermark = zone->pages_high + + (zone->pages_high * priority / + DEF_PRIORITY); + + if (!zone_watermark_ok(zone, order, + watermark, 0, 0)) { + end_zone = i; + goto scan; + } + } + goto out; + } else { + end_zone = pgdat->nr_zones - 1; + } +scan: + for (i = 0; i <= end_zone; i++) { + struct zone *zone = pgdat->node_zones + i; + + lru_pages += zone->nr_active + zone->nr_inactive; + } + + /* + * Now scan the zone in the dma->highmem direction, stopping + * at the last zone which needs scanning. + * + * We do this because the page allocator works in the opposite + * direction. This prevents the page allocator from allocating + * pages behind kswapd's direction of progress, which would + * cause too much scanning of the lower zones. + */ + for (i = 0; i <= end_zone; i++) { + struct zone *zone = pgdat->node_zones + i; + int nr_slab; + + if (!populated_zone(zone)) + continue; + + if (zone->all_unreclaimable && priority != DEF_PRIORITY) + continue; + + if (nr_pages == 0) { /* Not software suspend */ + unsigned long watermark = zone->pages_high + + (zone->pages_high * priority / + DEF_PRIORITY); + if (!zone_watermark_ok(zone, order, + watermark, end_zone, 0)) + all_zones_ok = 0; + } + zone->temp_priority = priority; + if (zone->prev_priority > priority) + zone->prev_priority = priority; + sc.nr_scanned = 0; + nr_reclaimed += shrink_zone(priority, zone, &sc); + reclaim_state->reclaimed_slab = 0; + nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, + lru_pages); + nr_reclaimed += reclaim_state->reclaimed_slab; + total_scanned += sc.nr_scanned; + if (zone->all_unreclaimable) + continue; + if (nr_slab == 0 && zone->pages_scanned >= + (zone->nr_active + zone->nr_inactive) * 4) + zone->all_unreclaimable = 1; + /* + * If we've done a decent amount of scanning and + * the reclaim ratio is low, start doing writepage + * even in laptop mode + */ + if (total_scanned > SWAP_CLUSTER_MAX * 2 && + total_scanned > nr_reclaimed + nr_reclaimed / 2) + sc.may_writepage = 1; + } + if (nr_pages && to_free > nr_reclaimed) + continue; /* swsusp: need to do more work */ + if (all_zones_ok) + break; /* kswapd: all done */ + /* + * OK, kswapd is getting into trouble. Take a nap, then take + * another pass across the zones. + */ + if (total_scanned && priority < DEF_PRIORITY - 2) + blk_congestion_wait(WRITE, HZ/10); + + /* + * We do this so kswapd doesn't build up large priorities for + * example when it is freeing in parallel with allocators. It + * matches the direct reclaim path behaviour in terms of impact + * on zone->*_priority. + */ + if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages) + break; + } +out: + for (i = 0; i < pgdat->nr_zones; i++) { + struct zone *zone = pgdat->node_zones + i; + + zone->prev_priority = zone->temp_priority; + } + if (!all_zones_ok) { + cond_resched(); + goto loop_again; + } + + return nr_reclaimed; +} + +/* + * The background pageout daemon, started as a kernel thread + * from the init process. + * + * This basically trickles out pages so that we have _some_ + * free memory available even if there is no other activity + * that frees anything up. This is needed for things like routing + * etc, where we otherwise might have all activity going on in + * asynchronous contexts that cannot page things out. + * + * If there are applications that are active memory-allocators + * (most normal use), this basically shouldn't matter. + */ +static int kswapd(void *p) +{ + unsigned long order; + pg_data_t *pgdat = (pg_data_t*)p; + struct task_struct *tsk = current; + DEFINE_WAIT(wait); + struct reclaim_state reclaim_state = { + .reclaimed_slab = 0, + }; + cpumask_t cpumask; + + daemonize("kswapd%d", pgdat->node_id); + cpumask = node_to_cpumask(pgdat->node_id); + if (!cpus_empty(cpumask)) + set_cpus_allowed(tsk, cpumask); + current->reclaim_state = &reclaim_state; + + /* + * Tell the memory management that we're a "memory allocator", + * and that if we need more memory we should get access to it + * regardless (see "__alloc_pages()"). "kswapd" should + * never get caught in the normal page freeing logic. + * + * (Kswapd normally doesn't need memory anyway, but sometimes + * you need a small amount of memory in order to be able to + * page out something else, and this flag essentially protects + * us from recursively trying to free more memory as we're + * trying to free the first piece of memory in the first place). + */ + tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; + + order = 0; + for ( ; ; ) { + unsigned long new_order; + + try_to_freeze(); + + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + new_order = pgdat->kswapd_max_order; + pgdat->kswapd_max_order = 0; + if (order < new_order) { + /* + * Don't sleep if someone wants a larger 'order' + * allocation + */ + order = new_order; + } else { + schedule(); + order = pgdat->kswapd_max_order; + } + finish_wait(&pgdat->kswapd_wait, &wait); + + balance_pgdat(pgdat, 0, order); + } + return 0; +} + +/* + * A zone is low on free memory, so wake its kswapd task to service it. + */ +void wakeup_kswapd(struct zone *zone, int order) +{ + pg_data_t *pgdat; + + if (!populated_zone(zone)) + return; + + pgdat = zone->zone_pgdat; + if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) + return; + if (pgdat->kswapd_max_order < order) + pgdat->kswapd_max_order = order; + if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) + return; + if (!waitqueue_active(&pgdat->kswapd_wait)) + return; + wake_up_interruptible(&pgdat->kswapd_wait); +} + +#ifdef CONFIG_PM +/* + * Try to free `nr_pages' of memory, system-wide. Returns the number of freed + * pages. + */ +unsigned long shrink_all_memory(unsigned long nr_pages) +{ + pg_data_t *pgdat; + unsigned long nr_to_free = nr_pages; + unsigned long ret = 0; + unsigned retry = 2; + struct reclaim_state reclaim_state = { + .reclaimed_slab = 0, + }; + + delay_swap_prefetch(); + + current->reclaim_state = &reclaim_state; +repeat: + for_each_online_pgdat(pgdat) { + unsigned long freed; + + freed = balance_pgdat(pgdat, nr_to_free, 0); + ret += freed; + nr_to_free -= freed; + if ((long)nr_to_free <= 0) + break; + } + if (retry-- && ret < nr_pages) { + blk_congestion_wait(WRITE, HZ/5); + goto repeat; + } + current->reclaim_state = NULL; + return ret; +} +#endif + +#ifdef CONFIG_HOTPLUG_CPU +/* It's optimal to keep kswapds on the same CPUs as their memory, but + not required for correctness. So if the last cpu in a node goes + away, we get changed to run anywhere: as the first one comes back, + restore their cpu bindings. */ +static int __devinit cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + pg_data_t *pgdat; + cpumask_t mask; + + if (action == CPU_ONLINE) { + for_each_online_pgdat(pgdat) { + mask = node_to_cpumask(pgdat->node_id); + if (any_online_cpu(mask) != NR_CPUS) + /* One of our CPUs online: restore mask */ + set_cpus_allowed(pgdat->kswapd, mask); + } + } + return NOTIFY_OK; +} +#endif /* CONFIG_HOTPLUG_CPU */ + +static int __init kswapd_init(void) +{ + pg_data_t *pgdat; + + swap_setup(); + for_each_online_pgdat(pgdat) { + pid_t pid; + + pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); + BUG_ON(pid < 0); + pgdat->kswapd = find_task_by_pid(pid); + } + total_memory = nr_free_pagecache_pages(); + hotcpu_notifier(cpu_callback, 0); + return 0; +} + +module_init(kswapd_init) + +#ifdef CONFIG_NUMA +/* + * Zone reclaim mode + * + * If non-zero call zone_reclaim when the number of free pages falls below + * the watermarks. + * + * In the future we may add flags to the mode. However, the page allocator + * should only have to check that zone_reclaim_mode != 0 before calling + * zone_reclaim(). + */ +int zone_reclaim_mode __read_mostly; + +#define RECLAIM_OFF 0 +#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ +#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ +#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ +#define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ + +/* + * Mininum time between zone reclaim scans + */ +int zone_reclaim_interval __read_mostly = 30*HZ; + +/* + * Priority for ZONE_RECLAIM. This determines the fraction of pages + * of a node considered for each zone_reclaim. 4 scans 1/16th of + * a zone. + */ +#define ZONE_RECLAIM_PRIORITY 4 + +/* + * Try to free up some pages from this zone through reclaim. + */ +int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +{ + /* Minimum pages needed in order to stay on node */ + const unsigned long nr_pages = 1 << order; + struct task_struct *p = current; + struct reclaim_state reclaim_state; + cpumask_t mask; + int node_id; + int priority; + unsigned long nr_reclaimed = 0; + struct scan_control sc = { + .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), + .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .nr_mapped = read_page_state(nr_mapped), + .swap_cluster_max = max_t(unsigned long, nr_pages, + SWAP_CLUSTER_MAX), + .gfp_mask = gfp_mask, + }; + + /* + * Do not reclaim if there was a recent unsuccessful attempt at zone + * reclaim. In that case we let allocations go off node for the + * zone_reclaim_interval. Otherwise we would scan for each off-node + * page allocation. + */ + if (time_before(jiffies, + zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) + return 0; + + /* + * Avoid concurrent zone reclaims, do not reclaim in a zone that does + * not have reclaimable pages and if we should not delay the allocation + * then do not scan. + */ + if (!(gfp_mask & __GFP_WAIT) || + zone->all_unreclaimable || + atomic_read(&zone->reclaim_in_progress) > 0) + return 0; + + node_id = zone->zone_pgdat->node_id; + mask = node_to_cpumask(node_id); + if (!cpus_empty(mask) && node_id != numa_node_id()) + return 0; + + disable_swap_token(); + + cond_resched(); + /* + * We need to be able to allocate from the reserves for RECLAIM_SWAP + * and we also need to be able to write out pages for RECLAIM_WRITE + * and RECLAIM_SWAP. + */ + p->flags |= PF_MEMALLOC | PF_SWAPWRITE; + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + + /* + * Free memory by calling shrink zone with increasing priorities + * until we have enough memory freed. + */ + priority = ZONE_RECLAIM_PRIORITY; + do { + nr_reclaimed += shrink_zone(priority, zone, &sc); + priority--; + } while (priority >= 0 && nr_reclaimed < nr_pages); + + if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { + /* + * shrink_slab() does not currently allow us to determine how + * many pages were freed in this zone. So we just shake the slab + * a bit and then go off node for this particular allocation + * despite possibly having freed enough memory to allocate in + * this zone. If we freed local memory then the next + * allocations will be local again. + * + * shrink_slab will free memory on all zones and may take + * a long time. + */ + shrink_slab(sc.nr_scanned, gfp_mask, order); + } + + p->reclaim_state = NULL; + current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); + + if (nr_reclaimed == 0) { + /* + * We were unable to reclaim enough pages to stay on node. We + * now allow off node accesses for a certain time period before + * trying again to reclaim pages from the local zone. + */ + zone->last_unsuccessful_zone_reclaim = jiffies; + } + + return nr_reclaimed >= nr_pages; +} +#endif