diff -urN oldtree/Documentation/filesystems/proc.txt newtree/Documentation/filesystems/proc.txt --- oldtree/Documentation/filesystems/proc.txt 2006-09-29 14:03:18.000000000 -0400 +++ newtree/Documentation/filesystems/proc.txt 2006-09-29 14:57:40.000000000 -0400 @@ -1324,6 +1324,14 @@ As this is a non-destructive operation and dirty objects are not freeable, the user should run `sync' first. +tail_largefiles +--------------- + +When enabled reads from large files to the tail end of the inactive lru list. +This means that any cache from reading large files is dropped very quickly, +preventing loss of mapped ram and useful pagecache when large files are read. +This does, however, make caching less effective when working with large files. + 2.5 /proc/sys/dev - Device specific parameters ---------------------------------------------- diff -urN oldtree/Documentation/sysctl/vm.txt newtree/Documentation/sysctl/vm.txt --- oldtree/Documentation/sysctl/vm.txt 2006-09-29 14:51:43.000000000 -0400 +++ newtree/Documentation/sysctl/vm.txt 2006-09-29 14:57:40.000000000 -0400 @@ -39,7 +39,7 @@ dirty_ratio, dirty_background_ratio, dirty_expire_centisecs, dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode, -block_dump, swap_token_timeout, drop-caches: +block_dump, swap_token_timeout, drop-caches, tail_largefiles: See Documentation/filesystems/proc.txt diff -urN oldtree/include/linux/sysctl.h newtree/include/linux/sysctl.h --- oldtree/include/linux/sysctl.h 2006-09-29 14:51:44.000000000 -0400 +++ newtree/include/linux/sysctl.h 2006-09-29 14:58:54.000000000 -0400 @@ -199,6 +199,7 @@ VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */ VM_SWAP_PREFETCH=36, /* swap prefetch */ VM_HARDMAPLIMIT=37, /* Make mapped a hard limit */ + VM_TAIL_LARGEFILES=38, /* Read large files to lru tail */ }; diff -urN oldtree/include/linux/writeback.h newtree/include/linux/writeback.h --- oldtree/include/linux/writeback.h 2006-09-29 14:03:22.000000000 -0400 +++ newtree/include/linux/writeback.h 2006-09-29 14:57:40.000000000 -0400 @@ -87,6 +87,8 @@ void throttle_vm_writeout(void); void writeback_congestion_end(void); +extern long total_pages; + /* These are exported to sysctl. */ extern int dirty_background_ratio; extern int vm_dirty_ratio; diff -urN oldtree/kernel/sysctl.c newtree/kernel/sysctl.c --- oldtree/kernel/sysctl.c 2006-09-29 14:51:44.000000000 -0400 +++ newtree/kernel/sysctl.c 2006-09-29 14:58:20.000000000 -0400 @@ -74,6 +74,7 @@ extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; +extern int vm_tail_largefiles; #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) int unknown_nmi_panic; @@ -858,6 +859,14 @@ .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = VM_TAIL_LARGEFILES, + .procname = "tail_largefiles", + .data = &vm_tail_largefiles, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #ifdef CONFIG_HUGETLB_PAGE { .ctl_name = VM_HUGETLB_PAGES, diff -urN oldtree/mm/filemap.c newtree/mm/filemap.c --- oldtree/mm/filemap.c 2006-09-29 14:03:22.000000000 -0400 +++ newtree/mm/filemap.c 2006-09-29 14:57:40.000000000 -0400 @@ -466,6 +466,16 @@ return ret; } +int add_to_page_cache_lru_tail(struct page *page, + struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) +{ + int ret = add_to_page_cache(page, mapping, offset, gfp_mask); + + if (ret == 0) + lru_cache_add_tail(page); + return ret; +} + #ifdef CONFIG_NUMA struct page *page_cache_alloc(struct address_space *x) { @@ -868,6 +878,34 @@ ra->ra_pages /= 4; } +/* + * Sysctl which determines whether we should read from large files to the + * tail of the inactive lru list. + */ +int vm_tail_largefiles __read_mostly = 1; + +static inline int nr_mapped(void) +{ + return global_page_state(NR_FILE_MAPPED) + + global_page_state(NR_ANON_PAGES); +} + +/* + * This examines how large in pages a file size is and returns 1 if it is + * more than half the unmapped ram. Avoid doing read_page_state which is + * expensive unless we already know it is likely to be large enough. + */ +static int large_isize(unsigned long nr_pages) +{ + if (nr_pages * 6 > total_pages) { + unsigned long unmapped_ram = total_pages - nr_mapped(); + + if (nr_pages * 2 > unmapped_ram) + return 1; + } + return 0; +} + /** * do_generic_mapping_read - generic file read routine * @mapping: address_space to be read @@ -1076,8 +1114,19 @@ goto out; } } - error = add_to_page_cache_lru(cached_page, mapping, - index, GFP_KERNEL); + + /* + * If we know the file is large we add the pages read to the + * end of the lru as we're unlikely to be able to cache the + * whole file in ram so make those pages the first to be + * dropped if not referenced soon. + */ + if (vm_tail_largefiles && large_isize(end_index)) + error = add_to_page_cache_lru_tail(cached_page, + mapping, index, GFP_KERNEL); + else + error = add_to_page_cache_lru(cached_page, mapping, + index, GFP_KERNEL); if (error) { if (error == -EEXIST) goto find_page; diff -urN oldtree/mm/swap.c newtree/mm/swap.c --- oldtree/mm/swap.c 2006-09-29 14:16:42.000000000 -0400 +++ newtree/mm/swap.c 2006-09-29 14:57:40.000000000 -0400 @@ -434,8 +434,7 @@ /* * Function used uniquely to put pages back to the lru at the end of the - * inactive list to preserve the lru order. Currently only used by swap - * prefetch. + * inactive list to preserve the lru order. */ void fastcall lru_cache_add_tail(struct page *page) { diff -urN oldtree/mm/vmscan.c newtree/mm/vmscan.c --- oldtree/mm/vmscan.c 2006-09-29 14:55:45.000000000 -0400 +++ newtree/mm/vmscan.c 2006-09-29 14:57:40.000000000 -0400 @@ -115,7 +115,7 @@ */ int vm_mapped __read_mostly = 66; int vm_hardmaplimit __read_mostly = 1; -long vm_total_pages; /* The total number of pages which the VM controls */ +long vm_total_pages __read_mostly; /* The total number of pages which the VM controls */ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem);