diff -urN oldtree/Documentation/DocBook/Makefile newtree/Documentation/DocBook/Makefile --- oldtree/Documentation/DocBook/Makefile 2006-04-01 04:48:27.000000000 -0500 +++ newtree/Documentation/DocBook/Makefile 2006-04-01 08:53:50.582662750 -0500 @@ -28,7 +28,7 @@ ### # The targets that may be used. -.PHONY: xmldocs sgmldocs psdocs pdfdocs htmldocs mandocs installmandocs +PHONY += xmldocs sgmldocs psdocs pdfdocs htmldocs mandocs installmandocs BOOKS := $(addprefix $(obj)/,$(DOCBOOKS)) xmldocs: $(BOOKS) @@ -211,3 +211,9 @@ #man put files in man subdir - traverse down subdir- := man/ + + +# Declare the contents of the .PHONY variable as phony. We keep that +# information in a variable se we can use it in if_changed and friends. + +.PHONY: $(PHONY) diff -urN oldtree/Documentation/sysctl/vm.txt newtree/Documentation/sysctl/vm.txt --- oldtree/Documentation/sysctl/vm.txt 2006-04-01 04:48:27.000000000 -0500 +++ newtree/Documentation/sysctl/vm.txt 2006-04-01 08:54:41.749860500 -0500 @@ -29,6 +29,7 @@ - drop-caches - zone_reclaim_mode - zone_reclaim_interval +- swap_prefetch ============================================================== @@ -178,3 +179,22 @@ Reduce the interval if undesired off node allocations occur. However, too frequent scans will have a negative impact onoff node allocation performance. +============================================================== + +swap_prefetch + +This enables or disables the swap prefetching feature. When the virtual +memory subsystem has been extremely idle for at least 5 seconds it will start +copying back pages from swap into the swapcache and keep a copy in swap. In +practice it can take many minutes before the vm is idle enough. + +This is value ORed together of +1 = Normal background swap prefetching when load is light +2 = Aggressively swap prefetch as much as possible + +When 2 is set, after the maximum amount possible has been prefetched, this bit +is unset. ie Setting the value to 3 will prefetch aggressively then drop to 1. +This is useful for doing aggressive prefetching for short periods in scripts +such as after resuming from software suspend. + +The default value is 1. diff -urN oldtree/Makefile newtree/Makefile --- oldtree/Makefile 2006-04-01 04:48:27.000000000 -0500 +++ newtree/Makefile 2006-04-01 08:53:50.586663000 -0500 @@ -95,7 +95,7 @@ endif # That's our default target when none is given on the command line -.PHONY: _all +PHONY := _all _all: ifneq ($(KBUILD_OUTPUT),) @@ -106,7 +106,7 @@ $(if $(KBUILD_OUTPUT),, \ $(error output directory "$(saved-output)" does not exist)) -.PHONY: $(MAKECMDGOALS) +PHONY += $(MAKECMDGOALS) $(filter-out _all,$(MAKECMDGOALS)) _all: $(if $(KBUILD_VERBOSE:1=),@)$(MAKE) -C $(KBUILD_OUTPUT) \ @@ -123,7 +123,7 @@ # If building an external module we do not care about the all: rule # but instead _all depend on modules -.PHONY: all +PHONY += all ifeq ($(KBUILD_EXTMOD),) _all: all else @@ -369,14 +369,14 @@ # Rules shared between *config targets and build targets # Basic helpers built in scripts/ -.PHONY: scripts_basic +PHONY += scripts_basic scripts_basic: $(Q)$(MAKE) $(build)=scripts/basic # To avoid any implicit rule to kick in, define an empty command. scripts/basic/%: scripts_basic ; -.PHONY: outputmakefile +PHONY += outputmakefile # outputmakefile generate a Makefile to be placed in output directory, if # using a seperate output directory. This allows convinient use # of make in output directory @@ -452,7 +452,7 @@ # Additional helpers built in scripts/ # Carefully list dependencies so we do not try to build scripts twice # in parrallel -.PHONY: scripts +PHONY += scripts scripts: scripts_basic include/config/MARKER $(Q)$(MAKE) $(build)=$(@) @@ -752,7 +752,7 @@ # make menuconfig etc. # Error messages still appears in the original language -.PHONY: $(vmlinux-dirs) +PHONY += $(vmlinux-dirs) $(vmlinux-dirs): prepare scripts $(Q)$(MAKE) $(build)=$@ @@ -805,10 +805,10 @@ # version.h and scripts_basic is processed / created. # Listed in dependency order -.PHONY: prepare archprepare prepare0 prepare1 prepare2 prepare3 +PHONY += prepare archprepare prepare0 prepare1 prepare2 prepare3 # prepare-all is deprecated, use prepare as valid replacement -.PHONY: prepare-all +PHONY += prepare-all # prepare3 is used to check if we are building in a separate output directory, # and if so do: @@ -910,7 +910,7 @@ # --------------------------------------------------------------------------- -.PHONY: depend dep +PHONY += depend dep depend dep: @echo '*** Warning: make $@ is unnecessary now.' @@ -925,21 +925,21 @@ # Build modules -.PHONY: modules +PHONY += modules modules: $(vmlinux-dirs) $(if $(KBUILD_BUILTIN),vmlinux) @echo ' Building modules, stage 2.'; $(Q)$(MAKE) -rR -f $(srctree)/scripts/Makefile.modpost # Target to prepare building external modules -.PHONY: modules_prepare +PHONY += modules_prepare modules_prepare: prepare scripts # Target to install modules -.PHONY: modules_install +PHONY += modules_install modules_install: _modinst_ _modinst_post -.PHONY: _modinst_ +PHONY += _modinst_ _modinst_: @if [ -z "`$(DEPMOD) -V 2>/dev/null | grep module-init-tools`" ]; then \ echo "Warning: you may need to install module-init-tools"; \ @@ -966,7 +966,7 @@ else depmod_opts := -b $(INSTALL_MOD_PATH) -r endif -.PHONY: _modinst_post +PHONY += _modinst_post _modinst_post: _modinst_ if [ -r System.map -a -x $(DEPMOD) ]; then $(DEPMOD) -ae -F System.map $(depmod_opts) $(KERNELRELEASE); fi @@ -1009,7 +1009,7 @@ clean: rm-files := $(CLEAN_FILES) clean-dirs := $(addprefix _clean_,$(srctree) $(vmlinux-alldirs)) -.PHONY: $(clean-dirs) clean archclean +PHONY += $(clean-dirs) clean archclean $(clean-dirs): $(Q)$(MAKE) $(clean)=$(patsubst _clean_%,%,$@) @@ -1027,7 +1027,7 @@ mrproper: rm-files := $(wildcard $(MRPROPER_FILES)) mrproper-dirs := $(addprefix _mrproper_,Documentation/DocBook scripts) -.PHONY: $(mrproper-dirs) mrproper archmrproper +PHONY += $(mrproper-dirs) mrproper archmrproper $(mrproper-dirs): $(Q)$(MAKE) $(clean)=$(patsubst _mrproper_%,%,$@) @@ -1037,7 +1037,7 @@ # distclean # -.PHONY: distclean +PHONY += distclean distclean: mrproper @find $(srctree) $(RCS_FIND_IGNORE) \ @@ -1053,7 +1053,7 @@ # rpm target kept for backward compatibility package-dir := $(srctree)/scripts/package -.PHONY: %-pkg rpm +PHONY += %-pkg rpm %pkg: FORCE $(Q)$(MAKE) -f $(package-dir)/Makefile $@ @@ -1145,11 +1145,11 @@ # We are always building modules KBUILD_MODULES := 1 -.PHONY: crmodverdir +PHONY += crmodverdir crmodverdir: $(Q)mkdir -p $(MODVERDIR) -.PHONY: $(objtree)/Module.symvers +PHONY += $(objtree)/Module.symvers $(objtree)/Module.symvers: @test -e $(objtree)/Module.symvers || ( \ echo; \ @@ -1158,7 +1158,7 @@ echo ) module-dirs := $(addprefix _module_,$(KBUILD_EXTMOD)) -.PHONY: $(module-dirs) modules +PHONY += $(module-dirs) modules $(module-dirs): crmodverdir $(objtree)/Module.symvers $(Q)$(MAKE) $(build)=$(patsubst _module_%,%,$@) @@ -1166,13 +1166,13 @@ @echo ' Building modules, stage 2.'; $(Q)$(MAKE) -rR -f $(srctree)/scripts/Makefile.modpost -.PHONY: modules_install +PHONY += modules_install modules_install: $(Q)$(MAKE) -rR -f $(srctree)/scripts/Makefile.modinst clean-dirs := $(addprefix _clean_,$(KBUILD_EXTMOD)) -.PHONY: $(clean-dirs) clean +PHONY += $(clean-dirs) clean $(clean-dirs): $(Q)$(MAKE) $(clean)=$(patsubst _clean_%,%,$@) @@ -1302,7 +1302,7 @@ endif #ifeq ($(config-targets),1) endif #ifeq ($(mixed-targets),1) -.PHONY: checkstack +PHONY += checkstack checkstack: $(OBJDUMP) -d vmlinux $$(find . -name '*.ko') | \ $(PERL) $(src)/scripts/checkstack.pl $(ARCH) @@ -1347,4 +1347,10 @@ endif # skip-makefile +PHONY += FORCE FORCE: + + +# Declare the contents of the .PHONY variable as phony. We keep that +# information in a variable se we can use it in if_changed and friends. +.PHONY: $(PHONY) diff -urN oldtree/arch/arm/Makefile newtree/arch/arm/Makefile --- oldtree/arch/arm/Makefile 2006-04-01 04:48:27.000000000 -0500 +++ newtree/arch/arm/Makefile 2006-04-01 08:53:50.590663250 -0500 @@ -1,6 +1,9 @@ # # arch/arm/Makefile # +# This file is included by the global makefile so that you can add your own +# architecture-specific flags and dependencies. +# # This file is subject to the terms and conditions of the GNU General Public # License. See the file "COPYING" in the main directory of this archive # for more details. @@ -176,7 +179,7 @@ archprepare: maketools -.PHONY: maketools FORCE +PHONY += maketools FORCE maketools: include/linux/version.h include/asm-arm/.arch FORCE $(Q)$(MAKE) $(build)=arch/arm/tools include/asm-arm/mach-types.h diff -urN oldtree/arch/arm/boot/Makefile newtree/arch/arm/boot/Makefile --- oldtree/arch/arm/boot/Makefile 2006-04-01 04:48:27.000000000 -0500 +++ newtree/arch/arm/boot/Makefile 2006-04-01 08:53:50.590663250 -0500 @@ -1,6 +1,9 @@ # # arch/arm/boot/Makefile # +# This file is included by the global makefile so that you can add your own +# architecture-specific flags and dependencies. +# # This file is subject to the terms and conditions of the GNU General Public # License. See the file "COPYING" in the main directory of this archive # for more details. @@ -73,7 +76,7 @@ $(call if_changed,objcopy) @echo ' Kernel: $@ is ready' -.PHONY: initrd FORCE +PHONY += initrd FORCE initrd: @test "$(INITRD_PHYS)" != "" || \ (echo This machine does not support INITRD; exit -1) diff -urN oldtree/arch/arm/boot/bootp/Makefile newtree/arch/arm/boot/bootp/Makefile --- oldtree/arch/arm/boot/bootp/Makefile 2006-04-01 04:48:27.000000000 -0500 +++ newtree/arch/arm/boot/bootp/Makefile 2006-04-01 08:53:50.594663500 -0500 @@ -1,6 +1,9 @@ # # linux/arch/arm/boot/bootp/Makefile # +# This file is included by the global makefile so that you can add your own +# architecture-specific flags and dependencies. +# LDFLAGS_bootp :=-p --no-undefined -X \ --defsym initrd_phys=$(INITRD_PHYS) \ @@ -21,4 +24,4 @@ $(obj)/initrd.o: $(INITRD) FORCE -.PHONY: $(INITRD) FORCE +PHONY += $(INITRD) FORCE diff -urN oldtree/arch/arm26/Makefile newtree/arch/arm26/Makefile --- oldtree/arch/arm26/Makefile 2006-04-01 04:48:27.000000000 -0500 +++ newtree/arch/arm26/Makefile 2006-04-01 08:53:50.598663750 -0500 @@ -1,6 +1,9 @@ # # arch/arm26/Makefile # +# This file is included by the global makefile so that you can add your own +# architecture-specific flags and dependencies. +# # This file is subject to the terms and conditions of the GNU General Public # License. See the file "COPYING" in the main directory of this archive # for more details. @@ -49,9 +52,9 @@ boot := arch/arm26/boot -.PHONY: maketools FORCE +PHONY += maketools FORCE maketools: FORCE - + # Convert bzImage to zImage bzImage: vmlinux diff -urN oldtree/arch/arm26/boot/Makefile newtree/arch/arm26/boot/Makefile --- oldtree/arch/arm26/boot/Makefile 2006-04-01 04:48:27.000000000 -0500 +++ newtree/arch/arm26/boot/Makefile 2006-04-01 08:53:50.602664000 -0500 @@ -1,6 +1,9 @@ # # arch/arm26/boot/Makefile # +# This file is included by the global makefile so that you can add your own +# architecture-specific flags and dependencies. +# # This file is subject to the terms and conditions of the GNU General Public # License. See the file "COPYING" in the main directory of this archive # for more details. @@ -60,7 +63,7 @@ @echo ' Kernel: $@ is ready' endif -.PHONY: initrd +PHONY += initrd initrd: @test "$(INITRD_PHYS)" != "" || \ (echo This machine does not support INITRD; exit -1) diff -urN oldtree/arch/i386/Makefile newtree/arch/i386/Makefile --- oldtree/arch/i386/Makefile 2006-04-01 04:48:27.000000000 -0500 +++ newtree/arch/i386/Makefile 2006-04-01 08:53:50.606664250 -0500 @@ -99,8 +99,8 @@ boot := arch/i386/boot -.PHONY: zImage bzImage compressed zlilo bzlilo \ - zdisk bzdisk fdimage fdimage144 fdimage288 install +PHONY += zImage bzImage compressed zlilo bzlilo \ + zdisk bzdisk fdimage fdimage144 fdimage288 install all: bzImage diff -urN oldtree/arch/ia64/Makefile newtree/arch/ia64/Makefile --- oldtree/arch/ia64/Makefile 2006-04-01 04:48:27.000000000 -0500 +++ newtree/arch/ia64/Makefile 2006-04-01 08:53:50.606664250 -0500 @@ -1,6 +1,9 @@ # # ia64/Makefile # +# This file is included by the global makefile so that you can add your own +# architecture-specific flags and dependencies. +# # This file is subject to the terms and conditions of the GNU General Public # License. See the file "COPYING" in the main directory of this archive # for more details. @@ -62,7 +65,7 @@ boot := arch/ia64/hp/sim/boot -.PHONY: boot compressed check +PHONY += boot compressed check all: compressed unwcheck diff -urN oldtree/arch/ia64/configs/tiger_defconfig newtree/arch/ia64/configs/tiger_defconfig --- oldtree/arch/ia64/configs/tiger_defconfig 2006-04-01 04:48:27.000000000 -0500 +++ newtree/arch/ia64/configs/tiger_defconfig 2006-04-01 08:52:29.321584250 -0500 @@ -105,10 +105,10 @@ # CONFIG_IA64_PAGE_SIZE_64KB is not set CONFIG_PGTABLE_3=y # CONFIG_PGTABLE_4 is not set -# CONFIG_HZ_100 is not set -CONFIG_HZ_250=y +CONFIG_HZ_100=y +# CONFIG_HZ_250 is not set # CONFIG_HZ_1000 is not set -CONFIG_HZ=250 +CONFIG_HZ=100 CONFIG_IA64_L1_CACHE_SHIFT=7 CONFIG_IA64_CYCLONE=y CONFIG_IOSAPIC=y diff -urN oldtree/arch/m32r/Makefile newtree/arch/m32r/Makefile --- oldtree/arch/m32r/Makefile 2006-04-01 04:48:27.000000000 -0500 +++ newtree/arch/m32r/Makefile 2006-04-01 08:53:50.610664500 -0500 @@ -1,6 +1,9 @@ # # m32r/Makefile # +# This file is included by the global makefile so that you can add your own +# architecture-specific flags and dependencies. +# LDFLAGS := OBJCOPYFLAGS := -O binary -R .note -R .comment -S @@ -39,7 +42,7 @@ boot := arch/m32r/boot -.PHONY: zImage +PHONY += zImage all: zImage diff -urN oldtree/arch/powerpc/Makefile newtree/arch/powerpc/Makefile --- oldtree/arch/powerpc/Makefile 2006-04-01 04:48:27.000000000 -0500 +++ newtree/arch/powerpc/Makefile 2006-04-01 08:53:50.614664750 -0500 @@ -150,7 +150,7 @@ BOOT_TARGETS = zImage zImage.initrd znetboot znetboot.initrd vmlinux.sm uImage -.PHONY: $(BOOT_TARGETS) +PHONY += $(BOOT_TARGETS) boot := arch/$(ARCH)/boot diff -urN oldtree/arch/ppc/Makefile newtree/arch/ppc/Makefile --- oldtree/arch/ppc/Makefile 2006-04-01 04:48:27.000000000 -0500 +++ newtree/arch/ppc/Makefile 2006-04-01 08:53:50.618665000 -0500 @@ -82,7 +82,7 @@ BOOT_TARGETS = zImage zImage.initrd znetboot znetboot.initrd vmlinux.sm -.PHONY: $(BOOT_TARGETS) +PHONY += $(BOOT_TARGETS) all: uImage zImage diff -urN oldtree/arch/ppc/boot/Makefile newtree/arch/ppc/boot/Makefile --- oldtree/arch/ppc/boot/Makefile 2006-04-01 04:48:27.000000000 -0500 +++ newtree/arch/ppc/boot/Makefile 2006-04-01 08:53:50.618665000 -0500 @@ -1,6 +1,9 @@ # # arch/ppc/boot/Makefile # +# This file is included by the global makefile so that you can add your own +# architecture-specific flags and dependencies. +# # This file is subject to the terms and conditions of the GNU General Public # License. See the file "COPYING" in the main directory of this archive # for more details. @@ -25,7 +28,7 @@ hostprogs-y := $(addprefix utils/, addnote mknote hack-coff mkprep mkbugboot mktree) -.PHONY: $(BOOT_TARGETS) $(bootdir-y) +PHONY += $(BOOT_TARGETS) $(bootdir-y) $(BOOT_TARGETS): $(bootdir-y) diff -urN oldtree/arch/ppc/boot/openfirmware/Makefile newtree/arch/ppc/boot/openfirmware/Makefile --- oldtree/arch/ppc/boot/openfirmware/Makefile 2006-04-01 04:48:27.000000000 -0500 +++ newtree/arch/ppc/boot/openfirmware/Makefile 2006-04-01 08:53:50.622665250 -0500 @@ -1,5 +1,8 @@ # Makefile for making bootable images on various OpenFirmware machines. # +# This file is included by the global makefile so that you can add your own +# architecture-specific flags and dependencies. +# # Paul Mackerras January 1997 # XCOFF bootable images for PowerMacs # Geert Uytterhoeven September 1997 @@ -86,7 +89,7 @@ # The targets used on the make command-line -.PHONY: zImage zImage.initrd +PHONY += zImage zImage.initrd zImage: $(images)/zImage.chrp \ $(images)/zImage.chrp-rs6k @echo ' kernel: $@ is ready ($<)' @@ -96,7 +99,7 @@ TFTPIMAGE := /tftpboot/zImage -.PHONY: znetboot znetboot.initrd +PHONY += znetboot znetboot.initrd znetboot: $(images)/zImage.chrp cp $(images)/zImage.chrp $(TFTPIMAGE).chrp$(END) @echo ' kernel: $@ is ready ($<)' diff -urN oldtree/arch/ppc/configs/common_defconfig newtree/arch/ppc/configs/common_defconfig --- oldtree/arch/ppc/configs/common_defconfig 2006-04-01 04:48:27.000000000 -0500 +++ newtree/arch/ppc/configs/common_defconfig 2006-04-01 08:52:30.717671500 -0500 @@ -139,10 +139,10 @@ CONFIG_PPCBUG_NVRAM=y # CONFIG_SMP is not set # CONFIG_HIGHMEM is not set -# CONFIG_HZ_100 is not set -CONFIG_HZ_250=y +CONFIG_HZ_100=y +# CONFIG_HZ_250 is not set # CONFIG_HZ_1000 is not set -CONFIG_HZ=250 +CONFIG_HZ=100 CONFIG_PREEMPT_NONE=y # CONFIG_PREEMPT_VOLUNTARY is not set # CONFIG_PREEMPT is not set diff -urN oldtree/arch/ppc/configs/pmac_defconfig newtree/arch/ppc/configs/pmac_defconfig --- oldtree/arch/ppc/configs/pmac_defconfig 2006-04-01 04:48:27.000000000 -0500 +++ newtree/arch/ppc/configs/pmac_defconfig 2006-04-01 08:52:30.729672250 -0500 @@ -139,10 +139,10 @@ CONFIG_PPCBUG_NVRAM=y # CONFIG_SMP is not set # CONFIG_HIGHMEM is not set -# CONFIG_HZ_100 is not set -CONFIG_HZ_250=y +CONFIG_HZ_100=y +# CONFIG_HZ_250 is not set # CONFIG_HZ_1000 is not set -CONFIG_HZ=250 +CONFIG_HZ=100 CONFIG_PREEMPT_NONE=y # CONFIG_PREEMPT_VOLUNTARY is not set # CONFIG_PREEMPT is not set diff -urN oldtree/arch/sh/Makefile newtree/arch/sh/Makefile --- oldtree/arch/sh/Makefile 2006-04-01 04:48:27.000000000 -0500 +++ newtree/arch/sh/Makefile 2006-04-01 08:53:50.626665500 -0500 @@ -172,7 +172,7 @@ archprepare: maketools include/asm-sh/.cpu include/asm-sh/.mach -.PHONY: maketools FORCE +PHONY += maketools FORCE maketools: include/linux/version.h FORCE $(Q)$(MAKE) $(build)=arch/sh/tools include/asm-sh/machtypes.h diff -urN oldtree/arch/um/Makefile newtree/arch/um/Makefile --- oldtree/arch/um/Makefile 2006-04-01 04:48:27.000000000 -0500 +++ newtree/arch/um/Makefile 2006-04-01 08:53:50.630665750 -0500 @@ -1,4 +1,7 @@ -# +# +# This file is included by the global makefile so that you can add your own +# architecture-specific flags and dependencies. +# # Copyright (C) 2002 Jeff Dike (jdike@karaya.com) # Licensed under the GPL # @@ -88,7 +91,7 @@ SIZE = (($(CONFIG_NEST_LEVEL) + $(CONFIG_KERNEL_HALF_GIGS)) * 0x20000000) -.PHONY: linux +PHONY += linux all: linux diff -urN oldtree/arch/x86_64/Makefile newtree/arch/x86_64/Makefile --- oldtree/arch/x86_64/Makefile 2006-04-01 04:48:27.000000000 -0500 +++ newtree/arch/x86_64/Makefile 2006-04-01 08:53:50.630665750 -0500 @@ -67,8 +67,8 @@ boot := arch/x86_64/boot -.PHONY: bzImage bzlilo install archmrproper \ - fdimage fdimage144 fdimage288 archclean +PHONY += bzImage bzlilo install archmrproper \ + fdimage fdimage144 fdimage288 archclean #Default target when executing "make" all: bzImage diff -urN oldtree/block/Kconfig.iosched newtree/block/Kconfig.iosched --- oldtree/block/Kconfig.iosched 2006-04-01 04:48:27.000000000 -0500 +++ newtree/block/Kconfig.iosched 2006-04-01 08:53:42.926184250 -0500 @@ -40,7 +40,7 @@ choice prompt "Default I/O scheduler" - default DEFAULT_AS + default DEFAULT_DEADLINE help Select the I/O scheduler which will be used by default for all block devices. diff -urN oldtree/drivers/block/loop.c newtree/drivers/block/loop.c --- oldtree/drivers/block/loop.c 2006-04-01 04:48:27.000000000 -0500 +++ newtree/drivers/block/loop.c 2006-04-01 08:51:11.580725750 -0500 @@ -779,6 +779,12 @@ mapping = file->f_mapping; inode = mapping->host; + /* + * The upper layer should already do proper look-ahead, + * one more look-ahead here only ruins the cache hit rate. + */ + file->f_ra.flags |= RA_FLAG_NO_LOOKAHEAD; + if (!(file->f_mode & FMODE_WRITE)) lo_flags |= LO_FLAGS_READ_ONLY; diff -urN oldtree/fs/mpage.c newtree/fs/mpage.c --- oldtree/fs/mpage.c 2006-04-01 04:48:27.000000000 -0500 +++ newtree/fs/mpage.c 2006-04-01 08:51:11.584726000 -0500 @@ -343,8 +343,10 @@ bio = do_mpage_readpage(bio, page, nr_pages - page_idx, &last_block_in_bio, get_block); - if (!pagevec_add(&lru_pvec, page)) + if (!pagevec_add(&lru_pvec, page)) { + cond_resched(); __pagevec_lru_add(&lru_pvec); + } } else { page_cache_release(page); } diff -urN oldtree/fs/nfsd/vfs.c newtree/fs/nfsd/vfs.c --- oldtree/fs/nfsd/vfs.c 2006-04-01 04:48:27.000000000 -0500 +++ newtree/fs/nfsd/vfs.c 2006-04-01 08:51:11.592726500 -0500 @@ -833,10 +833,14 @@ #endif /* Get readahead parameters */ - ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); + if (prefer_adaptive_readahead()) + ra = NULL; + else + ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); if (ra && ra->p_set) file->f_ra = ra->p_ra; + file->f_ra.flags |= RA_FLAG_NFSD; if (file->f_op->sendfile) { svc_pushback_unused_pages(rqstp); diff -urN oldtree/include/linux/fs.h newtree/include/linux/fs.h --- oldtree/include/linux/fs.h 2006-04-01 04:48:27.000000000 -0500 +++ newtree/include/linux/fs.h 2006-04-01 08:51:11.596726750 -0500 @@ -600,19 +600,40 @@ * Track a single file's readahead state */ struct file_ra_state { - unsigned long start; /* Current window */ - unsigned long size; - unsigned long flags; /* ra flags RA_FLAG_xxx*/ - unsigned long cache_hit; /* cache hit count*/ - unsigned long prev_page; /* Cache last read() position */ - unsigned long ahead_start; /* Ahead window */ - unsigned long ahead_size; - unsigned long ra_pages; /* Maximum readahead window */ - unsigned long mmap_hit; /* Cache hit stat for mmap accesses */ - unsigned long mmap_miss; /* Cache miss stat for mmap accesses */ + union { + struct { /* conventional read-ahead */ + unsigned long start; /* Current window */ + unsigned long size; + unsigned long ahead_start; /* Ahead window */ + unsigned long ahead_size; + unsigned long cache_hit; /* cache hit count */ + }; +#ifdef CONFIG_ADAPTIVE_READAHEAD + struct { /* adaptive read-ahead */ + pgoff_t la_index; + pgoff_t ra_index; + pgoff_t lookahead_index; + pgoff_t readahead_index; + unsigned long age; + uint64_t cache_hits; + }; +#endif + }; + + /* mmap read-around */ + unsigned long mmap_hit; /* Cache hit stat for mmap accesses */ + unsigned long mmap_miss; /* Cache miss stat for mmap accesses */ + + /* common ones */ + unsigned long flags; /* ra flags RA_FLAG_xxx*/ + unsigned long prev_page; /* Cache last read() position */ + unsigned long ra_pages; /* Maximum readahead window */ }; #define RA_FLAG_MISS 0x01 /* a cache miss occured against this file */ #define RA_FLAG_INCACHE 0x02 /* file is already in cache */ +#define RA_FLAG_MMAP (1UL<<31) /* mmaped page access */ +#define RA_FLAG_NO_LOOKAHEAD (1UL<<30) /* disable look-ahead */ +#define RA_FLAG_NFSD (1UL<<29) /* request from nfsd */ struct file { /* diff -urN oldtree/include/linux/ioprio.h newtree/include/linux/ioprio.h --- oldtree/include/linux/ioprio.h 2006-04-01 04:48:27.000000000 -0500 +++ newtree/include/linux/ioprio.h 2006-04-01 08:54:37.153573250 -0500 @@ -53,7 +53,13 @@ static inline int task_nice_ioprio(struct task_struct *task) { - return (task_nice(task) + 20) / 5; + int effective_nice = task_nice(task); + + if (idleprio_task(task)) + effective_nice = 19; + else if (rt_task(task) || iso_task(task)) + effective_nice = -20; + return (effective_nice + 20) / 5; } /* diff -urN oldtree/include/linux/mm.h newtree/include/linux/mm.h --- oldtree/include/linux/mm.h 2006-04-01 04:48:27.000000000 -0500 +++ newtree/include/linux/mm.h 2006-04-01 08:51:11.600727000 -0500 @@ -954,7 +954,11 @@ int write_one_page(struct page *page, int wait); /* readahead.c */ +#ifdef CONFIG_ADAPTIVE_READAHEAD +#define VM_MAX_READAHEAD 1024 /* kbytes */ +#else #define VM_MAX_READAHEAD 128 /* kbytes */ +#endif #define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ #define VM_MAX_CACHE_HIT 256 /* max pages in a row in cache before * turning readahead off */ @@ -971,6 +975,33 @@ void handle_ra_miss(struct address_space *mapping, struct file_ra_state *ra, pgoff_t offset); unsigned long max_sane_readahead(unsigned long nr); +unsigned long +page_cache_readahead_adaptive(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + struct page *prev_page, struct page *page, + pgoff_t first_index, pgoff_t index, pgoff_t last_index); + +#ifdef CONFIG_ADAPTIVE_READAHEAD +void fastcall readahead_cache_hit(struct file_ra_state *ra, struct page *page); +extern int readahead_ratio; +#else +#define readahead_cache_hit(ra, page) do { } while (0) +#define readahead_ratio 1 +#endif /* CONFIG_ADAPTIVE_READAHEAD */ + +static inline int prefer_adaptive_readahead(void) +{ + return readahead_ratio >= 10; +} + +DECLARE_PER_CPU(unsigned long, readahead_aging); +static inline void inc_readahead_aging(void) +{ + if (prefer_adaptive_readahead()) { + per_cpu(readahead_aging, get_cpu())++; + put_cpu(); + } +} /* Do stack extension */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); diff -urN oldtree/include/linux/mm_inline.h newtree/include/linux/mm_inline.h --- oldtree/include/linux/mm_inline.h 2006-04-01 04:48:27.000000000 -0500 +++ newtree/include/linux/mm_inline.h 2006-04-01 08:53:58.007126750 -0500 @@ -14,6 +14,13 @@ } static inline void +add_page_to_inactive_list_tail(struct zone *zone, struct page *page) +{ + list_add_tail(&page->lru, &zone->inactive_list); + zone->nr_inactive++; +} + +static inline void del_page_from_active_list(struct zone *zone, struct page *page) { list_del(&page->lru); diff -urN oldtree/include/linux/mmzone.h newtree/include/linux/mmzone.h --- oldtree/include/linux/mmzone.h 2006-04-01 04:48:27.000000000 -0500 +++ newtree/include/linux/mmzone.h 2006-04-01 08:54:45.766111500 -0500 @@ -120,7 +120,7 @@ struct zone { /* Fields commonly accessed by the page allocator */ unsigned long free_pages; - unsigned long pages_min, pages_low, pages_high; + unsigned long pages_min, pages_low, pages_high, pages_lots; /* * We don't know if the memory that we're going to allocate will be freeable * or/and it will be released eventually, so to avoid totally wasting several diff -urN oldtree/include/linux/page-flags.h newtree/include/linux/page-flags.h --- oldtree/include/linux/page-flags.h 2006-04-01 04:48:27.000000000 -0500 +++ newtree/include/linux/page-flags.h 2006-04-01 08:51:11.604727250 -0500 @@ -75,6 +75,7 @@ #define PG_reclaim 17 /* To be reclaimed asap */ #define PG_nosave_free 18 /* Free, should not be written */ #define PG_uncached 19 /* Page has been mapped as uncached */ +#define PG_readahead 20 /* Reminder to do readahead */ /* * Global page accounting. One instance per CPU. Only unsigned longs are @@ -344,6 +345,10 @@ #define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags) #define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags) +#define PageReadahead(page) test_bit(PG_readahead, &(page)->flags) +#define __SetPageReadahead(page) __set_bit(PG_readahead, &(page)->flags) +#define TestClearPageReadahead(page) test_and_clear_bit(PG_readahead, &(page)->flags) + struct page; /* forward declaration */ int test_clear_page_dirty(struct page *page); diff -urN oldtree/include/linux/radix-tree.h newtree/include/linux/radix-tree.h --- oldtree/include/linux/radix-tree.h 2006-04-01 04:48:27.000000000 -0500 +++ newtree/include/linux/radix-tree.h 2006-04-01 08:51:11.608727500 -0500 @@ -23,12 +23,24 @@ #include #include +#define RADIX_TREE_MAP_SHIFT 6 +#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) +#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) + struct radix_tree_root { unsigned int height; gfp_t gfp_mask; struct radix_tree_node *rnode; }; +/* + * Lookaside cache to support access patterns with strong locality. + */ +struct radix_tree_cache { + unsigned long first_index; + struct radix_tree_node *tree_node; +}; + #define RADIX_TREE_INIT(mask) { \ .height = 0, \ .gfp_mask = (mask), \ @@ -46,9 +58,18 @@ } while (0) int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); -void *radix_tree_lookup(struct radix_tree_root *, unsigned long); -void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); +void *radix_tree_lookup_node(struct radix_tree_root *, unsigned long, + unsigned int); +void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long); void *radix_tree_delete(struct radix_tree_root *, unsigned long); +unsigned int radix_tree_cache_count(struct radix_tree_cache *cache); +void *radix_tree_cache_lookup_node(struct radix_tree_root *root, + struct radix_tree_cache *cache, + unsigned long index, unsigned int level); +unsigned long radix_tree_scan_hole_backward(struct radix_tree_root *root, + unsigned long index, unsigned long max_scan); +unsigned long radix_tree_scan_hole(struct radix_tree_root *root, + unsigned long index, unsigned long max_scan); unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items); @@ -70,4 +91,61 @@ preempt_enable(); } +/** + * radix_tree_lookup - perform lookup operation on a radix tree + * @root: radix tree root + * @index: index key + * + * Lookup the item at the position @index in the radix tree @root. + */ +static inline void *radix_tree_lookup(struct radix_tree_root *root, + unsigned long index) +{ + return radix_tree_lookup_node(root, index, 0); +} + +/** + * radix_tree_cache_init - init a look-aside cache + * @cache: look-aside cache + * + * Init the radix tree look-aside cache @cache. + */ +static inline void radix_tree_cache_init(struct radix_tree_cache *cache) +{ + cache->first_index = RADIX_TREE_MAP_MASK; + cache->tree_node = NULL; +} + +/** + * radix_tree_cache_lookup - cached lookup on a radix tree + * @root: radix tree root + * @cache: look-aside cache + * @index: index key + * + * Lookup the item at the position @index in the radix tree @root, + * and make use of @cache to speedup the lookup process. + */ +static inline void *radix_tree_cache_lookup(struct radix_tree_root *root, + struct radix_tree_cache *cache, + unsigned long index) +{ + return radix_tree_cache_lookup_node(root, cache, index, 0); +} + +static inline unsigned int radix_tree_cache_size(struct radix_tree_cache *cache) +{ + return RADIX_TREE_MAP_SIZE; +} + +static inline int radix_tree_cache_full(struct radix_tree_cache *cache) +{ + return radix_tree_cache_count(cache) == radix_tree_cache_size(cache); +} + +static inline unsigned long +radix_tree_cache_first_index(struct radix_tree_cache *cache) +{ + return cache->first_index; +} + #endif /* _LINUX_RADIX_TREE_H */ diff -urN oldtree/include/linux/swap-prefetch.h newtree/include/linux/swap-prefetch.h --- oldtree/include/linux/swap-prefetch.h 1969-12-31 19:00:00.000000000 -0500 +++ newtree/include/linux/swap-prefetch.h 2006-04-01 08:53:58.011127000 -0500 @@ -0,0 +1,55 @@ +#ifndef SWAP_PREFETCH_H_INCLUDED +#define SWAP_PREFETCH_H_INCLUDED + +#ifdef CONFIG_SWAP_PREFETCH +/* mm/swap_prefetch.c */ +extern int swap_prefetch; +struct swapped_entry { + swp_entry_t swp_entry; /* The actual swap entry */ + struct list_head swapped_list; /* Linked list of entries */ +#if MAX_NUMNODES > 1 + int node; /* Node id */ +#endif +} __attribute__((packed)); + +static inline void store_swap_entry_node(struct swapped_entry *entry, + struct page *page) +{ +#if MAX_NUMNODES > 1 + entry->node = page_to_nid(page); +#endif +} + +static inline int get_swap_entry_node(struct swapped_entry *entry) +{ +#if MAX_NUMNODES > 1 + return entry->node; +#else + return 0; +#endif +} + +extern void add_to_swapped_list(struct page *page); +extern void remove_from_swapped_list(const unsigned long index); +extern void delay_swap_prefetch(void); +extern void prepare_swap_prefetch(void); + +#else /* CONFIG_SWAP_PREFETCH */ +static inline void add_to_swapped_list(struct page *__unused) +{ +} + +static inline void prepare_swap_prefetch(void) +{ +} + +static inline void remove_from_swapped_list(const unsigned long __unused) +{ +} + +static inline void delay_swap_prefetch(void) +{ +} +#endif /* CONFIG_SWAP_PREFETCH */ + +#endif /* SWAP_PREFETCH_H_INCLUDED */ diff -urN oldtree/include/linux/swap.h newtree/include/linux/swap.h --- oldtree/include/linux/swap.h 2006-04-01 04:48:27.000000000 -0500 +++ newtree/include/linux/swap.h 2006-04-01 08:54:48.062255000 -0500 @@ -164,6 +164,7 @@ /* linux/mm/swap.c */ extern void FASTCALL(lru_cache_add(struct page *)); extern void FASTCALL(lru_cache_add_active(struct page *)); +extern void FASTCALL(lru_cache_add_tail(struct page *)); extern void FASTCALL(activate_page(struct page *)); extern void FASTCALL(mark_page_accessed(struct page *)); extern void lru_add_drain(void); @@ -174,7 +175,8 @@ /* linux/mm/vmscan.c */ extern int try_to_free_pages(struct zone **, gfp_t); extern int shrink_all_memory(int); -extern int vm_swappiness; +extern int vm_mapped; +extern int vm_hardmaplimit; #ifdef CONFIG_NUMA extern int zone_reclaim_mode; @@ -235,6 +237,7 @@ extern struct page * lookup_swap_cache(swp_entry_t); extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma, unsigned long addr); +extern int add_to_swap_cache(struct page *page, swp_entry_t entry); /* linux/mm/swapfile.c */ extern long total_swap_pages; extern unsigned int nr_swapfiles; diff -urN oldtree/include/linux/sysctl.h newtree/include/linux/sysctl.h --- oldtree/include/linux/sysctl.h 2006-04-01 04:48:27.000000000 -0500 +++ newtree/include/linux/sysctl.h 2006-04-01 08:55:15.339959750 -0500 @@ -172,7 +172,7 @@ VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */ VM_PAGEBUF=17, /* struct: Control pagebuf parameters */ VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */ - VM_SWAPPINESS=19, /* Tendency to steal mapped memory */ + VM_MAPPED=19, /* percent mapped min while evicting cache */ VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */ VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */ @@ -186,6 +186,10 @@ VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */ VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */ VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */ + VM_SWAP_PREFETCH=33, /* swap prefetch */ + VM_HARDMAPLIMIT=34, /* Make mapped a hard limit */ + VM_READAHEAD_RATIO=35, /* percent of read-ahead size to thrashing-threshold */ + VM_READAHEAD_HIT_RATE=36, /* one accessed page legitimizes so many read-ahead pages */ }; diff -urN oldtree/include/linux/writeback.h newtree/include/linux/writeback.h --- oldtree/include/linux/writeback.h 2006-04-01 04:48:27.000000000 -0500 +++ newtree/include/linux/writeback.h 2006-04-01 08:51:11.624728500 -0500 @@ -85,6 +85,12 @@ void laptop_sync_completion(void); void throttle_vm_writeout(void); +extern struct timer_list laptop_mode_wb_timer; +static inline int laptop_spinned_down(void) +{ + return !timer_pending(&laptop_mode_wb_timer); +} + /* These are exported to sysctl. */ extern int dirty_background_ratio; extern int vm_dirty_ratio; diff -urN oldtree/init/Kconfig newtree/init/Kconfig --- oldtree/init/Kconfig 2006-04-01 04:48:27.000000000 -0500 +++ newtree/init/Kconfig 2006-04-01 08:53:57.979125000 -0500 @@ -92,6 +92,28 @@ used to provide more virtual memory than the actual RAM present in your computer. If unsure say Y. +config SWAP_PREFETCH + bool "Support for prefetching swapped memory" + depends on SWAP + default y + ---help--- + This option will allow the kernel to prefetch swapped memory pages + when idle. The pages will be kept on both swap and in swap_cache + thus avoiding the need for further I/O if either ram or swap space + is required. + + What this will do on workstations is slowly bring back applications + that have swapped out after memory intensive workloads back into + physical ram if you have free ram at a later stage and the machine + is relatively idle. This means that when you come back to your + computer after leaving it idle for a while, applications will come + to life faster. Note that your swap usage will appear to increase + but these are cached pages, can be dropped freely by the vm, and it + should stabilise around 50% swap usage maximum. + + Workstations and multiuser workstation servers will most likely want + to say Y. + config SYSVIPC bool "System V IPC" ---help--- diff -urN oldtree/kernel/Kconfig.hz newtree/kernel/Kconfig.hz --- oldtree/kernel/Kconfig.hz 2006-04-01 04:48:27.000000000 -0500 +++ newtree/kernel/Kconfig.hz 2006-04-01 08:53:08.900057750 -0500 @@ -4,7 +4,7 @@ choice prompt "Timer frequency" - default HZ_250 + default HZ_1000 help Allows the configuration of the timer frequency. It is customary to have the timer interrupt run at 1000 HZ but 100 HZ may be more @@ -21,14 +21,17 @@ help 100 HZ is a typical choice for servers, SMP and NUMA systems with lots of processors that may show reduced performance if - too many timer interrupts are occurring. + too many timer interrupts are occurring. Laptops may also show + improved battery life. - config HZ_250 + config HZ_250_NODEFAULT bool "250 HZ" help - 250 HZ is a good compromise choice allowing server performance - while also showing good interactive responsiveness even - on SMP and NUMA systems. + 250 HZ is a lousy compromise choice allowing server interactivity + while also showing desktop throughput and no extra power saving on + laptops. Good for when you can't make up your mind. + + Recommend 100 or 1000 instead. config HZ_1000 bool "1000 HZ" @@ -41,6 +44,6 @@ config HZ int default 100 if HZ_100 - default 250 if HZ_250 + default 250 if HZ_250_NODEFAULT default 1000 if HZ_1000 diff -urN oldtree/kernel/sysctl.c newtree/kernel/sysctl.c --- oldtree/kernel/sysctl.c 2006-04-01 04:48:27.000000000 -0500 +++ newtree/kernel/sysctl.c 2006-04-01 08:54:48.082256250 -0500 @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -73,6 +74,12 @@ extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; +#if defined(CONFIG_ADAPTIVE_READAHEAD) +extern int readahead_ratio; +extern int readahead_hit_rate; +static int one = 1; +#endif + #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) int unknown_nmi_panic; extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *, @@ -683,6 +690,28 @@ .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_ADAPTIVE_READAHEAD + { + .ctl_name = VM_READAHEAD_RATIO, + .procname = "readahead_ratio", + .data = &readahead_ratio, + .maxlen = sizeof(readahead_ratio), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, + { + .ctl_name = VM_READAHEAD_HIT_RATE, + .procname = "readahead_hit_rate", + .data = &readahead_hit_rate, + .maxlen = sizeof(readahead_hit_rate), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &one, + }, +#endif { .ctl_name = 0 } }; @@ -764,16 +793,24 @@ .proc_handler = &proc_dointvec, }, { - .ctl_name = VM_SWAPPINESS, - .procname = "swappiness", - .data = &vm_swappiness, - .maxlen = sizeof(vm_swappiness), + .ctl_name = VM_MAPPED, + .procname = "mapped", + .data = &vm_mapped, + .maxlen = sizeof(vm_mapped), .mode = 0644, .proc_handler = &proc_dointvec_minmax, .strategy = &sysctl_intvec, .extra1 = &zero, .extra2 = &one_hundred, }, + { + .ctl_name = VM_HARDMAPLIMIT, + .procname = "hardmaplimit", + .data = &vm_hardmaplimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #ifdef CONFIG_HUGETLB_PAGE { .ctl_name = VM_HUGETLB_PAGES, @@ -916,6 +953,16 @@ .strategy = &sysctl_jiffies, }, #endif +#ifdef CONFIG_SWAP_PREFETCH + { + .ctl_name = VM_SWAP_PREFETCH, + .procname = "swap_prefetch", + .data = &swap_prefetch, + .maxlen = sizeof(swap_prefetch), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = 0 } }; diff -urN oldtree/lib/radix-tree.c newtree/lib/radix-tree.c --- oldtree/lib/radix-tree.c 2006-04-01 04:48:27.000000000 -0500 +++ newtree/lib/radix-tree.c 2006-04-01 08:51:11.640729500 -0500 @@ -32,16 +32,7 @@ #include -#ifdef __KERNEL__ -#define RADIX_TREE_MAP_SHIFT 6 -#else -#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */ -#endif #define RADIX_TREE_TAGS 2 - -#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) -#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) - #define RADIX_TREE_TAG_LONGS \ ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) @@ -286,32 +277,89 @@ } EXPORT_SYMBOL(radix_tree_insert); -static inline void **__lookup_slot(struct radix_tree_root *root, - unsigned long index) +/** + * radix_tree_lookup_node - low level lookup routine + * @root: radix tree root + * @index: index key + * @level: stop at that many levels from the tree leaf + * + * Lookup the item at the position @index in the radix tree @root. + * The return value is: + * @level == 0: page at @index; + * @level == 1: the corresponding bottom level tree node; + * @level < height: (@level-1)th parent node of the bottom node + * that contains @index; + * @level >= height: the root node. + */ +void *radix_tree_lookup_node(struct radix_tree_root *root, + unsigned long index, unsigned int level) { unsigned int height, shift; - struct radix_tree_node **slot; + struct radix_tree_node *slot; height = root->height; if (index > radix_tree_maxindex(height)) return NULL; shift = (height-1) * RADIX_TREE_MAP_SHIFT; - slot = &root->rnode; + slot = root->rnode; - while (height > 0) { - if (*slot == NULL) + while (height > level) { + if (slot == NULL) return NULL; - slot = (struct radix_tree_node **) - ((*slot)->slots + - ((index >> shift) & RADIX_TREE_MAP_MASK)); + slot = slot->slots[(index >> shift) & RADIX_TREE_MAP_MASK]; shift -= RADIX_TREE_MAP_SHIFT; height--; } - return (void **)slot; + return slot; +} +EXPORT_SYMBOL(radix_tree_lookup_node); + +/** + * radix_tree_cache_lookup_node - cached lookup node + * @root: radix tree root + * @cache: look-aside cache + * @index: index key + * + * Lookup the item at the position @index in the radix tree @root, + * and return the node @level levels from the bottom in the search path. + * + * @cache stores the last accessed upper level tree node by this + * function, and is always checked first before searching in the tree. + * It can improve speed for access patterns with strong locality. + * + * NOTE: + * - The cache becomes invalid on leaving the lock; + * - Do not intermix calls with different @level. + */ +void *radix_tree_cache_lookup_node(struct radix_tree_root *root, + struct radix_tree_cache *cache, + unsigned long index, unsigned int level) +{ + struct radix_tree_node *node; + unsigned long i; + unsigned long mask; + + if (level >= root->height) + return root->rnode; + + i = ((index >> (level * RADIX_TREE_MAP_SHIFT)) & RADIX_TREE_MAP_MASK); + mask = ~((RADIX_TREE_MAP_SIZE << (level * RADIX_TREE_MAP_SHIFT)) - 1); + + if ((index & mask) == cache->first_index) + return cache->tree_node->slots[i]; + + node = radix_tree_lookup_node(root, index, level + 1); + if (!node) + return 0; + + cache->tree_node = node; + cache->first_index = (index & mask); + return node->slots[i]; } +EXPORT_SYMBOL(radix_tree_cache_lookup_node); /** * radix_tree_lookup_slot - lookup a slot in a radix tree @@ -323,25 +371,131 @@ */ void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index) { - return __lookup_slot(root, index); + struct radix_tree_node *node; + + node = radix_tree_lookup_node(root, index, 1); + return node->slots + (index & RADIX_TREE_MAP_MASK); } EXPORT_SYMBOL(radix_tree_lookup_slot); /** - * radix_tree_lookup - perform lookup operation on a radix tree + * radix_tree_cache_count - items in the cached node + * @cache: radix tree look-aside cache + * + * Query the number of items contained in the cached node. + */ +unsigned int radix_tree_cache_count(struct radix_tree_cache *cache) +{ + if (!(cache->first_index & RADIX_TREE_MAP_MASK)) + return cache->tree_node->count; + else + return 0; +} +EXPORT_SYMBOL(radix_tree_cache_count); + +/** + * radix_tree_scan_hole_backward - scan backward for hole * @root: radix tree root * @index: index key + * @max_scan: advice on max items to scan (it may scan a little more) * - * Lookup the item at the position @index in the radix tree @root. + * Scan backward from @index for a hole/empty item, stop when + * - hit hole + * - @max_scan or more items scanned + * - hit index 0 + * + * Return the correponding index. */ -void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) +unsigned long radix_tree_scan_hole_backward(struct radix_tree_root *root, + unsigned long index, unsigned long max_scan) { - void **slot; + struct radix_tree_cache cache; + struct radix_tree_node *node; + unsigned long origin; + int i; + + origin = index; + radix_tree_cache_init(&cache); + + while (origin - index < max_scan) { + node = radix_tree_cache_lookup_node(root, &cache, index, 1); + if (!node) + break; + + if (node->count == RADIX_TREE_MAP_SIZE) { + index = (index - RADIX_TREE_MAP_SIZE) | + RADIX_TREE_MAP_MASK; + goto check_underflow; + } + + for (i = index & RADIX_TREE_MAP_MASK; i >= 0; i--, index--) { + if (!node->slots[i]) + goto out; + } + +check_underflow: + if (unlikely(index == ULONG_MAX)) { + index = 0; + break; + } + } + +out: + return index; +} +EXPORT_SYMBOL(radix_tree_scan_hole_backward); - slot = __lookup_slot(root, index); - return slot != NULL ? *slot : NULL; +/** + * radix_tree_scan_hole - scan for hole + * @root: radix tree root + * @index: index key + * @max_scan: advice on max items to scan (it may scan a little more) + * + * Scan forward from @index for a hole/empty item, stop when + * - hit hole + * - hit EOF + * - hit index ULONG_MAX + * - @max_scan or more items scanned + * + * Return the correponding index. + */ +unsigned long radix_tree_scan_hole(struct radix_tree_root *root, + unsigned long index, unsigned long max_scan) +{ + struct radix_tree_cache cache; + struct radix_tree_node *node; + unsigned long origin; + int i; + + origin = index; + radix_tree_cache_init(&cache); + + while (index - origin < max_scan) { + node = radix_tree_cache_lookup_node(root, &cache, index, 1); + if (!node) + break; + + if (node->count == RADIX_TREE_MAP_SIZE) { + index = (index | RADIX_TREE_MAP_MASK) + 1; + goto check_overflow; + } + + for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; + i++, index++) { + if (!node->slots[i]) + goto out; + } + +check_overflow: + if (unlikely(!index)) { + index = ULONG_MAX; + break; + } + } +out: + return index; } -EXPORT_SYMBOL(radix_tree_lookup); +EXPORT_SYMBOL(radix_tree_scan_hole); /** * radix_tree_tag_set - set a tag on a radix tree node diff -urN oldtree/mm/Kconfig newtree/mm/Kconfig --- oldtree/mm/Kconfig 2006-04-01 04:48:27.000000000 -0500 +++ newtree/mm/Kconfig 2006-04-01 08:51:11.644729750 -0500 @@ -139,3 +139,58 @@ config MIGRATION def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM depends on SWAP + +# +# Adaptive file readahead +# +config ADAPTIVE_READAHEAD + bool "Adaptive file readahead (EXPERIMENTAL)" + default n + depends on EXPERIMENTAL + help + Readahead is a technique employed by the kernel in an attempt + to improve file reading performance. If the kernel has reason + to believe that a particular file is being read sequentially, + it will attempt to read blocks from the file into memory before + the application requests them. When readahead works, it speeds + up the system's throughput, since the reading application does + not have to wait for its requests. When readahead fails, instead, + it generates useless I/O and occupies memory pages which are + needed for some other purpose. For sequential readings, + + Normally, the kernel uses a stock readahead logic that is well + understood and well tuned. This option enables a much complex and + feature rich one. It is more aggressive and memory efficient in + doing readahead, and supports some less-common access patterns such + as reading backward and reading sparsely. However, due to the great + diversity of real world applications, it might not fit everyone. + + Please refer to Documentation/sysctl/vm.txt for tunable parameters. + + Say Y here if you are building kernel for file servers. + Say N if you are unsure. + +config DEBUG_READAHEAD + bool "Readahead debug and accounting" + default n + depends on ADAPTIVE_READAHEAD + select DEBUG_FS + help + This option injects extra code to dump detailed debug traces and do + readahead events accounting. + + To actually get the data: + + mkdir /debug + mount -t debug none /debug + + After that you can do the following: + + echo > /debug/readahead/events # reset the counters + cat /debug/readahead/events # check the counters + + echo 1 > /debug/readahead/debug_level # show printk traces + echo 2 > /debug/readahead/debug_level # show verbose printk traces + echo 0 > /debug/readahead/debug_level # stop filling my kern.log + + Say N, unless you have readahead performance problems. diff -urN oldtree/mm/Makefile newtree/mm/Makefile --- oldtree/mm/Makefile 2006-04-01 04:48:27.000000000 -0500 +++ newtree/mm/Makefile 2006-04-01 08:53:57.987125500 -0500 @@ -13,6 +13,7 @@ prio_tree.o util.o $(mmu-y) obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o +obj-$(CONFIG_SWAP_PREFETCH) += swap_prefetch.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o diff -urN oldtree/mm/filemap.c newtree/mm/filemap.c --- oldtree/mm/filemap.c 2006-04-01 04:48:27.000000000 -0500 +++ newtree/mm/filemap.c 2006-04-01 08:51:11.648730000 -0500 @@ -42,6 +42,12 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs); +#ifdef CONFIG_DEBUG_READAHEAD +extern u32 readahead_debug_level; +#else +#define readahead_debug_level 0 +#endif /* CONFIG_DEBUG_READAHEAD */ + /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. @@ -746,10 +752,12 @@ unsigned long prev_index; loff_t isize; struct page *cached_page; + struct page *prev_page; int error; struct file_ra_state ra = *_ra; cached_page = NULL; + prev_page = NULL; index = *ppos >> PAGE_CACHE_SHIFT; next_index = index; prev_index = ra.prev_page; @@ -760,6 +768,10 @@ if (!isize) goto out; + if (readahead_debug_level >= 5) + printk(KERN_DEBUG "read-file(ino=%lu, req=%lu+%lu)\n", + inode->i_ino, index, last_index - index); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; for (;;) { struct page *page; @@ -778,16 +790,45 @@ nr = nr - offset; cond_resched(); - if (index == next_index) + + if (!prefer_adaptive_readahead() && index == next_index) next_index = page_cache_readahead(mapping, &ra, filp, index, last_index - index); find_page: page = find_get_page(mapping, index); + if (prefer_adaptive_readahead()) { + if (unlikely(page == NULL)) { + ra.prev_page = prev_index; + page_cache_readahead_adaptive(mapping, &ra, + filp, prev_page, NULL, + *ppos >> PAGE_CACHE_SHIFT, + index, last_index); + page = find_get_page(mapping, index); + } else if (PageReadahead(page)) { + ra.prev_page = prev_index; + page_cache_readahead_adaptive(mapping, &ra, + filp, prev_page, page, + *ppos >> PAGE_CACHE_SHIFT, + index, last_index); + } + } if (unlikely(page == NULL)) { - handle_ra_miss(mapping, &ra, index); + if (!prefer_adaptive_readahead()) + handle_ra_miss(mapping, &ra, index); goto no_cached_page; } + + if (prev_page) + page_cache_release(prev_page); + prev_page = page; + + readahead_cache_hit(&ra, page); + if (readahead_debug_level >= 7) + printk(KERN_DEBUG "read-page(ino=%lu, idx=%lu, io=%s)\n", + inode->i_ino, index, + PageUptodate(page) ? "hit" : "miss"); + if (!PageUptodate(page)) goto page_not_up_to_date; page_ok: @@ -822,7 +863,6 @@ index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; - page_cache_release(page); if (ret == nr && desc->count) continue; goto out; @@ -834,7 +874,6 @@ /* Did it get unhashed before we got the lock? */ if (!page->mapping) { unlock_page(page); - page_cache_release(page); continue; } @@ -864,7 +903,6 @@ * invalidate_inode_pages got it */ unlock_page(page); - page_cache_release(page); goto find_page; } unlock_page(page); @@ -885,7 +923,6 @@ isize = i_size_read(inode); end_index = (isize - 1) >> PAGE_CACHE_SHIFT; if (unlikely(!isize || index > end_index)) { - page_cache_release(page); goto out; } @@ -894,7 +931,6 @@ if (index == end_index) { nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; if (nr <= offset) { - page_cache_release(page); goto out; } } @@ -904,7 +940,6 @@ readpage_error: /* UHHUH! A synchronous read error occurred. Report it */ desc->error = error; - page_cache_release(page); goto out; no_cached_page: @@ -929,15 +964,22 @@ } page = cached_page; cached_page = NULL; + if (prev_page) + page_cache_release(prev_page); + prev_page = page; goto readpage; } out: *_ra = ra; + if (prefer_adaptive_readahead()) + _ra->prev_page = prev_index; *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; if (cached_page) page_cache_release(cached_page); + if (prev_page) + page_cache_release(prev_page); if (filp) file_accessed(filp); } @@ -1216,6 +1258,7 @@ unsigned long size, pgoff; int did_readaround = 0, majmin = VM_FAULT_MINOR; + ra->flags |= RA_FLAG_MMAP; pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; retry_all: @@ -1233,7 +1276,7 @@ * * For sequential accesses, we use the generic readahead logic. */ - if (VM_SequentialReadHint(area)) + if (!prefer_adaptive_readahead() && VM_SequentialReadHint(area)) page_cache_readahead(mapping, ra, file, pgoff, 1); /* @@ -1241,11 +1284,24 @@ */ retry_find: page = find_get_page(mapping, pgoff); + if (prefer_adaptive_readahead() && VM_SequentialReadHint(area)) { + if (!page) { + page_cache_readahead_adaptive(mapping, ra, + file, NULL, NULL, + pgoff, pgoff, pgoff + 1); + page = find_get_page(mapping, pgoff); + } else if (PageReadahead(page)) { + page_cache_readahead_adaptive(mapping, ra, + file, NULL, page, + pgoff, pgoff, pgoff + 1); + } + } if (!page) { unsigned long ra_pages; if (VM_SequentialReadHint(area)) { - handle_ra_miss(mapping, ra, pgoff); + if (!prefer_adaptive_readahead()) + handle_ra_miss(mapping, ra, pgoff); goto no_cached_page; } ra->mmap_miss++; @@ -1282,6 +1338,14 @@ if (!did_readaround) ra->mmap_hit++; + readahead_cache_hit(ra, page); + if (readahead_debug_level >= 6) + printk(KERN_DEBUG "read-mmap(ino=%lu, idx=%lu, hint=%s, io=%s)\n", + inode->i_ino, pgoff, + VM_RandomReadHint(area) ? "random" : + (VM_SequentialReadHint(area) ? "sequential" : "none"), + PageUptodate(page) ? "hit" : "miss"); + /* * Ok, found a page in the page cache, now we need to check * that it's up-to-date. @@ -1296,6 +1360,8 @@ mark_page_accessed(page); if (type) *type = majmin; + if (prefer_adaptive_readahead()) + ra->prev_page = page->index; return page; outside_data_content: diff -urN oldtree/mm/memory.c newtree/mm/memory.c --- oldtree/mm/memory.c 2006-04-01 04:48:27.000000000 -0500 +++ newtree/mm/memory.c 2006-04-01 08:51:11.652730250 -0500 @@ -1993,6 +1993,7 @@ page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_none(*page_table)) goto release; + inc_readahead_aging(); inc_mm_counter(mm, anon_rss); lru_cache_add_active(page); page_add_new_anon_rmap(page, vma, address); diff -urN oldtree/mm/page-writeback.c newtree/mm/page-writeback.c --- oldtree/mm/page-writeback.c 2006-04-01 04:48:27.000000000 -0500 +++ newtree/mm/page-writeback.c 2006-04-01 08:54:33.701357500 -0500 @@ -69,18 +69,18 @@ /* * The generator of dirty data starts writeback at this percentage */ -int vm_dirty_ratio = 40; +int vm_dirty_ratio = 33; /* * The interval between `kupdate'-style writebacks, in centiseconds * (hundredths of a second) */ -int dirty_writeback_centisecs = 5 * 100; +int dirty_writeback_centisecs = 3 * 100; /* * The longest number of centiseconds for which data is allowed to remain dirty */ -int dirty_expire_centisecs = 30 * 100; +int dirty_expire_centisecs = 15 * 100; /* * Flag that makes the machine dump writes/reads and block dirtyings. @@ -370,7 +370,7 @@ static void laptop_timer_fn(unsigned long unused); static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0); -static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); +DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); /* * Periodic writeback of "old" data. diff -urN oldtree/mm/page_alloc.c newtree/mm/page_alloc.c --- oldtree/mm/page_alloc.c 2006-04-01 04:48:27.000000000 -0500 +++ newtree/mm/page_alloc.c 2006-04-01 08:54:45.770111750 -0500 @@ -532,7 +532,7 @@ if (PageReserved(page)) return 1; - page->flags &= ~(1 << PG_uptodate | 1 << PG_error | + page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked | 1 << PG_mappedtodisk); set_page_private(page, 0); @@ -1436,6 +1436,7 @@ " min:%lukB" " low:%lukB" " high:%lukB" + " lots:%lukB" " active:%lukB" " inactive:%lukB" " present:%lukB" @@ -1447,6 +1448,7 @@ K(zone->pages_min), K(zone->pages_low), K(zone->pages_high), + K(zone->pages_lots), K(zone->nr_active), K(zone->nr_inactive), K(zone->present_pages), @@ -2227,6 +2229,7 @@ "\n min %lu" "\n low %lu" "\n high %lu" + "\n lots %lu" "\n active %lu" "\n inactive %lu" "\n scanned %lu (a: %lu i: %lu)" @@ -2236,6 +2239,7 @@ zone->pages_min, zone->pages_low, zone->pages_high, + zone->pages_lots, zone->nr_active, zone->nr_inactive, zone->pages_scanned, @@ -2538,6 +2542,7 @@ zone->pages_low = zone->pages_min + tmp / 4; zone->pages_high = zone->pages_min + tmp / 2; + zone->pages_lots = zone->pages_min + tmp; spin_unlock_irqrestore(&zone->lru_lock, flags); } } diff -urN oldtree/mm/readahead.c newtree/mm/readahead.c --- oldtree/mm/readahead.c 2006-04-01 04:48:27.000000000 -0500 +++ newtree/mm/readahead.c 2006-04-01 08:51:11.664731000 -0500 @@ -14,6 +14,300 @@ #include #include #include +#include +#include +#include + +/* The default max/min read-ahead pages. */ +#define KB(size) (((size)*1024 + PAGE_CACHE_SIZE-1) / PAGE_CACHE_SIZE) +#define MAX_RA_PAGES KB(VM_MAX_READAHEAD) +#define MIN_RA_PAGES KB(VM_MIN_READAHEAD) +#define MIN_NFSD_PAGES KB(NFSSVC_MAXBLKSIZE/1024) + +#define next_page(pg) (list_entry((pg)->lru.prev, struct page, lru)) +#define prev_page(pg) (list_entry((pg)->lru.next, struct page, lru)) + +#ifdef CONFIG_ADAPTIVE_READAHEAD +/* + * Adaptive read-ahead parameters. + */ + +/* In laptop mode, poll delayed look-ahead on every ## pages read. */ +#define LAPTOP_POLL_INTERVAL 16 + +/* Set look-ahead size to 1/# of the thrashing-threshold. */ +#define LOOKAHEAD_RATIO 8 + +/* Set read-ahead size to ##% of the thrashing-threshold. */ +int readahead_ratio = 50; +EXPORT_SYMBOL(readahead_ratio); + +/* Readahead as long as cache hit ratio keeps above 1/##. */ +int readahead_hit_rate = 2; +EXPORT_SYMBOL(readahead_hit_rate); + +/* + * Measures the aging process of cold pages. + * Mainly increased on fresh page references to make it smooth. + */ +DEFINE_PER_CPU(unsigned long, readahead_aging); +EXPORT_PER_CPU_SYMBOL(readahead_aging); + +/* + * Detailed classification of read-ahead behaviors. + */ +#define RA_CLASS_SHIFT 4 +#define RA_CLASS_MASK ((1 << RA_CLASS_SHIFT) - 1) +enum ra_class { + RA_CLASS_ALL, + RA_CLASS_NEWFILE, + RA_CLASS_STATE, + RA_CLASS_CONTEXT, + RA_CLASS_CONTEXT_AGGRESSIVE, + RA_CLASS_BACKWARD, + RA_CLASS_THRASHING, + RA_CLASS_SEEK, + RA_CLASS_END, +}; +#endif /* CONFIG_ADAPTIVE_READAHEAD */ + +/* + * Read-ahead events accounting. + */ +#ifdef CONFIG_DEBUG_READAHEAD +#include +#include +#include +#include + +#define DEBUG_READAHEAD_RADIXTREE + +/* Read-ahead events to be accounted. */ +enum ra_event { + RA_EVENT_CACHE_MISS, /* read cache misses */ + RA_EVENT_READRANDOM, /* random reads */ + RA_EVENT_IO_CONGESTION, /* io congestion */ + RA_EVENT_IO_CACHE_HIT, /* canceled io due to cache hit */ + RA_EVENT_IO_BLOCK, /* read on locked page */ + + RA_EVENT_READAHEAD, /* read-ahead issued */ + RA_EVENT_READAHEAD_HIT, /* read-ahead page hit */ + RA_EVENT_LOOKAHEAD, /* look-ahead issued */ + RA_EVENT_LOOKAHEAD_HIT, /* look-ahead mark hit */ + RA_EVENT_LOOKAHEAD_NOACTION, /* look-ahead mark ignored */ + RA_EVENT_READAHEAD_MMAP, /* read-ahead for memory mapped file */ + RA_EVENT_READAHEAD_EOF, /* read-ahead reaches EOF */ + RA_EVENT_READAHEAD_SHRINK, /* ra_size under previous la_size */ + RA_EVENT_READAHEAD_THRASHING, /* read-ahead thrashing happened */ + RA_EVENT_READAHEAD_MUTILATE, /* read-ahead request mutilated */ + RA_EVENT_READAHEAD_RESCUE, /* read-ahead rescued */ + + RA_EVENT_END +}; + +static const char * const ra_event_name[] = { + "cache_miss", + "read_random", + "io_congestion", + "io_cache_hit", + "io_block", + "readahead", + "readahead_hit", + "lookahead", + "lookahead_hit", + "lookahead_ignore", + "readahead_mmap", + "readahead_eof", + "readahead_shrink", + "readahead_thrash", + "readahead_mutilt", + "readahead_rescue", +}; + +static const char * const ra_class_name[] = { + "total", + "newfile", + "state", + "context", + "contexta", + "backward", + "onthrash", + "onraseek", + "none", +}; + +static unsigned long ra_events[RA_CLASS_END+1][RA_EVENT_END+1][2]; + +static inline void ra_account(struct file_ra_state *ra, + enum ra_event e, int pages) +{ + enum ra_class c; + + if (e == RA_EVENT_READAHEAD_HIT && pages < 0) { + c = (ra->flags >> RA_CLASS_SHIFT) & RA_CLASS_MASK; + pages = -pages; + } else if (ra) + c = ra->flags & RA_CLASS_MASK; + else + c = RA_CLASS_END; + + if (!c) + c = RA_CLASS_END; + + ra_events[c][e][0] += 1; + ra_events[c][e][1] += pages; + + if (e == RA_EVENT_READAHEAD) + ra_events[c][RA_EVENT_END][1] += pages * pages; +} + +static int ra_events_show(struct seq_file *s, void *_) +{ + int i; + int c; + int e; + static const char event_fmt[] = "%-16s"; + static const char class_fmt[] = "%10s"; + static const char item_fmt[] = "%10lu"; + static const char percent_format[] = "%9lu%%"; + static const char * const table_name[] = { + "[table requests]", + "[table pages]", + "[table summary]"}; + + for (i = 0; i <= 1; i++) { + for (e = 0; e <= RA_EVENT_END; e++) { + ra_events[0][e][i] = 0; + for (c = 1; c < RA_CLASS_END; c++) + ra_events[0][e][i] += ra_events[c][e][i]; + } + + seq_printf(s, event_fmt, table_name[i]); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, class_fmt, ra_class_name[c]); + seq_puts(s, "\n"); + + for (e = 0; e < RA_EVENT_END; e++) { + if (e == RA_EVENT_READAHEAD_HIT && i == 0) + continue; + if (e == RA_EVENT_IO_BLOCK && i == 1) + continue; + + seq_printf(s, event_fmt, ra_event_name[e]); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, item_fmt, ra_events[c][e][i]); + seq_puts(s, "\n"); + } + seq_puts(s, "\n"); + } + + seq_printf(s, event_fmt, table_name[2]); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, class_fmt, ra_class_name[c]); + seq_puts(s, "\n"); + + seq_printf(s, event_fmt, "random_rate"); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, percent_format, + (ra_events[c][RA_EVENT_READRANDOM][0] * 100) / + ((ra_events[c][RA_EVENT_READRANDOM][0] + + ra_events[c][RA_EVENT_READAHEAD][0]) | 1)); + seq_puts(s, "\n"); + + seq_printf(s, event_fmt, "ra_hit_rate"); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, percent_format, + (ra_events[c][RA_EVENT_READAHEAD_HIT][1] * 100) / + (ra_events[c][RA_EVENT_READAHEAD][1] | 1)); + seq_puts(s, "\n"); + + seq_printf(s, event_fmt, "la_hit_rate"); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, percent_format, + (ra_events[c][RA_EVENT_LOOKAHEAD_HIT][0] * 100) / + (ra_events[c][RA_EVENT_LOOKAHEAD][0] | 1)); + seq_puts(s, "\n"); + + seq_printf(s, event_fmt, "var_ra_size"); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, item_fmt, + (ra_events[c][RA_EVENT_END][1] - + ra_events[c][RA_EVENT_READAHEAD][1] * + (ra_events[c][RA_EVENT_READAHEAD][1] / + (ra_events[c][RA_EVENT_READAHEAD][0] | 1))) / + (ra_events[c][RA_EVENT_READAHEAD][0] | 1)); + seq_puts(s, "\n"); + + seq_printf(s, event_fmt, "avg_ra_size"); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, item_fmt, + (ra_events[c][RA_EVENT_READAHEAD][1] + + ra_events[c][RA_EVENT_READAHEAD][0] / 2) / + (ra_events[c][RA_EVENT_READAHEAD][0] | 1)); + seq_puts(s, "\n"); + + seq_printf(s, event_fmt, "avg_la_size"); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, item_fmt, + (ra_events[c][RA_EVENT_LOOKAHEAD][1] + + ra_events[c][RA_EVENT_LOOKAHEAD][0] / 2) / + (ra_events[c][RA_EVENT_LOOKAHEAD][0] | 1)); + seq_puts(s, "\n"); + + return 0; +} + +static int ra_events_open(struct inode *inode, struct file *file) +{ + return single_open(file, ra_events_show, NULL); +} + +static ssize_t ra_events_write(struct file *file, const char __user *buf, + size_t size, loff_t *offset) +{ + memset(ra_events, 0, sizeof(ra_events)); + return 1; +} + +struct file_operations ra_events_fops = { + .owner = THIS_MODULE, + .open = ra_events_open, + .write = ra_events_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +u32 readahead_debug_level = 0; +u32 disable_stateful_method = 0; + +static int __init readahead_init(void) +{ + struct dentry *root; + + root = debugfs_create_dir("readahead", NULL); + + debugfs_create_file("events", 0644, root, NULL, &ra_events_fops); + + debugfs_create_u32("debug_level", 0644, root, &readahead_debug_level); + debugfs_create_bool("disable_stateful_method", 0644, root, + &disable_stateful_method); + + return 0; +} + +module_init(readahead_init) +#else +#define ra_account(ra, e, pages) do { } while (0) +#define readahead_debug_level (0) +#define disable_stateful_method (0) +#endif /* CONFIG_DEBUG_READAHEAD */ + +#define dprintk(args...) \ + do { if (readahead_debug_level >= 1) printk(KERN_DEBUG args); } while(0) +#define ddprintk(args...) \ + do { if (readahead_debug_level >= 2) printk(KERN_DEBUG args); } while(0) + void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) { @@ -21,7 +315,7 @@ EXPORT_SYMBOL(default_unplug_io_fn); struct backing_dev_info default_backing_dev_info = { - .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE, + .ra_pages = MAX_RA_PAGES, .state = 0, .capabilities = BDI_CAP_MAP_COPY, .unplug_io_fn = default_unplug_io_fn, @@ -49,7 +343,7 @@ static inline unsigned long get_min_readahead(struct file_ra_state *ra) { - return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; + return MIN_RA_PAGES; } static inline void ra_off(struct file_ra_state *ra) @@ -134,8 +428,10 @@ continue; } ret = filler(data, page); - if (!pagevec_add(&lru_pvec, page)) + if (!pagevec_add(&lru_pvec, page)) { + cond_resched(); __pagevec_lru_add(&lru_pvec); + } if (ret) { while (!list_empty(pages)) { struct page *victim; @@ -173,8 +469,10 @@ page->index, GFP_KERNEL)) { ret = mapping->a_ops->readpage(filp, page); if (ret != AOP_TRUNCATED_PAGE) { - if (!pagevec_add(&lru_pvec, page)) + if (!pagevec_add(&lru_pvec, page)) { + cond_resched(); __pagevec_lru_add(&lru_pvec); + } continue; } /* else fall through to release */ } @@ -257,7 +555,8 @@ */ static int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read) + pgoff_t offset, unsigned long nr_to_read, + unsigned long lookahead_size) { struct inode *inode = mapping->host; struct page *page; @@ -270,7 +569,7 @@ if (isize == 0) goto out; - end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); + end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); /* * Preallocate as many pages as we will need. @@ -287,12 +586,15 @@ continue; read_unlock_irq(&mapping->tree_lock); + cond_resched(); page = page_cache_alloc_cold(mapping); read_lock_irq(&mapping->tree_lock); if (!page) break; page->index = page_offset; list_add(&page->lru, &page_pool); + if (page_idx == nr_to_read - lookahead_size) + __SetPageReadahead(page); ret++; } read_unlock_irq(&mapping->tree_lock); @@ -329,7 +631,7 @@ if (this_chunk > nr_to_read) this_chunk = nr_to_read; err = __do_page_cache_readahead(mapping, filp, - offset, this_chunk); + offset, this_chunk, 0); if (err < 0) { ret = err; break; @@ -338,6 +640,9 @@ offset += this_chunk; nr_to_read -= this_chunk; } + + ra_account(NULL, RA_EVENT_READAHEAD, ret); + return ret; } @@ -373,10 +678,16 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read) { + unsigned long ret; + if (bdi_read_congested(mapping->backing_dev_info)) return -1; - return __do_page_cache_readahead(mapping, filp, offset, nr_to_read); + ret = __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); + + ra_account(NULL, RA_EVENT_READAHEAD, ret); + + return ret; } /* @@ -396,7 +707,11 @@ if (!block && bdi_read_congested(mapping->backing_dev_info)) return 0; - actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read); + actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); + + ra_account(NULL, RA_EVENT_READAHEAD, actual); + dprintk("blockable-readahead(ino=%lu, ra=%lu+%lu) = %d\n", + mapping->host->i_ino, offset, nr_to_read, actual); return check_ra_success(ra, nr_to_read, actual); } @@ -442,7 +757,7 @@ * @req_size: hint: total size of the read which the caller is performing in * PAGE_CACHE_SIZE units * - * page_cache_readahead() is the main function. If performs the adaptive + * page_cache_readahead() is the main function. It performs the adaptive * readahead window size management and submits the readahead I/O. * * Note that @filp is purely used for passing on to the ->readpage[s]() @@ -572,3 +887,1187 @@ __get_zone_counts(&active, &inactive, &free, NODE_DATA(numa_node_id())); return min(nr, (inactive + free) / 2); } + +/* + * Adaptive read-ahead. + * + * Good read patterns are compact both in space and time. The read-ahead logic + * tries to grant larger read-ahead size to better readers under the constraint + * of system memory and load pressure. + * + * It employs two methods to estimate the max thrashing safe read-ahead size: + * 1. state based - the default one + * 2. context based - the failsafe one + * The integration of the dual methods has the merit of being agile and robust. + * It makes the overall design clean: special cases are handled in general by + * the stateless method, leaving the stateful one simple and fast. + * + * To improve throughput and decrease read delay, the logic 'looks ahead'. + * In most read-ahead chunks, one page will be selected and tagged with + * PG_readahead. Later when the page with PG_readahead is read, the logic + * will be notified to submit the next read-ahead chunk in advance. + * + * a read-ahead chunk + * +-----------------------------------------+ + * | # PG_readahead | + * +-----------------------------------------+ + * ^ When this page is read, notify me for the next read-ahead. + * + * + * Here are some variable names used frequently: + * + * |<------- la_size ------>| + * +-----------------------------------------+ + * | # | + * +-----------------------------------------+ + * ra_index -->|<---------------- ra_size -------------->| + * + */ + +#ifdef CONFIG_ADAPTIVE_READAHEAD + +/* + * The nature of read-ahead allows false tests to occur occasionally. + * Here we just do not bother to call get_page(), it's meaningless anyway. + */ +static inline struct page *__find_page(struct address_space *mapping, + pgoff_t offset) +{ + return radix_tree_lookup(&mapping->page_tree, offset); +} + +static inline struct page *find_page(struct address_space *mapping, + pgoff_t offset) +{ + struct page *page; + + read_lock_irq(&mapping->tree_lock); + page = __find_page(mapping, offset); + read_unlock_irq(&mapping->tree_lock); + return page; +} + +/* + * Move pages in danger (of thrashing) to the head of inactive_list. + * Not expected to happen frequently. + */ +static unsigned long rescue_pages(struct page *page, unsigned long nr_pages) +{ + int pgrescue; + pgoff_t index; + struct zone *zone; + struct address_space *mapping; + + BUG_ON(!nr_pages || !page); + pgrescue = 0; + index = page_index(page); + mapping = page_mapping(page); + + dprintk("rescue_pages(ino=%lu, index=%lu nr=%lu)\n", + mapping->host->i_ino, index, nr_pages); + + for(;;) { + zone = page_zone(page); + spin_lock_irq(&zone->lru_lock); + + if (!PageLRU(page)) + goto out_unlock; + + while (page_mapping(page) == mapping && + page_index(page) == index) { + struct page *the_page = page; + page = next_page(page); + if (!PageActive(the_page) && + !PageLocked(the_page) && + page_count(the_page) == 1) { + list_move(&the_page->lru, &zone->inactive_list); + pgrescue++; + } + index++; + if (!--nr_pages) + goto out_unlock; + } + + spin_unlock_irq(&zone->lru_lock); + + cond_resched(); + page = find_page(mapping, index); + if (!page) + goto out; + } +out_unlock: + spin_unlock_irq(&zone->lru_lock); +out: + ra_account(NULL, RA_EVENT_READAHEAD_RESCUE, pgrescue); + return nr_pages; +} + +/* + * Set a new look-ahead mark at @new_index. + * Return 0 if the new mark is successfully set. + */ +static inline int renew_lookahead(struct address_space *mapping, + struct file_ra_state *ra, + pgoff_t index, pgoff_t new_index) +{ + struct page *page; + + if (index == ra->lookahead_index && + new_index >= ra->readahead_index) + return 1; + + page = find_page(mapping, new_index); + if (!page) + return 1; + + __SetPageReadahead(page); + if (ra->lookahead_index == index) + ra->lookahead_index = new_index; + + return 0; +} + +/* + * State based calculation of read-ahead request. + * + * This figure shows the meaning of file_ra_state members: + * + * chunk A chunk B + * +---------------------------+-------------------------------------------+ + * | # | # | + * +---------------------------+-------------------------------------------+ + * ^ ^ ^ ^ + * la_index ra_index lookahead_index readahead_index + */ + +/* + * The node's effective length of inactive_list(s). + */ +static unsigned long node_free_and_cold_pages(void) +{ + unsigned int i; + unsigned long sum = 0; + struct zone *zones = NODE_DATA(numa_node_id())->node_zones; + + for (i = 0; i < MAX_NR_ZONES; i++) + sum += zones[i].nr_inactive + + zones[i].free_pages - zones[i].pages_low; + + return sum; +} + +/* + * The node's accumulated aging activities. + */ +static unsigned long node_readahead_aging(void) +{ + unsigned long cpu; + unsigned long sum = 0; + cpumask_t mask = node_to_cpumask(numa_node_id()); + + for_each_cpu_mask(cpu, mask) + sum += per_cpu(readahead_aging, cpu); + + return sum; +} + +/* + * The 64bit cache_hits stores three accumulated values and a counter value. + * MSB LSB + * 3333333333333333 : 2222222222222222 : 1111111111111111 : 0000000000000000 + */ +static inline int ra_cache_hit(struct file_ra_state *ra, int nr) +{ + return (ra->cache_hits >> (nr * 16)) & 0xFFFF; +} + +/* + * Conceptual code: + * ra_cache_hit(ra, 1) += ra_cache_hit(ra, 0); + * ra_cache_hit(ra, 0) = 0; + */ +static inline void ra_addup_cache_hit(struct file_ra_state *ra) +{ + int n; + + n = ra_cache_hit(ra, 0); + ra->cache_hits -= n; + n <<= 16; + ra->cache_hits += n; +} + +/* + * The read-ahead is deemed success if cache-hit-rate >= 1/readahead_hit_rate. + */ +static inline int ra_cache_hit_ok(struct file_ra_state *ra) +{ + return ra_cache_hit(ra, 0) * readahead_hit_rate >= + (ra->lookahead_index - ra->la_index); +} + +/* + * Check if @index falls in the @ra request. + */ +static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) +{ + if (index < ra->la_index || index >= ra->readahead_index) + return 0; + + if (index >= ra->ra_index) + return 1; + else + return -1; +} + +/* + * Which method is issuing this read-ahead? + */ +static inline void ra_set_class(struct file_ra_state *ra, + enum ra_class ra_class) +{ + unsigned long flags_mask; + unsigned long flags; + unsigned long old_ra_class; + + flags_mask = ~(RA_CLASS_MASK | (RA_CLASS_MASK << RA_CLASS_SHIFT)); + flags = ra->flags & flags_mask; + + old_ra_class = (ra->flags & RA_CLASS_MASK) << RA_CLASS_SHIFT; + + ra->flags = flags | old_ra_class | ra_class; + + ra_addup_cache_hit(ra); + if (ra_class != RA_CLASS_STATE) + ra->cache_hits <<= 16; + + ra->age = node_readahead_aging(); +} + +/* + * Where is the old read-ahead and look-ahead? + */ +static inline void ra_set_index(struct file_ra_state *ra, + pgoff_t la_index, pgoff_t ra_index) +{ + ra->la_index = la_index; + ra->ra_index = ra_index; +} + +/* + * Where is the new read-ahead and look-ahead? + */ +static inline void ra_set_size(struct file_ra_state *ra, + unsigned long ra_size, unsigned long la_size) +{ + /* Disable look-ahead for loopback file. */ + if (unlikely(ra->flags & RA_FLAG_NO_LOOKAHEAD)) + la_size = 0; + + ra->readahead_index = ra->ra_index + ra_size; + ra->lookahead_index = ra->readahead_index - la_size; +} + +/* + * Submit IO for the read-ahead request in file_ra_state. + */ +static int ra_dispatch(struct file_ra_state *ra, + struct address_space *mapping, struct file *filp) +{ + pgoff_t eof_index; + unsigned long ra_size; + unsigned long la_size; + int actual; + enum ra_class ra_class; + + ra_class = (ra->flags & RA_CLASS_MASK); + BUG_ON(ra_class == 0 || ra_class > RA_CLASS_END); + + eof_index = ((i_size_read(mapping->host) - 1) >> PAGE_CACHE_SHIFT) + 1; + ra_size = ra->readahead_index - ra->ra_index; + la_size = ra->readahead_index - ra->lookahead_index; + + /* Snap to EOF. */ + if (unlikely(ra->ra_index >= eof_index)) + return 0; + if (ra->readahead_index + ra_size / 2 > eof_index) { + if (ra_class == RA_CLASS_CONTEXT_AGGRESSIVE && + eof_index > ra->lookahead_index + 1) + la_size = eof_index - ra->lookahead_index; + else + la_size = 0; + ra_size = eof_index - ra->ra_index; + ra_set_size(ra, ra_size, la_size); + } + + actual = __do_page_cache_readahead(mapping, filp, + ra->ra_index, ra_size, la_size); + +#ifdef CONFIG_DEBUG_READAHEAD + if (ra->flags & RA_FLAG_MMAP) + ra_account(ra, RA_EVENT_READAHEAD_MMAP, actual); + if (ra->readahead_index == eof_index) + ra_account(ra, RA_EVENT_READAHEAD_EOF, actual); + if (la_size) + ra_account(ra, RA_EVENT_LOOKAHEAD, la_size); + if (ra_size > actual) + ra_account(ra, RA_EVENT_IO_CACHE_HIT, ra_size - actual); + ra_account(ra, RA_EVENT_READAHEAD, actual); + + if (!ra->ra_index && filp->f_dentry->d_inode) { + char *fn; + static char path[1024]; + unsigned long size; + + size = (i_size_read(filp->f_dentry->d_inode)+1023)/1024; + fn = d_path(filp->f_dentry, filp->f_vfsmnt, path, 1000); + if (!IS_ERR(fn)) + ddprintk("ino %lu is %s size %luK by %s(%d)\n", + filp->f_dentry->d_inode->i_ino, + fn, size, + current->comm, current->pid); + } + + dprintk("readahead-%s(ino=%lu, index=%lu, ra=%lu+%lu-%lu) = %d\n", + ra_class_name[ra_class], + mapping->host->i_ino, ra->la_index, + ra->ra_index, ra_size, la_size, actual); +#endif /* CONFIG_DEBUG_READAHEAD */ + + return actual; +} + +/* + * Determine the ra request from primitive values. + * + * It applies the following rules: + * - Substract ra_size by the old look-ahead to get real safe read-ahead; + * - Set new la_size according to the (still large) ra_size; + * - Apply upper limits; + * - Make sure stream_shift is not too small. + * (So that the next global_shift will not be too small.) + * + * Input: + * ra_size stores the estimated thrashing-threshold. + * la_size stores the look-ahead size of previous request. + */ +static inline int adjust_rala(unsigned long ra_max, + unsigned long *ra_size, unsigned long *la_size) +{ + unsigned long stream_shift = *la_size; + + if (*ra_size > *la_size) + *ra_size -= *la_size; + else { + ra_account(NULL, RA_EVENT_READAHEAD_SHRINK, *ra_size); + return 0; + } + + *la_size = *ra_size / LOOKAHEAD_RATIO; + + if (*ra_size > ra_max) + *ra_size = ra_max; + if (*la_size > *ra_size) + *la_size = *ra_size; + + stream_shift += (*ra_size - *la_size); + if (stream_shift < *ra_size / 4) + *la_size -= (*ra_size / 4 - stream_shift); + + return 1; +} + +/* + * The function estimates two values: + * 1. thrashing-threshold for the current stream + * It is returned to make the next read-ahead request. + * 2. the remained safe space for the current chunk + * It will be checked to ensure that the current chunk is safe. + * + * The computation will be pretty accurate under heavy load, and will vibrate + * more on light load(with small global_shift), so the grow speed of ra_size + * must be limited, and a moderate large stream_shift must be insured. + * + * This figure illustrates the formula used in the function: + * While the stream reads stream_shift pages inside the chunks, + * the chunks are shifted global_shift pages inside inactive_list. + * + * chunk A chunk B + * |<=============== global_shift ================| + * +-------------+ +-------------------+ | + * | # | | # | inactive_list | + * +-------------+ +-------------------+ head | + * |---->| |---------->| + * | | + * +-- stream_shift --+ + */ +static inline unsigned long compute_thrashing_threshold( + struct file_ra_state *ra, + unsigned long *remain) +{ + unsigned long global_size; + unsigned long global_shift; + unsigned long stream_shift; + unsigned long ra_size; + uint64_t ll; + + global_size = node_free_and_cold_pages(); + global_shift = node_readahead_aging() - ra->age; + global_shift |= 1UL; + stream_shift = ra_cache_hit(ra, 0); + + ll = (uint64_t) stream_shift * (global_size >> 9) * readahead_ratio * 5; + do_div(ll, global_shift); + ra_size = ll; + + if (global_size > global_shift) { + ll = (uint64_t) stream_shift * (global_size - global_shift); + do_div(ll, global_shift); + *remain = ll; + } else + *remain = 0; + + ddprintk("compute_thrashing_threshold: " + "at %lu ra %lu=%lu*%lu/%lu, remain %lu for %lu\n", + ra->readahead_index, ra_size, + stream_shift, global_size, global_shift, + *remain, ra->readahead_index - ra->lookahead_index); + + return ra_size; +} + +/* + * Main function for file_ra_state based read-ahead. + */ +static inline unsigned long +state_based_readahead(struct address_space *mapping, struct file *filp, + struct file_ra_state *ra, + struct page *page, pgoff_t index, + unsigned long ra_size, unsigned long ra_max) +{ + unsigned long ra_old; + unsigned long la_size; + unsigned long remain_space; + unsigned long growth_limit; + + la_size = ra->readahead_index - index; + ra_old = ra->readahead_index - ra->ra_index; + growth_limit = ra_size + ra_max / 16 + + (2 + readahead_ratio / 64) * ra_old; + ra_size = compute_thrashing_threshold(ra, &remain_space); + + if (page && remain_space <= la_size && la_size > 1) { + rescue_pages(page, la_size); + return 0; + } + + if (!adjust_rala(min(ra_max, growth_limit), &ra_size, &la_size)) + return 0; + + ra_set_class(ra, RA_CLASS_STATE); + ra_set_index(ra, index, ra->readahead_index); + ra_set_size(ra, ra_size, la_size); + + return ra_dispatch(ra, mapping, filp); +} + +/* + * Page cache context based estimation of read-ahead/look-ahead size/index. + * + * The logic first looks around to find the start point of next read-ahead, + * and then, if necessary, looks backward in the inactive_list to get an + * estimation of the thrashing-threshold. + * + * The estimation theory can be illustrated with figure: + * + * chunk A chunk B chunk C head + * + * l01 l11 l12 l21 l22 + *| |-->|-->| |------>|-->| |------>| + *| +-------+ +-----------+ +-------------+ | + *| | # | | # | | # | | + *| +-------+ +-----------+ +-------------+ | + *| |<==============|<===========================|<============================| + * L0 L1 L2 + * + * Let f(l) = L be a map from + * l: the number of pages read by the stream + * to + * L: the number of pages pushed into inactive_list in the mean time + * then + * f(l01) <= L0 + * f(l11 + l12) = L1 + * f(l21 + l22) = L2 + * ... + * f(l01 + l11 + ...) <= Sum(L0 + L1 + ...) + * <= Length(inactive_list) = f(thrashing-threshold) + * + * So the count of countinuous history pages left in the inactive_list is always + * a lower estimation of the true thrashing-threshold. + */ + +#define PAGE_REFCNT_0 0 +#define PAGE_REFCNT_1 (1 << PG_referenced) +#define PAGE_REFCNT_2 (1 << PG_active) +#define PAGE_REFCNT_3 ((1 << PG_active) | (1 << PG_referenced)) +#define PAGE_REFCNT_MASK PAGE_REFCNT_3 + +/* + * STATUS REFERENCE COUNT + * __ 0 + * _R PAGE_REFCNT_1 + * A_ PAGE_REFCNT_2 + * AR PAGE_REFCNT_3 + * + * A/R: Active / Referenced + */ +static inline unsigned long page_refcnt(struct page *page) +{ + return page->flags & PAGE_REFCNT_MASK; +} + +/* + * STATUS REFERENCE COUNT TYPE + * __ 0 fresh + * _R PAGE_REFCNT_1 stale + * A_ PAGE_REFCNT_2 disturbed once + * AR PAGE_REFCNT_3 disturbed twice + * + * A/R: Active / Referenced + */ +static inline unsigned long cold_page_refcnt(struct page *page) +{ + if (!page || PageActive(page)) + return 0; + + return page_refcnt(page); +} + +static inline char page_refcnt_symbol(struct page *page) +{ + if (!page) + return 'X'; + + switch (page_refcnt(page)) { + case 0: + return '_'; + case PAGE_REFCNT_1: + return '-'; + case PAGE_REFCNT_2: + return '='; + case PAGE_REFCNT_3: + return '#'; + default: + return '?'; + } +} + +/* + * Count/estimate cache hits in range [first_index, last_index]. + * The estimation is simple and optimistic. + */ +static int count_cache_hit(struct address_space *mapping, + pgoff_t first_index, pgoff_t last_index) +{ + struct page *page; + int size = last_index - first_index + 1; + int count = 0; + int i; + + cond_resched(); + read_lock_irq(&mapping->tree_lock); + + /* + * The first page may well is chunk head and has been accessed, + * so it is index 0 that makes the estimation optimistic. This + * behavior guarantees a readahead when (size < ra_max) and + * (readahead_hit_rate >= 16). + */ + for (i = 0; i < 16;) { + page = __find_page(mapping, first_index + + size * ((i++ * 29) & 15) / 16); + if (cold_page_refcnt(page) >= PAGE_REFCNT_1 && ++count >= 2) + break; + } + + read_unlock_irq(&mapping->tree_lock); + + return size * count / i; +} + +/* + * Look back and check history pages to estimate thrashing-threshold. + */ +static unsigned long query_page_cache_segment(struct address_space *mapping, + struct file_ra_state *ra, + unsigned long *remain, pgoff_t offset, + unsigned long ra_min, unsigned long ra_max) +{ + pgoff_t index; + unsigned long count; + unsigned long nr_lookback; + struct radix_tree_cache cache; + + /* + * Scan backward and check the near @ra_max pages. + * The count here determines ra_size. + */ + cond_resched(); + read_lock_irq(&mapping->tree_lock); + index = radix_tree_scan_hole_backward(&mapping->page_tree, + offset, ra_max); +#ifdef DEBUG_READAHEAD_RADIXTREE + WARN_ON(index > offset); + if (index != offset) + WARN_ON(!__find_page(mapping, index + 1)); + if (index && offset - index < ra_max) + WARN_ON(__find_page(mapping, index)); +#endif + read_unlock_irq(&mapping->tree_lock); + + *remain = offset - index; + + if (offset == ra->readahead_index && ra_cache_hit_ok(ra)) + count = *remain; + else if (count_cache_hit(mapping, index + 1, offset) * + readahead_hit_rate >= *remain) + count = *remain; + else + count = ra_min; + + /* + * Unnecessary to count more? + */ + if (count < ra_max) + goto out; + + if (unlikely(ra->flags & RA_FLAG_NO_LOOKAHEAD)) + goto out; + + /* + * Check the far pages coarsely. + * The big count here helps increase la_size. + */ + nr_lookback = ra_max * (LOOKAHEAD_RATIO + 1) * + 100 / (readahead_ratio + 1); + + cond_resched(); + radix_tree_cache_init(&cache); + read_lock_irq(&mapping->tree_lock); + for (count += ra_max; count < nr_lookback; count += ra_max) { + struct radix_tree_node *node; + node = radix_tree_cache_lookup_node(&mapping->page_tree, + &cache, offset - count, 1); +#ifdef DEBUG_READAHEAD_RADIXTREE + if (node != radix_tree_lookup_node(&mapping->page_tree, + offset - count, 1)) + BUG(); +#endif + if (!node) + break; + } + read_unlock_irq(&mapping->tree_lock); + +out: + /* + * For sequential read that extends from index 0, the counted value + * may well be far under the true threshold, so return it unmodified + * for further process in adjust_rala_aggressive(). + */ + if (count >= offset) + count = offset; + else + count = max(ra_min, count * readahead_ratio / 100); + + ddprintk("query_page_cache_segment: " + "ino=%lu, idx=%lu, count=%lu, remain=%lu\n", + mapping->host->i_ino, offset, count, *remain); + + return count; +} + +/* + * Find past-the-end index of the segment before @index. + */ +static inline pgoff_t find_segtail_backward(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + struct radix_tree_cache cache; + struct page *page; + pgoff_t origin; + + origin = index; + if (max_scan > index) + max_scan = index; + + cond_resched(); + radix_tree_cache_init(&cache); + read_lock_irq(&mapping->tree_lock); + for (; origin - index < max_scan;) { + page = radix_tree_cache_lookup(&mapping->page_tree, + &cache, --index); + if (page) { + read_unlock_irq(&mapping->tree_lock); + return index + 1; + } + } + read_unlock_irq(&mapping->tree_lock); + + return 0; +} + +/* + * Find past-the-end index of the segment at @index. + */ +static inline pgoff_t find_segtail(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + pgoff_t ra_index; + + cond_resched(); + read_lock_irq(&mapping->tree_lock); + ra_index = radix_tree_scan_hole(&mapping->page_tree, index, max_scan); +#ifdef DEBUG_READAHEAD_RADIXTREE + BUG_ON(!__find_page(mapping, index)); + WARN_ON(ra_index < index); + if (ra_index != index && !__find_page(mapping, ra_index - 1)) + printk(KERN_ERR "radix_tree_scan_hole(index=%lu ra_index=%lu " + "max_scan=%lu nrpages=%lu) fooled!\n", + index, ra_index, max_scan, mapping->nrpages); + if (ra_index != ~0UL && ra_index - index < max_scan) + WARN_ON(__find_page(mapping, ra_index)); +#endif + read_unlock_irq(&mapping->tree_lock); + + if (ra_index <= index + max_scan) + return ra_index; + else + return 0; +} + +/* + * Determine the request parameters for context based read-ahead that extends + * from start of file. + * + * The major weakness of stateless method is perhaps the slow grow up speed of + * ra_size. The logic tries to make up for this in the important case of + * sequential reads that extend from start of file. In this case, the ra_size + * is not chosen to make the whole next chunk safe (as in normal ones). Only + * half of which is safe. The added 'unsafe' half is the look-ahead part. It + * is expected to be safeguarded by rescue_pages() when the previous chunks are + * lost. + */ +static inline int adjust_rala_aggressive(unsigned long ra_max, + unsigned long *ra_size, unsigned long *la_size) +{ + pgoff_t index = *ra_size; + + *ra_size -= min(*ra_size, *la_size); + *ra_size = *ra_size * readahead_ratio / 100; + *la_size = index * readahead_ratio / 100; + *ra_size += *la_size; + + if (*ra_size > ra_max) + *ra_size = ra_max; + if (*la_size > *ra_size) + *la_size = *ra_size; + + return 1; +} + +/* + * Main function for page context based read-ahead. + */ +static inline int +try_context_based_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct page *prev_page, + struct page *page, pgoff_t index, + unsigned long ra_min, unsigned long ra_max) +{ + pgoff_t ra_index; + unsigned long ra_size; + unsigned long la_size; + unsigned long remain_pages; + + /* Where to start read-ahead? + * NFSv3 daemons may process adjacent requests in parallel, + * leading to many locally disordered, globally sequential reads. + * So do not require nearby history pages to be present or accessed. + */ + if (page) { + ra_index = find_segtail(mapping, index, ra_max * 5 / 4); + if (!ra_index) + return -1; + } else if (prev_page || find_page(mapping, index - 1)) { + ra_index = index; + } else if (readahead_hit_rate > 1) { + ra_index = find_segtail_backward(mapping, index, + readahead_hit_rate + ra_min); + if (!ra_index) + return 0; + ra_min += 2 * (index - ra_index); + index = ra_index; /* pretend the request starts here */ + } else + return 0; + + ra_size = query_page_cache_segment(mapping, ra, &remain_pages, + index, ra_min, ra_max); + + la_size = ra_index - index; + if (page && remain_pages <= la_size && + remain_pages < index && la_size > 1) { + rescue_pages(page, la_size); + return -1; + } + + if (ra_size == index) { + if (!adjust_rala_aggressive(ra_max, &ra_size, &la_size)) + return -1; + ra_set_class(ra, RA_CLASS_CONTEXT_AGGRESSIVE); + } else { + if (!adjust_rala(ra_max, &ra_size, &la_size)) + return -1; + ra_set_class(ra, RA_CLASS_CONTEXT); + } + + ra_set_index(ra, index, ra_index); + ra_set_size(ra, ra_size, la_size); + + return 1; +} + +/* + * Read-ahead on start of file. + * + * The strategies here are most important for small files. + * 1. Set a moderately large read-ahead size; + * 2. Issue the next read-ahead request as soon as possible. + * + * But be careful, there are some applications that dip into only the very head + * of a file. The most important thing is to prevent them from triggering the + * next (much larger) read-ahead request, which leads to lots of cache misses. + * Two pages should be enough for them, correct me if I'm wrong. + */ +static inline unsigned long +newfile_readahead(struct address_space *mapping, + struct file *filp, struct file_ra_state *ra, + unsigned long req_size, unsigned long ra_min) +{ + unsigned long ra_size; + unsigned long la_size; + + if (req_size > ra_min) /* larger value risks thrashing */ + req_size = ra_min; + + if (unlikely(ra->flags & RA_FLAG_NFSD)) { + ra_size = MIN_NFSD_PAGES; + la_size = 0; + } else { + ra_size = 4 * req_size; + la_size = 2 * req_size; + } + + ra_set_class(ra, RA_CLASS_NEWFILE); + ra_set_index(ra, 0, 0); + ra_set_size(ra, ra_size, la_size); + + return ra_dispatch(ra, mapping, filp); +} + +/* + * Backward prefetching. + * No look ahead and thrashing threshold estimation for stepping backward + * pattern: should be unnecessary. + */ +static inline int +try_read_backward(struct file_ra_state *ra, pgoff_t begin_index, + unsigned long ra_size, unsigned long ra_max) +{ + pgoff_t end_index; + + /* Are we reading backward? */ + if (begin_index > ra->prev_page) + return 0; + + if ((ra->flags & RA_CLASS_MASK) == RA_CLASS_BACKWARD && + ra_has_index(ra, ra->prev_page)) { + ra_size += 2 * ra_cache_hit(ra, 0); + end_index = ra->la_index; + } else { + ra_size += ra_size + ra_size * (readahead_hit_rate - 1) / 2; + end_index = ra->prev_page; + } + + if (ra_size > ra_max) + ra_size = ra_max; + + /* Read traces close enough to be covered by the prefetching? */ + if (end_index > begin_index + ra_size) + return 0; + + begin_index = end_index - ra_size; + + ra_set_class(ra, RA_CLASS_BACKWARD); + ra_set_index(ra, begin_index, begin_index); + ra_set_size(ra, ra_size, 0); + + return 1; +} + +/* + * Readahead thrashing recovery. + */ +static inline unsigned long +thrashing_recovery_readahead(struct address_space *mapping, + struct file *filp, struct file_ra_state *ra, + pgoff_t index, unsigned long ra_max) +{ + unsigned long ra_size; + + if (readahead_debug_level && find_page(mapping, index - 1)) + ra_account(ra, RA_EVENT_READAHEAD_MUTILATE, + ra->readahead_index - index); + ra_account(ra, RA_EVENT_READAHEAD_THRASHING, + ra->readahead_index - index); + + /* + * Some thrashing occur in (ra_index, la_index], in which case the + * old read-ahead chunk is lost soon after the new one is allocated. + * Ensure that we recover all needed pages in the old chunk. + */ + if (index < ra->ra_index) + ra_size = ra->ra_index - index; + else { + /* After thrashing, we know the exact thrashing-threshold. */ + ra_size = ra_cache_hit(ra, 0); + + /* And we'd better be a bit conservative. */ + ra_size = ra_size * 3 / 4; + } + + if (ra_size > ra_max) + ra_size = ra_max; + + ra_set_class(ra, RA_CLASS_THRASHING); + ra_set_index(ra, index, index); + ra_set_size(ra, ra_size, ra_size / LOOKAHEAD_RATIO); + + return ra_dispatch(ra, mapping, filp); +} + +/* + * If there is a previous sequential read, it is likely to be another + * sequential read at the new position. + * Databases are known to have this seek-and-read-one-block pattern. + */ +static inline int +try_readahead_on_seek(struct file_ra_state *ra, pgoff_t index, + unsigned long ra_size, unsigned long ra_max) +{ + unsigned long hit0 = ra_cache_hit(ra, 0); + unsigned long hit1 = ra_cache_hit(ra, 1) + hit0; + unsigned long hit2 = ra_cache_hit(ra, 2); + unsigned long hit3 = ra_cache_hit(ra, 3); + + /* There's a previous read-ahead request? */ + if (!ra_has_index(ra, ra->prev_page)) + return 0; + + /* The previous read-ahead sequences have similiar sizes? */ + if (!(ra_size < hit1 && hit1 > hit2 / 2 && + hit2 > hit3 / 2 && + hit3 > hit1 / 2)) + return 0; + + hit1 = max(hit1, hit2); + + /* Follow the same prefetching direction. */ + if ((ra->flags & RA_CLASS_MASK) == RA_CLASS_BACKWARD) + index = ((index > hit1 - ra_size) ? index - hit1 + ra_size : 0); + + ra_size = min(hit1, ra_max); + + ra_set_class(ra, RA_CLASS_SEEK); + ra_set_index(ra, index, index); + ra_set_size(ra, ra_size, 0); + + return 1; +} + +/* + * ra_min is mainly determined by the size of cache memory. + * Table of concrete numbers for 4KB page size: + * inactive + free (MB): 4 8 16 32 64 128 256 512 1024 + * ra_min (KB): 16 16 16 16 20 24 32 48 64 + */ +static inline void get_readahead_bounds(struct file_ra_state *ra, + unsigned long *ra_min, + unsigned long *ra_max) +{ + unsigned long pages; + + pages = max_sane_readahead(KB(1024*1024)); + *ra_max = min(min(pages, 0xFFFFUL), ra->ra_pages); + *ra_min = min(min(MIN_RA_PAGES + (pages>>13), KB(128)), *ra_max/2); +} + +/** + * page_cache_readahead_adaptive - adaptive read-ahead main function + * @mapping, @ra, @filp: the same as page_cache_readahead() + * @prev_page: the page at @index-1, may be NULL to let the function find it + * @page: the page at @index, or NULL if non-present + * @begin_index, @index, @end_index: offsets into @mapping + * [@begin_index, @end_index) is the read the caller is performing + * @index indicates the page to be read now + * + * page_cache_readahead_adaptive() is the entry point of the adaptive + * read-ahead logic. It tries a set of methods in turn to determine the + * appropriate readahead action and submits the readahead I/O. + * + * The caller is expected to point ra->prev_page to the previously accessed + * page, and to call it on two conditions: + * 1. @page == NULL + * A cache miss happened, some pages have to be read in + * 2. @page != NULL && PageReadahead(@page) + * A look-ahead mark encountered, this is set by a previous read-ahead + * invocation to instruct the caller to give the function a chance to + * check up and do next read-ahead in advance. + */ +unsigned long +page_cache_readahead_adaptive(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + struct page *prev_page, struct page *page, + pgoff_t begin_index, pgoff_t index, pgoff_t end_index) +{ + unsigned long size; + unsigned long ra_min; + unsigned long ra_max; + int ret; + + might_sleep(); + + if (page) { + if(!TestClearPageReadahead(page)) + return 0; + if (bdi_read_congested(mapping->backing_dev_info)) { + ra_account(ra, RA_EVENT_IO_CONGESTION, + end_index - index); + return 0; + } + if (laptop_mode && laptop_spinned_down()) { + if (!renew_lookahead(mapping, ra, index, + index + LAPTOP_POLL_INTERVAL)) + return 0; + } + } + + if (page) + ra_account(ra, RA_EVENT_LOOKAHEAD_HIT, + ra->readahead_index - ra->lookahead_index); + else if (index) + ra_account(ra, RA_EVENT_CACHE_MISS, end_index - begin_index); + + size = end_index - index; + get_readahead_bounds(ra, &ra_min, &ra_max); + + /* readahead disabled? */ + if (unlikely(!ra_max || !readahead_ratio)) { + size = max_sane_readahead(size); + goto readit; + } + + /* + * Start of file. + */ + if (index == 0) + return newfile_readahead(mapping, filp, ra, end_index, ra_min); + + /* + * State based sequential read-ahead. + */ + if (!disable_stateful_method && + index == ra->lookahead_index && ra_cache_hit_ok(ra)) + return state_based_readahead(mapping, filp, ra, page, + index, size, ra_max); + + /* + * Recover from possible thrashing. + */ + if (!page && index == ra->prev_page + 1 && ra_has_index(ra, index)) + return thrashing_recovery_readahead(mapping, filp, ra, + index, ra_max); + + /* + * Backward read-ahead. + */ + if (!page && begin_index == index && + try_read_backward(ra, index, size, ra_max)) + return ra_dispatch(ra, mapping, filp); + + /* + * Context based sequential read-ahead. + */ + ret = try_context_based_readahead(mapping, ra, prev_page, page, + index, ra_min, ra_max); + if (ret > 0) + return ra_dispatch(ra, mapping, filp); + if (ret < 0) + return 0; + + /* No action on look ahead time? */ + if (page) { + ra_account(ra, RA_EVENT_LOOKAHEAD_NOACTION, + ra->readahead_index - index); + return 0; + } + + /* + * Random read that follows a sequential one. + */ + if (try_readahead_on_seek(ra, index, size, ra_max)) + return ra_dispatch(ra, mapping, filp); + + /* + * Random read. + */ + if (size > ra_max) + size = ra_max; + +readit: + size = __do_page_cache_readahead(mapping, filp, index, size, 0); + + ra_account(ra, RA_EVENT_READRANDOM, size); + dprintk("readrandom(ino=%lu, pages=%lu, index=%lu-%lu-%lu) = %lu\n", + mapping->host->i_ino, mapping->nrpages, + begin_index, index, end_index, size); + + return size; +} + +/** + * readahead_cache_hit - adaptive read-ahead feedback function + * @ra: file_ra_state which holds the readahead state + * @page: the page just accessed + * + * readahead_cache_hit() is the feedback route of the adaptive read-ahead + * logic. It must be called on every access on the read-ahead pages. + */ +void fastcall readahead_cache_hit(struct file_ra_state *ra, struct page *page) +{ + if (PageActive(page) || PageReferenced(page)) + return; + + if (!PageUptodate(page)) + ra_account(ra, RA_EVENT_IO_BLOCK, 1); + + if (!ra_has_index(ra, page->index)) + return; + + ra->cache_hits++; + + if (page->index >= ra->ra_index) + ra_account(ra, RA_EVENT_READAHEAD_HIT, 1); + else + ra_account(ra, RA_EVENT_READAHEAD_HIT, -1); +} + +#endif /* CONFIG_ADAPTIVE_READAHEAD */ diff -urN oldtree/mm/swap.c newtree/mm/swap.c --- oldtree/mm/swap.c 2006-04-01 04:48:27.000000000 -0500 +++ newtree/mm/swap.c 2006-04-01 08:53:57.995126000 -0500 @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -127,6 +128,8 @@ ClearPageReferenced(page); } else if (!PageReferenced(page)) { SetPageReferenced(page); + if (PageLRU(page)) + inc_readahead_aging(); } } @@ -382,6 +385,46 @@ pagevec_reinit(pvec); } +static inline void __pagevec_lru_add_tail(struct pagevec *pvec) +{ + int i; + struct zone *zone = NULL; + + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + struct zone *pagezone = page_zone(page); + + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } + BUG_ON(PageLRU(page)); + SetPageLRU(page); + add_page_to_inactive_list_tail(zone, page); + } + if (zone) + spin_unlock_irq(&zone->lru_lock); + release_pages(pvec->pages, pvec->nr, pvec->cold); + pagevec_reinit(pvec); +} + +/* + * Function used uniquely to put pages back to the lru at the end of the + * inactive list to preserve the lru order. Currently only used by swap + * prefetch. + */ +void fastcall lru_cache_add_tail(struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); + + page_cache_get(page); + if (!pagevec_add(pvec, page)) + __pagevec_lru_add_tail(pvec); + put_cpu_var(lru_add_pvecs); +} + /* * Try to drop buffers from the pages in a pagevec */ @@ -536,5 +579,8 @@ * Right now other parts of the system means that we * _really_ don't want to cluster much more */ + + prepare_swap_prefetch(); + hotcpu_notifier(cpu_swap_callback, 0); } diff -urN oldtree/mm/swap_prefetch.c newtree/mm/swap_prefetch.c --- oldtree/mm/swap_prefetch.c 1969-12-31 19:00:00.000000000 -0500 +++ newtree/mm/swap_prefetch.c 2006-04-01 08:54:41.745860250 -0500 @@ -0,0 +1,617 @@ +/* + * linux/mm/swap_prefetch.c + * + * Copyright (C) 2005-2006 Con Kolivas + * + * Written by Con Kolivas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Time to delay prefetching if vm is busy or prefetching unsuccessful. There + * needs to be at least this duration of idle time meaning in practice it can + * be much longer + */ +#define PREFETCH_DELAY (HZ * 5) + +#define PREFETCH_NORMAL (1 << 0) +#define PREFETCH_AGGRESSIVE (1 << 1) +/* + * sysctl - enable/disable swap prefetching bits + * This is composed of the bitflags PREFETCH_NORMAL and PREFETCH_AGGRESSIVE. + * Once PREFETCH_AGGRESSIVE is set, swap prefetching will be peformed as much + * as possible irrespective of load conditions and then the + * PREFETCH_AGGRESSIVE bit will be unset. + */ +int swap_prefetch __read_mostly = PREFETCH_NORMAL; + +#define aggressive_prefetch (unlikely(swap_prefetch & PREFETCH_AGGRESSIVE)) + +struct swapped_root { + unsigned long busy; /* vm busy */ + spinlock_t lock; /* protects all data */ + struct list_head list; /* MRU list of swapped pages */ + struct radix_tree_root swap_tree; /* Lookup tree of pages */ + unsigned int count; /* Number of entries */ + unsigned int maxcount; /* Maximum entries allowed */ + kmem_cache_t *cache; /* Of struct swapped_entry */ +}; + +static struct swapped_root swapped = { + .lock = SPIN_LOCK_UNLOCKED, + .list = LIST_HEAD_INIT(swapped.list), + .swap_tree = RADIX_TREE_INIT(GFP_ATOMIC), +}; + +static task_t *kprefetchd_task; + +/* + * We check to see no part of the vm is busy. If it is this will interrupt + * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy. + */ +inline void delay_swap_prefetch(void) +{ + if (!test_bit(0, &swapped.busy)) + __set_bit(0, &swapped.busy); +} + +/* + * Drop behind accounting which keeps a list of the most recently used swap + * entries. + */ +void add_to_swapped_list(struct page *page) +{ + struct swapped_entry *entry; + unsigned long index; + int wakeup; + + if (!swap_prefetch) + return; + + wakeup = 0; + + spin_lock(&swapped.lock); + if (swapped.count >= swapped.maxcount) { + /* + * We limit the number of entries to 2/3 of physical ram. + * Once the number of entries exceeds this we start removing + * the least recently used entries. + */ + entry = list_entry(swapped.list.next, + struct swapped_entry, swapped_list); + radix_tree_delete(&swapped.swap_tree, entry->swp_entry.val); + list_del(&entry->swapped_list); + swapped.count--; + } else { + entry = kmem_cache_alloc(swapped.cache, GFP_ATOMIC); + if (unlikely(!entry)) + /* bad, can't allocate more mem */ + goto out_locked; + } + + index = page_private(page); + entry->swp_entry.val = index; + /* + * On numa we need to store the node id to ensure that we prefetch to + * the same node it came from. + */ + store_swap_entry_node(entry, page); + + if (likely(!radix_tree_insert(&swapped.swap_tree, index, entry))) { + /* + * If this is the first entry, kprefetchd needs to be + * (re)started. + */ + if (!swapped.count) + wakeup = 1; + list_add(&entry->swapped_list, &swapped.list); + swapped.count++; + } + +out_locked: + spin_unlock(&swapped.lock); + + /* Do the wakeup outside the lock to shorten lock hold time. */ + if (wakeup) + wake_up_process(kprefetchd_task); + + return; +} + +/* + * Removes entries from the swapped_list. The radix tree allows us to quickly + * look up the entry from the index without having to iterate over the whole + * list. + */ +void remove_from_swapped_list(const unsigned long index) +{ + struct swapped_entry *entry; + unsigned long flags; + + if (list_empty(&swapped.list)) + return; + + spin_lock_irqsave(&swapped.lock, flags); + entry = radix_tree_delete(&swapped.swap_tree, index); + if (likely(entry)) { + list_del_init(&entry->swapped_list); + swapped.count--; + kmem_cache_free(swapped.cache, entry); + } + spin_unlock_irqrestore(&swapped.lock, flags); +} + +enum trickle_return { + TRICKLE_SUCCESS, + TRICKLE_FAILED, + TRICKLE_DELAY, +}; + +struct node_stats { + unsigned long last_free; + /* Free ram after a cycle of prefetching */ + unsigned long current_free; + /* Free ram on this cycle of checking prefetch_suitable */ + unsigned long prefetch_watermark; + /* Maximum amount we will prefetch to */ + unsigned long highfree[MAX_NR_ZONES]; + /* The amount of free ram before we start prefetching */ + unsigned long lowfree[MAX_NR_ZONES]; + /* The amount of free ram where we will stop prefetching */ + unsigned long *pointfree[MAX_NR_ZONES]; + /* highfree or lowfree depending on whether we've hit a watermark */ +}; + +/* + * prefetch_stats stores the free ram data of each node and this is used to + * determine if a node is suitable for prefetching into. + */ +struct prefetch_stats { + nodemask_t prefetch_nodes; + /* Which nodes are currently suited to prefetching */ + unsigned long prefetched_pages; + /* Total pages we've prefetched on this wakeup of kprefetchd */ + struct node_stats node[MAX_NUMNODES]; +}; + +static struct prefetch_stats sp_stat; + +/* + * This tries to read a swp_entry_t into swap cache for swap prefetching. + * If it returns TRICKLE_DELAY we should delay further prefetching. + */ +static enum trickle_return trickle_swap_cache_async(const swp_entry_t entry, + const int node) +{ + enum trickle_return ret = TRICKLE_FAILED; + struct page *page; + + read_lock_irq(&swapper_space.tree_lock); + /* Entry may already exist */ + page = radix_tree_lookup(&swapper_space.page_tree, entry.val); + read_unlock_irq(&swapper_space.tree_lock); + if (page) { + remove_from_swapped_list(entry.val); + goto out; + } + + /* + * Get a new page to read from swap. We have already checked the + * watermarks so __alloc_pages will not call on reclaim. + */ + page = alloc_pages_node(node, GFP_HIGHUSER & ~__GFP_WAIT, 0); + if (unlikely(!page)) { + ret = TRICKLE_DELAY; + goto out; + } + + if (add_to_swap_cache(page, entry)) { + /* Failed to add to swap cache */ + goto out_release; + } + + /* Add them to the tail of the inactive list to preserve LRU order */ + lru_cache_add_tail(page); + if (unlikely(swap_readpage(NULL, page))) { + ret = TRICKLE_DELAY; + goto out_release; + } + + sp_stat.prefetched_pages++; + sp_stat.node[node].last_free--; + + ret = TRICKLE_SUCCESS; +out_release: + page_cache_release(page); +out: + return ret; +} + +static void clear_last_prefetch_free(void) +{ + int node; + + /* + * Reset the nodes suitable for prefetching to all nodes. We could + * update the data to take into account memory hotplug if desired.. + */ + sp_stat.prefetch_nodes = node_online_map; + for_each_node_mask(node, sp_stat.prefetch_nodes) { + struct node_stats *ns = &sp_stat.node[node]; + + ns->last_free = 0; + } +} + +static void clear_current_prefetch_free(void) +{ + int node; + + sp_stat.prefetch_nodes = node_online_map; + for_each_node_mask(node, sp_stat.prefetch_nodes) { + struct node_stats *ns = &sp_stat.node[node]; + + ns->current_free = 0; + } +} + +/* + * This updates the high and low watermarks of amount of free ram in each + * node used to start and stop prefetching. We prefetch from pages_high * 4 + * down to pages_high * 3. + */ +static void examine_free_limits(void) +{ + struct zone *z; + + for_each_zone(z) { + struct node_stats *ns; + int idx; + + if (!populated_zone(z)) + continue; + + ns = &sp_stat.node[z->zone_pgdat->node_id]; + idx = zone_idx(z); + ns->lowfree[idx] = z->pages_high * 3 + z->lowmem_reserve[idx]; + ns->highfree[idx] = ns->lowfree[idx] + z->pages_high; + + if (z->free_pages > ns->highfree[idx]) { + /* + * We've gotten above the high watermark of free pages + * so we can start prefetching till we get to the low + * watermark. + */ + ns->pointfree[idx] = &ns->lowfree[idx]; + } + } +} + +/* + * Have some hysteresis between where page reclaiming and prefetching + * will occur to prevent ping-ponging between them. + */ +static void set_suitable_nodes(void) +{ + struct zone *z; + + for_each_zone(z) { + struct node_stats *ns; + unsigned long free; + int node, idx; + + if (!populated_zone(z)) + continue; + + node = z->zone_pgdat->node_id; + ns = &sp_stat.node[node]; + idx = zone_idx(z); + + free = z->free_pages; + if (free < *ns->pointfree[idx]) { + /* + * Free pages have dropped below the low watermark so + * we won't start prefetching again till we hit the + * high watermark of free pages. + */ + ns->pointfree[idx] = &ns->highfree[idx]; + node_clear(node, sp_stat.prefetch_nodes); + continue; + } + ns->current_free += free; + } +} + +/* + * We want to be absolutely certain it's ok to start prefetching. + */ +static int prefetch_suitable(void) +{ + unsigned long limit; + int node, ret = 0, test_pagestate = 0; + + if (aggressive_prefetch) { + clear_current_prefetch_free(); + set_suitable_nodes(); + if (!nodes_empty(sp_stat.prefetch_nodes)) + ret = 1; + goto out; + } + + /* Purposefully racy */ + if (test_bit(0, &swapped.busy)) { + __clear_bit(0, &swapped.busy); + goto out; + } + + /* + * get_page_state and above_background_load are expensive so we only + * perform them every SWAP_CLUSTER_MAX prefetched_pages. + * We test to see if we're above_background_load as disk activity + * even at low priority can cause interrupt induced scheduling + * latencies. + */ + if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) { + if (above_background_load()) + goto out; + test_pagestate = 1; + } + + clear_current_prefetch_free(); + set_suitable_nodes(); + + /* + * We iterate over each node testing to see if it is suitable for + * prefetching and clear the nodemask if it is not. + */ + for_each_node_mask(node, sp_stat.prefetch_nodes) { + struct node_stats *ns = &sp_stat.node[node]; + struct page_state ps; + + /* + * We check to see that pages are not being allocated + * elsewhere at any significant rate implying any + * degree of memory pressure (eg during file reads) + */ + if (ns->last_free) { + if (ns->current_free + SWAP_CLUSTER_MAX < + ns->last_free) { + ns->last_free = ns->current_free; + node_clear(node, + sp_stat.prefetch_nodes); + continue; + } + } else + ns->last_free = ns->current_free; + + if (!test_pagestate) + continue; + + get_page_state_node(&ps, node); + + /* We shouldn't prefetch when we are doing writeback */ + if (ps.nr_writeback) { + node_clear(node, sp_stat.prefetch_nodes); + continue; + } + + /* + * >2/3 of the ram on this node is mapped, slab, swapcache or + * dirty, we need to leave some free for pagecache. + * Note that currently nr_slab is innacurate on numa because + * nr_slab is incremented on the node doing the accounting + * even if the slab is being allocated on a remote node. This + * would be expensive to fix and not of great significance. + */ + limit = ps.nr_mapped + ps.nr_slab + ps.nr_dirty + + ps.nr_unstable + total_swapcache_pages; + if (limit > ns->prefetch_watermark) { + node_clear(node, sp_stat.prefetch_nodes); + continue; + } + } + + if (nodes_empty(sp_stat.prefetch_nodes)) + goto out; + + /* Survived all that? Hooray we can prefetch! */ + ret = 1; +out: + return ret; +} + +/* + * Get previous swapped entry when iterating over all entries. swapped.lock + * should be held and we should already ensure that entry exists. + */ +static inline struct swapped_entry *prev_swapped_entry + (struct swapped_entry *entry) +{ + return list_entry(entry->swapped_list.prev->prev, + struct swapped_entry, swapped_list); +} + +static unsigned long pages_prefetched(void) +{ + unsigned long pages = sp_stat.prefetched_pages; + + if (pages) { + lru_add_drain(); + sp_stat.prefetched_pages = 0; + } + return pages; +} + +/* + * trickle_swap is the main function that initiates the swap prefetching. It + * first checks to see if the busy flag is set, and does not prefetch if it + * is, as the flag implied we are low on memory or swapping in currently. + * Otherwise it runs until prefetch_suitable fails which occurs when the + * vm is busy, we prefetch to the watermark, or the list is empty or we have + * iterated over all entries + */ +static enum trickle_return trickle_swap(void) +{ + enum trickle_return ret = TRICKLE_DELAY; + struct swapped_entry *entry; + + /* + * If laptop_mode is enabled don't prefetch to avoid hard drives + * doing unnecessary spin-ups + */ + if (!swap_prefetch || (laptop_mode && !aggressive_prefetch)) + return ret; + + examine_free_limits(); + entry = NULL; + + for ( ; ; ) { + swp_entry_t swp_entry; + int node; + + if (!prefetch_suitable()) + break; + + spin_lock(&swapped.lock); + if (list_empty(&swapped.list)) { + ret = TRICKLE_FAILED; + spin_unlock(&swapped.lock); + break; + } + + if (!entry) { + /* + * This sets the entry for the first iteration. It + * also is a safeguard against the entry disappearing + * while the lock is not held. + */ + entry = list_entry(swapped.list.prev, + struct swapped_entry, swapped_list); + } else if (entry->swapped_list.prev == swapped.list.next) { + /* + * If we have iterated over all entries and there are + * still entries that weren't swapped out there may + * be a reason we could not swap them back in so + * delay attempting further prefetching. + */ + spin_unlock(&swapped.lock); + if (aggressive_prefetch) { + /* + * If we're prefetching aggressively and + * making progress then don't give up. + */ + if (pages_prefetched()) + continue; + } + break; + } + + node = get_swap_entry_node(entry); + if (!node_isset(node, sp_stat.prefetch_nodes)) { + /* + * We found an entry that belongs to a node that is + * not suitable for prefetching so skip it. + */ + entry = prev_swapped_entry(entry); + spin_unlock(&swapped.lock); + continue; + } + swp_entry = entry->swp_entry; + entry = prev_swapped_entry(entry); + spin_unlock(&swapped.lock); + + if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY && + !aggressive_prefetch) + break; + } + + /* Return value of pages_prefetched irrelevant here */ + pages_prefetched(); + if (aggressive_prefetch) + swap_prefetch &= ~PREFETCH_AGGRESSIVE; + return ret; +} + +static int kprefetchd(void *__unused) +{ + set_user_nice(current, 19); + /* Set ioprio to lowest if supported by i/o scheduler */ + sys_ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_CLASS_IDLE); + + do { + try_to_freeze(); + + /* + * TRICKLE_FAILED implies no entries left - we do not schedule + * a wakeup, and further delay the next one. + */ + if (trickle_swap() == TRICKLE_FAILED) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } + clear_last_prefetch_free(); + schedule_timeout_interruptible(PREFETCH_DELAY); + } while (!kthread_should_stop()); + + return 0; +} + +/* + * Create kmem cache for swapped entries + */ +void __init prepare_swap_prefetch(void) +{ + struct zone *zone; + + swapped.cache = kmem_cache_create("swapped_entry", + sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL); + + /* + * Set max number of entries to 2/3 the size of physical ram as we + * only ever prefetch to consume 2/3 of the ram. + */ + swapped.maxcount = nr_free_pagecache_pages() / 3 * 2; + + for_each_zone(zone) { + unsigned long present; + struct node_stats *ns; + int idx; + + present = zone->present_pages; + if (!present) + continue; + + ns = &sp_stat.node[zone->zone_pgdat->node_id]; + ns->prefetch_watermark += present / 3 * 2; + idx = zone_idx(zone); + ns->pointfree[idx] = &ns->highfree[idx]; + } +} + +static int __init kprefetchd_init(void) +{ + kprefetchd_task = kthread_run(kprefetchd, NULL, "kprefetchd"); + + return 0; +} + +static void __exit kprefetchd_exit(void) +{ + kthread_stop(kprefetchd_task); +} + +module_init(kprefetchd_init); +module_exit(kprefetchd_exit); diff -urN oldtree/mm/swap_state.c newtree/mm/swap_state.c --- oldtree/mm/swap_state.c 2006-04-01 04:48:27.000000000 -0500 +++ newtree/mm/swap_state.c 2006-04-01 08:53:57.999126250 -0500 @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -81,6 +82,7 @@ error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); if (!error) { + remove_from_swapped_list(entry.val); page_cache_get(page); SetPageLocked(page); SetPageSwapCache(page); @@ -94,11 +96,12 @@ return error; } -static int add_to_swap_cache(struct page *page, swp_entry_t entry) +int add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; if (!swap_duplicate(entry)) { + remove_from_swapped_list(entry.val); INC_CACHE_INFO(noent_race); return -ENOENT; } @@ -147,6 +150,9 @@ swp_entry_t entry; int err; + /* Swap prefetching is delayed if we're swapping pages */ + delay_swap_prefetch(); + if (!PageLocked(page)) BUG(); @@ -320,6 +326,9 @@ struct page *found_page, *new_page = NULL; int err; + /* Swap prefetching is delayed if we're already reading from swap */ + delay_swap_prefetch(); + do { /* * First check the swap cache. Since this is normally diff -urN oldtree/mm/vmscan.c newtree/mm/vmscan.c --- oldtree/mm/vmscan.c 2006-04-01 04:48:27.000000000 -0500 +++ newtree/mm/vmscan.c 2006-04-01 08:54:48.086256500 -0500 @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -123,10 +124,11 @@ #endif /* - * From 0 .. 100. Higher means more swappy. + * From 0 .. 100. Lower means more swappy. */ -int vm_swappiness = 60; -static long total_memory; +int vm_mapped __read_mostly = 66; +int vm_hardmaplimit __read_mostly = 1; +static long total_memory __read_mostly; static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); @@ -396,6 +398,7 @@ if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; + add_to_swapped_list(page); __delete_from_swap_cache(page); write_unlock_irq(&mapping->tree_lock); swap_free(swap); @@ -454,6 +457,9 @@ if (PageWriteback(page)) goto keep_locked; + if (!PageReferenced(page)) + inc_readahead_aging(); + referenced = page_referenced(page, 1); /* In active use or really unfreeable? Activate it. */ if (referenced && page_mapping_inuse(page)) @@ -1230,10 +1236,14 @@ * The distress ratio is important - we don't want to start * going oom. * - * A 100% value of vm_swappiness overrides this algorithm - * altogether. + * This distress value is ignored if we apply a hardmaplimit except + * in extreme distress. + * + * A 0% value of vm_mapped overrides this algorithm altogether. */ - swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; + swap_tendency = mapped_ratio * 100 / (vm_mapped + 1); + if (!vm_hardmaplimit || distress == 100) + swap_tendency += distress; /* * Now use this metric to decide whether to start moving mapped @@ -1442,6 +1452,8 @@ sc.may_writepage = !laptop_mode; sc.may_swap = 1; + delay_swap_prefetch(); + inc_page_state(allocstall); for (i = 0; zones[i] != NULL; i++) { @@ -1571,6 +1583,7 @@ */ for (i = pgdat->nr_zones - 1; i >= 0; i--) { struct zone *zone = pgdat->node_zones + i; + unsigned long watermark; if (!populated_zone(zone)) continue; @@ -1579,8 +1592,17 @@ priority != DEF_PRIORITY) continue; + /* + * The watermark is relaxed depending on the + * level of "priority" till it drops to + * pages_high. + */ + watermark = zone->pages_high + + (zone->pages_high * priority / + DEF_PRIORITY); + if (!zone_watermark_ok(zone, order, - zone->pages_high, 0, 0)) { + watermark, 0, 0)) { end_zone = i; goto scan; } @@ -1616,8 +1638,11 @@ continue; if (nr_pages == 0) { /* Not software suspend */ + unsigned long watermark = zone->pages_high + + (zone->pages_high * priority / + DEF_PRIORITY); if (!zone_watermark_ok(zone, order, - zone->pages_high, end_zone, 0)) + watermark, end_zone, 0)) all_zones_ok = 0; } zone->temp_priority = priority; @@ -1788,6 +1813,8 @@ .reclaimed_slab = 0, }; + delay_swap_prefetch(); + current->reclaim_state = &reclaim_state; for_each_pgdat(pgdat) { int freed; diff -urN oldtree/scripts/Kbuild.include newtree/scripts/Kbuild.include --- oldtree/scripts/Kbuild.include 2006-04-01 04:48:27.000000000 -0500 +++ newtree/scripts/Kbuild.include 2006-04-01 08:53:50.634666000 -0500 @@ -78,8 +78,9 @@ # function to only execute the passed command if necessary # >'< substitution is for echo to work, >$< substitution to preserve $ when reloading .cmd file # note: when using inline perl scripts [perl -e '...$$t=1;...'] in $(cmd_xxx) double $$ your perl vars -# -if_changed = $(if $(strip $? $(call arg-check, $(cmd_$(1)), $(cmd_$@)) ), \ +# +if_changed = $(if $(strip $(filter-out $(PHONY),$?) \ + $(call arg-check, $(cmd_$(1)), $(cmd_$@)) ), \ @set -e; \ $(echo-cmd) \ $(cmd_$(1)); \ @@ -87,8 +88,9 @@ # execute the command and also postprocess generated .d dependencies # file -if_changed_dep = $(if $(strip $? $(filter-out FORCE $(wildcard $^),$^)\ - $(call arg-check, $(cmd_$(1)), $(cmd_$@)) ), \ +if_changed_dep = $(if $(strip $(filter-out $(PHONY),$?) \ + $(filter-out FORCE $(wildcard $^),$^) \ + $(call arg-check, $(cmd_$(1)), $(cmd_$@)) ), \ @set -e; \ $(echo-cmd) \ $(cmd_$(1)); \ @@ -99,6 +101,7 @@ # Usage: $(call if_changed_rule,foo) # will check if $(cmd_foo) changed, or any of the prequisites changed, # and if so will execute $(rule_foo) -if_changed_rule = $(if $(strip $? $(call arg-check, $(cmd_$(1)), $(cmd_$@)) ),\ +if_changed_rule = $(if $(strip $(filter-out $(PHONY),$?) \ + $(call arg-check, $(cmd_$(1)), $(cmd_$@)) ),\ @set -e; \ $(rule_$(1))) diff -urN oldtree/scripts/Makefile.build newtree/scripts/Makefile.build --- oldtree/scripts/Makefile.build 2006-04-01 04:48:27.000000000 -0500 +++ newtree/scripts/Makefile.build 2006-04-01 08:53:50.638666250 -0500 @@ -4,7 +4,7 @@ src := $(obj) -.PHONY: __build +PHONY := __build __build: # Read .config if it exist, otherwise ignore @@ -309,14 +309,14 @@ # Descending # --------------------------------------------------------------------------- -.PHONY: $(subdir-ym) +PHONY += $(subdir-ym) $(subdir-ym): $(Q)$(MAKE) $(build)=$@ # Add FORCE to the prequisites of a target to force it to be always rebuilt. # --------------------------------------------------------------------------- -.PHONY: FORCE +PHONY += FORCE FORCE: @@ -331,3 +331,9 @@ ifneq ($(cmd_files),) include $(cmd_files) endif + + +# Declare the contents of the .PHONY variable as phony. We keep that +# information in a variable se we can use it in if_changed and friends. + +.PHONY: $(PHONY) diff -urN oldtree/scripts/Makefile.clean newtree/scripts/Makefile.clean --- oldtree/scripts/Makefile.clean 2006-04-01 04:48:27.000000000 -0500 +++ newtree/scripts/Makefile.clean 2006-04-01 08:53:50.642666500 -0500 @@ -4,7 +4,7 @@ src := $(obj) -.PHONY: __clean +PHONY := __clean __clean: # Shorthand for $(Q)$(MAKE) scripts/Makefile.clean obj=dir @@ -87,10 +87,16 @@ # Descending # --------------------------------------------------------------------------- -.PHONY: $(subdir-ymn) +PHONY += $(subdir-ymn) $(subdir-ymn): $(Q)$(MAKE) $(clean)=$@ # If quiet is set, only print short version of command cmd = @$(if $($(quiet)cmd_$(1)),echo ' $($(quiet)cmd_$(1))' &&) $(cmd_$(1)) + + +# Declare the contents of the .PHONY variable as phony. We keep that +# information in a variable se we can use it in if_changed and friends. + +.PHONY: $(PHONY) diff -urN oldtree/scripts/Makefile.modinst newtree/scripts/Makefile.modinst --- oldtree/scripts/Makefile.modinst 2006-04-01 04:48:27.000000000 -0500 +++ newtree/scripts/Makefile.modinst 2006-04-01 08:53:50.642666500 -0500 @@ -2,7 +2,7 @@ # Installing modules # ========================================================================== -.PHONY: __modinst +PHONY := __modinst __modinst: include scripts/Kbuild.include @@ -12,7 +12,7 @@ __modules := $(sort $(shell grep -h '\.ko' /dev/null $(wildcard $(MODVERDIR)/*.mod))) modules := $(patsubst %.o,%.ko,$(wildcard $(__modules:.ko=.o))) -.PHONY: $(modules) +PHONY += $(modules) __modinst: $(modules) @: @@ -27,3 +27,9 @@ $(modules): $(call cmd,modules_install,$(MODLIB)/$(modinst_dir)) + + +# Declare the contents of the .PHONY variable as phony. We keep that +# information in a variable se we can use it in if_changed and friends. + +.PHONY: $(PHONY) diff -urN oldtree/scripts/Makefile.modpost newtree/scripts/Makefile.modpost --- oldtree/scripts/Makefile.modpost 2006-04-01 04:48:27.000000000 -0500 +++ newtree/scripts/Makefile.modpost 2006-04-01 08:53:50.646666750 -0500 @@ -32,7 +32,7 @@ # Step 4 is solely used to allow module versioning in external modules, # where the CRC of each module is retrieved from the Module.symers file. -.PHONY: _modpost +PHONY := _modpost _modpost: __modpost include .config @@ -57,7 +57,7 @@ $(if $(KBUILD_EXTMOD),-i,-o) $(symverfile) \ $(filter-out FORCE,$^) -.PHONY: __modpost +PHONY += __modpost __modpost: $(wildcard vmlinux) $(modules:.ko=.o) FORCE $(call cmd,modpost) @@ -94,7 +94,7 @@ # Add FORCE to the prequisites of a target to force it to be always rebuilt. # --------------------------------------------------------------------------- -.PHONY: FORCE +PHONY += FORCE FORCE: @@ -109,3 +109,9 @@ ifneq ($(cmd_files),) include $(cmd_files) endif + + +# Declare the contents of the .PHONY variable as phony. We keep that +# information in a variable se we can use it in if_changed and friends. + +.PHONY: $(PHONY) diff -urN oldtree/scripts/kconfig/Makefile newtree/scripts/kconfig/Makefile --- oldtree/scripts/kconfig/Makefile 2006-04-01 04:48:27.000000000 -0500 +++ newtree/scripts/kconfig/Makefile 2006-04-01 08:53:50.650667000 -0500 @@ -2,7 +2,7 @@ # Kernel configuration targets # These targets are used from top-level makefile -.PHONY: oldconfig xconfig gconfig menuconfig config silentoldconfig update-po-config +PHONY += oldconfig xconfig gconfig menuconfig config silentoldconfig update-po-config xconfig: $(obj)/qconf $< arch/$(ARCH)/Kconfig @@ -42,7 +42,7 @@ $(Q)rm -f arch/um/Kconfig_arch $(Q)rm -f scripts/kconfig/linux_*.pot scripts/kconfig/config.pot -.PHONY: randconfig allyesconfig allnoconfig allmodconfig defconfig +PHONY += randconfig allyesconfig allnoconfig allmodconfig defconfig randconfig: $(obj)/conf $< -r arch/$(ARCH)/Kconfig diff -urN oldtree/scripts/kconfig/lxdialog/Makefile newtree/scripts/kconfig/lxdialog/Makefile --- oldtree/scripts/kconfig/lxdialog/Makefile 2006-04-01 04:48:27.000000000 -0500 +++ newtree/scripts/kconfig/lxdialog/Makefile 2006-04-01 08:53:50.654667250 -0500 @@ -7,10 +7,10 @@ # we really need to do so. (Do not call gcc as part of make mrproper) HOST_EXTRACFLAGS = $(shell $(CONFIG_SHELL) $(check-lxdialog) -ccflags) HOST_LOADLIBES = $(shell $(CONFIG_SHELL) $(check-lxdialog) -ldflags $(HOSTCC)) - -HOST_EXTRACFLAGS += -DLOCALE -.PHONY: dochecklxdialog +HOST_EXTRACFLAGS += -DLOCALE + +PHONY += dochecklxdialog $(obj)/dochecklxdialog: $(Q)$(CONFIG_SHELL) $(check-lxdialog) -check $(HOSTCC) $(HOST_LOADLIBES) diff -urN oldtree/scripts/package/Makefile newtree/scripts/package/Makefile --- oldtree/scripts/package/Makefile 2006-04-01 04:48:27.000000000 -0500 +++ newtree/scripts/package/Makefile 2006-04-01 08:53:50.654667250 -0500 @@ -32,7 +32,7 @@ PREV := set -e; cd ..; # rpm-pkg -.PHONY: rpm-pkg rpm +PHONY += rpm-pkg rpm $(objtree)/kernel.spec: $(MKSPEC) $(srctree)/Makefile $(CONFIG_SHELL) $(MKSPEC) > $@ @@ -54,10 +54,10 @@ clean-files := $(objtree)/kernel.spec # binrpm-pkg -.PHONY: binrpm-pkg +PHONY += binrpm-pkg $(objtree)/binkernel.spec: $(MKSPEC) $(srctree)/Makefile $(CONFIG_SHELL) $(MKSPEC) prebuilt > $@ - + binrpm-pkg: $(objtree)/binkernel.spec $(MAKE) KBUILD_SRC= set -e; \ @@ -72,7 +72,7 @@ # Deb target # --------------------------------------------------------------------------- # -.PHONY: deb-pkg +PHONY += deb-pkg deb-pkg: $(MAKE) KBUILD_SRC= $(CONFIG_SHELL) $(srctree)/scripts/package/builddeb @@ -82,7 +82,7 @@ # tarball targets # --------------------------------------------------------------------------- -.PHONY: tar%pkg +PHONY += tar%pkg tar%pkg: $(MAKE) KBUILD_SRC= $(CONFIG_SHELL) $(srctree)/scripts/package/buildtar $@