diff -urN oldtree/Documentation/kernel-parameters.txt newtree/Documentation/kernel-parameters.txt --- oldtree/Documentation/kernel-parameters.txt 2006-02-18 15:18:20.484203176 +0000 +++ newtree/Documentation/kernel-parameters.txt 2006-02-18 15:24:31.288832304 +0000 @@ -71,6 +71,7 @@ SERIAL Serial support is enabled. SMP The kernel is an SMP kernel. SPARC Sparc architecture is enabled. + SUSPEND2 Suspend2 is enabled. SWSUSP Software suspend is enabled. TS Appropriate touchscreen support is enabled. USB USB support is enabled. @@ -1039,6 +1040,8 @@ noresume [SWSUSP] Disables resume and restores original swap space. + noresume2 [SUSPEND2] Disables resuming and restores original swap signature. + no-scroll [VGA] Disables scrollback. This is required for the Braillex ib80-piezo Braille reader made by F.H. Papenmeier (Germany). @@ -1296,6 +1299,11 @@ resume= [SWSUSP] Specify the partition device for software suspend + resume2= [SUSPEND2] Specify the storage device for Suspend2. + Format: :. + See Documentation/power/suspend2.txt for details of the + formats for available image writers. + rhash_entries= [KNL,NET] Set number of hash buckets for route cache diff -urN oldtree/Documentation/kernel-parameters.txt.orig newtree/Documentation/kernel-parameters.txt.orig --- oldtree/Documentation/kernel-parameters.txt.orig 1970-01-01 00:00:00.000000000 +0000 +++ newtree/Documentation/kernel-parameters.txt.orig 2006-02-18 15:18:20.000000000 +0000 @@ -0,0 +1,1664 @@ +February 2003 Kernel Parameters v2.5.59 + ~~~~~~~~~~~~~~~~~ + +The following is a consolidated list of the kernel parameters as implemented +(mostly) by the __setup() macro and sorted into English Dictionary order +(defined as ignoring all punctuation and sorting digits before letters in a +case insensitive manner), and with descriptions where known. + +Module parameters for loadable modules are specified only as the +parameter name with optional '=' and value as appropriate, such as: + + modprobe usbcore blinkenlights=1 + +Module parameters for modules that are built into the kernel image +are specified on the kernel command line with the module name plus +'.' plus parameter name, with '=' and value if appropriate, such as: + + usbcore.blinkenlights=1 + +The text in square brackets at the beginning of the description states the +restrictions on the kernel for the said kernel parameter to be valid. The +restrictions referred to are that the relevant option is valid if: + + ACPI ACPI support is enabled. + ALSA ALSA sound support is enabled. + APIC APIC support is enabled. + APM Advanced Power Management support is enabled. + AX25 Appropriate AX.25 support is enabled. + CD Appropriate CD support is enabled. + DEVFS devfs support is enabled. + DRM Direct Rendering Management support is enabled. + EDD BIOS Enhanced Disk Drive Services (EDD) is enabled + EFI EFI Partitioning (GPT) is enabled + EIDE EIDE/ATAPI support is enabled. + FB The frame buffer device is enabled. + HW Appropriate hardware is enabled. + IA-32 IA-32 aka i386 architecture is enabled. + IA-64 IA-64 architecture is enabled. + IOSCHED More than one I/O scheduler is enabled. + IP_PNP IP DHCP, BOOTP, or RARP is enabled. + ISAPNP ISA PnP code is enabled. + ISDN Appropriate ISDN support is enabled. + JOY Appropriate joystick support is enabled. + LP Printer support is enabled. + LOOP Loopback device support is enabled. + M68k M68k architecture is enabled. + These options have more detailed description inside of + Documentation/m68k/kernel-options.txt. + MCA MCA bus support is enabled. + MDA MDA console support is enabled. + MOUSE Appropriate mouse support is enabled. + MTD MTD support is enabled. + NET Appropriate network support is enabled. + NUMA NUMA support is enabled. + NFS Appropriate NFS support is enabled. + OSS OSS sound support is enabled. + PARIDE The ParIDE subsystem is enabled. + PARISC The PA-RISC architecture is enabled. + PCI PCI bus support is enabled. + PCMCIA The PCMCIA subsystem is enabled. + PNP Plug & Play support is enabled. + PPC PowerPC architecture is enabled. + PPT Parallel port support is enabled. + PS2 Appropriate PS/2 support is enabled. + RAM RAM disk support is enabled. + S390 S390 architecture is enabled. + SCSI Appropriate SCSI support is enabled. + A lot of drivers has their options described inside of + Documentation/scsi/. + SELINUX SELinux support is enabled. + SERIAL Serial support is enabled. + SMP The kernel is an SMP kernel. + SPARC Sparc architecture is enabled. + SWSUSP Software suspend is enabled. + TS Appropriate touchscreen support is enabled. + USB USB support is enabled. + USBHID USB Human Interface Device support is enabled. + V4L Video For Linux support is enabled. + VGA The VGA console has been enabled. + VT Virtual terminal support is enabled. + WDT Watchdog support is enabled. + XT IBM PC/XT MFM hard disk support is enabled. + X86-64 X86-64 architecture is enabled. + More X86-64 boot options can be found in + Documentation/x86_64/boot-options.txt . + +In addition, the following text indicates that the option: + + BUGS= Relates to possible processor bugs on the said processor. + KNL Is a kernel start-up parameter. + BOOT Is a boot loader parameter. + +Parameters denoted with BOOT are actually interpreted by the boot +loader, and have no meaning to the kernel directly. +Do not modify the syntax of boot loader parameters without extreme +need or coordination with . + +Note that ALL kernel parameters listed below are CASE SENSITIVE, and that +a trailing = on the name of any parameter states that that parameter will +be entered as an environment variable, whereas its absence indicates that +it will appear as a kernel argument readable via /proc/cmdline by programs +running once the system is up. + + 53c7xx= [HW,SCSI] Amiga SCSI controllers + See header of drivers/scsi/53c7xx.c. + See also Documentation/scsi/ncr53c7xx.txt. + + acpi= [HW,ACPI] Advanced Configuration and Power Interface + Format: { force | off | ht | strict | noirq } + force -- enable ACPI if default was off + off -- disable ACPI if default was on + noirq -- do not use ACPI for IRQ routing + ht -- run only enough ACPI to enable Hyper Threading + strict -- Be less tolerant of platforms that are not + strictly ACPI specification compliant. + + See also Documentation/pm.txt, pci=noacpi + + acpi_sleep= [HW,ACPI] Sleep options + Format: { s3_bios, s3_mode } + See Documentation/power/video.txt + + acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode + Format: { level | edge | high | low } + + acpi_irq_balance [HW,ACPI] + ACPI will balance active IRQs + default in APIC mode + + acpi_irq_nobalance [HW,ACPI] + ACPI will not move active IRQs (default) + default in PIC mode + + acpi_irq_pci= [HW,ACPI] If irq_balance, clear listed IRQs for + use by PCI + Format: ,... + + acpi_irq_isa= [HW,ACPI] If irq_balance, mark listed IRQs used by ISA + Format: ,... + + acpi_osi= [HW,ACPI] empty param disables _OSI + + acpi_serialize [HW,ACPI] force serialization of AML methods + + acpi_skip_timer_override [HW,ACPI] + Recognize and ignore IRQ0/pin2 Interrupt Override. + For broken nForce2 BIOS resulting in XT-PIC timer. + + acpi_dbg_layer= [HW,ACPI] + Format: + Each bit of the indicates an ACPI debug layer, + 1: enable, 0: disable. It is useful for boot time + debugging. After system has booted up, it can be set + via /proc/acpi/debug_layer. + + acpi_dbg_level= [HW,ACPI] + Format: + Each bit of the indicates an ACPI debug level, + 1: enable, 0: disable. It is useful for boot time + debugging. After system has booted up, it can be set + via /proc/acpi/debug_level. + + acpi_fake_ecdt [HW,ACPI] Workaround failure due to BIOS lacking ECDT + + acpi_generic_hotkey [HW,ACPI] + Allow consolidated generic hotkey driver to + override platform specific driver. + See also Documentation/acpi-hotkey.txt. + + enable_timer_pin_1 [i386,x86-64] + Enable PIN 1 of APIC timer + Can be useful to work around chipset bugs + (in particular on some ATI chipsets). + The kernel tries to set a reasonable default. + + disable_timer_pin_1 [i386,x86-64] + Disable PIN 1 of APIC timer + Can be useful to work around chipset bugs. + + ad1816= [HW,OSS] + Format: ,,, + See also Documentation/sound/oss/AD1816. + + ad1848= [HW,OSS] + Format: ,,,, + + adlib= [HW,OSS] + Format: + + advansys= [HW,SCSI] + See header of drivers/scsi/advansys.c. + + advwdt= [HW,WDT] Advantech WDT + Format: , + + aedsp16= [HW,OSS] Audio Excel DSP 16 + Format: ,,,,, + See also header of sound/oss/aedsp16.c. + + aha152x= [HW,SCSI] + See Documentation/scsi/aha152x.txt. + + aha1542= [HW,SCSI] + Format: [,,[,]] + + aic7xxx= [HW,SCSI] + See Documentation/scsi/aic7xxx.txt. + + aic79xx= [HW,SCSI] + See Documentation/scsi/aic79xx.txt. + + amijoy.map= [HW,JOY] Amiga joystick support + Map of devices attached to JOY0DAT and JOY1DAT + Format: , + See also Documentation/kernel/input/joystick.txt + + analog.map= [HW,JOY] Analog joystick and gamepad support + Specifies type or capabilities of an analog joystick + connected to one of 16 gameports + Format: ,,.. + + apc= [HW,SPARC] + Power management functions (SPARCstation-4/5 + deriv.) + Format: noidle + Disable APC CPU standby support. SPARCstation-Fox does + not play well with APC CPU idle - disable it if you have + APC and your system crashes randomly. + + apic= [APIC,i386] Change the output verbosity whilst booting + Format: { quiet (default) | verbose | debug } + Change the amount of debugging information output + when initialising the APIC and IO-APIC components. + + apm= [APM] Advanced Power Management + See header of arch/i386/kernel/apm.c. + + applicom= [HW] + Format: , + + arcrimi= [HW,NET] ARCnet - "RIM I" (entirely mem-mapped) cards + Format: ,, + + ataflop= [HW,M68k] + + atarimouse= [HW,MOUSE] Atari Mouse + + atascsi= [HW,SCSI] Atari SCSI + + atkbd.extra= [HW] Enable extra LEDs and keys on IBM RapidAccess, + EzKey and similar keyboards + + atkbd.reset= [HW] Reset keyboard during initialization + + atkbd.set= [HW] Select keyboard code set + Format: (2 = AT (default), 3 = PS/2) + + atkbd.scroll= [HW] Enable scroll wheel on MS Office and similar + keyboards + + atkbd.softraw= [HW] Choose between synthetic and real raw mode + Format: (0 = real, 1 = synthetic (default)) + + atkbd.softrepeat= [HW] + Use software keyboard repeat + + autotest [IA64] + + awe= [HW,OSS] AWE32/SB32/AWE64 wave table synth + Format: ,, + + aztcd= [HW,CD] Aztech CD268 CDROM driver + Format: ,0x79 (?) + + baycom_epp= [HW,AX25] + Format: , + + baycom_par= [HW,AX25] BayCom Parallel Port AX.25 Modem + Format: , + See header of drivers/net/hamradio/baycom_par.c. + + baycom_ser_fdx= [HW,AX25] + BayCom Serial Port AX.25 Modem (Full Duplex Mode) + Format: ,,[,] + See header of drivers/net/hamradio/baycom_ser_fdx.c. + + baycom_ser_hdx= [HW,AX25] + BayCom Serial Port AX.25 Modem (Half Duplex Mode) + Format: ,, + See header of drivers/net/hamradio/baycom_ser_hdx.c. + + blkmtd_device= [HW,MTD] + blkmtd_erasesz= + blkmtd_ro= + blkmtd_bs= + blkmtd_count= + + bttv.card= [HW,V4L] bttv (bt848 + bt878 based grabber cards) + bttv.radio= Most important insmod options are available as + kernel args too. + bttv.pll= See Documentation/video4linux/bttv/Insmod-options + bttv.tuner= and Documentation/video4linux/bttv/CARDLIST + + BusLogic= [HW,SCSI] + See drivers/scsi/BusLogic.c, comment before function + BusLogic_ParseDriverOptions(). + + c101= [NET] Moxa C101 synchronous serial card + + cachesize= [BUGS=IA-32] Override level 2 CPU cache size detection. + Sometimes CPU hardware bugs make them report the cache + size incorrectly. The kernel will attempt work arounds + to fix known problems, but for some CPUs it is not + possible to determine what the correct size should be. + This option provides an override for these situations. + + cdu31a= [HW,CD] + Format: ,[,PAS] + See header of drivers/cdrom/cdu31a.c. + + chandev= [HW,NET] Generic channel device initialisation + + checkreqprot [SELINUX] Set initial checkreqprot flag value. + Format: { "0" | "1" } + See security/selinux/Kconfig help text. + 0 -- check protection applied by kernel (includes + any implied execute protection). + 1 -- check protection requested by application. + Default value is set via a kernel config option. + Value can be changed at runtime via + /selinux/checkreqprot. + + clock= [BUGS=IA-32,HW] gettimeofday timesource override. + Forces specified timesource (if avaliable) to be used + when calculating gettimeofday(). If specicified + timesource is not avalible, it defaults to PIT. + Format: { pit | tsc | cyclone | pmtmr } + + hpet= [IA-32,HPET] option to disable HPET and use PIT. + Format: disable + + cm206= [HW,CD] + Format: { auto | [,][] } + + com20020= [HW,NET] ARCnet - COM20020 chipset + Format: + [,[,[,[,[,]]]]] + + com90io= [HW,NET] ARCnet - COM90xx chipset (IO-mapped buffers) + Format: [,] + + com90xx= [HW,NET] + ARCnet - COM90xx chipset (memory-mapped buffers) + Format: [,[,]] + + condev= [HW,S390] console device + conmode= + + console= [KNL] Output console device and options. + + tty Use the virtual console device . + + ttyS[,options] + Use the specified serial port. The options are of + the form "bbbbpn", where "bbbb" is the baud rate, + "p" is parity ("n", "o", or "e"), and "n" is bits. + Default is "9600n8". + + See also Documentation/serial-console.txt. + + uart,io,[,options] + uart,mmio,[,options] + Start an early, polled-mode console on the 8250/16550 + UART at the specified I/O port or MMIO address, + switching to the matching ttyS device later. The + options are the same as for ttyS, above. + + cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver + Format: + ,,,[,] + + cpia_pp= [HW,PPT] + Format: { parport | auto | none } + + crashkernel=nn[KMG]@ss[KMG] + [KNL] Reserve a chunk of physical memory to + hold a kernel to switch to with kexec on panic. + + cs4232= [HW,OSS] + Format: ,,,,, + + cs89x0_dma= [HW,NET] + Format: + + cs89x0_media= [HW,NET] + Format: { rj45 | aui | bnc } + + cyclades= [HW,SERIAL] Cyclades multi-serial port adapter. + + dasd= [HW,NET] + See header of drivers/s390/block/dasd_devmap.c. + + db9.dev[2|3]= [HW,JOY] Multisystem joystick support via parallel port + (one device per port) + Format: , + See also Documentation/input/joystick-parport.txt + + debug [KNL] Enable kernel debugging (events log level). + + decnet= [HW,NET] + Format: [,] + See also Documentation/networking/decnet.txt. + + devfs= [DEVFS] + See Documentation/filesystems/devfs/boot-options. + + dhash_entries= [KNL] + Set number of hash buckets for dentry cache. + + digi= [HW,SERIAL] + IO parameters + enable/disable command. + + digiepca= [HW,SERIAL] + See drivers/char/README.epca and + Documentation/digiepca.txt. + + dmascc= [HW,AX25,SERIAL] AX.25 Z80SCC driver with DMA + support available. + Format: [,[,..]] + + dmasound= [HW,OSS] Sound subsystem buffers + + dscc4.setup= [NET] + + dtc3181e= [HW,SCSI] + + earlyprintk= [IA-32,X86-64] + earlyprintk=vga + earlyprintk=serial[,ttySn[,baudrate]] + + Append ",keep" to not disable it when the real console + takes over. + + Only vga or serial at a time, not both. + + Currently only ttyS0 and ttyS1 are supported. + + Interaction with the standard serial driver is not + very good. + + The VGA output is eventually overwritten by the real + console. + + eata= [HW,SCSI] + + ec_intr= [HW,ACPI] ACPI Embedded Controller interrupt mode + Format: + 0: polling mode + non-0: interrupt mode (default) + + eda= [HW,PS2] + + edb= [HW,PS2] + + edd= [EDD] + Format: {"of[f]" | "sk[ipmbr]"} + See comment in arch/i386/boot/edd.S + + eicon= [HW,ISDN] + Format: ,, + + eisa_irq_edge= [PARISC,HW] + See header of drivers/parisc/eisa.c. + + elanfreq= [IA-32] + See comment before function elanfreq_setup() in + arch/i386/kernel/cpu/cpufreq/elanfreq.c. + + elevator= [IOSCHED] + Format: {"anticipatory" | "cfq" | "deadline" | "noop"} + See Documentation/block/as-iosched.txt and + Documentation/block/deadline-iosched.txt for details. + + elfcorehdr= [IA-32, X86_64] + Specifies physical address of start of kernel core + image elf header. Generally kexec loader will + pass this option to capture kernel. + See Documentation/kdump/kdump.txt for details. + + enforcing [SELINUX] Set initial enforcing status. + Format: {"0" | "1"} + See security/selinux/Kconfig help text. + 0 -- permissive (log only, no denials). + 1 -- enforcing (deny and log). + Default value is 0. + Value can be changed at runtime via /selinux/enforce. + + es1370= [HW,OSS] + Format: [,] + See also header of sound/oss/es1370.c. + + es1371= [HW,OSS] + Format: ,[,[]] + See also header of sound/oss/es1371.c. + + ether= [HW,NET] Ethernet cards parameters + This option is obsoleted by the "netdev=" option, which + has equivalent usage. See its documentation for details. + + eurwdt= [HW,WDT] Eurotech CPU-1220/1410 onboard watchdog. + Format: [,] + + fd_mcs= [HW,SCSI] + See header of drivers/scsi/fd_mcs.c. + + fdomain= [HW,SCSI] + See header of drivers/scsi/fdomain.c. + + floppy= [HW] + See Documentation/floppy.txt. + + ftape= [HW] Floppy Tape subsystem debugging options. + See Documentation/ftape.txt. + + gamecon.map[2|3]= + [HW,JOY] Multisystem joystick and NES/SNES/PSX pad + support via parallel port (up to 5 devices per port) + Format: ,,,,, + See also Documentation/input/joystick-parport.txt + + gamma= [HW,DRM] + + gdth= [HW,SCSI] + See header of drivers/scsi/gdth.c. + + gpt [EFI] Forces disk with valid GPT signature but + invalid Protective MBR to be treated as GPT. + + gscd= [HW,CD] + Format: + + gt96100eth= [NET] MIPS GT96100 Advanced Communication Controller + + gus= [HW,OSS] + Format: ,,, + + gvp11= [HW,SCSI] + + hashdist= [KNL,NUMA] Large hashes allocated during boot + are distributed across NUMA nodes. Defaults on + for IA-64, off otherwise. + Format: 0 | 1 (for off | on) + + hcl= [IA-64] SGI's Hardware Graph compatibility layer + + hd= [EIDE] (E)IDE hard drive subsystem geometry + Format: ,, + + hd?= [HW] (E)IDE subsystem + hd?lun= See Documentation/ide.txt. + + highmem=nn[KMG] [KNL,BOOT] forces the highmem zone to have an exact + size of . This works even on boxes that have no + highmem otherwise. This also works to reduce highmem + size on bigger boxes. + + hisax= [HW,ISDN] + See Documentation/isdn/README.HiSax. + + hugepages= [HW,IA-32,IA-64] Maximal number of HugeTLB pages. + + noirqbalance [IA-32,SMP,KNL] Disable kernel irq balancing + + i8042.direct [HW] Put keyboard port into non-translated mode + i8042.dumbkbd [HW] Pretend that controlled can only read data from + keyboard and can not control its state + (Don't attempt to blink the leds) + i8042.noaux [HW] Don't check for auxiliary (== mouse) port + i8042.nokbd [HW] Don't check/create keyboard port + i8042.nomux [HW] Don't check presence of an active multiplexing + controller + i8042.nopnp [HW] Don't use ACPIPnP / PnPBIOS to discover KBD/AUX + controllers + i8042.panicblink= + [HW] Frequency with which keyboard LEDs should blink + when kernel panics (default is 0.5 sec) + i8042.reset [HW] Reset the controller during init and cleanup + i8042.unlock [HW] Unlock (ignore) the keylock + + i810= [HW,DRM] + + i8k.ignore_dmi [HW] Continue probing hardware even if DMI data + indicates that the driver is running on unsupported + hardware. + i8k.force [HW] Activate i8k driver even if SMM BIOS signature + does not match list of supported models. + i8k.power_status + [HW] Report power status in /proc/i8k + (disabled by default) + i8k.restricted [HW] Allow controlling fans only if SYS_ADMIN + capability is set. + + ibmmcascsi= [HW,MCA,SCSI] IBM MicroChannel SCSI adapter + See Documentation/mca.txt. + + icn= [HW,ISDN] + Format: [,[,[,]]] + + ide= [HW] (E)IDE subsystem + Format: ide=nodma or ide=doubler or ide=reverse + See Documentation/ide.txt. + + ide?= [HW] (E)IDE subsystem + Format: ide?=noprobe or chipset specific parameters. + See Documentation/ide.txt. + + idebus= [HW] (E)IDE subsystem - VLB/PCI bus speed + See Documentation/ide.txt. + + idle= [HW] + Format: idle=poll or idle=halt + + ihash_entries= [KNL] + Set number of hash buckets for inode cache. + + in2000= [HW,SCSI] + See header of drivers/scsi/in2000.c. + + init= [KNL] + Format: + Run specified binary instead of /sbin/init as init + process. + + initcall_debug [KNL] Trace initcalls as they are executed. Useful + for working out where the kernel is dying during + startup. + + initrd= [BOOT] Specify the location of the initial ramdisk + + inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver + Format: + + combined_mode= [HW] control which driver uses IDE ports in combined + mode: legacy IDE driver, libata, or both + (in the libata case, libata.atapi_enabled=1 may be + useful as well). Note that using the ide or libata + options may affect your device naming (e.g. by + changing hdc to sdb). + Format: combined (default), ide, or libata + + inttest= [IA64] + + io7= [HW] IO7 for Marvel based alpha systems + See comment before marvel_specify_io7 in + arch/alpha/kernel/core_marvel.c. + + ip= [IP_PNP] + See Documentation/nfsroot.txt. + + ip2= [HW] Set IO/IRQ pairs for up to 4 IntelliPort boards + See comment before ip2_setup() in drivers/char/ip2.c. + + ips= [HW,SCSI] Adaptec / IBM ServeRAID controller + See header of drivers/scsi/ips.c. + + irqfixup [HW] + When an interrupt is not handled search all handlers + for it. Intended to get systems with badly broken + firmware running. + + irqpoll [HW] + When an interrupt is not handled search all handlers + for it. Also check all handlers each timer + interrupt. Intended to get systems with badly broken + firmware running. + + isapnp= [ISAPNP] + Format: ,,, + + isolcpus= [KNL,SMP] Isolate CPUs from the general scheduler. + Format: ,..., + This option can be used to specify one or more CPUs + to isolate from the general SMP balancing and scheduling + algorithms. The only way to move a process onto or off + an "isolated" CPU is via the CPU affinity syscalls. + begins at 0 and the maximum value is + "number of CPUs in system - 1". + + This option is the preferred way to isolate CPUs. The + alternative -- manually setting the CPU mask of all + tasks in the system -- can cause problems and + suboptimal load balancer performance. + + isp16= [HW,CD] + Format: ,,, + + iucv= [HW,NET] + + js= [HW,JOY] Analog joystick + See Documentation/input/joystick.txt. + + keepinitrd [HW,ARM] + + kstack=N [IA-32,X86-64] Print N words from the kernel stack + in oops dumps. + + l2cr= [PPC] + + lapic [IA-32,APIC] Enable the local APIC even if BIOS + disabled it. + + lasi= [HW,SCSI] PARISC LASI driver for the 53c700 chip + Format: addr:,irq: + + llsc*= [IA64] See function print_params() in + arch/ia64/sn/kernel/llsc4.c. + + load_ramdisk= [RAM] List of ramdisks to load from floppy + See Documentation/ramdisk.txt. + + lockd.nlm_grace_period=P [NFS] Assign grace period. + Format: + + lockd.nlm_tcpport=N [NFS] Assign TCP port. + Format: + + lockd.nlm_timeout=T [NFS] Assign timeout value. + Format: + + lockd.nlm_udpport=M [NFS] Assign UDP port. + Format: + + logibm.irq= [HW,MOUSE] Logitech Bus Mouse Driver + Format: + + loglevel= All Kernel Messages with a loglevel smaller than the + console loglevel will be printed to the console. It can + also be changed with klogd or other programs. The + loglevels are defined as follows: + + 0 (KERN_EMERG) system is unusable + 1 (KERN_ALERT) action must be taken immediately + 2 (KERN_CRIT) critical conditions + 3 (KERN_ERR) error conditions + 4 (KERN_WARNING) warning conditions + 5 (KERN_NOTICE) normal but significant condition + 6 (KERN_INFO) informational + 7 (KERN_DEBUG) debug-level messages + + log_buf_len=n Sets the size of the printk ring buffer, in bytes. + Format: { n | nk | nM } + n must be a power of two. The default size + is set in the kernel config file. + + lp=0 [LP] Specify parallel ports to use, e.g, + lp=port[,port...] lp=none,parport0 (lp0 not configured, lp1 uses + lp=reset first parallel port). 'lp=0' disables the + lp=auto printer driver. 'lp=reset' (which can be + specified in addition to the ports) causes + attached printers to be reset. Using + lp=port1,port2,... specifies the parallel ports + to associate lp devices with, starting with + lp0. A port specification may be 'none' to skip + that lp device, or a parport name such as + 'parport0'. Specifying 'lp=auto' instead of a + port specification list means that device IDs + from each port should be examined, to see if + an IEEE 1284-compliant printer is attached; if + so, the driver will manage that printer. + See also header of drivers/char/lp.c. + + lpj=n [KNL] + Sets loops_per_jiffy to given constant, thus avoiding + time-consuming boot-time autodetection (up to 250 ms per + CPU). 0 enables autodetection (default). To determine + the correct value for your kernel, boot with normal + autodetection and see what value is printed. Note that + on SMP systems the preset will be applied to all CPUs, + which is likely to cause problems if your CPUs need + significantly divergent settings. An incorrect value + will cause delays in the kernel to be wrong, leading to + unpredictable I/O errors and other breakage. Although + unlikely, in the extreme case this might damage your + hardware. + + ltpc= [NET] + Format: ,, + + mac5380= [HW,SCSI] Format: + ,,,, + + mac53c9x= [HW,SCSI] Format: + ,,,,,,, + + machvec= [IA64] Force the use of a particular machine-vector + (machvec) in a generic kernel. + Example: machvec=hpzx1_swiotlb + + mad16= [HW,OSS] Format: + ,,,,,, + + maui= [HW,OSS] + Format: , + + max_loop= [LOOP] Maximum number of loopback devices that can + be mounted + Format: <1-256> + + maxcpus= [SMP] Maximum number of processors that an SMP kernel + should make use of + + max_addr=[KMG] [KNL,BOOT,ia64] All physical memory greater than or + equal to this physical address is ignored. + + max_luns= [SCSI] Maximum number of LUNs to probe. + Should be between 1 and 2^32-1. + + max_report_luns= + [SCSI] Maximum number of LUNs received. + Should be between 1 and 16384. + + mca-pentium [BUGS=IA-32] + + mcatest= [IA-64] + + mcd= [HW,CD] + Format: ,, + + mcdx= [HW,CD] + + mce [IA-32] Machine Check Exception + + md= [HW] RAID subsystems devices and level + See Documentation/md.txt. + + mdacon= [MDA] + Format: , + Specifies range of consoles to be captured by the MDA. + + mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory + Amount of memory to be used when the kernel is not able + to see the whole system memory or for test. + [IA-32] Use together with memmap= to avoid physical + address space collisions. Without memmap= PCI devices + could be placed at addresses belonging to unused RAM. + + mem=nopentium [BUGS=IA-32] Disable usage of 4MB pages for kernel + memory. + + memmap=exactmap [KNL,IA-32,X86_64] Enable setting of an exact + E820 memory map, as specified by the user. + Such memmap=exactmap lines can be constructed based on + BIOS output or other requirements. See the memmap=nn@ss + option description. + + memmap=nn[KMG]@ss[KMG] + [KNL] Force usage of a specific region of memory + Region of memory to be used, from ss to ss+nn. + + memmap=nn[KMG]#ss[KMG] + [KNL,ACPI] Mark specific memory as ACPI data. + Region of memory to be used, from ss to ss+nn. + + memmap=nn[KMG]$ss[KMG] + [KNL,ACPI] Mark specific memory as reserved. + Region of memory to be used, from ss to ss+nn. + + meye.*= [HW] Set MotionEye Camera parameters + See Documentation/video4linux/meye.txt. + + mga= [HW,DRM] + + migration_cost= + [KNL,SMP] debug: override scheduler migration costs + Format: ,,... + This debugging option can be used to override the + default scheduler migration cost matrix. The numbers + are indexed by 'CPU domain distance'. + E.g. migration_cost=1000,2000,3000 on an SMT NUMA + box will set up an intra-core migration cost of + 1 msec, an inter-core migration cost of 2 msecs, + and an inter-node migration cost of 3 msecs. + + WARNING: using the wrong values here can break + scheduler performance, so it's only for scheduler + development purposes, not production environments. + + migration_debug= + [KNL,SMP] migration cost auto-detect verbosity + Format=<0|1|2> + If a system's migration matrix reported at bootup + seems erroneous then this option can be used to + increase verbosity of the detection process. + We default to 0 (no extra messages), 1 will print + some more information, and 2 will be really + verbose (probably only useful if you also have a + serial console attached to the system). + + migration_factor= + [KNL,SMP] multiply/divide migration costs by a factor + Format= + This debug option can be used to proportionally + increase or decrease the auto-detected migration + costs for all entries of the migration matrix. + E.g. migration_factor=150 will increase migration + costs by 50%. (and thus the scheduler will be less + eager migrating cache-hot tasks) + migration_factor=80 will decrease migration costs + by 20%. (thus the scheduler will be more eager to + migrate tasks) + + WARNING: using the wrong values here can break + scheduler performance, so it's only for scheduler + development purposes, not production environments. + + mousedev.tap_time= + [MOUSE] Maximum time between finger touching and + leaving touchpad surface for touch to be considered + a tap and be reported as a left button click (for + touchpads working in absolute mode only). + Format: + mousedev.xres= [MOUSE] Horizontal screen resolution, used for devices + reporting absolute coordinates, such as tablets + mousedev.yres= [MOUSE] Vertical screen resolution, used for devices + reporting absolute coordinates, such as tablets + + mpu401= [HW,OSS] + Format: , + + MTD_Partition= [MTD] + Format: ,,, + + MTD_Region= [MTD] Format: + ,[,,,,] + + mtdparts= [MTD] + See drivers/mtd/cmdline.c. + + mtouchusb.raw_coordinates= + [HW] Make the MicroTouch USB driver use raw coordinates + ('y', default) or cooked coordinates ('n') + + n2= [NET] SDL Inc. RISCom/N2 synchronous serial card + + NCR_D700= [HW,SCSI] + See header of drivers/scsi/NCR_D700.c. + + ncr5380= [HW,SCSI] + + ncr53c400= [HW,SCSI] + + ncr53c400a= [HW,SCSI] + + ncr53c406a= [HW,SCSI] + + ncr53c8xx= [HW,SCSI] + + netdev= [NET] Network devices parameters + Format: ,,,, + Note that mem_start is often overloaded to mean + something different and driver-specific. + This usage is only documented in each driver source + file if at all. + + nfsaddrs= [NFS] + See Documentation/nfsroot.txt. + + nfsroot= [NFS] nfs root filesystem for disk-less boxes. + See Documentation/nfsroot.txt. + + nfs.callback_tcpport= + [NFS] set the TCP port on which the NFSv4 callback + channel should listen. + + nfs.idmap_cache_timeout= + [NFS] set the maximum lifetime for idmapper cache + entries. + + nmi_watchdog= [KNL,BUGS=IA-32] Debugging features for SMP kernels + + no387 [BUGS=IA-32] Tells the kernel to use the 387 maths + emulation library even if a 387 maths coprocessor + is present. + + noalign [KNL,ARM] + + noapic [SMP,APIC] Tells the kernel to not make use of any + IOAPICs that may be present in the system. + + noasync [HW,M68K] Disables async and sync negotiation for + all devices. + + nobats [PPC] Do not use BATs for mapping kernel lowmem + on "Classic" PPC cores. + + nocache [ARM] + + nodisconnect [HW,SCSI,M68K] Disables SCSI disconnects. + + noexec [IA-64] + + noexec [IA-32,X86-64] + noexec=on: enable non-executable mappings (default) + noexec=off: disable nn-executable mappings + + nofxsr [BUGS=IA-32] + + nohlt [BUGS=ARM] + + no-hlt [BUGS=IA-32] Tells the kernel that the hlt + instruction doesn't work correctly and not to + use it. + + nohalt [IA-64] Tells the kernel not to use the power saving + function PAL_HALT_LIGHT when idle. This increases + power-consumption. On the positive side, it reduces + interrupt wake-up latency, which may improve performance + in certain environments such as networked servers or + real-time systems. + + noirqdebug [IA-32] Disables the code which attempts to detect and + disable unhandled interrupt sources. + + noisapnp [ISAPNP] Disables ISA PnP code. + + noinitrd [RAM] Tells the kernel not to load any configured + initial RAM disk. + + nointroute [IA-64] + + nolapic [IA-32,APIC] Do not enable or use the local APIC. + + noltlbs [PPC] Do not use large page/tlb entries for kernel + lowmem mapping on PPC40x. + + nomce [IA-32] Machine Check Exception + + noresidual [PPC] Don't use residual data on PReP machines. + + noresume [SWSUSP] Disables resume and restores original swap + space. + + no-scroll [VGA] Disables scrollback. + This is required for the Braillex ib80-piezo Braille + reader made by F.H. Papenmeier (Germany). + + nosbagart [IA-64] + + nosmp [SMP] Tells an SMP kernel to act as a UP kernel. + + nosync [HW,M68K] Disables sync negotiation for all devices. + + notsc [BUGS=IA-32] Disable Time Stamp Counter + + nousb [USB] Disable the USB subsystem + + nowb [ARM] + + nr_uarts= [SERIAL] maximum number of UARTs to be registered. + + opl3= [HW,OSS] + Format: + + opl3sa= [HW,OSS] + Format: ,,,,, + + opl3sa2= [HW,OSS] Format: + ,,,,,,,[,, + + osst= [HW,SCSI] SCSI Tape Driver + Format: , + See also Documentation/scsi/st.txt. + + panic= [KNL] Kernel behaviour on panic + Format: + + parkbd.port= [HW] Parallel port number the keyboard adapter is + connected to, default is 0. + Format: + parkbd.mode= [HW] Parallel port keyboard adapter mode of operation, + 0 for XT, 1 for AT (default is AT). + Format: + + parport= [HW,PPT] Specify parallel ports. 0 disables. + Format: { 0 | auto | 0xBBB[,IRQ[,DMA]] } + Use 'auto' to force the driver to use any + IRQ/DMA settings detected (the default is to + ignore detected IRQ/DMA settings because of + possible conflicts). You can specify the base + address, IRQ, and DMA settings; IRQ and DMA + should be numbers, or 'auto' (for using detected + settings on that particular port), or 'nofifo' + (to avoid using a FIFO even if it is detected). + Parallel ports are assigned in the order they + are specified on the command line, starting + with parport0. + + parport_init_mode= [HW,PPT] + Configure VIA parallel port to operate in + a specific mode. This is necessary on Pegasos + computer where firmware has no options for setting + up parallel port mode and sets it to spp. + Currently this function knows 686a and 8231 chips. + Format: [spp|ps2|epp|ecp|ecpepp] + + pas2= [HW,OSS] Format: + ,,,,,,, + + pas16= [HW,SCSI] + See header of drivers/scsi/pas16.c. + + pcbit= [HW,ISDN] + + pcd. [PARIDE] + See header of drivers/block/paride/pcd.c. + See also Documentation/paride.txt. + + pci=option[,option...] [PCI] various PCI subsystem options: + off [IA-32] don't probe for the PCI bus + bios [IA-32] force use of PCI BIOS, don't access + the hardware directly. Use this if your machine + has a non-standard PCI host bridge. + nobios [IA-32] disallow use of PCI BIOS, only direct + hardware access methods are allowed. Use this + if you experience crashes upon bootup and you + suspect they are caused by the BIOS. + conf1 [IA-32] Force use of PCI Configuration + Mechanism 1. + conf2 [IA-32] Force use of PCI Configuration + Mechanism 2. + nommconf [IA-32,X86_64] Disable use of MMCONFIG for PCI + Configuration + nosort [IA-32] Don't sort PCI devices according to + order given by the PCI BIOS. This sorting is + done to get a device order compatible with + older kernels. + biosirq [IA-32] Use PCI BIOS calls to get the interrupt + routing table. These calls are known to be buggy + on several machines and they hang the machine + when used, but on other computers it's the only + way to get the interrupt routing table. Try + this option if the kernel is unable to allocate + IRQs or discover secondary PCI buses on your + motherboard. + rom [IA-32] Assign address space to expansion ROMs. + Use with caution as certain devices share + address decoders between ROMs and other + resources. + irqmask=0xMMMM [IA-32] Set a bit mask of IRQs allowed to be + assigned automatically to PCI devices. You can + make the kernel exclude IRQs of your ISA cards + this way. + pirqaddr=0xAAAAA [IA-32] Specify the physical address + of the PIRQ table (normally generated + by the BIOS) if it is outside the + F0000h-100000h range. + lastbus=N [IA-32] Scan all buses thru bus #N. Can be + useful if the kernel is unable to find your + secondary buses and you want to tell it + explicitly which ones they are. + assign-busses [IA-32] Always assign all PCI bus + numbers ourselves, overriding + whatever the firmware may have done. + usepirqmask [IA-32] Honor the possible IRQ mask stored + in the BIOS $PIR table. This is needed on + some systems with broken BIOSes, notably + some HP Pavilion N5400 and Omnibook XE3 + notebooks. This will have no effect if ACPI + IRQ routing is enabled. + noacpi [IA-32] Do not use ACPI for IRQ routing + or for PCI scanning. + routeirq Do IRQ routing for all PCI devices. + This is normally done in pci_enable_device(), + so this option is a temporary workaround + for broken drivers that don't call it. + firmware [ARM] Do not re-enumerate the bus but instead + just use the configuration from the + bootloader. This is currently used on + IXP2000 systems where the bus has to be + configured a certain way for adjunct CPUs. + + pcmv= [HW,PCMCIA] BadgePAD 4 + + pd. [PARIDE] + See Documentation/paride.txt. + + pdcchassis= [PARISC,HW] Disable/Enable PDC Chassis Status codes at + boot time. + Format: { 0 | 1 } + See arch/parisc/kernel/pdc_chassis.c + + pf. [PARIDE] + See Documentation/paride.txt. + + pg. [PARIDE] + See Documentation/paride.txt. + + pirq= [SMP,APIC] Manual mp-table setup + See Documentation/i386/IO-APIC.txt. + + plip= [PPT,NET] Parallel port network link + Format: { parport | timid | 0 } + See also Documentation/parport.txt. + + pnpacpi= [ACPI] + { off } + + pnpbios= [ISAPNP] + { on | off | curr | res | no-curr | no-res } + + pnp_reserve_irq= + [ISAPNP] Exclude IRQs for the autoconfiguration + + pnp_reserve_dma= + [ISAPNP] Exclude DMAs for the autoconfiguration + + pnp_reserve_io= [ISAPNP] Exclude I/O ports for the autoconfiguration + Ranges are in pairs (I/O port base and size). + + pnp_reserve_mem= + [ISAPNP] Exclude memory regions for the + autoconfiguration. + Ranges are in pairs (memory base and size). + + profile= [KNL] Enable kernel profiling via /proc/profile + Format: [schedule,] + Param: "schedule" - profile schedule points. + Param: - step/bucket size as a power of 2 for + statistical time based profiling. + + processor.max_cstate= [HW,ACPI] + Limit processor to maximum C-state + max_cstate=9 overrides any DMI blacklist limit. + + processor.nocst [HW,ACPI] + Ignore the _CST method to determine C-states, + instead using the legacy FADT method + + prompt_ramdisk= [RAM] List of RAM disks to prompt for floppy disk + before loading. + See Documentation/ramdisk.txt. + + psmouse.proto= [HW,MOUSE] Highest PS2 mouse protocol extension to + probe for; one of (bare|imps|exps|lifebook|any). + psmouse.rate= [HW,MOUSE] Set desired mouse report rate, in reports + per second. + psmouse.resetafter= [HW,MOUSE] + Try to reset the device after so many bad packets + (0 = never). + psmouse.resolution= + [HW,MOUSE] Set desired mouse resolution, in dpi. + psmouse.smartscroll= + [HW,MOUSE] Controls Logitech smartscroll autorepeat. + 0 = disabled, 1 = enabled (default). + + pss= [HW,OSS] Personal Sound System (ECHO ESC614) + Format: + ,,,,, + + pt. [PARIDE] + See Documentation/paride.txt. + + quiet= [KNL] Disable log messages + + r128= [HW,DRM] + + raid= [HW,RAID] + See Documentation/md.txt. + + ramdisk= [RAM] Sizes of RAM disks in kilobytes [deprecated] + See Documentation/ramdisk.txt. + + ramdisk_blocksize= [RAM] + See Documentation/ramdisk.txt. + + ramdisk_size= [RAM] Sizes of RAM disks in kilobytes + New name for the ramdisk parameter. + See Documentation/ramdisk.txt. + + rdinit= [KNL] + Format: + Run specified binary instead of /init from the ramdisk, + used for early userspace startup. See initrd. + + reboot= [BUGS=IA-32,BUGS=ARM,BUGS=IA-64] Rebooting mode + Format: [,[,...]] + See arch/*/kernel/reboot.c. + + reserve= [KNL,BUGS] Force the kernel to ignore some iomem area + + resume= [SWSUSP] + Specify the partition device for software suspend + + rhash_entries= [KNL,NET] + Set number of hash buckets for route cache + + riscom8= [HW,SERIAL] + Format: [,[,...]] + + ro [KNL] Mount root device read-only on boot + + root= [KNL] Root filesystem + + rootdelay= [KNL] Delay (in seconds) to pause before attempting to + mount the root filesystem + + rootflags= [KNL] Set root filesystem mount option string + + rootfstype= [KNL] Set root filesystem type + + rw [KNL] Mount root device read-write on boot + + S [KNL] Run init in single mode + + sa1100ir [NET] + See drivers/net/irda/sa1100_ir.c. + + sb= [HW,OSS] + Format: ,,, + + sbni= [NET] Granch SBNI12 leased line adapter + + sbpcd= [HW,CD] Soundblaster CD adapter + Format: , + See a comment before function sbpcd_setup() in + drivers/cdrom/sbpcd.c. + + sc1200wdt= [HW,WDT] SC1200 WDT (watchdog) driver + Format: [,[,]] + + scsi_debug_*= [SCSI] + See drivers/scsi/scsi_debug.c. + + scsi_default_dev_flags= + [SCSI] SCSI default device flags + Format: + + scsi_dev_flags= [SCSI] Black/white list entry for vendor and model + Format: :: + (flags are integer value) + + scsi_logging= [SCSI] + + selinux [SELINUX] Disable or enable SELinux at boot time. + Format: { "0" | "1" } + See security/selinux/Kconfig help text. + 0 -- disable. + 1 -- enable. + Default value is set via kernel config option. + If enabled at boot time, /selinux/disable can be used + later to disable prior to initial policy load. + + serialnumber [BUGS=IA-32] + + sg_def_reserved_size= [SCSI] + + sgalaxy= [HW,OSS] + Format: ,,,, + + shapers= [NET] + Maximal number of shapers. + + sim710= [SCSI,HW] + See header of drivers/scsi/sim710.c. + + simeth= [IA-64] + simscsi= + + sjcd= [HW,CD] + Format: ,, + See header of drivers/cdrom/sjcd.c. + + slram= [HW,MTD] + + smart2= [HW] + Format: [,[,...,]] + + snd-ad1816a= [HW,ALSA] + + snd-ad1848= [HW,ALSA] + + snd-ali5451= [HW,ALSA] + + snd-als100= [HW,ALSA] + + snd-als4000= [HW,ALSA] + + snd-azt2320= [HW,ALSA] + + snd-cmi8330= [HW,ALSA] + + snd-cmipci= [HW,ALSA] + + snd-cs4231= [HW,ALSA] + + snd-cs4232= [HW,ALSA] + + snd-cs4236= [HW,ALSA] + + snd-cs4281= [HW,ALSA] + + snd-cs46xx= [HW,ALSA] + + snd-dt019x= [HW,ALSA] + + snd-dummy= [HW,ALSA] + + snd-emu10k1= [HW,ALSA] + + snd-ens1370= [HW,ALSA] + + snd-ens1371= [HW,ALSA] + + snd-es968= [HW,ALSA] + + snd-es1688= [HW,ALSA] + + snd-es18xx= [HW,ALSA] + + snd-es1938= [HW,ALSA] + + snd-es1968= [HW,ALSA] + + snd-fm801= [HW,ALSA] + + snd-gusclassic= [HW,ALSA] + + snd-gusextreme= [HW,ALSA] + + snd-gusmax= [HW,ALSA] + + snd-hdsp= [HW,ALSA] + + snd-ice1712= [HW,ALSA] + + snd-intel8x0= [HW,ALSA] + + snd-interwave= [HW,ALSA] + + snd-interwave-stb= + [HW,ALSA] + + snd-korg1212= [HW,ALSA] + + snd-maestro3= [HW,ALSA] + + snd-mpu401= [HW,ALSA] + + snd-mtpav= [HW,ALSA] + + snd-nm256= [HW,ALSA] + + snd-opl3sa2= [HW,ALSA] + + snd-opti92x-ad1848= + [HW,ALSA] + + snd-opti92x-cs4231= + [HW,ALSA] + + snd-opti93x= [HW,ALSA] + + snd-pmac= [HW,ALSA] + + snd-rme32= [HW,ALSA] + + snd-rme96= [HW,ALSA] + + snd-rme9652= [HW,ALSA] + + snd-sb8= [HW,ALSA] + + snd-sb16= [HW,ALSA] + + snd-sbawe= [HW,ALSA] + + snd-serial= [HW,ALSA] + + snd-sgalaxy= [HW,ALSA] + + snd-sonicvibes= [HW,ALSA] + + snd-sun-amd7930= + [HW,ALSA] + + snd-sun-cs4231= [HW,ALSA] + + snd-trident= [HW,ALSA] + + snd-usb-audio= [HW,ALSA,USB] + + snd-via82xx= [HW,ALSA] + + snd-virmidi= [HW,ALSA] + + snd-wavefront= [HW,ALSA] + + snd-ymfpci= [HW,ALSA] + + sonicvibes= [HW,OSS] + Format: + + sonycd535= [HW,CD] + Format: [,] + + sonypi.*= [HW] Sony Programmable I/O Control Device driver + See Documentation/sonypi.txt + + specialix= [HW,SERIAL] Specialix multi-serial port adapter + See Documentation/specialix.txt. + + spia_io_base= [HW,MTD] + spia_fio_base= + spia_pedr= + spia_peddr= + + sscape= [HW,OSS] + Format: ,,,, + + st= [HW,SCSI] SCSI tape parameters (buffers, etc.) + See Documentation/scsi/st.txt. + + st0x= [HW,SCSI] + See header of drivers/scsi/seagate.c. + + sti= [PARISC,HW] + Format: + Set the STI (builtin display/keyboard on the HP-PARISC + machines) console (graphic card) which should be used + as the initial boot-console. + See also comment in drivers/video/console/sticore.c. + + sti_font= [HW] + See comment in drivers/video/console/sticore.c. + + stifb= [HW] + Format: bpp:[:[:...]] + + swiotlb= [IA-64] Number of I/O TLB slabs + + switches= [HW,M68k] + + sym53c416= [HW,SCSI] + See header of drivers/scsi/sym53c416.c. + + t128= [HW,SCSI] + See header of drivers/scsi/t128.c. + + tdfx= [HW,DRM] + + thash_entries= [KNL,NET] + Set number of hash buckets for TCP connection + + time Show timing data prefixed to each printk message line + + tipar.timeout= [HW,PPT] + Set communications timeout in tenths of a second + (default 15). + + tipar.delay= [HW,PPT] + Set inter-bit delay in microseconds (default 10). + + tmc8xx= [HW,SCSI] + See header of drivers/scsi/seagate.c. + + tmscsim= [HW,SCSI] + See comment before function dc390_setup() in + drivers/scsi/tmscsim.c. + + tp720= [HW,PS2] + + trix= [HW,OSS] MediaTrix AudioTrix Pro + Format: + ,,,,,,,, + + tsdev.xres= [TS] Horizontal screen resolution. + tsdev.yres= [TS] Vertical screen resolution. + + turbografx.map[2|3]= [HW,JOY] + TurboGraFX parallel port interface + Format: + ,,,,,,, + See also Documentation/input/joystick-parport.txt + + u14-34f= [HW,SCSI] UltraStor 14F/34F SCSI host adapter + See header of drivers/scsi/u14-34f.c. + + uart401= [HW,OSS] + Format: , + + uart6850= [HW,OSS] + Format: , + + usbhid.mousepoll= + [USBHID] The interval which mice are to be polled at. + + video= [FB] Frame buffer configuration + See Documentation/fb/modedb.txt. + + vga= [BOOT,IA-32] Select a particular video mode + See Documentation/i386/boot.txt and + Documentation/svga.txt. + Use vga=ask for menu. + This is actually a boot loader parameter; the value is + passed to the kernel using a special protocol. + + vmalloc=nn[KMG] [KNL,BOOT] Forces the vmalloc area to have an exact + size of . This can be used to increase the + minimum size (128MB on x86). It can also be used to + decrease the size and leave more room for directly + mapped kernel RAM. + + vmhalt= [KNL,S390] + + vmpoff= [KNL,S390] + + waveartist= [HW,OSS] + Format: ,,, + + wd33c93= [HW,SCSI] + See header of drivers/scsi/wd33c93.c. + + wd7000= [HW,SCSI] + See header of drivers/scsi/wd7000.c. + + wdt= [WDT] Watchdog + See Documentation/watchdog/watchdog.txt. + + xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks. + xd_geo= See header of drivers/block/xd.c. + + xirc2ps_cs= [NET,PCMCIA] + Format: + ,,,,,[,[,[,]]] + + norandmaps Don't use address space randomization + Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space + + +______________________________________________________________________ +Changelog: + +2000-06-?? Mr. Unknown + The last known update (for 2.4.0) - the changelog was not kept before. + +2002-11-24 Petr Baudis + Randy Dunlap + Update for 2.5.49, description for most of the options introduced, + references to other documentation (C files, READMEs, ..), added S390, + PPC, SPARC, MTD, ALSA and OSS category. Minor corrections and + reformatting. + +2005-10-19 Randy Dunlap + Lots of typos, whitespace, some reformatting. + +TODO: + + Add documentation for ALSA options. + Add more DRM drivers. diff -urN oldtree/Documentation/power/internals.txt newtree/Documentation/power/internals.txt --- oldtree/Documentation/power/internals.txt 1970-01-01 00:00:00.000000000 +0000 +++ newtree/Documentation/power/internals.txt 2006-02-18 15:24:31.290832000 +0000 @@ -0,0 +1,360 @@ + Software Suspend 2.2 Internal Documentation. + Version 1 + +1. Introduction. + + Software Suspend 2.2 is an addition to the Linux Kernel, designed to + allow the user to quickly shutdown and quickly boot a computer, without + needing to close documents or programs. It is equivalent to the + hibernate facility in some laptops. This implementation, however, + requires no special BIOS or hardware support. + + The code in these files is based upon the original implementation + prepared by Gabor Kuti and additional work by Pavel Machek and a + host of others. This code has been substantially reworked by Nigel + Cunningham, again with the help and testing of many others, not the + least of whom is Michael Frank, At its heart, however, the operation is + essentially the same as Gabor's version. + +2. Overview of operation. + + The basic sequence of operations is as follows: + + a. Quiesce all other activity. + b. Ensure enough memory and storage space are available, and attempt + to free memory/storage if necessary. + c. Allocate the required memory and storage space. + d. Write the image. + e. Power down. + + There are a number of complicating factors which mean that things are + not as simple as the above would imply, however... + + o The activity of each process must be stopped at a point where it will + not be holding locks necessary for saving the image, or unexpectedly + restart operations due to something like a timeout and thereby make + our image inconsistent. + + o It is desirous that we sync outstanding I/O to disk before calculating + image statistics. This reduces corruption if one should suspend but + then not resume, and also makes later parts of the operation safer (see + below). + + o We need to get as close as we can to an atomic copy of the data. + Inconsistencies in the image will result inconsistent memory contents at + resume time, and thus in instability of the system and/or file system + corruption. This would appear to imply a maximum image size of one half of + the amount of RAM, but we have a solution... (again, below). + + o In 2.6, we must play nicely with the other suspend-to-disk + implementations. + +3. Detailed description of internals. + + a. Quiescing activity. + + Safely quiescing the system is achieved using two methods. + + First, we note that the vast majority of processes don't need to run during + suspend. They can be 'frozen'. We therefore implement a refrigerator + routine, which processes enter and in which they remain until the cycle is + complete. In the vanilla kernel, processes enter the refrigerator via + try_to_freeze() invocations at appropriate places. A process cannot be + frozen in any old place. It must not be holding locks that will be needed + for writing the image or freezing other processes. For this reason, + userspace processes generally enter the refrigerator via the signal handling + code, and kernel threads at the place in their event loops where they drop + locks and yield to other processes or sleep. + + In this revision of Suspend2, Christoph Lameter's todo list concept is + utilised to do the freezing. This means that we replace direct invocation of + the refrigerator function with a notifier list implementation, allowing + other applications of the hooks. + + The second part of our method for quisescing the system involves freezing + the filesystems. We use the standard freeze_bdev and thaw_bdev functions to + ensure that all of the user's data is synced to disk before we begin to + write the image. + + Quiescing the system works most quickly and reliably when we add one more + element to the algorithm: separating the freezing of userspace processes + from the freezing of kernel space processes, and doing the filesystem freeze + in between. The filesystem freeze needs to be done while kernel threads such + as kjournald can still run.At the same time, though, everything will be less + racy and run more quickly if we stop userspace submitting more I/O work + while we're trying to quiesce. + + Quiescing the system is therefore done in three steps: + - Freeze userspace + - Freeze filesystems + - Freeze kernel threads + + If we need to free memory, we thaw kernel threads and filesystems, but not + userspace. We can then free caches without worrying about deadlocks due to + swap files being on frozen filesystems or such like. + + b. Ensure enough memory & storage are available. + + We have a number of constraints to meet to be able to successfully suspend + and resume. + + First, the image will be written in two parts, described below. One of these + parts needs to have an atomic copy made, which of course implies a maximum + size of one half of the amount of system memory. The other part ('pageset') + is not atomically copied, and can therefore be as large or small as desired. + + Second, we have constraints on the amount of storage available. In these + calculations, we may also consider any compression that will be done. The + cryptoapi plugin allows the user to configure an expected compression ratio. + + Third, the user can specify an arbitrary limit on the image size, in + megabytes. This limit is treated as a soft limit, so that we don't fail the + attempt to suspend if we cannot meet this constraint. + + c. Allocate the required memory and storage space. + + Having done the initial freeze, we determine whether the above constraints + are met, and seek to allocate the metadata for the image. If the constraints + are not met, or we fail to allocate the required space for the metadata, we + seek to free the amount of memory that we calculate is needed and try again. + We allow up to four iterations of this loop before aborting the cycle. If we + do fail, it should only be because of a bug in Suspend's calculations. + + These steps are merged together in the prepare_image function, found in + prepare_image.c. The functions are merged because of the cyclical nature + of the problem of calculating how much memory and storage is needed. Since + the data structures containing the information about the image must + themselves take memory and use storage, the amount of memory and storage + required changes as we prepare the image. Since the changes are not large, + only one or two iterations will be required to achieve a solution. + + d. Write the image. + + We previously mentioned the need to create an atomic copy of the data, and + the half-of-memory limitation that is implied in this. This limitation is + circumvented by dividing the memory to be saved into two parts, called + pagesets. + + Pageset2 contains the page cache - the pages on the active and inactive + lists. These pages are saved first and reloaded last. While saving these + pages, the swapwriter plugin carefully ensures that the work of writing + the pages doesn't make the image inconsistent. Pages added to the LRU + lists are immediately shot down, and careful accounting for available + memory aids debugging. No atomic copy of these pages needs to be made. + + Writing the image requires memory, of course, and at this point we have + also not yet suspended the drivers. To avoid the possibility of remaining + activity corrupting the image, we allocate a special memory pool. Calls + to __alloc_pages and __free_pages_ok are then diverted to use our memory + pool. Pages in the memory pool are saved as part of pageset1 regardless of + whether or not they are used. + + Once pageset2 has been saved, we suspend the drivers and save the CPU + context before making an atomic copy of pageset1, resuming the drivers + and saving the atomic copy. After saving the two pagesets, we just need to + save our metadata before powering down. + + Having saved pageset2 pages, we can safely overwrite their contents with + the atomic copy of pageset1. This is how we manage to overcome the half of + memory limitation. Pageset2 is normally far larger than pageset1, and + pageset1 is normally much smaller than half of the memory, with the result + that pageset2 pages can be safely overwritten with the atomic copy of + pageset1. This is where we need to be careful about syncing, however. + Pageset2 will probably contain filesystem meta data. If this is overwritten + with pageset1 and then a sync occurs, the filesystem will be corrupted - + at least until resume time and another sync of the restored data. Since + there is a possibility that the user might not resume or (may it never be!) + that suspend might oops, we do our utmost to avoid syncing filesystems after + copying pageset1. + + e. Power down. + + Powering down uses standard kernel routines. Prior to this, however, we + suspend drivers again, ensuring that write caches are flushed. + +4. The method of writing the image. + + Suspend2 contains an internal API which is designed to simplify the + implementation of new methods of transforming the image to be written and + writing the image itself. In early versions of Suspend2, compression support + was inlined in the image writing code, and the data structures and code for + managing swap were intertwined with the rest of the code. A number of people + had expressed interest in implementing image encryption, and alternative + methods of storing the image. This internal API makes that possible by + implementing 'plugins'. + + A plugin is a single file which encapsulates the functionality needed + to transform a pageset of data (encryption or compression, for example), + or to write the pageset to a device. The former type of plugin is called + a 'page-transformer', the later a 'writer'. + + Plugins are linked together in pipeline fashion. There may be zero or more + page transformers in a pipeline, and there is always exactly one writer. + The pipeline follows this pattern: + + --------------------------------- + | Suspend2 Core + --------------------------------- + | + | + --------------------------------- + | Page transformer 1 | + --------------------------------- + | + | + --------------------------------- + | Page transformer 2 | + --------------------------------- + | + | + --------------------------------- + | Writer | + --------------------------------- + + During the writing of an image, the core code feeds pages one at a time + to the first plugin. This plugin performs whatever transformations it + implements on the incoming data, completely consuming the incoming data and + feeding output in a similar manner to the next plugin. A plugin may buffer + its output. + + During reading, the pipeline works in the reverse direction. The core code + calls the first plugin with the address of a buffer which should be filled. + (Note that the buffer size is always PAGE_SIZE at this time). This plugin + will in turn request data from the next plugin and so on down until the + writer is made to read from the stored image. + + Part of definition of the structure of a plugin thus looks like this: + + /* Writing the image proper */ + int (*write_init) (int stream_number); + int (*write_chunk) (char *buffer_start); + int (*write_cleanup) (void); + + /* Reading the image proper */ + int (*read_init) (int stream_number); + int (*read_chunk) (char *buffer_start, int sync); + int (*read_cleanup) (void); + + It should be noted that the _cleanup routines may be called before the + full stream of data has been read or written. While writing the image, + the user may (depending upon settings) choose to abort suspending, and + if we are in the midst of writing the last portion of the image, a portion + of the second pageset may be reread. + + In addition to the above routines for writing the data, all plugins have a + number of other routines: + + TYPE indicates whether the plugin is a page transformer or a writer. + #define TRANSFORMER_PLUGIN 1 + #define WRITER_PLUGIN 2 + + NAME is the name of the plugin, used in generic messages. + + PLUGIN_LIST is used to link the plugin into the list of all plugins. + + MEMORY_NEEDED returns the number of pages of memory required by the plugin + to do its work. + + STORAGE_NEEDED returns the number of pages in the suspend header required + to store the plugin's configuration data. + + PRINT_DEBUG_INFO fills a buffer with information to be displayed about the + operation or settings of the plugin. + + SAVE_CONFIG_INFO returns a buffer of PAGE_SIZE or smaller (the size is the + return code), containing the plugin's configuration info. This information + will be written in the image header and restored at resume time. Since this + buffer is allocated after the atomic copy of the kernel is made, you don't + need to worry about the buffer being freed. + + LOAD_CONFIG_INFO gives the plugin a pointer to the the configuration info + which was saved during suspending. Once again, the plugin doesn't need to + worry about freeing the buffer. The kernel will be overwritten with the + original kernel, so no memory leak will occur. + + OPS contains the operations specific to transformers and writers. These are + described below. + + The complete definition of struct suspend_plugin_ops is: + + struct suspend_plugin_ops { + /* Functions common to transformers and writers */ + int type; + char *name; + struct list_head plugin_list; + unsigned long (*memory_needed) (void); + unsigned long (*storage_needed) (void); + int (*print_debug_info) (char *buffer, int size); + int (*save_config_info) (char *buffer); + void (*load_config_info) (char *buffer, int len); + + /* Writing the image proper */ + int (*write_init) (int stream_number); + int (*write_chunk) (char *buffer_start); + int (*write_cleanup) (void); + + /* Reading the image proper */ + int (*read_init) (int stream_number); + int (*read_chunk) (char *buffer_start, int sync); + int (*read_cleanup) (void); + + union { + struct suspend_transformer_ops transformer; + struct suspend_writer_ops writer; + } ops; + }; + + + The operations specific to transformers are few in number: + + struct suspend_transformer_ops { + int (*expected_compression) (void); + struct list_head transformer_list; + }; + + Expected compression returns the expected ratio between the amount of + data sent to this plugin and the amount of data it passes to the next + plugin. The value is used by the core code to calculate the amount of + space required to write the image. If the ratio is not achieved, the + writer will complain when it runs out of space with data still to + write, and the core code will abort the suspend. + + transformer_list links together page transformers, in the order in + which they register, which is in turn determined by order in the + Makefile. + + There are many more operations specific to a writer: + + struct suspend_writer_ops { + + long (*storage_available) (void); + + unsigned long (*storage_allocated) (void); + + int (*release_storage) (void); + + long (*allocate_header_space) (unsigned long space_requested); + int (*allocate_storage) (unsigned long space_requested); + + int (*write_header_init) (void); + int (*write_header_chunk) (char *buffer_start, int buffer_size); + int (*write_header_cleanup) (void); + + int (*read_header_init) (void); + int (*read_header_chunk) (char *buffer_start, int buffer_size); + int (*read_header_cleanup) (void); + + int (*prepare_save) (void); + int (*post_load) (void); + + int (*parse_image_location) (char *buffer); + + int (*image_exists) (void); + + int (*invalidate_image) (void); + + int (*wait_on_io) (int flush_all); + + struct list_head writer_list; + }; + diff -urN oldtree/Documentation/power/kernel_threads.txt newtree/Documentation/power/kernel_threads.txt --- oldtree/Documentation/power/kernel_threads.txt 2006-01-03 03:21:10.000000000 +0000 +++ newtree/Documentation/power/kernel_threads.txt 2006-02-18 15:24:31.291831848 +0000 @@ -4,15 +4,15 @@ Freezer Upon entering a suspended state the system will freeze all -tasks. This is done by delivering pseudosignals. This affects -kernel threads, too. To successfully freeze a kernel thread -the thread has to check for the pseudosignal and enter the -refrigerator. Code to do this looks like this: +tasks. This is done by making all processes execute a notifier. +This affects kernel threads, too. To successfully freeze a kernel thread +the thread has to check for the notifications and call the notifier +chain for the process. Code to do this looks like this: do { hub_events(); wait_event_interruptible(khubd_wait, !list_empty(&hub_event_list)); - try_to_freeze(); + try_todo_list(); } while (!signal_pending(current)); from drivers/usb/core/hub.c::hub_thread() diff -urN oldtree/Documentation/power/suspend2.txt newtree/Documentation/power/suspend2.txt --- oldtree/Documentation/power/suspend2.txt 1970-01-01 00:00:00.000000000 +0000 +++ newtree/Documentation/power/suspend2.txt 2006-02-18 15:24:31.293831544 +0000 @@ -0,0 +1,631 @@ + --- Suspend2, version 2.1.9 --- + +1. What is it? +2. Why would you want it? +3. What do you need to use it? +4. How do you use it? +5. What do all those entries in /proc/suspend2 do? +6. How do you get support? +7. I think I've found a bug. What should I do? +8. When will XXX be supported? +9. How does it work? +10. Who wrote Suspend2? + +1. What is it? + + Imagine you're sitting at your computer, working away. For some reason, you + need to turn off your computer for a while - perhaps it's time to go home + for the day. When you come back to your computer next, you're going to want + to carry on where you left off. Now imagine that you could push a button and + have your computer store the contents of its memory to disk and power down. + Then, when you next start up your computer, it loads that image back into + memory and you can carry on from where you were, just as if you'd never + turned the computer off. Far less time to start up, no reopening + applications and finding what directory you put that file in yesterday. + That's what Suspend2 does. + +2. Why would you want it? + + Why wouldn't you want it? + + Being able to save the state of your system and quickly restore it improves + your productivity - you get a useful system in far less time than through + the normal boot process. + +3. What do you need to use it? + + a. Kernel Support. + + i) The Suspend2 patch. + + Suspend2 is part of the Linux Kernel. This version is not part of Linus's + 2.6 tree at the moment, so you will need to download the kernel source and + apply the latest patch. Having done that, enable the appropriate options in + make [menu|x]config (under General Setup), compile and install your kernel. + Suspend2 works with SMP, Highmem, preemption, x86-32, PPC and mac. + x86-64 support is coming. + + Suspend2 patches are available from http://suspend2.net. + + ii) Compression and encryption support. + + As of 2.1.9.2, compression and encryption support are implemented via the + cryptoapi. You will therefore want to select any Cryptoapi transforms that + you want to use on your image from the Cryptoapi menu while configuring + your kernel. + + You can also tell Suspend to write it's image to an encrypted and/or + compressed filesystem/swap partition. In that case, you don't need to do + anything special for Suspend2 when it comes to kernel configuration. + + iii) Configuring other options. + + While you're configuring your kernel, try to configure as much as possible + to build as modules. We recommend this because there are a number of drivers + that are still in the process of implementing proper power management + support. In those cases, the best way to work around their current lack is + to build them as modules and remove the modules while suspending. You might + also bug the driver authors to get their support up to speed, or even help! + + b. Storage. + + i) Swap. + + Suspend2 can store the suspend image in your swap partition, a swap file or + a combination thereof. Whichever combination you choose, you will probably + want to create enough swap space to store the largest image you could have, + plus the space you'd normally use for swap. A good rule of thumb would be + to calculate the amount of swap you'd want without using Suspend2, and then + add the amount of memory you have. This swapspace can be arranged in any way + you'd like. It can be in one partition or file, or spread over a number. The + only requirement is that they be active when you start a suspend cycle. + + There is one exception to this requirement. Suspend2 has the ability to turn + on one swap file or partition at the start of suspending and turn it back off + at the end. If you want to ensure you have enough memory to store a image + when your memory is fully used, you might want to make one swap partition or + file for 'normal' use, and another for Suspend2 to activate & deactivate + automatically. (Further details below). + + ii) Normal files. + + As of 2.1.8.5, Suspend2 includes a 'filewriter'. The filewriter can store + your image in a simple file. Since Linux has the idea of everything being + a file, this is more powerful than it initially sounds. If, for example, + you were to set up a network block device file, you could suspend to a + network server. This has been tested and works to a point, but nbd itself + isn't stateless enough for our purposes. + + Take extra care when setting up the filewriter. If you just type commands + without thinking and then try to suspend, you could cause irreversible + corruption on your filesystems! Make sure you have backups. Also, because + the filewriter is comparatively new, it's not as well tested as the + swapwriter. Be aware that there may be bugs that could cause damage to your + data even if you are careful! You have been warned! + + Most people will only want to suspend to a local file. To achieve that, do + something along the lines of: + + echo Suspend2 > /suspend-file + dd if=/dev/zero bs=1M count=512 >> suspend-file + + This will create a 512MB file called /suspend-file. To get Suspend2 to use + it: + + echo /suspend-file > /proc/suspend2/filewriter_target + + Then + + cat /proc/suspend2/resume2 + + Put the results of this into your bootloader's configuration (see also step + C, below: + + ---EXAMPLE-ONLY-DON'T-COPY-AND-PASTE--- + # cat /proc/suspend2/resume2 + file:/dev/hda2:0x1e001 + + In this example, we would edit the append= line of our lilo.conf|menu.lst + so that it included: + + resume2=file:/dev/hda2:0x1e001 + ---EXAMPLE-ONLY-DON'T-COPY-AND-PASTE--- + + For those who are thinking 'Could I make the file sparse?', the answer is + 'No!'. At the moment, there is no way for Suspend2 to fill in the holes in + a sparse file while suspending. In the longer term (post merge!), I'd like + to change things so that the file could be dynamically resized as needed. + Right now, however, that's not possible. + + c. Bootloader configuration. + + Using Suspend2 also requires that you add an extra parameter to + your lilo.conf or equivalent. Here's an example for a swap partition: + + append="resume2=swap:/dev/hda1" + + This would tell Suspend2 that /dev/hda1 is a swap partition you + have. Suspend2 will use the swap signature of this partition as a + pointer to your data when you suspend. This means that (in this example) + /dev/hda1 doesn't need to be _the_ swap partition where all of your data + is actually stored. It just needs to be a swap partition that has a + valid signature. + + You don't need to have a swap partition for this purpose. Suspend2 + can also use a swap file, but usage is a little more complex. Having made + your swap file, turn it on and do + + cat /proc/suspend2/headerlocations + + (this assumes you've already compiled your kernel with Suspend2 + support and booted it). The results of the cat command will tell you + what you need to put in lilo.conf: + + For swap partitions like /dev/hda1, simply use resume2=/dev/hda1. + For swapfile `swapfile`, use resume2=swap:/dev/hda2:0x242d@4096. + + If the swapfile changes for any reason (it is moved to a different + location, it is deleted and recreated, or the filesystem is + defragmented) then you will have to check + /proc/suspend2/headerlocations for a new resume_block value. + + Once you've compiled and installed the kernel, adjusted your lilo.conf + and rerun lilo, you should only need to reboot for the most basic part + of Suspend2 to be ready. + + If you only compile in the swapwriter, or only compile in the filewriter, + you don't need to add the "swap:" part of the resume2= parameters above. + resume2=/dev/hda2:0x242d@4096 will work just as well. + + d. The hibernate script. + + Since the driver model in 2.6 kernels is still being developed, you may need + to do more, however. Users of Suspend2 usually start the process via a script + which prepares for the suspend, tells the kernel to do its stuff and then + restore things afterwards. This script might involve: + + - Switching to a text console and back if X doesn't like the video card + status on resume. + - Un/reloading PCMCIA support since it doesn't play well with suspend. + + Note that you might not be able to unload some drivers if there are + processes using them. You might have to kill off processes that hold + devices open. Hint: if your X server accesses an USB mouse, doing a + 'chvt' to a text console releases the device and you can unload the + module. + + Check out the latest script (available on suspend2.net). + +4. How do you use it? + + Once your script is properly set up, you should just be able to start it + and everything should go like clockwork. Of course things aren't always + that easy out of the box. + + Check out (in the kernel source tree) include/linux/suspend2.h for + settings you can use to get detailed information about what suspend is doing. + The kernel parameters suspend_act, suspend_dbg and suspend_lvl allow you to + set the action and debugging parameters prior to starting a suspend and/or + at the lilo prompt before resuming. There is also a nice little program that + should be available from suspend2.net which makes it easier to turn these + debugging settings on and off. Note that to get any debugging output, you + need to enable CONFIG_PM_DEBUG when compiling the kernel. + + A neat feature of Suspend2 is that you can press Escape at any time + during suspending, and the process will be aborted. + + Due to the way suspend works, this means you'll have your system back and + perfectly usable almost instantly. The only exception is when it's at + the very end of writing the image. Then it will need to reload a small + (usually 4-50MBs, depending upon the image characteristics) portion first. + + If you run into problems with resuming, adding the "noresume2" option to + the kernel command line will let you skip the resume step and recover your + system. + +5. What do all those entries in /proc/suspend2 do? + + /proc/suspend2 is the directory which contains files you can use to + tune and configure Suspend2 to your liking. The exact contents of + the directory will depend upon the version of Suspend2 you're + running and the options you selected at compile time. In the following + descriptions, names in brackets refer to compile time options. + (Note that they're all dependant upon you having selected CONFIG_SUSPEND2 + in the first place!) + + Since the values of these settings can open potential security risks, they + are usually accessible only to the root user. You can, however, enable a + compile time option which makes all of these files world-accessible. This + should only be done if you trust everyone with shell access to this + computer! + + - all_settings: + + This file provides a convenient way to save and restore all of the other + settings in one hit. The contents include binary data, so you'll want to + redirect the output to a file: + + cat /proc/suspend2/all_settings > /etc/hibernate/all_settings.conf + + cat /etc/hibernate/all_settings.conf > /proc/suspend2/all_settings + + - debug_info: + + This file returns information about your configuration that may be helpful + in diagnosing problems with suspending. + + - debug_sections (CONFIG_PM_DEBUG): + + This value, together with the console log level, controls what debugging + information is displayed. The console log level determines the level of + detail, and this value determines what detail is displayed. This value is + a bit vector, and the meaning of the bits can be found in the kernel tree + in include/linux/suspend2.h. It can be overridden using the kernel's + command line option suspend_dbg. + + - default_console_level (CONFIG_PM_DEBUG): + + This determines the value of the console log level at the start of a + suspend cycle. If debugging is compiled in, the console log level can be + changed during a cycle by pressing the digit keys. Meanings are: + + 0: Nice display. + 1: Nice display plus numerical progress. + 2: Errors only. + 3: Low level debugging info. + 4: Medium level debugging info. + 5: High level debugging info. + 6: Verbose debugging info. + + This value can be overridden using the kernel command line option + suspend_lvl. + + - disable_* + + This option can be used to temporarily disable various parts of suspend. + Note that these flags can be set by restoring all_settings: If the saved + settings don't include any information about how a part of suspend should + be configured, that section will be disabled. + + - do_resume: + + When anything is written to this file suspend will attempt to read and + restore an image. If there is no image, it will return almost immediately. + If an image exists, the echo > will never return. Instead, the original + kernel context will be restored and the original echo > do_suspend will + return. + + - do_suspend: + + When anything is written to this file, the kernel side of Suspend2 will + begin to attempt to write an image to disk and power down. You'll normally + want to run the hibernate script instead, to get modules unloaded first. + + - enable_escape: + + Setting this to "1" will enable you abort a suspend by + pressing escape, "0" (default) disables this feature. Note that enabling + this option means that you cannot initiate a suspend and then walk away + from your computer, expecting it to be secure. With feature disabled, + you can validly have this expectation once Suspend begins to write the + image to disk. (Prior to this point, it is possible that Suspend might + about because of failure to freeze all processes or because constraints + on its ability to save the image are not met). + + - expected_compression: + + These values allow you to set an expected compression ratio, which Software + Suspend will use in calculating whether it meets constraints on the image + size. If this expected compression ratio is not attained, the suspend will + abort, so it is wise to allow some spare. You can see what compression + ratio is achieved in the logs after suspending. + + - filewriter_target: + + Read this value to get the current setting. Write to it to point Suspend + at a new storage location for the filewriter. See above for details of how + to set up the filewriter. + + - headerlocations: + + This option tells you the resume2= options to use for swap devices you + currently have activated. It is particularly useful when you only want to + use a swap file to store your image. See above for further details. + + - image_exists: + + Can be used in a script to determine whether a valid image exists at the + location currently pointed to by resume2=. Echoing anything to this entry + removes any current image. + + - image_size_limit: + + The maximum size of suspend image written to disk, measured in megabytes + (1024*1024). + + - interface_version: + + The value returned by this file can be used by scripts and configuration + tools to determine what entries should be looked for. The value is + incremented whenever an entry in /proc/suspend2 is obsoleted or + added. + + - last_result: + + The result of the last suspend, as defined in + include/linux/suspend-debug.h with the values SUSPEND_ABORTED to + SUSPEND_KEPT_IMAGE. This is a bitmask. + + - log_everything (CONFIG_PM_DEBUG): + + Setting this option results in all messages printed being logged. Normally, + only a subset are logged, so as to not slow the process and not clutter the + logs. Useful for debugging. It can be toggled during a cycle by pressing + 'L'. + + - pause_between_steps (CONFIG_PM_DEBUG): + + This option is used during debugging, to make Suspend2 pause between + each step of the process. It is ignored when the nice display is on. + + - powerdown_method: + + Used to select a method by which Suspend2 should powerdown after writing the + image. Currently: + + 3: Attempt to enter Suspend-to-ram. + 4: Attempt to enter ACPI S4 mode. + 5: Normal power down. + + Note that these options are highly dependant upon your hardware & software. + + - progressbar_granularity_limit: + + This option can be used to limit the granularity of the progress bar + displayed with a bootsplash screen. The value is the maximum number of + steps. That is, 10 will make the progress bar jump in 10% increments. + + - reboot: + + This option causes Suspend2 to reboot rather than powering down + at the end of saving an image. It can be toggled during a cycle by pressing + 'R'. + + - resume_commandline: + + This entry can be read after resuming to see the commandline that was used + when resuming began. You might use this to set up two bootloader entries + that are the same apart from the fact that one includes a extra append= + argument "at_work=1". You could then grep resume_commandline in your + post-resume scripts and configure networking (for example) differently + depending upon whether you're at home or work. resume_commandline can be + set to arbitrary text if you wish to remove sensitive contents. + + - swapfile: + + This entry is used to specify the swapfile or partition that + Suspend2 will attempt to swapon/swapoff automatically. Thus, if + I normally use /dev/hda1 for swap, and want to use /dev/hda2 for specifically + for my suspend image, I would + + echo /dev/hda2 > /proc/suspend2/swapfile + + /dev/hda2 would then be automatically swapon'd and swapoff'd. Note that the + swapon and swapoff occur while other processes are frozen (including kswapd) + so this swap file will not be used up when attempting to free memory. The + parition/file is also given the highest priority, so other swapfiles/partitions + will only be used to save the image when this one is filled. + + The value of this file is used by headerlocations along with any currently + activated swapfiles/partitions. + + - toggle_process_nofreeze + + This entry can be used to toggle the NOFREEZE flag on a process, to allow it + to run during Suspending. It should be used with extreme caution. There are + strict limitations on what a process running during suspend can do. This is + really only intended for use by Suspend's helpers (userui in particular). + + - userui_program + + This entry is used to tell Suspend what userspace program to use for + providing a user interface while suspending. The program uses a netlink + socket to pass messages back and forward to the kernel, allowing all of the + functions formerly implemented in the kernel user interface components. + + - version: + + The version of suspend you have compiled into the currently running kernel. + +6. How do you get support? + + Glad you asked. Suspend2 is being actively maintained and supported + by Nigel (the guy doing most of the kernel coding at the moment), Bernard + (who maintains the hibernate script and userspace user interface components) + and its users. + + Resources availble include HowTos, FAQs and a Wiki, all available via + suspend2.net. You can find the mailing lists there. + +7. I think I've found a bug. What should I do? + + By far and a way, the most common problems people have with suspend2 + related to drivers not having adequate power management support. In this + case, it is not a bug with suspend2, but we can still help you. As we + mentioned above, such issues can usually be worked around by building the + functionality as modules and unloading them while suspending. Please visit + the Wiki for up-to-date lists of known issues and work arounds. + + If this information doesn't help, try running: + + hibernate --bug-report + + ..and sending the output to the users mailing list. + + Good information on how to provide us with useful information from an + oops is found in the file REPORTING-BUGS, in the top level directory + of the kernel tree. If you get an oops, please especially note the + information about running what is printed on the screen through ksymoops. + The raw information is useless. + +8. When will XXX be supported? + + Suspend2 currently lacks support for x86-64. It is work in progress, but + hasn't been made a great priority because debugging is difficult (Nigel + doesn't have access to the hardware). 64GB Highmem and discontig-mem are + also not supported at the moment. + + Patches for the other items (and anything that's been missed) are welcome. + Please send to the list. + +9. How does it work? + + Suspend2 does its work in a number of steps. + + a. Freezing system activity. + + The first main stage in suspending is to stop all other activity. This is + achieved in stages. Processes are considered in fours groups, which we will + describe in reverse order for clarity's sake: Threads with the PF_NOFREEZE + flag, kernel threads without this flag, userspace processes with the + PF_SYNCTHREAD flag and all other processes. The first set (PF_NOFREEZE) are + untouched by the refrigerator code. They are allowed to run during suspending + and resuming, and are used to support user interaction, storage access or the + like. Other kernel threads (those unneeded while suspending) are frozen last. + This leaves us with userspace processes that need to be frozen. When a + process enters one of the *_sync system calls, we set a PF_SYNCTHREAD flag on + that process for the duration of that call. Processes that have this flag are + frozen after processes without it, so that we can seek to ensure that dirty + data is synced to disk as quickly as possible in a situation where other + processes may be submitting writes at the same time. Freezing the processes + that are submitting data stops new I/O from being submitted. Syncthreads can + then cleanly finish their work. So the order is: + + - Userspace processes without PF_SYNCTHREAD or PF_NOFREEZE; + - Userspace processes with PF_SYNCTHREAD (they won't have NOFREEZE); + - Kernel processes without PF_NOFREEZE. + + b. Eating memory. + + For a successful suspend, you need to have enough disk space to store the + image and enough memory for the various limitations of Suspend2's + algorithm. You can also specify a maximum image size. In order to attain + to those constraints, Suspend2 may 'eat' memory. If, after freezing + processes, the constraints aren't met, Suspend2 will thaw all the + other processes and begin to eat memory until its calculations indicate + the constraints are met. It will then freeze processes again and recheck + its calculations. + + c. Allocation of storage. + + Next, Suspend2 allocates the storage that will be used to save + the image. + + The core of Suspend2 knows nothing about how or where pages are stored. We + therefore request the active writer (remember you might have compiled in + more than one!) to allocate enough storage for our expect image size. If + this request cannot be fulfilled, we eat more memory and try again. If it + is fulfiled, we seek to allocate additional storage, just in case our + expected compression ratio (if any) isn't achieved. This time, however, we + just continue if we can't allocate enough storage. + + If these calls to our writer change the characteristics of the image such + that we haven't allocated enough memory, we also loop. (The writer may well + need to allocate space for its storage information). + + d. Write the first part of the image. + + Suspend2 stores the image in two sets of pages called 'pagesets'. + Pageset 2 contains pages on the active and inactive lists; essentially + the page cache. Pageset 1 contains all other pages, including the kernel. + We use two pagesets for one important reason: We need to make an atomic copy + of the kernel to ensure consistency of the image. Without a second pageset, + that would limit us to an image that was at most half the amount of memory + available. Using two pagesets allows us to store a full image. Since pageset + 2 pages won't be needed in saving pageset 1, we first save pageset 2 pages. + We can then make our atomic copy of the remaining pages using both pageset 2 + pages and any other pages that are free. While saving both pagesets, we are + careful not to corrupt the image. Among other things, we use lowlevel block + I/O routines that don't change the pagecache contents. + + The next step, then, is writing pageset 2. + + e. Suspending drivers and storing processor context. + + Having written pageset2, Suspend2 calls the power management functions to + notify drivers of the suspend, and saves the processor state in preparation + for the atomic copy of memory we are about to make. + + f. Atomic copy. + + At this stage, everything else but the Suspend2 code is halted. Processes + are frozen or idling, drivers are quiesced and have stored (ideally and where + necessary) their configuration in memory we are about to atomically copy. + In our lowlevel architecture specific code, we have saved the CPU state. + We can therefore now do our atomic copy before resuming drivers etc. + + g. Save the atomic copy (pageset 1). + + Suspend can then write the atomic copy of the remaining pages. Since we + have copied the pages into other locations, we can continue to use the + normal block I/O routines without fear of corruption our image. + + f. Save the suspend header. + + Nearly there! We save our settings and other parameters needed for + reloading pageset 1 in a 'suspend header'. We also tell our writer to + serialise its data at this stage, so that it can reread the image at resume + time. Note that the writer can write this data in any format - in the case + of the swapwriter, for example, it splits header pages in 4092 byte blocks, + using the last four bytes to link pages of data together. This is completely + transparent to the core. + + g. Set the image header. + + Finally, we edit the header at our resume2= location. The signature is + changed by the writer to reflect the fact that an image exists, and to point + to the start of that data if necessary (swapwriter). + + h. Power down. + + Or reboot if we're debugging and the appropriate option is selected. + + Whew! + + Reloading the image. + -------------------- + + Reloading the image is essentially the reverse of all the above. We load + our copy of pageset 1, being careful to choose locations that aren't going + to be overwritten as we copy it back (We start very early in the boot + process, so there are no other processes to quiesce here). We then copy + pageset 1 back to its original location in memory and restore the process + context. We are now running with the original kernel. Next, we reload the + pageset 2 pages, free the memory and swap used by Suspend2, restore + the pageset header and restart processes. Sounds easy in comparison to + suspending, doesn't it! + + There is of course more to Suspend2 than this, but this explanation + should be a good start. If there's interest, I'll write further + documentation on range pages and the low level I/O. + +10. Who wrote Suspend2? + + (Answer based on the writings of Florent Chabaud, credits in files and + Nigel's limited knowledge; apologies to anyone missed out!) + + The main developers of Suspend2 have been... + + Gabor Kuti + Pavel Machek + Florent Chabaud + Bernard Blackham + Nigel Cunningham + + They have been aided in their efforts by a host of hundreds, if not thousands + of testers and people who have submitted bug fixes & suggestions. Of special + note are the efforts of Michael Frank, who had his computers repetitively + suspend and resume for literally tens of thousands of cycles and developed + scripts to stress the system and test Suspend2 far beyond the point + most of us (Nigel included!) would consider testing. His efforts have + contributed as much to Suspend2 as any of the names above. diff -urN oldtree/Documentation/power/swsusp.txt newtree/Documentation/power/swsusp.txt --- oldtree/Documentation/power/swsusp.txt 2006-02-18 15:18:20.508199528 +0000 +++ newtree/Documentation/power/swsusp.txt 2006-02-18 15:24:31.294831392 +0000 @@ -135,7 +135,8 @@ website, and not to the Linux Kernel Mailing List. We are working toward merging suspend2 into the mainline kernel. -Q: A kernel thread must voluntarily freeze itself (call 'refrigerator'). +Q: A kernel thread must work on the todo list (call 'run_todo_list') +to enter the refrigerator. I found some kernel threads that don't do it, and they don't freeze so the system can't sleep. Is this a known behavior? @@ -144,7 +145,7 @@ should be held at that point and it must be safe to sleep there), and add: - try_to_freeze(); + try_todo_list(); If the thread is needed for writing the image to storage, you should instead set the PF_NOFREEZE process flag when creating the thread (and diff -urN oldtree/arch/arm/mm/init.c newtree/arch/arm/mm/init.c --- oldtree/arch/arm/mm/init.c 2006-01-03 03:21:10.000000000 +0000 +++ newtree/arch/arm/mm/init.c 2006-02-18 15:24:31.295831240 +0000 @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -86,6 +87,11 @@ printk("%d pages swap cached\n", cached); } +int page_is_ram(int pfn) +{ + return pfn_valid(pfn); +} + static inline pmd_t *pmd_off(pgd_t *pgd, unsigned long virt) { return pmd_offset(pgd, virt); @@ -660,6 +666,15 @@ */ sysctl_overcommit_memory = OVERCOMMIT_ALWAYS; } +#ifdef CONFIG_SUSPEND2 + { + unsigned long addr; + for (addr = &__nosave_begin; addr < &__nosave_end; + addr += PAGE_SIZE) { + SetPageNosave(virt_to_page(addr)); + } + } +#endif } void free_initmem(void) diff -urN oldtree/arch/i386/kernel/time.c newtree/arch/i386/kernel/time.c --- oldtree/arch/i386/kernel/time.c 2006-02-18 15:18:21.711016672 +0000 +++ newtree/arch/i386/kernel/time.c 2006-02-18 15:24:31.296831088 +0000 @@ -378,7 +378,8 @@ mod_timer(&sync_cmos_timer, jiffies + 1); } -static long clock_cmos_diff, sleep_start; +static long clock_cmos_diff; +static unsigned long sleep_start; static struct timer_opts *last_timer; static int timer_suspend(struct sys_device *dev, pm_message_t state) @@ -386,9 +387,11 @@ /* * Estimate time zone so that set_time can update the clock */ - clock_cmos_diff = -get_cmos_time(); + long cmos_time = __get_cmos_time(); + + clock_cmos_diff = -cmos_time; clock_cmos_diff += get_seconds(); - sleep_start = get_cmos_time(); + sleep_start = cmos_time; last_timer = cur_timer; cur_timer = &timer_none; if (last_timer->suspend) @@ -401,14 +404,16 @@ unsigned long flags; unsigned long sec; unsigned long sleep_length; + unsigned long cmos_time; #ifdef CONFIG_HPET_TIMER if (is_hpet_enabled()) hpet_reenable(); #endif + cmos_time = get_cmos_time(); + sec = cmos_time + clock_cmos_diff; + sleep_length = (cmos_time - sleep_start) * HZ; setup_pit_timer(); - sec = get_cmos_time() + clock_cmos_diff; - sleep_length = (get_cmos_time() - sleep_start) * HZ; write_seqlock_irqsave(&xtime_lock, flags); xtime.tv_sec = sec; xtime.tv_nsec = 0; diff -urN oldtree/arch/i386/mm/init.c newtree/arch/i386/mm/init.c --- oldtree/arch/i386/mm/init.c 2006-02-18 15:18:21.723014848 +0000 +++ newtree/arch/i386/mm/init.c 2006-02-18 15:24:31.297830936 +0000 @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -48,6 +49,7 @@ unsigned long highstart_pfn, highend_pfn; static int noinline do_test_wp_bit(void); +int bad_ppro; /* * Creates a middle page table and puts a pointer to it in the @@ -279,9 +281,12 @@ { if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { ClearPageReserved(page); + ClearPageNosave(page); free_new_highpage(page); - } else + } else { SetPageReserved(page); + SetPageNosave(page); + } } static int add_one_highpage_hotplug(struct page *page, unsigned long pfn) @@ -384,7 +389,7 @@ #endif } -#ifdef CONFIG_SOFTWARE_SUSPEND +#ifdef CONFIG_PM /* * Swap suspend & friends need this for resume because things like the intel-agp * driver might have split up a kernel 4MB mapping. @@ -570,7 +575,7 @@ extern int ppro_with_ram_bug(void); int codesize, reservedpages, datasize, initsize; int tmp; - int bad_ppro; + struct page *tmp_page; #ifdef CONFIG_FLATMEM if (!mem_map) @@ -601,12 +606,23 @@ totalram_pages += free_all_bootmem(); reservedpages = 0; - for (tmp = 0; tmp < max_low_pfn; tmp++) - /* - * Only count reserved RAM pages - */ - if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) - reservedpages++; + for (tmp = 0; tmp < max_low_pfn; tmp++) { + if (page_is_ram(tmp)) { + /* + * Only count reserved RAM pages + */ + if (PageReserved(pfn_to_page(tmp))) + reservedpages++; + } else + /* + * Non-RAM pages are always nosave + */ + SetPageNosave(pfn_to_page(tmp)); + } + + for (tmp_page = virt_to_page(&__nosave_begin); + tmp_page < virt_to_page(&__nosave_end); tmp_page++) + SetPageNosave(tmp_page); set_highmem_pages_init(bad_ppro); @@ -727,6 +743,7 @@ addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); + ClearPageNosave(virt_to_page(addr)); set_page_count(virt_to_page(addr), 1); memset((void *)addr, 0xcc, PAGE_SIZE); free_page(addr); @@ -766,6 +783,7 @@ printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); + ClearPageNosave(virt_to_page(start)); set_page_count(virt_to_page(start), 1); free_page(start); totalram_pages++; diff -urN oldtree/arch/ppc/mm/init.c newtree/arch/ppc/mm/init.c --- oldtree/arch/ppc/mm/init.c 2006-02-18 15:18:23.343768456 +0000 +++ newtree/arch/ppc/mm/init.c 2006-02-18 15:24:31.299830632 +0000 @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -140,6 +141,7 @@ while (start < end) { ClearPageReserved(virt_to_page(start)); + ClearPageNosave(virt_to_page(start)); set_page_count(virt_to_page(start), 1); free_page(start); cnt++; @@ -172,6 +174,7 @@ for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); + ClearPageNosave(virt_to_page(start)); set_page_count(virt_to_page(start), 1); free_page(start); totalram_pages++; @@ -407,8 +410,10 @@ /* if we are booted from BootX with an initial ramdisk, make sure the ramdisk pages aren't reserved. */ if (initrd_start) { - for (addr = initrd_start; addr < initrd_end; addr += PAGE_SIZE) + for (addr = initrd_start; addr < initrd_end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); + ClearPageNosave(virt_to_page(addr)); + } } #endif /* CONFIG_BLK_DEV_INITRD */ @@ -417,13 +422,21 @@ if ( rtas_data ) for (addr = (ulong)__va(rtas_data); addr < PAGE_ALIGN((ulong)__va(rtas_data)+rtas_size) ; - addr += PAGE_SIZE) + addr += PAGE_SIZE) { SetPageReserved(virt_to_page(addr)); + SetPageNosave(virt_to_page(addr)); + } #endif for (addr = PAGE_OFFSET; addr < (unsigned long)high_memory; addr += PAGE_SIZE) { if (!PageReserved(virt_to_page(addr))) continue; + /* + * Mark nosave pages + */ + if (addr >= (void *)&__nosave_begin && addr < (void *)&__nosave_end) + SetPageNosave(virt_to_page(addr)); + if (addr < (ulong) etext) codepages++; else if (addr >= (unsigned long)&__init_begin @@ -441,6 +454,7 @@ struct page *page = mem_map + pfn; ClearPageReserved(page); + ClearPageNosave(page); set_page_count(page, 1); __free_page(page); totalhigh_pages++; diff -urN oldtree/arch/x86_64/kernel/e820.c newtree/arch/x86_64/kernel/e820.c --- oldtree/arch/x86_64/kernel/e820.c 2006-02-18 15:18:23.852691088 +0000 +++ newtree/arch/x86_64/kernel/e820.c 2006-02-18 15:24:31.300830480 +0000 @@ -186,6 +186,23 @@ return end_pfn; } +int page_is_ram(unsigned long pagenr) +{ + unsigned long start = pagenr << PAGE_SHIFT; + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (ei->addr+ei->size <= start || + ei->addr >= (start + PAGE_SIZE)) + continue; + + return (ei->type == E820_RAM); + } + + return 0; +} + /* * Compute how much memory is missing in a range. * Unlike the other functions in this file the arguments are in page numbers. diff -urN oldtree/arch/x86_64/kernel/suspend.c newtree/arch/x86_64/kernel/suspend.c --- oldtree/arch/x86_64/kernel/suspend.c 2006-02-18 15:18:23.891685160 +0000 +++ newtree/arch/x86_64/kernel/suspend.c 2006-02-18 15:24:31.301830328 +0000 @@ -13,6 +13,7 @@ #include #include #include +#include struct saved_context saved_context; @@ -22,6 +23,8 @@ unsigned long saved_context_r12, saved_context_r13, saved_context_r14, saved_context_r15; unsigned long saved_context_eflags; +void fix_processor_context(void); + void __save_processor_state(struct saved_context *ctxt) { kernel_fpu_begin(); @@ -141,7 +144,7 @@ } -#ifdef CONFIG_SOFTWARE_SUSPEND +#if defined(CONFIG_SOFTWARE_SUSPEND) /* Defined in arch/x86_64/kernel/suspend_asm.S */ extern int restore_image(void); @@ -220,4 +223,5 @@ restore_image(); return 0; } + #endif /* CONFIG_SOFTWARE_SUSPEND */ diff -urN oldtree/arch/x86_64/kernel/time.c newtree/arch/x86_64/kernel/time.c --- oldtree/arch/x86_64/kernel/time.c 2006-02-18 15:18:23.894684704 +0000 +++ newtree/arch/x86_64/kernel/time.c 2006-02-18 15:24:31.302830176 +0000 @@ -510,11 +510,10 @@ return cycles_2_ns(a); } -static unsigned long get_cmos_time(void) +static unsigned long __get_cmos_time(void) { unsigned int timeout = 1000000, year, mon, day, hour, min, sec; unsigned char uip = 0, this = 0; - unsigned long flags; /* * The Linux interpretation of the CMOS clock register contents: When the @@ -524,8 +523,6 @@ * standard 8.3 MHz ISA bus. */ - spin_lock_irqsave(&rtc_lock, flags); - while (timeout && (!uip || this)) { uip |= this; this = CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP; @@ -543,8 +540,6 @@ mon = CMOS_READ(RTC_MONTH); year = CMOS_READ(RTC_YEAR); - spin_unlock_irqrestore(&rtc_lock, flags); - /* * We know that x86-64 always uses BCD format, no need to check the * config register. @@ -566,6 +561,20 @@ return mktime(year, mon, day, hour, min, sec); } +static unsigned long get_cmos_time(void) +{ + unsigned long flags; + unsigned long result; + + spin_lock_irqsave(&rtc_lock, flags); + + result = __get_cmos_time(); + + spin_unlock_irqrestore(&rtc_lock, flags); + + return result; +} + #ifdef CONFIG_CPU_FREQ /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency @@ -1029,7 +1038,7 @@ /* * Estimate time zone so that set_time can update the clock */ - long cmos_time = get_cmos_time(); + long cmos_time = __get_cmos_time(); clock_cmos_diff = -cmos_time; clock_cmos_diff += get_seconds(); diff -urN oldtree/arch/x86_64/mm/init.c newtree/arch/x86_64/mm/init.c --- oldtree/arch/x86_64/mm/init.c 2006-02-18 15:18:23.914681664 +0000 +++ newtree/arch/x86_64/mm/init.c 2006-02-18 15:24:31.303830024 +0000 @@ -592,6 +592,7 @@ addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); + ClearPageNosave(virt_to_page(addr)); set_page_count(virt_to_page(addr), 1); memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); free_page(addr); @@ -632,6 +633,7 @@ printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); + ClearPageNosave(virt_to_page(start)); set_page_count(virt_to_page(start), 1); free_page(start); totalram_pages++; @@ -743,3 +745,22 @@ { return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); } + +#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_SUSPEND2) +/* + * Software suspend & friends need this for resume because things like the intel-agp + * driver might have split up a kernel 4MB mapping. + */ +char __nosavedata swsusp_pg_dir[PAGE_SIZE] + __attribute__ ((aligned (PAGE_SIZE))); + +static inline void save_pg_dir(void) +{ + memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); +} +#else +static inline void save_pg_dir(void) +{ +} +#endif + diff -urN oldtree/block/ll_rw_blk.c newtree/block/ll_rw_blk.c --- oldtree/block/ll_rw_blk.c 2006-02-18 15:18:23.959674824 +0000 +++ newtree/block/ll_rw_blk.c 2006-02-18 15:24:31.306829568 +0000 @@ -28,6 +28,9 @@ #include #include #include +#include +#include +#include /* * for max sense size @@ -3037,12 +3040,26 @@ else mod_page_state(pgpgin, count); + if (unlikely(( bio->bi_flags & (1 << BIO_SUSPEND2)) && + test_action_state(SUSPEND_TEST_BIO) && + (rw & WRITE))) { + char b[BDEVNAME_SIZE]; + printk("FAKEDWRITE: %s(%d): %s block %Lu on %s\n", + current->comm, current->pid, + (rw & WRITE) ? "WRITE" : "READ", + (unsigned long long)bio->bi_sector, + bdevname(bio->bi_bdev,b)); + bio_endio(bio, PAGE_SIZE, 0); + return; + } + if (unlikely(block_dump)) { char b[BDEVNAME_SIZE]; - printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", + printk(KERN_DEBUG "%s(%d): %s block %Lu size %d on %s\n", current->comm, current->pid, (rw & WRITE) ? "WRITE" : "READ", (unsigned long long)bio->bi_sector, + bio->bi_size, bdevname(bio->bi_bdev,b)); } @@ -3434,7 +3451,7 @@ { int i; - kblockd_workqueue = create_workqueue("kblockd"); + kblockd_workqueue = create_nofreeze_workqueue("kblockd"); if (!kblockd_workqueue) panic("Failed to create kblockd\n"); diff -urN oldtree/crypto/Kconfig newtree/crypto/Kconfig --- oldtree/crypto/Kconfig 2006-02-18 15:18:23.961674520 +0000 +++ newtree/crypto/Kconfig 2006-02-18 15:24:31.307829416 +0000 @@ -316,6 +316,13 @@ You will most probably want this if using IPSec. +config CRYPTO_LZF + tristate "LZF compression algorithm" + depends on CRYPTO + help + This is the LZF algorithm. It is especially useful for Suspend2, + because it achieves good compression quickly. + config CRYPTO_MICHAEL_MIC tristate "Michael MIC keyed digest algorithm" depends on CRYPTO diff -urN oldtree/crypto/Makefile newtree/crypto/Makefile --- oldtree/crypto/Makefile 2006-01-03 03:21:10.000000000 +0000 +++ newtree/crypto/Makefile 2006-02-18 15:24:31.308829264 +0000 @@ -30,5 +30,6 @@ obj-$(CONFIG_CRYPTO_DEFLATE) += deflate.o obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o +obj-$(CONFIG_CRYPTO_LZF) += lzf.o obj-$(CONFIG_CRYPTO_TEST) += tcrypt.o diff -urN oldtree/crypto/deflate.c newtree/crypto/deflate.c --- oldtree/crypto/deflate.c 2006-01-03 03:21:10.000000000 +0000 +++ newtree/crypto/deflate.c 2006-02-18 15:24:31.308829264 +0000 @@ -143,8 +143,15 @@ ret = zlib_deflate(stream, Z_FINISH); if (ret != Z_STREAM_END) { - ret = -EINVAL; - goto out; + if (!(ret == Z_OK && !stream->avail_in && !stream->avail_out)) { + ret = -EINVAL; + goto out; + } else { + u8 zerostuff = 0; + stream->next_out = &zerostuff; + stream->avail_out = 1; + ret = zlib_deflate(stream, Z_FINISH); + } } ret = 0; *dlen = stream->total_out; diff -urN oldtree/crypto/lzf.c newtree/crypto/lzf.c --- oldtree/crypto/lzf.c 1970-01-01 00:00:00.000000000 +0000 +++ newtree/crypto/lzf.c 2006-02-18 15:24:31.310828960 +0000 @@ -0,0 +1,335 @@ +/* + * Cryptoapi LZF compression module. + * + * Copyright (c) 2004-2005 Nigel Cunningham + * + * based on the deflate.c file: + * + * Copyright (c) 2003 James Morris + * + * and upon the LZF compression module donated to the Suspend2 project with + * the following copyright: + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * Copyright (c) 2000-2003 Marc Alexander Lehmann + * + * Redistribution and use in source and binary forms, with or without modifica- + * tion, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER- + * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE- + * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH- + * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Alternatively, the contents of this file may be used under the terms of + * the GNU General Public License version 2 (the "GPL"), in which case the + * provisions of the GPL are applicable instead of the above. If you wish to + * allow the use of your version of this file only under the terms of the + * GPL and not to allow others to use your version of this file under the + * BSD license, indicate your decision by deleting the provisions above and + * replace them with the notice and other provisions required by the GPL. If + * you do not delete the provisions above, a recipient may use your version + * of this file under either the BSD or the GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct lzf_ctx { + void *hbuf; + unsigned int bufofs; +}; + +/* + * size of hashtable is (1 << hlog) * sizeof (char *) + * decompression is independent of the hash table size + * the difference between 15 and 14 is very small + * for small blocks (and 14 is also faster). + * For a low-memory configuration, use hlog == 13; + * For best compression, use 15 or 16. + */ +static const int hlog = 14; + +/* + * don't play with this unless you benchmark! + * decompression is not dependent on the hash function + * the hashing function might seem strange, just believe me + * it works ;) + */ +static inline u16 first(const u8 *p) +{ + return ((p[0]) << 8) + p[1]; +} + +static inline u16 next(u8 v, const u8 *p) +{ + return ((v) << 8) + p[2]; +} + +static inline u32 idx(unsigned int h) +{ + return (((h ^ (h << 5)) >> (3*8 - hlog)) + h*3) & ((1 << hlog) - 1); +} + +/* + * IDX works because it is very similar to a multiplicative hash, e.g. + * (h * 57321 >> (3*8 - hlog)) + * the next one is also quite good, albeit slow ;) + * (int)(cos(h & 0xffffff) * 1e6) + */ + +static const int max_lit = (1 << 5); +static const int max_off = (1 << 13); +static const int max_ref = ((1 << 8) + (1 << 3)); + +/* + * compressed format + * + * 000LLLLL ; literal + * LLLOOOOO oooooooo ; backref L + * 111OOOOO LLLLLLLL oooooooo ; backref L+7 + * + */ + +static void lzf_compress_exit(void *context) +{ + struct lzf_ctx *ctx = (struct lzf_ctx *)context; + + if (ctx->hbuf) { + vfree(ctx->hbuf); + ctx->hbuf = NULL; + } +} + +static int lzf_compress_init(void *context) +{ + struct lzf_ctx *ctx = (struct lzf_ctx *)context; + + /* Get LZF ready to go */ + ctx->hbuf = vmalloc_32((1 << hlog) * sizeof(char *)); + if (!ctx->hbuf) { + printk(KERN_WARNING + "Failed to allocate %ld bytes for lzf workspace\n", + (long) ((1 << hlog) * sizeof(char *))); + return -ENOMEM; + } + return 0; +} + +static int lzf_compress(void *context, const u8 *in_data, unsigned int in_len, + u8 *out_data, unsigned int *out_len) +{ + struct lzf_ctx *ctx = (struct lzf_ctx *)context; + const u8 **htab = ctx->hbuf; + const u8 **hslot; + const u8 *ip = in_data; + u8 *op = out_data; + const u8 *in_end = ip + in_len; + u8 *out_end = op + *out_len - 3; + const u8 *ref; + + unsigned int hval = first(ip); + unsigned long off; + int lit = 0; + + memset(htab, 0, sizeof(htab)); + + for (;;) { + if (ip < in_end - 2) { + hval = next(hval, ip); + hslot = htab + idx(hval); + ref = *hslot; + *hslot = ip; + + if ((off = ip - ref - 1) < max_off + && ip + 4 < in_end && ref > in_data + && *(u16 *) ref == *(u16 *) ip && ref[2] == ip[2] + ) { + /* match found at *ref++ */ + unsigned int len = 2; + unsigned int maxlen = in_end - ip - len; + maxlen = maxlen > max_ref ? max_ref : maxlen; + + do + len++; + while (len < maxlen && ref[len] == ip[len]); + + if (op + lit + 1 + 3 >= out_end) { + *out_len = PAGE_SIZE; + return 0; + } + + if (lit) { + *op++ = lit - 1; + lit = -lit; + do + *op++ = ip[lit]; + while (++lit); + } + + len -= 2; + ip++; + + if (len < 7) { + *op++ = (off >> 8) + (len << 5); + } else { + *op++ = (off >> 8) + (7 << 5); + *op++ = len - 7; + } + + *op++ = off; + + ip += len; + hval = first(ip); + hval = next(hval, ip); + htab[idx(hval)] = ip; + ip++; + continue; + } + } else if (ip == in_end) + break; + + /* one more literal byte we must copy */ + lit++; + ip++; + + if (lit == max_lit) { + if (op + 1 + max_lit >= out_end) { + *out_len = PAGE_SIZE; + return 0; + } + + *op++ = max_lit - 1; + memcpy(op, ip - max_lit, max_lit); + op += max_lit; + lit = 0; + } + } + + if (lit) { + if (op + lit + 1 >= out_end) { + *out_len = PAGE_SIZE; + return 0; + } + + *op++ = lit - 1; + lit = -lit; + do + *op++ = ip[lit]; + while (++lit); + } + + *out_len = op - out_data; + return 0; +} + +static int lzf_decompress(void *context, const u8 *src, unsigned int slen, + u8 *dst, unsigned int *dlen) +{ + u8 const *ip = src; + u8 *op = dst; + u8 const *const in_end = ip + slen; + u8 *const out_end = op + *dlen; + + do { + unsigned int ctrl = *ip++; + + if (ctrl < (1 << 5)) { /* literal run */ + ctrl++; + + if (op + ctrl > out_end) { + *dlen = PAGE_SIZE; + return 0; + } + memcpy(op, ip, ctrl); + op += ctrl; + ip += ctrl; + } else { /* back reference */ + + unsigned int len = ctrl >> 5; + + u8 *ref = op - ((ctrl & 0x1f) << 8) - 1; + + if (len == 7) + len += *ip++; + + ref -= *ip++; + + if (op + len + 2 > out_end) { + *dlen = PAGE_SIZE; + return 0; + } + + if (ref < (u8 *) dst) { + *dlen = PAGE_SIZE; + return 0; + } + + *op++ = *ref++; + *op++ = *ref++; + + do + *op++ = *ref++; + while (--len); + } + } + while (op < out_end && ip < in_end); + + *dlen = op - (u8 *) dst; + return 0; +} + +static struct crypto_alg alg = { + .cra_name = "lzf", + .cra_flags = CRYPTO_ALG_TYPE_COMPRESS, + .cra_ctxsize = 0, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(alg.cra_list), + .cra_u = {.compress = { + .coa_init = lzf_compress_init, + .coa_exit = lzf_compress_exit, + .coa_compress = lzf_compress, + .coa_decompress = lzf_decompress}} +}; + +static int __init init(void) +{ + return crypto_register_alg(&alg); +} + +static void __exit fini(void) +{ + crypto_unregister_alg(&alg); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZF Compression Algorithm"); +MODULE_AUTHOR("Marc Alexander Lehmann & Nigel Cunningham"); diff -urN oldtree/drivers/acpi/osl.c newtree/drivers/acpi/osl.c --- oldtree/drivers/acpi/osl.c 2006-02-18 15:18:24.099653544 +0000 +++ newtree/drivers/acpi/osl.c 2006-02-18 15:24:31.311828808 +0000 @@ -91,7 +91,7 @@ "Access to PCI configuration space unavailable\n"); return AE_NULL_ENTRY; } - kacpid_wq = create_singlethread_workqueue("kacpid"); + kacpid_wq = create_nofreeze_singlethread_workqueue("kacpid"); BUG_ON(!kacpid_wq); return AE_OK; diff -urN oldtree/drivers/acpi/sleep/proc.c newtree/drivers/acpi/sleep/proc.c --- oldtree/drivers/acpi/sleep/proc.c 2006-01-03 03:21:10.000000000 +0000 +++ newtree/drivers/acpi/sleep/proc.c 2006-02-18 15:24:31.312828656 +0000 @@ -58,6 +58,15 @@ goto Done; } state = simple_strtoul(str, NULL, 0); + + /* + * I used to put this after the CONFIG_SOFTWARE_SUSPEND + * test, but people who compile in suspend2 usually want + * to use it instead of swsusp. --NC + */ + if (may_try_suspend2(state)) + goto Done; + #ifdef CONFIG_SOFTWARE_SUSPEND if (state == 4) { error = software_suspend(); diff -urN oldtree/drivers/base/sys.c newtree/drivers/base/sys.c --- oldtree/drivers/base/sys.c 2006-02-18 15:18:24.249630744 +0000 +++ newtree/drivers/base/sys.c 2006-02-18 15:24:31.313828504 +0000 @@ -302,16 +302,14 @@ cls->resume(dev); /* Call auxillary drivers next. */ - list_for_each_entry(drv, &cls->drivers, entry) { + list_for_each_entry(drv, &cls->drivers, entry) if (drv->resume) drv->resume(dev); - } /* Call global drivers. */ - list_for_each_entry(drv, &sysdev_drivers, entry) { + list_for_each_entry(drv, &sysdev_drivers, entry) if (drv->resume) drv->resume(dev); - } } /** diff -urN oldtree/drivers/char/hvc_console.c newtree/drivers/char/hvc_console.c --- oldtree/drivers/char/hvc_console.c 2006-02-18 15:18:24.472596848 +0000 +++ newtree/drivers/char/hvc_console.c 2006-02-18 15:24:31.314828352 +0000 @@ -839,7 +839,7 @@ /* Always start the kthread because there can be hotplug vty adapters * added later. */ - hvc_task = kthread_run(khvcd, NULL, "khvcd"); + hvc_task = kthread_nofreeze_run(khvcd, NULL, "khvcd"); if (IS_ERR(hvc_task)) { panic("Couldn't create kthread for console.\n"); put_tty_driver(hvc_driver); diff -urN oldtree/drivers/char/hvcs.c newtree/drivers/char/hvcs.c --- oldtree/drivers/char/hvcs.c 2006-02-18 15:18:24.474596544 +0000 +++ newtree/drivers/char/hvcs.c 2006-02-18 15:24:31.316828048 +0000 @@ -1404,7 +1404,7 @@ return -ENOMEM; } - hvcs_task = kthread_run(khvcsd, NULL, "khvcsd"); + hvcs_task = kthread_nofreeze_run(khvcsd, NULL, "khvcsd"); if (IS_ERR(hvcs_task)) { printk(KERN_ERR "HVCS: khvcsd creation failed. Driver not loaded.\n"); kfree(hvcs_pi_buff); diff -urN oldtree/drivers/input/serio/serio.c newtree/drivers/input/serio/serio.c --- oldtree/drivers/input/serio/serio.c 2006-02-18 15:18:25.080504432 +0000 +++ newtree/drivers/input/serio/serio.c 2006-02-18 15:24:31.317827896 +0000 @@ -901,7 +901,7 @@ static int __init serio_init(void) { - serio_task = kthread_run(serio_thread, NULL, "kseriod"); + serio_task = kthread_nofreeze_run(serio_thread, NULL, "kseriod"); if (IS_ERR(serio_task)) { printk(KERN_ERR "serio: Failed to start kseriod\n"); return PTR_ERR(serio_task); diff -urN oldtree/drivers/macintosh/Kconfig newtree/drivers/macintosh/Kconfig --- oldtree/drivers/macintosh/Kconfig 2006-02-18 15:18:25.159492424 +0000 +++ newtree/drivers/macintosh/Kconfig 2006-02-18 15:24:31.318827744 +0000 @@ -200,4 +200,8 @@ tristate "Support for ANS LCD display" depends on ADB_CUDA && PPC_PMAC +config SOFTWARE_REPLACE_SLEEP + bool "Using Software suspend replace broken sleep function" + depends on SUSPEND2 + endmenu diff -urN oldtree/drivers/macintosh/via-pmu.c newtree/drivers/macintosh/via-pmu.c --- oldtree/drivers/macintosh/via-pmu.c 2006-02-18 15:18:25.197486648 +0000 +++ newtree/drivers/macintosh/via-pmu.c 2006-02-18 15:24:31.320827440 +0000 @@ -2653,6 +2653,13 @@ return -EACCES; if (sleep_in_progress) return -EBUSY; +#ifdef CONFIG_SOFTWARE_REPLACE_SLEEP + { + extern void software_suspend_pending(void); + software_suspend_pending(); + return (0); + } +#endif sleep_in_progress = 1; switch (pmu_kind) { case PMU_OHARE_BASED: diff -urN oldtree/drivers/md/dm-crypt.c newtree/drivers/md/dm-crypt.c --- oldtree/drivers/md/dm-crypt.c 2006-02-18 15:18:25.219483304 +0000 +++ newtree/drivers/md/dm-crypt.c 2006-02-18 15:24:31.321827288 +0000 @@ -928,7 +928,7 @@ if (!_crypt_io_pool) return -ENOMEM; - _kcryptd_workqueue = create_workqueue("kcryptd"); + _kcryptd_workqueue = create_nofreeze_workqueue("kcryptd"); if (!_kcryptd_workqueue) { r = -ENOMEM; DMERR(PFX "couldn't create kcryptd"); diff -urN oldtree/drivers/md/md.c newtree/drivers/md/md.c --- oldtree/drivers/md/md.c 2006-02-18 15:18:25.238480416 +0000 +++ newtree/drivers/md/md.c 2006-02-18 15:24:31.325826680 +0000 @@ -41,7 +41,6 @@ #include #include #include /* for invalidate_bdev */ -#include #include #include @@ -3972,7 +3971,8 @@ thread->run = run; thread->mddev = mddev; thread->timeout = MAX_SCHEDULE_TIMEOUT; - thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); + thread->tsk = kthread_nofreeze_run(md_thread, thread, + name, mdname(thread->mddev)); if (IS_ERR(thread->tsk)) { kfree(thread); return NULL; diff -urN oldtree/drivers/net/irda/sir_kthread.c newtree/drivers/net/irda/sir_kthread.c --- oldtree/drivers/net/irda/sir_kthread.c 2006-02-18 15:18:28.421996448 +0000 +++ newtree/drivers/net/irda/sir_kthread.c 2006-02-18 15:24:31.326826528 +0000 @@ -112,6 +112,7 @@ DECLARE_WAITQUEUE(wait, current); daemonize("kIrDAd"); + current->flags |= PF_NOFREEZE; irda_rq_queue.thread = current; @@ -134,9 +135,6 @@ __set_task_state(current, TASK_RUNNING); remove_wait_queue(&irda_rq_queue.kick, &wait); - /* make swsusp happy with our thread */ - try_to_freeze(); - run_irda_queue(); } diff -urN oldtree/drivers/scsi/hosts.c newtree/drivers/scsi/hosts.c --- oldtree/drivers/scsi/hosts.c 2006-02-18 15:18:28.787940816 +0000 +++ newtree/drivers/scsi/hosts.c 2006-02-18 15:24:31.327826376 +0000 @@ -227,7 +227,7 @@ if (shost->transportt->create_work_queue) { snprintf(shost->work_q_name, KOBJ_NAME_LEN, "scsi_wq_%d", shost->host_no); - shost->work_q = create_singlethread_workqueue( + shost->work_q = create_nofreeze_singlethread_workqueue( shost->work_q_name); if (!shost->work_q) goto out_free_shost_data; diff -urN oldtree/drivers/scsi/lpfc/lpfc_init.c newtree/drivers/scsi/lpfc/lpfc_init.c --- oldtree/drivers/scsi/lpfc/lpfc_init.c 2006-02-18 15:18:28.820935800 +0000 +++ newtree/drivers/scsi/lpfc/lpfc_init.c 2006-02-18 15:24:31.343823944 +0000 @@ -1521,7 +1521,7 @@ phba->work_ha_mask |= (HA_RXMASK << (LPFC_ELS_RING * 4)); /* Startup the kernel thread for this host adapter. */ - phba->worker_thread = kthread_run(lpfc_do_work, phba, + phba->worker_thread = kthread_nofreeze_run(lpfc_do_work, phba, "lpfc_worker_%d", phba->brd_no); if (IS_ERR(phba->worker_thread)) { error = PTR_ERR(phba->worker_thread); diff -urN oldtree/drivers/usb/net/pegasus.c newtree/drivers/usb/net/pegasus.c --- oldtree/drivers/usb/net/pegasus.c 2006-02-18 15:18:29.111891568 +0000 +++ newtree/drivers/usb/net/pegasus.c 2006-02-18 15:24:31.344823792 +0000 @@ -1451,7 +1451,7 @@ pr_info("%s: %s, " DRIVER_DESC "\n", driver_name, DRIVER_VERSION); if (devid) parse_id(devid); - pegasus_workqueue = create_singlethread_workqueue("pegasus"); + pegasus_workqueue = create_nofreeze_singlethread_workqueue("pegasus"); if (!pegasus_workqueue) return -ENOMEM; return usb_register(&pegasus_driver); diff -urN oldtree/include/asm-arm/hw_irq.h newtree/include/asm-arm/hw_irq.h --- oldtree/include/asm-arm/hw_irq.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/include/asm-arm/hw_irq.h 2006-02-18 15:24:31.345823640 +0000 @@ -0,0 +1,4 @@ +#ifndef __ASM_HARDIRQ_H +#define __ASM_HARDIRQ_H +#include +#endif diff -urN oldtree/include/asm-arm/suspend2.h newtree/include/asm-arm/suspend2.h --- oldtree/include/asm-arm/suspend2.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/include/asm-arm/suspend2.h 2006-02-18 15:24:31.346823488 +0000 @@ -0,0 +1,136 @@ +#ifndef _ASMARM_SUSPEND_H +#define _ASMARM_SUSPEND_H +/* + * Based on code + * Copyright 2005 Sony Corporation + * Copyright 2003-2004 Nigel Cunningham + * Copyright 2001-2002 Pavel Machek + * Copyright 2001 Patrick Mochel + */ + +/* image of the saved processor state */ +struct suspend2_saved_context { + /* general registers */ + __u32 r[15]; + + /* coprocessor 15 registers */ +/* __u32 ID_code; read only reg */ +/* __u32 cache_type; read only reg */ +/* __u32 TCM_stat; read only reg */ + __u32 CR; + __u32 TTBR; + __u32 DACR; + __u32 D_FSR; + __u32 I_FSR; + __u32 FAR; +/* __u32 COR; write only reg */ +/* __u32 TLBOR; write only reg */ + __u32 D_CLR; + __u32 I_CLR; + __u32 D_TCMRR; + __u32 I_TCMRR; + __u32 TLBLR; + __u32 FCSE; + __u32 CID; +} __attribute__((packed)); +typedef struct suspend2_saved_context suspend2_saved_context_t; + +/* temporary storage */ +extern struct suspend2_saved_context suspend2_saved_context; + +static inline void suspend2_arch_save_processor_context(void) +{ + /* save general registers */ + asm volatile ("stmia %0, {r4-r14}" + :: "r" (suspend2_saved_context.r)); + /* save coprocessor 15 registers */ + asm volatile ("mrc p15, 0, %0, c1, c0, 0" + : "=r" (suspend2_saved_context.CR)); + asm volatile ("mrc p15, 0, %0, c3, c0, 0" + : "=r" (suspend2_saved_context.DACR)); + asm volatile ("mrc p15, 0, %0, c5, c0, 0" + : "=r" (suspend2_saved_context.D_FSR)); + asm volatile ("mrc p15, 0, %0, c5, c0, 1" + : "=r" (suspend2_saved_context.I_FSR)); + asm volatile ("mrc p15, 0, %0, c6, c0, 0" + : "=r" (suspend2_saved_context.FAR)); + asm volatile ("mrc p15, 0, %0, c9, c0, 0" + : "=r" (suspend2_saved_context.D_CLR)); + asm volatile ("mrc p15, 0, %0, c9, c0, 1" + : "=r" (suspend2_saved_context.I_CLR)); + asm volatile ("mrc p15, 0, %0, c9, c1, 0" + : "=r" (suspend2_saved_context.D_TCMRR)); + asm volatile ("mrc p15, 0, %0, c9, c1, 1" + : "=r" (suspend2_saved_context.I_TCMRR)); + asm volatile ("mrc p15, 0, %0, c10, c0, 0" + : "=r" (suspend2_saved_context.TLBLR)); + asm volatile ("mrc p15, 0, %0, c13, c0, 0" + : "=r" (suspend2_saved_context.FCSE)); + asm volatile ("mrc p15, 0, %0, c13, c0, 1" + : "=r" (suspend2_saved_context.CID)); + asm volatile ("mrc p15, 0, %0, c2, c0, 0" + : "=r" (suspend2_saved_context.TTBR)); +} + +static inline void suspend2_arch_restore_processor_context(void) +{ + /* restore coprocessor 15 registers */ + asm volatile ("mcr p15, 0, %0, c2, c0, 0" + :: "r" (suspend2_saved_context.TTBR)); + asm volatile ("mcr p15, 0, %0, c13, c0, 1" + :: "r" (suspend2_saved_context.CID)); + asm volatile ("mcr p15, 0, %0, c13, c0, 0" + :: "r" (suspend2_saved_context.FCSE)); + asm volatile ("mcr p15, 0, %0, c10, c0, 0" + :: "r" (suspend2_saved_context.TLBLR)); + asm volatile ("mcr p15, 0, %0, c9, c1, 1" + :: "r" (suspend2_saved_context.I_TCMRR)); + asm volatile ("mcr p15, 0, %0, c9, c1, 0" + :: "r" (suspend2_saved_context.D_TCMRR)); + asm volatile ("mcr p15, 0, %0, c9, c0, 1" + :: "r" (suspend2_saved_context.I_CLR)); + asm volatile ("mcr p15, 0, %0, c9, c0, 0" + :: "r" (suspend2_saved_context.D_CLR)); + asm volatile ("mcr p15, 0, %0, c6, c0, 0" + :: "r" (suspend2_saved_context.FAR)); + asm volatile ("mcr p15, 0, %0, c5, c0, 1" + :: "r" (suspend2_saved_context.I_FSR)); + asm volatile ("mcr p15, 0, %0, c5, c0, 0" + :: "r" (suspend2_saved_context.D_FSR)); + asm volatile ("mcr p15, 0, %0, c3, c0, 0" + :: "r" (suspend2_saved_context.DACR)); + asm volatile ("mcr p15, 0, %0, c1, c0, 0" + :: "r" (suspend2_saved_context.CR)); + + /* restore general registers */ + asm volatile ("ldmia r3, {r4-r14}" : "=m" (suspend2_saved_context.r)); +} + +static inline void save_context(void) +{ +} + +static inline void restore_context(void) +{ +} + +static inline void suspend2_arch_pre_copy(void) +{ +} + +static inline void suspend2_arch_post_copy(void) +{ +} + +static inline void suspend2_arch_pre_copyback(void) +{ +} + +static inline void suspend2_arch_post_copyback(void) +{ +} + +static inline void suspend2_arch_flush_caches(void) +{ +} +#endif diff -urN oldtree/include/asm-i386/mach-default/mach_time.h newtree/include/asm-i386/mach-default/mach_time.h --- oldtree/include/asm-i386/mach-default/mach_time.h 2006-01-03 03:21:10.000000000 +0000 +++ newtree/include/asm-i386/mach-default/mach_time.h 2006-02-18 15:24:31.348823184 +0000 @@ -79,24 +79,19 @@ return retval; } -static inline unsigned long mach_get_cmos_time(void) +/* __get_cmos_time + * + * Separated out from mach_get_cmos_time so that we can + * quickly get the cmos time when we don't care about + * whether the second has just started. + * + * Used from suspend and resume sysdev calls. + */ +static inline unsigned long __get_cmos_time(void) { unsigned int year, mon, day, hour, min, sec; - int i; - /* The Linux interpretation of the CMOS clock register contents: - * When the Update-In-Progress (UIP) flag goes from 1 to 0, the - * RTC registers show the second which has precisely just started. - * Let's hope other operating systems interpret the RTC the same way. - */ - /* read RTC exactly on falling edge of update flag */ - for (i = 0 ; i < 1000000 ; i++) /* may take up to 1 second... */ - if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) - break; - for (i = 0 ; i < 1000000 ; i++) /* must try at least 2.228 ms */ - if (!(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)) - break; - do { /* Isn't this overkill ? UIP above should guarantee consistency */ + do { sec = CMOS_READ(RTC_SECONDS); min = CMOS_READ(RTC_MINUTES); hour = CMOS_READ(RTC_HOURS); @@ -104,6 +99,7 @@ mon = CMOS_READ(RTC_MONTH); year = CMOS_READ(RTC_YEAR); } while (sec != CMOS_READ(RTC_SECONDS)); + if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { BCD_TO_BIN(sec); @@ -119,4 +115,24 @@ return mktime(year, mon, day, hour, min, sec); } +static inline unsigned long mach_get_cmos_time(void) +{ + int i; + + /* The Linux interpretation of the CMOS clock register contents: + * When the Update-In-Progress (UIP) flag goes from 1 to 0, the + * RTC registers show the second which has precisely just started. + * Let's hope other operating systems interpret the RTC the same way. + */ + /* read RTC exactly on falling edge of update flag */ + for (i = 0 ; i < 1000000 ; i++) /* may take up to 1 second... */ + if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) + break; + for (i = 0 ; i < 1000000 ; i++) /* must try at least 2.228 ms */ + if (!(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)) + break; + + return __get_cmos_time(); +} + #endif /* !_MACH_TIME_H */ diff -urN oldtree/include/asm-i386/suspend2.h newtree/include/asm-i386/suspend2.h --- oldtree/include/asm-i386/suspend2.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/include/asm-i386/suspend2.h 2006-02-18 15:24:31.349823032 +0000 @@ -0,0 +1,288 @@ + /* + * Copyright 2003-2005 Nigel Cunningham + * Based on code + * Copyright 2001-2002 Pavel Machek + * Based on code + * Copyright 2001 Patrick Mochel + */ +#include +#include +#include +#include +#include +#include + +/* image of the saved processor states */ +struct suspend2_saved_context { + u32 eax, ebx, ecx, edx; + u32 esp, ebp, esi, edi; + u16 es, fs, gs, ss; + u32 cr0, cr2, cr3, cr4; + u16 gdt_pad; + u16 gdt_limit; + u32 gdt_base; + u16 idt_pad; + u16 idt_limit; + u32 idt_base; + u16 ldt; + u16 tss; + u32 tr; + u32 safety; + u32 return_address; + u32 eflags; +} __attribute__((packed)); +typedef struct suspend2_saved_context suspend2_saved_context_t; + +/* temporary storage */ +extern struct suspend2_saved_context suspend2_saved_context; + +/* + * save_processor_context + * + * Save the state of the processor before we go to sleep. + * + * return_stack is the value of the stack pointer (%esp) as the caller sees it. + * A good way could not be found to obtain it from here (don't want to make + * _too_ many assumptions about the layout of the stack this far down.) Also, + * the handy little __builtin_frame_pointer(level) where level > 0, is blatantly + * buggy - it returns the value of the stack at the proper location, not the + * location, like it should (as of gcc 2.91.66) + * + * Note that the context and timing of this function is pretty critical. + * With a minimal amount of things going on in the caller and in here, gcc + * does a good job of being just a dumb compiler. Watch the assembly output + * if anything changes, though, and make sure everything is going in the right + * place. + */ +static inline void suspend2_arch_save_processor_context(void) +{ + kernel_fpu_begin(); + + /* + * descriptor tables + */ + asm volatile ("sgdt (%0)" : "=m" (suspend2_saved_context.gdt_limit)); + asm volatile ("sidt (%0)" : "=m" (suspend2_saved_context.idt_limit)); + asm volatile ("sldt (%0)" : "=m" (suspend2_saved_context.ldt)); + asm volatile ("str (%0)" : "=m" (suspend2_saved_context.tr)); + + /* + * save the general registers. + * note that gcc has constructs to specify output of certain registers, + * but they're not used here, because it assumes that you want to modify + * those registers, so it tries to be smart and save them beforehand. + * It's really not necessary, and kinda fishy (check the assembly output), + * so it's avoided. + */ + asm volatile ("movl %%esp, (%0)" : "=m" (suspend2_saved_context.esp)); + asm volatile ("movl %%eax, (%0)" : "=m" (suspend2_saved_context.eax)); + asm volatile ("movl %%ebx, (%0)" : "=m" (suspend2_saved_context.ebx)); + asm volatile ("movl %%ecx, (%0)" : "=m" (suspend2_saved_context.ecx)); + asm volatile ("movl %%edx, (%0)" : "=m" (suspend2_saved_context.edx)); + asm volatile ("movl %%ebp, (%0)" : "=m" (suspend2_saved_context.ebp)); + asm volatile ("movl %%esi, (%0)" : "=m" (suspend2_saved_context.esi)); + asm volatile ("movl %%edi, (%0)" : "=m" (suspend2_saved_context.edi)); + + /* + * segment registers + */ + asm volatile ("movw %%es, %0" : "=r" (suspend2_saved_context.es)); + asm volatile ("movw %%fs, %0" : "=r" (suspend2_saved_context.fs)); + asm volatile ("movw %%gs, %0" : "=r" (suspend2_saved_context.gs)); + asm volatile ("movw %%ss, %0" : "=r" (suspend2_saved_context.ss)); + + /* + * control registers + */ + asm volatile ("movl %%cr0, %0" : "=r" (suspend2_saved_context.cr0)); + asm volatile ("movl %%cr2, %0" : "=r" (suspend2_saved_context.cr2)); + asm volatile ("movl %%cr3, %0" : "=r" (suspend2_saved_context.cr3)); + asm volatile ("movl %%cr4, %0" : "=r" (suspend2_saved_context.cr4)); + + /* + * eflags + */ + asm volatile ("pushfl ; popl (%0)" : "=m" (suspend2_saved_context.eflags)); +} + +static void fix_processor_context(void) +{ + struct tss_struct *t = &per_cpu(init_tss,0); + + /* This just modifies memory; should not be neccessary. But... This is + * neccessary, because 386 hardware has concept of busy tsc or some + * similar stupidity. */ + set_tss_desc(0,t); + per_cpu(cpu_gdt_table,0)[GDT_ENTRY_TSS].b &= 0xfffffdff; + + load_TR_desc(); + + load_LDT(¤t->active_mm->context); /* This does lldt */ + + /* + * Now maybe reload the debug registers + */ + if (current->thread.debugreg[7]){ + set_debugreg(¤t->thread.debugreg[0], 0); + set_debugreg(¤t->thread.debugreg[1], 1); + set_debugreg(¤t->thread.debugreg[2], 2); + set_debugreg(¤t->thread.debugreg[3], 3); + /* no 4 and 5 */ + set_debugreg(¤t->thread.debugreg[6], 6); + set_debugreg(¤t->thread.debugreg[7], 7); + } + +} + +static void do_fpu_end(void) +{ + /* restore FPU regs if necessary */ + /* Do it out of line so that gcc does not move cr0 load to some stupid + * place */ + kernel_fpu_end(); +} + +#if defined(CONFIG_SUSPEND2) || defined(CONFIG_SMP) +static unsigned long c_loops_per_jiffy_ref __nosavedata; +#endif + +#ifdef CONFIG_SUSPEND2 +#ifndef CONFIG_SMP +extern unsigned long loops_per_jiffy; +volatile static unsigned long cpu_khz_ref __nosavedata = 0; +#endif + +static inline void suspend2_arch_pre_copy(void) { } +static inline void suspend2_arch_post_copy(void) { } + +static inline void suspend2_arch_pre_copyback(void) +{ + /* We want to run from swsusp_pg_dir, since swsusp_pg_dir is stored in + * constant place in memory. + */ + + __asm__( "movl %%ecx,%%cr3\n" ::"c"(__pa(swsusp_pg_dir))); + + c_loops_per_jiffy_ref = + current_cpu_data.loops_per_jiffy; +#ifndef CONFIG_SMP + cpu_khz_ref = cpu_khz; + c_loops_per_jiffy_ref = loops_per_jiffy; +#endif + +} + +/* + * restore_processor_context + * + * Restore the processor context as it was before we went to sleep + * - descriptor tables + * - control registers + * - segment registers + * - flags + * + * Note that it is critical that this function is declared inline. + * It was separated out from restore_state to make that function + * a little clearer, but it needs to be inlined because we won't have a + * stack when we get here (so we can't push a return address). + */ +static inline void suspend2_arch_restore_processor_context(void) +{ + /* + * first restore %ds, so we can access our data properly + */ + asm volatile (".align 4"); + asm volatile ("movw %0, %%ds" :: "r" ((u16)__KERNEL_DS)); + + + /* + * control registers + */ + asm volatile ("movl %0, %%cr4" :: "r" (suspend2_saved_context.cr4)); + asm volatile ("movl %0, %%cr3" :: "r" (suspend2_saved_context.cr3)); + asm volatile ("movl %0, %%cr2" :: "r" (suspend2_saved_context.cr2)); + asm volatile ("movl %0, %%cr0" :: "r" (suspend2_saved_context.cr0)); + + /* + * segment registers + */ + asm volatile ("movw %0, %%es" :: "r" (suspend2_saved_context.es)); + asm volatile ("movw %0, %%fs" :: "r" (suspend2_saved_context.fs)); + asm volatile ("movw %0, %%gs" :: "r" (suspend2_saved_context.gs)); + asm volatile ("movw %0, %%ss" :: "r" (suspend2_saved_context.ss)); + + /* + * the other general registers + * + * note that even though gcc has constructs to specify memory + * input into certain registers, it will try to be too smart + * and save them at the beginning of the function. This is esp. + * bad since we don't have a stack set up when we enter, and we + * want to preserve the values on exit. So, we set them manually. + */ + asm volatile ("movl %0, %%esp" :: "m" (suspend2_saved_context.esp)); + asm volatile ("movl %0, %%ebp" :: "m" (suspend2_saved_context.ebp)); + asm volatile ("movl %0, %%eax" :: "m" (suspend2_saved_context.eax)); + asm volatile ("movl %0, %%ebx" :: "m" (suspend2_saved_context.ebx)); + asm volatile ("movl %0, %%ecx" :: "m" (suspend2_saved_context.ecx)); + asm volatile ("movl %0, %%edx" :: "m" (suspend2_saved_context.edx)); + asm volatile ("movl %0, %%esi" :: "m" (suspend2_saved_context.esi)); + asm volatile ("movl %0, %%edi" :: "m" (suspend2_saved_context.edi)); + + /* + * now restore the descriptor tables to their proper values + * ltr is done in fix_processor_context(). + */ + + asm volatile ("lgdt (%0)" :: "m" (suspend2_saved_context.gdt_limit)); + asm volatile ("lidt (%0)" :: "m" (suspend2_saved_context.idt_limit)); + asm volatile ("lldt (%0)" :: "m" (suspend2_saved_context.ldt)); + + /* tell gcc that we clobbered all the registers... + * otherwise it might keep some addresses there. + * Unfortunately gcc 4 thinks it's smart and will + * error out if we tell it we're clobbering ebp as + * well. So we have to lie. + */ + asm volatile ("" : : : "esp", "eax", "ebx", "ecx", "edx", "esi", "edi"); + + if (boot_cpu_has(X86_FEATURE_SEP)) + enable_sep_cpu(); + + fix_processor_context(); + + /* + * the flags + */ + asm volatile ("pushl %0 ; popfl" :: "m" (suspend2_saved_context.eflags)); + + do_fpu_end(); + + mtrr_ap_init(); + mcheck_init(&boot_cpu_data); +} + +static inline void suspend2_arch_flush_caches(void) +{ +#ifdef CONFIG_SMP + cpu_clear(0, per_cpu(cpu_tlbstate, + 0).active_mm->cpu_vm_mask); +#endif + wbinvd(); + __flush_tlb_all(); + +} + +static inline void suspend2_arch_post_copyback(void) +{ + BUG_ON(!irqs_disabled()); + + current_cpu_data.loops_per_jiffy = + c_loops_per_jiffy_ref; +#ifndef CONFIG_SMP + loops_per_jiffy = c_loops_per_jiffy_ref; + cpu_khz = cpu_khz_ref; +#endif +} + +#endif diff -urN oldtree/include/asm-ppc/cpu_context.h newtree/include/asm-ppc/cpu_context.h --- oldtree/include/asm-ppc/cpu_context.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/include/asm-ppc/cpu_context.h 2006-02-18 15:24:31.350822880 +0000 @@ -0,0 +1,110 @@ +/* + * Written by Hu Gang (hugang@soulinfo.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +/* image of the saved processor states */ +struct saved_context { + u32 lr, cr, sp, r2; + u32 r[20]; /* r12 - r31 */ + u32 sprg[4]; + u32 msr, sdr1, tb1, tb2; +} __attribute__((packed)); + +inline static void __save_processor_state(struct saved_context *s) +{ + /*asm volatile ("mflr 0; stw 0,%0" : "=m" (s->lr));*/ + asm volatile ("mfcr 0; stw 0,%0" : "=m" (s->cr)); + asm volatile ("stw 1,%0" : "=m" (s->sp)); + asm volatile ("stw 2,%0" : "=m" (s->r2)); + asm volatile ("stmw 12,%0" : "=m" (s->r)); + + /* Save MSR & SDR1 */ + asm volatile ("mfmsr 4; stw 4,%0" : "=m" (s->msr)); + asm volatile ("mfsdr1 4; stw 4,%0": "=m" (s->sdr1)); + + /* Get a stable timebase and save it */ + asm volatile ("1:\n" + "mftbu 4;stw 4,%0\n" + "mftb 5;stw 5,%1\n" + "mftbu 3\n" + "cmpw 3,4;\n" + "bne 1b" : + "=m" (s->tb1), + "=m" (s->tb2)); + + /* Save SPRGs */ + asm volatile ("mfsprg 4,0; stw 4,%0 " : "=m" (s->sprg[0])); + asm volatile ("mfsprg 4,1; stw 4,%0 " : "=m" (s->sprg[1])); + asm volatile ("mfsprg 4,2; stw 4,%0 " : "=m" (s->sprg[2])); + asm volatile ("mfsprg 4,3; stw 4,%0 " : "=m" (s->sprg[3])); +} + +inline static void __restore_processor_state(struct saved_context *s) +{ + /* Restore the BATs, and SDR1 */ + asm volatile ("lwz 4,%0; mtsdr1 4" : "=m" (s->sdr1)); + /* asm volatile ("lwz 3,%0" : "=m" (saved_context.msr)); */ + + asm volatile ("lwz 4,%0; mtsprg 0,4": "=m" (s->sprg[0])); + asm volatile ("lwz 4,%0; mtsprg 1,4": "=m" (s->sprg[1])); + asm volatile ("lwz 4,%0; mtsprg 2,4": "=m" (s->sprg[2])); + asm volatile ("lwz 4,%0; mtsprg 3,4": "=m" (s->sprg[3])); + + /* Restore TB */ + asm volatile ("li 3,0; mttbl 3; \n" + "lwz 3,%0\n; lwz 4,%1\n" + "mttbu 3; mttbl 4" : + "=m" (s->tb1), + "=m" (s->tb2)); + + /* Restore the callee-saved registers and return */ + asm volatile ("lmw 12,%0" : "=m" (s->r)); + asm volatile ("lwz 2,%0" : "=m" (s->r2)); + asm volatile ("lwz 1,%0" : "=m" (s->sp)); + asm volatile ("lwz 0,%0; mtcr 0" : "=m" (s->cr)); + + /* tell gcc that we clobbered all the registers... + * otherwise it might keep some addresses there. */ + asm volatile ("" : : : "r13", "r14", "r15", "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31"); + /*asm volatile ("lwz 0,%0; mtlr 0" : "=m" (s->lr));*/ +} + +static inline void save_context(void) +{ +#ifdef CONFIG_ADB_PMU + printk("pmu suspend\n"); + pmu_suspend(); +#endif +} + +extern void enable_kernel_altivec(void); + +static inline void restore_context(void) +{ + printk("set context: <%p>\n", current); + set_context(current->active_mm->context, + current->active_mm->pgd); + +#ifdef CONFIG_ADB_PMU + printk("pmu_resume\n"); + pmu_resume(); +#endif + +#ifdef CONFIG_ALTIVEC + if (cur_cpu_spec->cpu_features & CPU_FTR_ALTIVEC) { + printk("enable altivec\n"); + enable_kernel_altivec(); + } +#endif + printk("enable fp\n"); + enable_kernel_fp(); +} diff -urN oldtree/include/asm-ppc/suspend2.h newtree/include/asm-ppc/suspend2.h --- oldtree/include/asm-ppc/suspend2.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/include/asm-ppc/suspend2.h 2006-02-18 15:24:31.351822728 +0000 @@ -0,0 +1,47 @@ +/* + * Written by Hu Gang (hugang@soulinfo.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include "asm/cpu_context.h" + +typedef struct saved_context suspend2_saved_context_t; + +extern struct saved_context suspend2_saved_context; + +static inline void suspend2_arch_save_processor_context(void) +{ + __save_processor_state(&suspend2_saved_context); +} + +static inline void suspend2_arch_restore_processor_context(void) +{ + __restore_processor_state(&suspend2_saved_context); + + restore_context(); +} + +static inline void suspend2_arch_pre_copy(void) +{ +} + +static inline void suspend2_arch_post_copy(void) +{ +} + +static inline void suspend2_arch_pre_copyback(void) +{ + save_context(); +} + +static inline void suspend2_arch_post_copyback(void) +{ +} + +static inline void suspend2_arch_flush_caches(void) +{ +} diff -urN oldtree/include/asm-x86_64/page.h newtree/include/asm-x86_64/page.h --- oldtree/include/asm-x86_64/page.h 2006-02-18 15:18:29.866776808 +0000 +++ newtree/include/asm-x86_64/page.h 2006-02-18 15:24:31.352822576 +0000 @@ -105,6 +105,8 @@ #include +extern int page_is_ram(unsigned long pagenr); + #endif /* __ASSEMBLY__ */ #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) diff -urN oldtree/include/asm-x86_64/suspend.h newtree/include/asm-x86_64/suspend.h --- oldtree/include/asm-x86_64/suspend.h 2006-01-03 03:21:10.000000000 +0000 +++ newtree/include/asm-x86_64/suspend.h 2006-02-18 15:24:31.353822424 +0000 @@ -43,8 +43,6 @@ : /* no output */ \ :"r" ((thread)->debugreg##register)) -extern void fix_processor_context(void); - #ifdef CONFIG_ACPI_SLEEP extern unsigned long saved_eip; extern unsigned long saved_esp; diff -urN oldtree/include/asm-x86_64/suspend2.h newtree/include/asm-x86_64/suspend2.h --- oldtree/include/asm-x86_64/suspend2.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/include/asm-x86_64/suspend2.h 2006-02-18 15:24:31.354822272 +0000 @@ -0,0 +1,427 @@ + /* + * Copyright 2005 Nigel Cunningham + * Based on code + * Copyright 2001-2002 Pavel Machek + * Based on code + * Copyright 2001 Patrick Mochel + */ + +#include +#include +#include +#include +#include +#include +#include + +static pgd_t *temp_level4_pgt; +extern int suspend2_mapping_prepare(void); + +/* image of the saved processor states */ +struct suspend2_saved_context { + unsigned long eax, ebx, ecx, edx; + unsigned long esp, ebp, esi, edi; + unsigned long r8, r9, r10, r11; + unsigned long r12, r13, r14, r15; + + u16 ds, es, fs, gs, ss; + unsigned long gs_base, gs_kernel_base, fs_base; + unsigned long cr0, cr2, cr3, cr4, cr8; + u16 gdt_pad; + u16 gdt_limit; + unsigned long gdt_base; + u16 idt_pad; + u16 idt_limit; + unsigned long idt_base; + u16 ldt; + u16 tss; + unsigned long tr; + unsigned long safety; + unsigned long return_address; + unsigned long eflags; +} __attribute__((packed)); + +typedef struct suspend2_saved_context suspend2_saved_context_t; + +/* temporary storage */ +extern struct suspend2_saved_context suspend2_saved_context; + +static inline void suspend2_arch_flush_caches(void) +{ +#ifdef CONFIG_SMP + clear_bit(0, &read_pda(active_mm)->cpu_vm_mask); +#endif + wbinvd(); + __flush_tlb_all(); + +} + +/* + * save_processor_context + * + * Save the state of the processor before we go to sleep. + * + * return_stack is the value of the stack pointer (%esp) as the caller sees it. + * A good way could not be found to obtain it from here (don't want to make _too_ + * many assumptions about the layout of the stack this far down.) Also, the + * handy little __builtin_frame_pointer(level) where level > 0, is blatantly + * buggy - it returns the value of the stack at the proper location, not the + * location, like it should (as of gcc 2.91.66) + * + * Note that the context and timing of this function is pretty critical. + * With a minimal amount of things going on in the caller and in here, gcc + * does a good job of being just a dumb compiler. Watch the assembly output + * if anything changes, though, and make sure everything is going in the right + * place. + */ +static inline void suspend2_arch_save_processor_context(void) +{ + kernel_fpu_begin(); + + /* + * descriptor tables + */ + asm volatile ("sgdt %0" : "=m" (suspend2_saved_context.gdt_limit)); + asm volatile ("sidt %0" : "=m" (suspend2_saved_context.idt_limit)); + asm volatile ("str %0" : "=m" (suspend2_saved_context.tr)); + + /* + * segment registers + */ + asm volatile ("movw %%ds, %0" : "=r" (suspend2_saved_context.ds)); + asm volatile ("movw %%es, %0" : "=r" (suspend2_saved_context.es)); + asm volatile ("movw %%fs, %0" : "=r" (suspend2_saved_context.fs)); + asm volatile ("movw %%gs, %0" : "=r" (suspend2_saved_context.gs)); + asm volatile ("movw %%ss, %0" : "=r" (suspend2_saved_context.ss)); + + rdmsrl(MSR_FS_BASE, suspend2_saved_context.fs_base); + rdmsrl(MSR_GS_BASE, suspend2_saved_context.gs_base); + rdmsrl(MSR_KERNEL_GS_BASE, suspend2_saved_context.gs_kernel_base); + + /* + * control registers + */ + asm volatile ("movq %%cr0, %0" : "=r" (suspend2_saved_context.cr0)); + asm volatile ("movq %%cr2, %0" : "=r" (suspend2_saved_context.cr2)); + asm volatile ("movq %%cr3, %0" : "=r" (suspend2_saved_context.cr3)); + asm volatile ("movq %%cr4, %0" : "=r" (suspend2_saved_context.cr4)); + asm volatile ("movq %%cr8, %0" : "=r" (suspend2_saved_context.cr8)); + + /* + * save the general registers. + * note that gcc has constructs to specify output of certain registers, + * but they're not used here, because it assumes that you want to modify + * those registers, so it tries to be smart and save them beforehand. + * It's really not necessary, and kinda fishy (check the assembly output), + * so it's avoided. + */ + + asm volatile ("movq %%rsp, %0" : "=m" (suspend2_saved_context.esp)); + + asm volatile ("movq %%rax, %0" : "=m" (suspend2_saved_context.eax)); + asm volatile ("movq %%rbx, %0" : "=m" (suspend2_saved_context.ebx)); + asm volatile ("movq %%rcx, %0" : "=m" (suspend2_saved_context.ecx)); + asm volatile ("movq %%rdx, %0" : "=m" (suspend2_saved_context.edx)); + asm volatile ("movq %%rbp, %0" : "=m" (suspend2_saved_context.ebp)); + asm volatile ("movq %%rsi, %0" : "=m" (suspend2_saved_context.esi)); + asm volatile ("movq %%rdi, %0" : "=m" (suspend2_saved_context.edi)); + asm volatile ("movq %%r8, %0" : "=m" (suspend2_saved_context.r8)); + asm volatile ("movq %%r9, %0" : "=m" (suspend2_saved_context.r9)); + asm volatile ("movq %%r10, %0" : "=m" (suspend2_saved_context.r10)); + asm volatile ("movq %%r11, %0" : "=m" (suspend2_saved_context.r11)); + asm volatile ("movq %%r12, %0" : "=m" (suspend2_saved_context.r12)); + asm volatile ("movq %%r13, %0" : "=m" (suspend2_saved_context.r13)); + asm volatile ("movq %%r14, %0" : "=m" (suspend2_saved_context.r14)); + asm volatile ("movq %%r15, %0" : "=m" (suspend2_saved_context.r15)); + + /* + * eflags + */ + asm volatile ("pushfq ; popq %0" : "=m" (suspend2_saved_context.eflags)); + +} + +static void fix_processor_context(void) +{ + struct tss_struct * t = &per_cpu(init_tss,0); + + set_tss_desc(0,t); /* This just modifies memory; should not be neccessary. But... This is neccessary, because 386 hardware has concept of busy tsc or some similar stupidity. */ + cpu_gdt(0)[GDT_ENTRY_TSS].type = 9; + + syscall_init(); /* This sets MSR_*STAR and related */ + load_TR_desc(); + load_LDT(¤t->active_mm->context); /* This does lldt */ + + /* + * Now maybe reload the debug registers + */ + if (current->thread.debugreg7){ + loaddebug(¤t->thread, 0); + loaddebug(¤t->thread, 1); + loaddebug(¤t->thread, 2); + loaddebug(¤t->thread, 3); + /* no 4 and 5 */ + loaddebug(¤t->thread, 6); + loaddebug(¤t->thread, 7); + } +} + +static void do_fpu_end(void) +{ + /* restore FPU regs if necessary */ + /* Do it out of line so that gcc does not move cr0 load to some stupid place */ + kernel_fpu_end(); + mxcsr_feature_mask_init(); +} + +/* + * restore_processor_context + * + * Restore the processor context as it was before we went to sleep + * - descriptor tables + * - control registers + * - segment registers + * - flags + * + * Note that it is critical that this function is declared inline. + * It was separated out from restore_state to make that function + * a little clearer, but it needs to be inlined because we won't have a + * stack when we get here (so we can't push a return address). + */ +static inline void restore_processor_context(void) +{ + /* + * Credit for this goes to the swsusp code. Restoring the + * CPU context is the one thing we still do in the same + * way, and swsusp did it right first. + * + * 0xffffffff80000000UL is __START_KERNEL_map. + */ + + __asm__ __volatile__( + "leaq init_level4_pgt(%rip), %rax; \n" + "subq $0xffffffff80000000, %rax; \n" + "movq %rax, %cr3; \n" + "movq mmu_cr4_features(%rip), %rax; \n" + "movq %rax, %rdx; \n" + "andq $~(1<<7), %rdx; # PGE \n" + "movq %rdx, %cr4; # turn off PGE \n" + "movq %cr3, %rcx; # flush TLB \n" + "movq %rcx, %cr3; \n" + "movq %rax, %cr4; # turn PGE back on; \n" + + "movl $24, %eax; \n" + "movl %eax, %ds \n"); + /* + * the other general registers + * + * note that even though gcc has constructs to specify memory + * input into certain registers, it will try to be too smart + * and save them at the beginning of the function. This is esp. + * bad since we don't have a stack set up when we enter, and we + * want to preserve the values on exit. So, we set them manually. + */ + asm volatile ("movq %0, %%rsp" :: "m" (suspend2_saved_context.esp)); + asm volatile ("movq %0, %%rbp" :: "m" (suspend2_saved_context.ebp)); + asm volatile ("movq %0, %%rbx" :: "m" (suspend2_saved_context.ebx)); + asm volatile ("movq %0, %%rcx" :: "m" (suspend2_saved_context.ecx)); + asm volatile ("movq %0, %%rdx" :: "m" (suspend2_saved_context.edx)); + asm volatile ("movq %0, %%rsi" :: "m" (suspend2_saved_context.esi)); + asm volatile ("movq %0, %%rdi" :: "m" (suspend2_saved_context.edi)); + asm volatile ("movq %0, %%r8" :: "m" (suspend2_saved_context.r8)); + asm volatile ("movq %0, %%r9" :: "m" (suspend2_saved_context.r9)); + asm volatile ("movq %0, %%r10" :: "m" (suspend2_saved_context.r10)); + asm volatile ("movq %0, %%r11" :: "m" (suspend2_saved_context.r11)); + asm volatile ("movq %0, %%r12" :: "m" (suspend2_saved_context.r12)); + asm volatile ("movq %0, %%r13" :: "m" (suspend2_saved_context.r13)); + asm volatile ("movq %0, %%r14" :: "m" (suspend2_saved_context.r14)); + asm volatile ("movq %0, %%r15" :: "m" (suspend2_saved_context.r15)); + + /* + * the flags + */ + asm volatile ("pushq %0 ; popfq" :: "m" (suspend2_saved_context.eflags)); + + asm volatile ("xorq %rax, %rax"); + + /* + * control registers + */ + asm volatile ("movq %0, %%cr8" :: "r" (suspend2_saved_context.cr8)); + asm volatile ("movq %0, %%cr4" :: "r" (suspend2_saved_context.cr4)); + asm volatile ("movq %0, %%cr3" :: "r" (suspend2_saved_context.cr3)); + asm volatile ("movq %0, %%cr2" :: "r" (suspend2_saved_context.cr2)); + asm volatile ("movq %0, %%cr0" :: "r" (suspend2_saved_context.cr0)); + + /* + * now restore the descriptor tables to their proper values + * ltr is done in fix_processor_context(). + */ + + asm volatile ("lgdt %0" :: "m" (suspend2_saved_context.gdt_limit)); + asm volatile ("lidt %0" :: "m" (suspend2_saved_context.idt_limit)); + + /* + * segment registers + */ + asm volatile ("movw %0, %%ds" :: "r" (suspend2_saved_context.ds)); + asm volatile ("movw %0, %%es" :: "r" (suspend2_saved_context.es)); + asm volatile ("movw %0, %%fs" :: "r" (suspend2_saved_context.fs)); + load_gs_index(suspend2_saved_context.gs); + asm volatile ("movw %0, %%ss" :: "r" (suspend2_saved_context.ss)); + + wrmsrl(MSR_FS_BASE, suspend2_saved_context.fs_base); + wrmsrl(MSR_GS_BASE, suspend2_saved_context.gs_base); + wrmsrl(MSR_KERNEL_GS_BASE, suspend2_saved_context.gs_kernel_base); + + /* tell gcc that we clobbered all the registers... + * otherwise it might keep some addresses there. */ + asm volatile ("" : : : "rsp", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"); + + fix_processor_context(); + + do_fpu_end(); + + suspend2_arch_flush_caches(); + + mtrr_ap_init(); + mcheck_init(&boot_cpu_data); +} + +#if defined(CONFIG_SUSPEND2) || defined(CONFIG_SMP) +extern unsigned char * my_saved_context __nosavedata; +static unsigned long c_loops_per_jiffy_ref[NR_CPUS] __nosavedata; +#endif + +#ifdef CONFIG_SUSPEND2 +#ifndef CONFIG_SMP +extern unsigned long loops_per_jiffy; +volatile static unsigned long cpu_khz_ref __nosavedata = 0; +#endif + +/* + * APIC support: These routines save the APIC + * configuration for the CPU on which they are + * being executed + */ +extern void suspend_apic_save_state(void); +extern void suspend_apic_reload_state(void); + +static inline void suspend2_arch_pre_copy(void) +{ +} + +static inline void suspend2_arch_post_copy(void) +{ +} + +/* Based on the version from swsusp */ +static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) +{ + long i, j; + + i = pud_index(address); + pud = pud + i; + for (; i < PTRS_PER_PUD; pud++, i++) { + unsigned long paddr; + pmd_t *pmd; + + paddr = address + i*PUD_SIZE; + if (paddr >= end) + break; + + pmd = (pmd_t *)suspend_get_nonconflicting_pages(0); + if (!pmd) + return -ENOMEM; + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { + unsigned long pe; + + if (paddr >= end) + break; + pe = _PAGE_NX | _PAGE_PSE | _KERNPG_TABLE | paddr; + pe &= __supported_pte_mask; + set_pmd(pmd, __pmd(pe)); + } + } + return 0; +} + +static int set_up_temporary_mappings_suspend2(void) +{ + unsigned long start, end, next; + int error; + + temp_level4_pgt = (pgd_t *)suspend_get_nonconflicting_pages(0); + if (!temp_level4_pgt) + return -ENOMEM; + + /* It is safe to reuse the original kernel mapping */ + set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map), + init_level4_pgt[pgd_index(__START_KERNEL_map)]); + + /* Set up the direct mapping from scratch */ + start = (unsigned long)pfn_to_kaddr(0); + end = (unsigned long)pfn_to_kaddr(end_pfn); + + for (; start < end; start = next) { + pud_t *pud = (pud_t *)suspend_get_nonconflicting_pages(0); + if (!pud) + return -ENOMEM; + next = start + PGDIR_SIZE; + if (next > end) + next = end; + if ((error = res_phys_pud_init(pud, __pa(start), __pa(next)))) + return error; + set_pgd(temp_level4_pgt + pgd_index(start), + mk_kernel_pgd(__pa(pud))); + } + return 0; +} + +static inline void suspend2_arch_pre_copyback(void) +{ + /* We want to run from swsusp_pg_dir, since swsusp_pg_dir is stored in + * constant place in memory. + */ + + set_up_temporary_mappings_suspend2(); + + asm volatile ("movq $0xffff810000000000, %rdx"); + asm volatile ("movq temp_level4_pgt(%rip), %rax"); + asm volatile ("subq %rdx, %rax"); + asm volatile ("movq %rax, %cr3"); + + wbinvd(); + __flush_tlb_all(); + + c_loops_per_jiffy_ref[0] = + current_cpu_data.loops_per_jiffy; +#ifndef CONFIG_SMP + cpu_khz_ref = cpu_khz; + c_loops_per_jiffy_ref[0] = loops_per_jiffy; +#endif + +} + +static inline void suspend2_arch_restore_processor_context(void) +{ + restore_processor_context(); +} + +static inline void suspend2_arch_post_copyback(void) +{ + /* Get other CPUs to restore their contexts and flush their tlbs. */ + clear_suspend_state(SUSPEND_FREEZE_SMP); + + BUG_ON(!irqs_disabled()); + + current_cpu_data.loops_per_jiffy = + c_loops_per_jiffy_ref[0]; +#ifndef CONFIG_SMP + loops_per_jiffy = c_loops_per_jiffy_ref[0]; + cpu_khz = cpu_khz_ref; +#endif +} + +#endif diff -urN oldtree/include/linux/bio.h newtree/include/linux/bio.h --- oldtree/include/linux/bio.h 2006-02-18 15:18:29.878774984 +0000 +++ newtree/include/linux/bio.h 2006-02-18 15:24:31.355822120 +0000 @@ -124,6 +124,7 @@ #define BIO_BOUNCED 5 /* bio is a bounce bio */ #define BIO_USER_MAPPED 6 /* contains user pages */ #define BIO_EOPNOTSUPP 7 /* not supported */ +#define BIO_SUSPEND2 8 /* Suspend2 bio - for corruption checking */ #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) /* diff -urN oldtree/include/linux/dyn_pageflags.h newtree/include/linux/dyn_pageflags.h --- oldtree/include/linux/dyn_pageflags.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/include/linux/dyn_pageflags.h 2006-02-18 15:24:31.356821968 +0000 @@ -0,0 +1,66 @@ +/* + * include/linux/dyn_pageflags.h + * + * Copyright (C) 2004-2006 Nigel Cunningham + * + * This file is released under the GPLv2. + * + * It implements support for dynamically allocated bitmaps that are + * used for temporary or infrequently used pageflags, in lieu of + * bits in the struct page flags entry. + */ + +#ifndef DYN_PAGEFLAGS_H +#define DYN_PAGEFLAGS_H + +#include + +typedef unsigned long *** dyn_pageflags_t; + +#define BITNUMBER(page) (page_to_pfn(page)) + +#if BITS_PER_LONG == 32 +#define UL_SHIFT 5 +#else +#if BITS_PER_LONG == 64 +#define UL_SHIFT 6 +#else +#error Bits per long not 32 or 64? +#endif +#endif + +#define BIT_NUM_MASK (sizeof(unsigned long) * 8 - 1) +#define PAGE_NUM_MASK (~((1 << (PAGE_SHIFT + 3)) - 1)) +#define UL_NUM_MASK (~(BIT_NUM_MASK | PAGE_NUM_MASK)) + +#define BITS_PER_PAGE (PAGE_SIZE << 3) +#define PAGENUMBER(zone_offset) (zone_offset >> (PAGE_SHIFT + 3)) +#define PAGEINDEX(zone_offset) ((zone_offset & UL_NUM_MASK) >> UL_SHIFT) +#define PAGEBIT(zone_offset) (zone_offset & BIT_NUM_MASK) + +#define PAGE_UL_PTR(bitmap, zone_num, zone_pfn) \ + ((bitmap[zone_num][PAGENUMBER(zone_pfn)])+PAGEINDEX(zone_pfn)) + +/* With the above macros defined, you can do... + +#define PagePageset1(page) (test_dynpageflag(&pageset1_map, page)) +#define SetPagePageset1(page) (set_dynpageflag(&pageset1_map, page)) +#define ClearPagePageset1(page) (clear_dynpageflag(&pageset1_map, page)) +*/ + +#define BITMAP_FOR_EACH_SET(bitmap, counter) \ + for (counter = get_next_bit_on(bitmap, -1); counter < max_pfn; \ + counter = get_next_bit_on(bitmap, counter)) + +extern void clear_dyn_pageflags(dyn_pageflags_t pagemap); +extern int allocate_dyn_pageflags(dyn_pageflags_t *pagemap); +extern void free_dyn_pageflags(dyn_pageflags_t *pagemap); +extern int dyn_pageflags_pages_per_bitmap(void); +extern int get_next_bit_on(dyn_pageflags_t bitmap, int counter); +extern unsigned long *dyn_pageflags_ul_ptr(dyn_pageflags_t *bitmap, + struct page *pg); + +extern int test_dynpageflag(dyn_pageflags_t *bitmap, struct page *page); +extern void set_dynpageflag(dyn_pageflags_t *bitmap, struct page *page); +extern void clear_dynpageflag(dyn_pageflags_t *bitmap, struct page *page); +#endif diff -urN oldtree/include/linux/freezer.h newtree/include/linux/freezer.h --- oldtree/include/linux/freezer.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/include/linux/freezer.h 2006-02-18 15:24:31.357821816 +0000 @@ -0,0 +1,28 @@ +/* Freezer declarations */ + +#define FREEZER_ON 0 +#define ABORT_FREEZING 1 + +#define FREEZER_KERNEL_THREADS 0 +#define FREEZER_ALL_THREADS 1 + +#ifdef CONFIG_PM +extern unsigned long freezer_state; + +#define test_freezer_state(bit) test_bit(bit, &freezer_state) +#define set_freezer_state(bit) set_bit(bit, &freezer_state) +#define clear_freezer_state(bit) clear_bit(bit, &freezer_state) + +#define freezer_is_on() (test_freezer_state(FREEZER_ON)) + +extern void do_freeze_process(struct notifier_block *nl); + +#else + +#define test_freezer_state(bit) (0) +#define set_freezer_state(bit) do { } while(0) +#define clear_freezer_state(bit) do { } while(0) + +#define freezer_is_on() (0) + +#endif diff -urN oldtree/include/linux/kernel.h newtree/include/linux/kernel.h --- oldtree/include/linux/kernel.h 2006-02-18 15:18:29.909770272 +0000 +++ newtree/include/linux/kernel.h 2006-02-18 15:24:31.358821664 +0000 @@ -105,6 +105,8 @@ __attribute__ ((format (printf, 2, 0))); extern int snprintf(char * buf, size_t size, const char * fmt, ...) __attribute__ ((format (printf, 3, 4))); +extern int snprintf_used(char *buffer, int buffer_size, + const char *fmt, ...); extern int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) __attribute__ ((format (printf, 3, 0))); extern int scnprintf(char * buf, size_t size, const char * fmt, ...) diff -urN oldtree/include/linux/kthread.h newtree/include/linux/kthread.h --- oldtree/include/linux/kthread.h 2006-01-03 03:21:10.000000000 +0000 +++ newtree/include/linux/kthread.h 2006-02-18 15:24:31.359821512 +0000 @@ -23,10 +23,20 @@ * * Returns a task_struct or ERR_PTR(-ENOMEM). */ +struct task_struct *__kthread_create(int (*threadfn)(void *data), + void *data, + unsigned long freezer_flags, + const char namefmt[], + va_list * args); + struct task_struct *kthread_create(int (*threadfn)(void *data), void *data, const char namefmt[], ...); +struct task_struct *kthread_nofreeze_create(int (*threadfn)(void *data), + void *data, + const char namefmt[], ...); + /** * kthread_run: create and wake a thread. * @threadfn: the function to run until signal_pending(current). @@ -35,14 +45,15 @@ * * Description: Convenient wrapper for kthread_create() followed by * wake_up_process(). Returns the kthread, or ERR_PTR(-ENOMEM). */ -#define kthread_run(threadfn, data, namefmt, ...) \ -({ \ - struct task_struct *__k \ - = kthread_create(threadfn, data, namefmt, ## __VA_ARGS__); \ - if (!IS_ERR(__k)) \ - wake_up_process(__k); \ - __k; \ -}) + +extern struct task_struct * kthread_run(int (*threadfn)(void *data), + void *data, + const char namefmt[], ...); + +extern struct task_struct * kthread_nofreeze_run(int (*threadfn)(void *data), + void *data, + const char namefmt[], ...); + /** * kthread_bind: bind a just-created kthread to a cpu. diff -urN oldtree/include/linux/netlink.h newtree/include/linux/netlink.h --- oldtree/include/linux/netlink.h 2006-02-18 15:18:29.937766016 +0000 +++ newtree/include/linux/netlink.h 2006-02-18 15:24:31.360821360 +0000 @@ -21,6 +21,8 @@ #define NETLINK_DNRTMSG 14 /* DECnet routing messages */ #define NETLINK_KOBJECT_UEVENT 15 /* Kernel messages to userspace */ #define NETLINK_GENERIC 16 +#define NETLINK_SUSPEND2_USERUI 17 /* For suspend2's userui */ +#define NETLINK_SUSPEND2_USM 18 /* For suspend2's userui */ #define MAX_LINKS 32 diff -urN oldtree/include/linux/sched.h newtree/include/linux/sched.h --- oldtree/include/linux/sched.h 2006-02-18 15:18:29.959762672 +0000 +++ newtree/include/linux/sched.h 2006-02-18 15:24:31.363820904 +0000 @@ -1436,7 +1436,7 @@ extern void refrigerator(void); extern int freeze_processes(void); -extern void thaw_processes(void); +extern void thaw_processes(int which_threads); static inline int try_to_freeze(void) { @@ -1455,7 +1455,7 @@ static inline void refrigerator(void) {} static inline int freeze_processes(void) { BUG(); return 0; } -static inline void thaw_processes(void) {} +static inline void thaw_processes(int which_threads) {} static inline int try_to_freeze(void) { return 0; } diff -urN oldtree/include/linux/sched.h.orig newtree/include/linux/sched.h.orig --- oldtree/include/linux/sched.h.orig 1970-01-01 00:00:00.000000000 +0000 +++ newtree/include/linux/sched.h.orig 2006-02-18 15:18:29.000000000 +0000 @@ -0,0 +1,1465 @@ +#ifndef _LINUX_SCHED_H +#define _LINUX_SCHED_H + +#include /* for HZ */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* For AT_VECTOR_SIZE */ + +struct exec_domain; + +/* + * cloning flags: + */ +#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ +#define CLONE_VM 0x00000100 /* set if VM shared between processes */ +#define CLONE_FS 0x00000200 /* set if fs info shared between processes */ +#define CLONE_FILES 0x00000400 /* set if open files shared between processes */ +#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ +#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ +#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ +#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ +#define CLONE_THREAD 0x00010000 /* Same thread group? */ +#define CLONE_NEWNS 0x00020000 /* New namespace group? */ +#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ +#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ +#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ +#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */ +#define CLONE_DETACHED 0x00400000 /* Unused, ignored */ +#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ +#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ +#define CLONE_STOPPED 0x02000000 /* Start in stopped state */ + +/* + * List of flags we want to share for kernel threads, + * if only because they are not used by them anyway. + */ +#define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND) + +/* + * These are the constant used to fake the fixed-point load-average + * counting. Some notes: + * - 11 bit fractions expand to 22 bits by the multiplies: this gives + * a load-average precision of 10 bits integer + 11 bits fractional + * - if you want to count load-averages more often, you need more + * precision, or rounding will get you. With 2-second counting freq, + * the EXP_n values would be 1981, 2034 and 2043 if still using only + * 11 bit fractions. + */ +extern unsigned long avenrun[]; /* Load averages */ + +#define FSHIFT 11 /* nr of bits of precision */ +#define FIXED_1 (1<>= FSHIFT; + +extern unsigned long total_forks; +extern int nr_threads; +extern int last_pid; +DECLARE_PER_CPU(unsigned long, process_counts); +extern int nr_processes(void); +extern unsigned long nr_running(void); +extern unsigned long nr_uninterruptible(void); +extern unsigned long nr_iowait(void); + +#include +#include +#include +#include +#include + +#include + +/* + * Task state bitmask. NOTE! These bits are also + * encoded in fs/proc/array.c: get_task_state(). + * + * We have two separate sets of flags: task->state + * is about runnability, while task->exit_state are + * about the task exiting. Confusing, but this way + * modifying one set can't modify the other one by + * mistake. + */ +#define TASK_RUNNING 0 +#define TASK_INTERRUPTIBLE 1 +#define TASK_UNINTERRUPTIBLE 2 +#define TASK_STOPPED 4 +#define TASK_TRACED 8 +/* in tsk->exit_state */ +#define EXIT_ZOMBIE 16 +#define EXIT_DEAD 32 +/* in tsk->state again */ +#define TASK_NONINTERACTIVE 64 + +#define __set_task_state(tsk, state_value) \ + do { (tsk)->state = (state_value); } while (0) +#define set_task_state(tsk, state_value) \ + set_mb((tsk)->state, (state_value)) + +/* + * set_current_state() includes a barrier so that the write of current->state + * is correctly serialised wrt the caller's subsequent test of whether to + * actually sleep: + * + * set_current_state(TASK_UNINTERRUPTIBLE); + * if (do_i_need_to_sleep()) + * schedule(); + * + * If the caller does not need such serialisation then use __set_current_state() + */ +#define __set_current_state(state_value) \ + do { current->state = (state_value); } while (0) +#define set_current_state(state_value) \ + set_mb(current->state, (state_value)) + +/* Task command name length */ +#define TASK_COMM_LEN 16 + +/* + * Scheduling policies + */ +#define SCHED_NORMAL 0 +#define SCHED_FIFO 1 +#define SCHED_RR 2 +#define SCHED_BATCH 3 + +struct sched_param { + int sched_priority; +}; + +#ifdef __KERNEL__ + +#include + +/* + * This serializes "schedule()" and also protects + * the run-queue from deletions/modifications (but + * _adding_ to the beginning of the run-queue has + * a separate lock). + */ +extern rwlock_t tasklist_lock; +extern spinlock_t mmlist_lock; + +typedef struct task_struct task_t; + +extern void sched_init(void); +extern void sched_init_smp(void); +extern void init_idle(task_t *idle, int cpu); + +extern cpumask_t nohz_cpu_mask; + +extern void show_state(void); +extern void show_regs(struct pt_regs *); + +/* + * TASK is a pointer to the task whose backtrace we want to see (or NULL for current + * task), SP is the stack pointer of the first frame that should be shown in the back + * trace (or NULL if the entire call-chain of the task should be shown). + */ +extern void show_stack(struct task_struct *task, unsigned long *sp); + +void io_schedule(void); +long io_schedule_timeout(long timeout); + +extern void cpu_init (void); +extern void trap_init(void); +extern void update_process_times(int user); +extern void scheduler_tick(void); + +#ifdef CONFIG_DETECT_SOFTLOCKUP +extern void softlockup_tick(struct pt_regs *regs); +extern void spawn_softlockup_task(void); +extern void touch_softlockup_watchdog(void); +#else +static inline void softlockup_tick(struct pt_regs *regs) +{ +} +static inline void spawn_softlockup_task(void) +{ +} +static inline void touch_softlockup_watchdog(void) +{ +} +#endif + + +/* Attach to any functions which should be ignored in wchan output. */ +#define __sched __attribute__((__section__(".sched.text"))) +/* Is this address in the __sched functions? */ +extern int in_sched_functions(unsigned long addr); + +#define MAX_SCHEDULE_TIMEOUT LONG_MAX +extern signed long FASTCALL(schedule_timeout(signed long timeout)); +extern signed long schedule_timeout_interruptible(signed long timeout); +extern signed long schedule_timeout_uninterruptible(signed long timeout); +asmlinkage void schedule(void); + +struct namespace; + +/* Maximum number of active map areas.. This is a random (large) number */ +#define DEFAULT_MAX_MAP_COUNT 65536 + +extern int sysctl_max_map_count; + +#include + +extern unsigned long +arch_get_unmapped_area(struct file *, unsigned long, unsigned long, + unsigned long, unsigned long); +extern unsigned long +arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); +extern void arch_unmap_area(struct mm_struct *, unsigned long); +extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); + +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS +/* + * The mm counters are not protected by its page_table_lock, + * so must be incremented atomically. + */ +#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value) +#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member)) +#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member) +#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member) +#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member) +typedef atomic_long_t mm_counter_t; + +#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ +/* + * The mm counters are protected by its page_table_lock, + * so can be incremented directly. + */ +#define set_mm_counter(mm, member, value) (mm)->_##member = (value) +#define get_mm_counter(mm, member) ((mm)->_##member) +#define add_mm_counter(mm, member, value) (mm)->_##member += (value) +#define inc_mm_counter(mm, member) (mm)->_##member++ +#define dec_mm_counter(mm, member) (mm)->_##member-- +typedef unsigned long mm_counter_t; + +#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ + +#define get_mm_rss(mm) \ + (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) +#define update_hiwater_rss(mm) do { \ + unsigned long _rss = get_mm_rss(mm); \ + if ((mm)->hiwater_rss < _rss) \ + (mm)->hiwater_rss = _rss; \ +} while (0) +#define update_hiwater_vm(mm) do { \ + if ((mm)->hiwater_vm < (mm)->total_vm) \ + (mm)->hiwater_vm = (mm)->total_vm; \ +} while (0) + +struct mm_struct { + struct vm_area_struct * mmap; /* list of VMAs */ + struct rb_root mm_rb; + struct vm_area_struct * mmap_cache; /* last find_vma result */ + unsigned long (*get_unmapped_area) (struct file *filp, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags); + void (*unmap_area) (struct mm_struct *mm, unsigned long addr); + unsigned long mmap_base; /* base of mmap area */ + unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */ + unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */ + pgd_t * pgd; + atomic_t mm_users; /* How many users with user space? */ + atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ + int map_count; /* number of VMAs */ + struct rw_semaphore mmap_sem; + spinlock_t page_table_lock; /* Protects page tables and some counters */ + + struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung + * together off init_mm.mmlist, and are protected + * by mmlist_lock + */ + + /* Special counters, in some configurations protected by the + * page_table_lock, in other configurations by being atomic. + */ + mm_counter_t _file_rss; + mm_counter_t _anon_rss; + + unsigned long hiwater_rss; /* High-watermark of RSS usage */ + unsigned long hiwater_vm; /* High-water virtual memory usage */ + + unsigned long total_vm, locked_vm, shared_vm, exec_vm; + unsigned long stack_vm, reserved_vm, def_flags, nr_ptes; + unsigned long start_code, end_code, start_data, end_data; + unsigned long start_brk, brk, start_stack; + unsigned long arg_start, arg_end, env_start, env_end; + + unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ + + unsigned dumpable:2; + cpumask_t cpu_vm_mask; + + /* Architecture-specific MM context */ + mm_context_t context; + + /* Token based thrashing protection. */ + unsigned long swap_token_time; + char recent_pagein; + + /* coredumping support */ + int core_waiters; + struct completion *core_startup_done, core_done; + + /* aio bits */ + rwlock_t ioctx_list_lock; + struct kioctx *ioctx_list; +}; + +struct sighand_struct { + atomic_t count; + struct k_sigaction action[_NSIG]; + spinlock_t siglock; + struct rcu_head rcu; +}; + +extern void sighand_free_cb(struct rcu_head *rhp); + +static inline void sighand_free(struct sighand_struct *sp) +{ + call_rcu(&sp->rcu, sighand_free_cb); +} + +/* + * NOTE! "signal_struct" does not have it's own + * locking, because a shared signal_struct always + * implies a shared sighand_struct, so locking + * sighand_struct is always a proper superset of + * the locking of signal_struct. + */ +struct signal_struct { + atomic_t count; + atomic_t live; + + wait_queue_head_t wait_chldexit; /* for wait4() */ + + /* current thread group signal load-balancing target: */ + task_t *curr_target; + + /* shared signal handling: */ + struct sigpending shared_pending; + + /* thread group exit support */ + int group_exit_code; + /* overloaded: + * - notify group_exit_task when ->count is equal to notify_count + * - everyone except group_exit_task is stopped during signal delivery + * of fatal signals, group_exit_task processes the signal. + */ + struct task_struct *group_exit_task; + int notify_count; + + /* thread group stop support, overloads group_exit_code too */ + int group_stop_count; + unsigned int flags; /* see SIGNAL_* flags below */ + + /* POSIX.1b Interval Timers */ + struct list_head posix_timers; + + /* ITIMER_REAL timer for the process */ + struct hrtimer real_timer; + ktime_t it_real_incr; + + /* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */ + cputime_t it_prof_expires, it_virt_expires; + cputime_t it_prof_incr, it_virt_incr; + + /* job control IDs */ + pid_t pgrp; + pid_t tty_old_pgrp; + pid_t session; + /* boolean value for session group leader */ + int leader; + + struct tty_struct *tty; /* NULL if no tty */ + + /* + * Cumulative resource counters for dead threads in the group, + * and for reaped dead child processes forked by this group. + * Live threads maintain their own counters and add to these + * in __exit_signal, except for the group leader. + */ + cputime_t utime, stime, cutime, cstime; + unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; + unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; + + /* + * Cumulative ns of scheduled CPU time for dead threads in the + * group, not including a zombie group leader. (This only differs + * from jiffies_to_ns(utime + stime) if sched_clock uses something + * other than jiffies.) + */ + unsigned long long sched_time; + + /* + * We don't bother to synchronize most readers of this at all, + * because there is no reader checking a limit that actually needs + * to get both rlim_cur and rlim_max atomically, and either one + * alone is a single word that can safely be read normally. + * getrlimit/setrlimit use task_lock(current->group_leader) to + * protect this instead of the siglock, because they really + * have no need to disable irqs. + */ + struct rlimit rlim[RLIM_NLIMITS]; + + struct list_head cpu_timers[3]; + + /* keep the process-shared keyrings here so that they do the right + * thing in threads created with CLONE_THREAD */ +#ifdef CONFIG_KEYS + struct key *session_keyring; /* keyring inherited over fork */ + struct key *process_keyring; /* keyring private to this process */ +#endif +}; + +/* Context switch must be unlocked if interrupts are to be enabled */ +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW +# define __ARCH_WANT_UNLOCKED_CTXSW +#endif + +/* + * Bits in flags field of signal_struct. + */ +#define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ +#define SIGNAL_STOP_DEQUEUED 0x00000002 /* stop signal dequeued */ +#define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */ +#define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */ + + +/* + * Priority of a process goes from 0..MAX_PRIO-1, valid RT + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH + * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority + * values are inverted: lower p->prio value means higher priority. + * + * The MAX_USER_RT_PRIO value allows the actual maximum + * RT priority to be separate from the value exported to + * user-space. This allows kernel threads to set their + * priority to a value higher than any user task. Note: + * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. + */ + +#define MAX_USER_RT_PRIO 100 +#define MAX_RT_PRIO MAX_USER_RT_PRIO + +#define MAX_PRIO (MAX_RT_PRIO + 40) + +#define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) + +/* + * Some day this will be a full-fledged user tracking system.. + */ +struct user_struct { + atomic_t __count; /* reference count */ + atomic_t processes; /* How many processes does this user have? */ + atomic_t files; /* How many open files does this user have? */ + atomic_t sigpending; /* How many pending signals does this user have? */ +#ifdef CONFIG_INOTIFY + atomic_t inotify_watches; /* How many inotify watches does this user have? */ + atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ +#endif + /* protected by mq_lock */ + unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ + unsigned long locked_shm; /* How many pages of mlocked shm ? */ + +#ifdef CONFIG_KEYS + struct key *uid_keyring; /* UID specific keyring */ + struct key *session_keyring; /* UID's default session keyring */ +#endif + + /* Hash table maintenance information */ + struct list_head uidhash_list; + uid_t uid; +}; + +extern struct user_struct *find_user(uid_t); + +extern struct user_struct root_user; +#define INIT_USER (&root_user) + +typedef struct prio_array prio_array_t; +struct backing_dev_info; +struct reclaim_state; + +#ifdef CONFIG_SCHEDSTATS +struct sched_info { + /* cumulative counters */ + unsigned long cpu_time, /* time spent on the cpu */ + run_delay, /* time spent waiting on a runqueue */ + pcnt; /* # of timeslices run on this cpu */ + + /* timestamps */ + unsigned long last_arrival, /* when we last ran on a cpu */ + last_queued; /* when we were last queued to run */ +}; + +extern struct file_operations proc_schedstat_operations; +#endif + +enum idle_type +{ + SCHED_IDLE, + NOT_IDLE, + NEWLY_IDLE, + MAX_IDLE_TYPES +}; + +/* + * sched-domains (multiprocessor balancing) declarations: + */ +#ifdef CONFIG_SMP +#define SCHED_LOAD_SCALE 128UL /* increase resolution of load */ + +#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ +#define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */ +#define SD_BALANCE_EXEC 4 /* Balance on exec */ +#define SD_BALANCE_FORK 8 /* Balance on fork, clone */ +#define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */ +#define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */ +#define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ +#define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ + +struct sched_group { + struct sched_group *next; /* Must be a circular list */ + cpumask_t cpumask; + + /* + * CPU power of this group, SCHED_LOAD_SCALE being max power for a + * single CPU. This is read only (except for setup, hotplug CPU). + */ + unsigned long cpu_power; +}; + +struct sched_domain { + /* These fields must be setup */ + struct sched_domain *parent; /* top domain must be null terminated */ + struct sched_group *groups; /* the balancing groups of the domain */ + cpumask_t span; /* span of all CPUs in this domain */ + unsigned long min_interval; /* Minimum balance interval ms */ + unsigned long max_interval; /* Maximum balance interval ms */ + unsigned int busy_factor; /* less balancing by factor if busy */ + unsigned int imbalance_pct; /* No balance until over watermark */ + unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ + unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ + unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */ + unsigned int busy_idx; + unsigned int idle_idx; + unsigned int newidle_idx; + unsigned int wake_idx; + unsigned int forkexec_idx; + int flags; /* See SD_* */ + + /* Runtime fields. */ + unsigned long last_balance; /* init to jiffies. units in jiffies */ + unsigned int balance_interval; /* initialise to 1. units in ms. */ + unsigned int nr_balance_failed; /* initialise to 0 */ + +#ifdef CONFIG_SCHEDSTATS + /* load_balance() stats */ + unsigned long lb_cnt[MAX_IDLE_TYPES]; + unsigned long lb_failed[MAX_IDLE_TYPES]; + unsigned long lb_balanced[MAX_IDLE_TYPES]; + unsigned long lb_imbalance[MAX_IDLE_TYPES]; + unsigned long lb_gained[MAX_IDLE_TYPES]; + unsigned long lb_hot_gained[MAX_IDLE_TYPES]; + unsigned long lb_nobusyg[MAX_IDLE_TYPES]; + unsigned long lb_nobusyq[MAX_IDLE_TYPES]; + + /* Active load balancing */ + unsigned long alb_cnt; + unsigned long alb_failed; + unsigned long alb_pushed; + + /* SD_BALANCE_EXEC stats */ + unsigned long sbe_cnt; + unsigned long sbe_balanced; + unsigned long sbe_pushed; + + /* SD_BALANCE_FORK stats */ + unsigned long sbf_cnt; + unsigned long sbf_balanced; + unsigned long sbf_pushed; + + /* try_to_wake_up() stats */ + unsigned long ttwu_wake_remote; + unsigned long ttwu_move_affine; + unsigned long ttwu_move_balance; +#endif +}; + +extern void partition_sched_domains(cpumask_t *partition1, + cpumask_t *partition2); + +/* + * Maximum cache size the migration-costs auto-tuning code will + * search from: + */ +extern unsigned int max_cache_size; + +#endif /* CONFIG_SMP */ + + +struct io_context; /* See blkdev.h */ +void exit_io_context(void); +struct cpuset; + +#define NGROUPS_SMALL 32 +#define NGROUPS_PER_BLOCK ((int)(PAGE_SIZE / sizeof(gid_t))) +struct group_info { + int ngroups; + atomic_t usage; + gid_t small_block[NGROUPS_SMALL]; + int nblocks; + gid_t *blocks[0]; +}; + +/* + * get_group_info() must be called with the owning task locked (via task_lock()) + * when task != current. The reason being that the vast majority of callers are + * looking at current->group_info, which can not be changed except by the + * current task. Changing current->group_info requires the task lock, too. + */ +#define get_group_info(group_info) do { \ + atomic_inc(&(group_info)->usage); \ +} while (0) + +#define put_group_info(group_info) do { \ + if (atomic_dec_and_test(&(group_info)->usage)) \ + groups_free(group_info); \ +} while (0) + +extern struct group_info *groups_alloc(int gidsetsize); +extern void groups_free(struct group_info *group_info); +extern int set_current_groups(struct group_info *group_info); +extern int groups_search(struct group_info *group_info, gid_t grp); +/* access the groups "array" with this macro */ +#define GROUP_AT(gi, i) \ + ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK]) + +#ifdef ARCH_HAS_PREFETCH_SWITCH_STACK +extern void prefetch_stack(struct task_struct*); +#else +static inline void prefetch_stack(struct task_struct *t) { } +#endif + +struct audit_context; /* See audit.c */ +struct mempolicy; + +struct task_struct { + volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ + struct thread_info *thread_info; + atomic_t usage; + unsigned long flags; /* per process flags, defined below */ + unsigned long ptrace; + + int lock_depth; /* BKL lock depth */ + +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + int oncpu; +#endif + int prio, static_prio; + struct list_head run_list; + prio_array_t *array; + + unsigned short ioprio; + + unsigned long sleep_avg; + unsigned long long timestamp, last_ran; + unsigned long long sched_time; /* sched_clock time spent running */ + int activated; + + unsigned long policy; + cpumask_t cpus_allowed; + unsigned int time_slice, first_time_slice; + +#ifdef CONFIG_SCHEDSTATS + struct sched_info sched_info; +#endif + + struct list_head tasks; + /* + * ptrace_list/ptrace_children forms the list of my children + * that were stolen by a ptracer. + */ + struct list_head ptrace_children; + struct list_head ptrace_list; + + struct mm_struct *mm, *active_mm; + +/* task state */ + struct linux_binfmt *binfmt; + long exit_state; + int exit_code, exit_signal; + int pdeath_signal; /* The signal sent when the parent dies */ + /* ??? */ + unsigned long personality; + unsigned did_exec:1; + pid_t pid; + pid_t tgid; + /* + * pointers to (original) parent process, youngest child, younger sibling, + * older sibling, respectively. (p->father can be replaced with + * p->parent->pid) + */ + struct task_struct *real_parent; /* real parent process (when being debugged) */ + struct task_struct *parent; /* parent process */ + /* + * children/sibling forms the list of my children plus the + * tasks I'm ptracing. + */ + struct list_head children; /* list of my children */ + struct list_head sibling; /* linkage in my parent's children list */ + struct task_struct *group_leader; /* threadgroup leader */ + + /* PID/PID hash table linkage. */ + struct pid pids[PIDTYPE_MAX]; + + struct completion *vfork_done; /* for vfork() */ + int __user *set_child_tid; /* CLONE_CHILD_SETTID */ + int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ + + unsigned long rt_priority; + cputime_t utime, stime; + unsigned long nvcsw, nivcsw; /* context switch counts */ + struct timespec start_time; +/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ + unsigned long min_flt, maj_flt; + + cputime_t it_prof_expires, it_virt_expires; + unsigned long long it_sched_expires; + struct list_head cpu_timers[3]; + +/* process credentials */ + uid_t uid,euid,suid,fsuid; + gid_t gid,egid,sgid,fsgid; + struct group_info *group_info; + kernel_cap_t cap_effective, cap_inheritable, cap_permitted; + unsigned keep_capabilities:1; + struct user_struct *user; +#ifdef CONFIG_KEYS + struct key *request_key_auth; /* assumed request_key authority */ + struct key *thread_keyring; /* keyring private to this thread */ + unsigned char jit_keyring; /* default keyring to attach requested keys to */ +#endif + int oomkilladj; /* OOM kill score adjustment (bit shift). */ + char comm[TASK_COMM_LEN]; /* executable name excluding path + - access with [gs]et_task_comm (which lock + it with task_lock()) + - initialized normally by flush_old_exec */ +/* file system info */ + int link_count, total_link_count; +/* ipc stuff */ + struct sysv_sem sysvsem; +/* CPU-specific state of this task */ + struct thread_struct thread; +/* filesystem information */ + struct fs_struct *fs; +/* open file information */ + struct files_struct *files; +/* namespace */ + struct namespace *namespace; +/* signal handlers */ + struct signal_struct *signal; + struct sighand_struct *sighand; + + sigset_t blocked, real_blocked; + sigset_t saved_sigmask; /* To be restored with TIF_RESTORE_SIGMASK */ + struct sigpending pending; + + unsigned long sas_ss_sp; + size_t sas_ss_size; + int (*notifier)(void *priv); + void *notifier_data; + sigset_t *notifier_mask; + + void *security; + struct audit_context *audit_context; + seccomp_t seccomp; + +/* Thread group tracking */ + u32 parent_exec_id; + u32 self_exec_id; +/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */ + spinlock_t alloc_lock; +/* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */ + spinlock_t proc_lock; + +#ifdef CONFIG_DEBUG_MUTEXES + /* mutex deadlock detection */ + struct mutex_waiter *blocked_on; +#endif + +/* journalling filesystem info */ + void *journal_info; + +/* VM state */ + struct reclaim_state *reclaim_state; + + struct dentry *proc_dentry; + struct backing_dev_info *backing_dev_info; + + struct io_context *io_context; + + unsigned long ptrace_message; + siginfo_t *last_siginfo; /* For ptrace use. */ +/* + * current io wait handle: wait queue entry to use for io waits + * If this thread is processing aio, this points at the waitqueue + * inside the currently handled kiocb. It may be NULL (i.e. default + * to a stack based synchronous wait) if its doing sync IO. + */ + wait_queue_t *io_wait; +/* i/o counters(bytes read/written, #syscalls */ + u64 rchar, wchar, syscr, syscw; +#if defined(CONFIG_BSD_PROCESS_ACCT) + u64 acct_rss_mem1; /* accumulated rss usage */ + u64 acct_vm_mem1; /* accumulated virtual memory usage */ + clock_t acct_stimexpd; /* clock_t-converted stime since last update */ +#endif +#ifdef CONFIG_NUMA + struct mempolicy *mempolicy; + short il_next; +#endif +#ifdef CONFIG_CPUSETS + struct cpuset *cpuset; + nodemask_t mems_allowed; + int cpuset_mems_generation; +#endif + atomic_t fs_excl; /* holding fs exclusive resources */ + struct rcu_head rcu; +}; + +static inline pid_t process_group(struct task_struct *tsk) +{ + return tsk->signal->pgrp; +} + +/** + * pid_alive - check that a task structure is not stale + * @p: Task structure to be checked. + * + * Test if a process is not yet dead (at most zombie state) + * If pid_alive fails, then pointers within the task structure + * can be stale and must not be dereferenced. + */ +static inline int pid_alive(struct task_struct *p) +{ + return p->pids[PIDTYPE_PID].nr != 0; +} + +extern void free_task(struct task_struct *tsk); +extern void __put_task_struct(struct task_struct *tsk); +#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) + +extern void __put_task_struct_cb(struct rcu_head *rhp); + +static inline void put_task_struct(struct task_struct *t) +{ + if (atomic_dec_and_test(&t->usage)) + call_rcu(&t->rcu, __put_task_struct_cb); +} + +/* + * Per process flags + */ +#define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */ + /* Not implemented yet, only for 486*/ +#define PF_STARTING 0x00000002 /* being created */ +#define PF_EXITING 0x00000004 /* getting shut down */ +#define PF_DEAD 0x00000008 /* Dead */ +#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ +#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ +#define PF_DUMPCORE 0x00000200 /* dumped core */ +#define PF_SIGNALED 0x00000400 /* killed by a signal */ +#define PF_MEMALLOC 0x00000800 /* Allocating memory */ +#define PF_FLUSHER 0x00001000 /* responsible for disk writeback */ +#define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ +#define PF_FREEZE 0x00004000 /* this task is being frozen for suspend now */ +#define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ +#define PF_FROZEN 0x00010000 /* frozen for system suspend */ +#define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ +#define PF_KSWAPD 0x00040000 /* I am kswapd */ +#define PF_SWAPOFF 0x00080000 /* I am in swapoff */ +#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ +#define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ +#define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */ +#define PF_RANDOMIZE 0x00800000 /* randomize virtual address space */ +#define PF_SWAPWRITE 0x01000000 /* Allowed to write to swap */ + +/* + * Only the _current_ task can read/write to tsk->flags, but other + * tasks can access tsk->flags in readonly mode for example + * with tsk_used_math (like during threaded core dumping). + * There is however an exception to this rule during ptrace + * or during fork: the ptracer task is allowed to write to the + * child->flags of its traced child (same goes for fork, the parent + * can write to the child->flags), because we're guaranteed the + * child is not running and in turn not changing child->flags + * at the same time the parent does it. + */ +#define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) +#define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) +#define clear_used_math() clear_stopped_child_used_math(current) +#define set_used_math() set_stopped_child_used_math(current) +#define conditional_stopped_child_used_math(condition, child) \ + do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0) +#define conditional_used_math(condition) \ + conditional_stopped_child_used_math(condition, current) +#define copy_to_stopped_child_used_math(child) \ + do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0) +/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */ +#define tsk_used_math(p) ((p)->flags & PF_USED_MATH) +#define used_math() tsk_used_math(current) + +#ifdef CONFIG_SMP +extern int set_cpus_allowed(task_t *p, cpumask_t new_mask); +#else +static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask) +{ + if (!cpu_isset(0, new_mask)) + return -EINVAL; + return 0; +} +#endif + +extern unsigned long long sched_clock(void); +extern unsigned long long current_sched_time(const task_t *current_task); + +/* sched_exec is called by processes performing an exec */ +#ifdef CONFIG_SMP +extern void sched_exec(void); +#else +#define sched_exec() {} +#endif + +#ifdef CONFIG_HOTPLUG_CPU +extern void idle_task_exit(void); +#else +static inline void idle_task_exit(void) {} +#endif + +extern void sched_idle_next(void); +extern void set_user_nice(task_t *p, long nice); +extern int task_prio(const task_t *p); +extern int task_nice(const task_t *p); +extern int can_nice(const task_t *p, const int nice); +extern int task_curr(const task_t *p); +extern int idle_cpu(int cpu); +extern int sched_setscheduler(struct task_struct *, int, struct sched_param *); +extern task_t *idle_task(int cpu); +extern task_t *curr_task(int cpu); +extern void set_curr_task(int cpu, task_t *p); + +void yield(void); + +/* + * The default (Linux) execution domain. + */ +extern struct exec_domain default_exec_domain; + +union thread_union { + struct thread_info thread_info; + unsigned long stack[THREAD_SIZE/sizeof(long)]; +}; + +#ifndef __HAVE_ARCH_KSTACK_END +static inline int kstack_end(void *addr) +{ + /* Reliable end of stack detection: + * Some APM bios versions misalign the stack + */ + return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); +} +#endif + +extern union thread_union init_thread_union; +extern struct task_struct init_task; + +extern struct mm_struct init_mm; + +#define find_task_by_pid(nr) find_task_by_pid_type(PIDTYPE_PID, nr) +extern struct task_struct *find_task_by_pid_type(int type, int pid); +extern void set_special_pids(pid_t session, pid_t pgrp); +extern void __set_special_pids(pid_t session, pid_t pgrp); + +/* per-UID process charging. */ +extern struct user_struct * alloc_uid(uid_t); +static inline struct user_struct *get_uid(struct user_struct *u) +{ + atomic_inc(&u->__count); + return u; +} +extern void free_uid(struct user_struct *); +extern void switch_uid(struct user_struct *); + +#include + +extern void do_timer(struct pt_regs *); + +extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state)); +extern int FASTCALL(wake_up_process(struct task_struct * tsk)); +extern void FASTCALL(wake_up_new_task(struct task_struct * tsk, + unsigned long clone_flags)); +#ifdef CONFIG_SMP + extern void kick_process(struct task_struct *tsk); +#else + static inline void kick_process(struct task_struct *tsk) { } +#endif +extern void FASTCALL(sched_fork(task_t * p, int clone_flags)); +extern void FASTCALL(sched_exit(task_t * p)); + +extern int in_group_p(gid_t); +extern int in_egroup_p(gid_t); + +extern void proc_caches_init(void); +extern void flush_signals(struct task_struct *); +extern void flush_signal_handlers(struct task_struct *, int force_default); +extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); + +static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&tsk->sighand->siglock, flags); + ret = dequeue_signal(tsk, mask, info); + spin_unlock_irqrestore(&tsk->sighand->siglock, flags); + + return ret; +} + +extern void block_all_signals(int (*notifier)(void *priv), void *priv, + sigset_t *mask); +extern void unblock_all_signals(void); +extern void release_task(struct task_struct * p); +extern int send_sig_info(int, struct siginfo *, struct task_struct *); +extern int send_group_sig_info(int, struct siginfo *, struct task_struct *); +extern int force_sigsegv(int, struct task_struct *); +extern int force_sig_info(int, struct siginfo *, struct task_struct *); +extern int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp); +extern int kill_pg_info(int, struct siginfo *, pid_t); +extern int kill_proc_info(int, struct siginfo *, pid_t); +extern int kill_proc_info_as_uid(int, struct siginfo *, pid_t, uid_t, uid_t); +extern void do_notify_parent(struct task_struct *, int); +extern void force_sig(int, struct task_struct *); +extern void force_sig_specific(int, struct task_struct *); +extern int send_sig(int, struct task_struct *, int); +extern void zap_other_threads(struct task_struct *p); +extern int kill_pg(pid_t, int, int); +extern int kill_sl(pid_t, int, int); +extern int kill_proc(pid_t, int, int); +extern struct sigqueue *sigqueue_alloc(void); +extern void sigqueue_free(struct sigqueue *); +extern int send_sigqueue(int, struct sigqueue *, struct task_struct *); +extern int send_group_sigqueue(int, struct sigqueue *, struct task_struct *); +extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); +extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long); + +/* These can be the second arg to send_sig_info/send_group_sig_info. */ +#define SEND_SIG_NOINFO ((struct siginfo *) 0) +#define SEND_SIG_PRIV ((struct siginfo *) 1) +#define SEND_SIG_FORCED ((struct siginfo *) 2) + +static inline int is_si_special(const struct siginfo *info) +{ + return info <= SEND_SIG_FORCED; +} + +/* True if we are on the alternate signal stack. */ + +static inline int on_sig_stack(unsigned long sp) +{ + return (sp - current->sas_ss_sp < current->sas_ss_size); +} + +static inline int sas_ss_flags(unsigned long sp) +{ + return (current->sas_ss_size == 0 ? SS_DISABLE + : on_sig_stack(sp) ? SS_ONSTACK : 0); +} + +/* + * Routines for handling mm_structs + */ +extern struct mm_struct * mm_alloc(void); + +/* mmdrop drops the mm and the page tables */ +extern void FASTCALL(__mmdrop(struct mm_struct *)); +static inline void mmdrop(struct mm_struct * mm) +{ + if (atomic_dec_and_test(&mm->mm_count)) + __mmdrop(mm); +} + +/* mmput gets rid of the mappings and all user-space */ +extern void mmput(struct mm_struct *); +/* Grab a reference to a task's mm, if it is not already going away */ +extern struct mm_struct *get_task_mm(struct task_struct *task); +/* Remove the current tasks stale references to the old mm_struct */ +extern void mm_release(struct task_struct *, struct mm_struct *); + +extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); +extern void flush_thread(void); +extern void exit_thread(void); + +extern void exit_files(struct task_struct *); +extern void exit_signal(struct task_struct *); +extern void __exit_signal(struct task_struct *); +extern void exit_sighand(struct task_struct *); +extern void __exit_sighand(struct task_struct *); +extern void exit_itimers(struct signal_struct *); + +extern NORET_TYPE void do_group_exit(int); + +extern void daemonize(const char *, ...); +extern int allow_signal(int); +extern int disallow_signal(int); +extern task_t *child_reaper; + +extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *); +extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); +task_t *fork_idle(int); + +extern void set_task_comm(struct task_struct *tsk, char *from); +extern void get_task_comm(char *to, struct task_struct *tsk); + +#ifdef CONFIG_SMP +extern void wait_task_inactive(task_t * p); +#else +#define wait_task_inactive(p) do { } while (0) +#endif + +#define remove_parent(p) list_del_init(&(p)->sibling) +#define add_parent(p, parent) list_add_tail(&(p)->sibling,&(parent)->children) + +#define REMOVE_LINKS(p) do { \ + if (thread_group_leader(p)) \ + list_del_init(&(p)->tasks); \ + remove_parent(p); \ + } while (0) + +#define SET_LINKS(p) do { \ + if (thread_group_leader(p)) \ + list_add_tail(&(p)->tasks,&init_task.tasks); \ + add_parent(p, (p)->parent); \ + } while (0) + +#define next_task(p) list_entry((p)->tasks.next, struct task_struct, tasks) +#define prev_task(p) list_entry((p)->tasks.prev, struct task_struct, tasks) + +#define for_each_process(p) \ + for (p = &init_task ; (p = next_task(p)) != &init_task ; ) + +/* + * Careful: do_each_thread/while_each_thread is a double loop so + * 'break' will not work as expected - use goto instead. + */ +#define do_each_thread(g, t) \ + for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do + +#define while_each_thread(g, t) \ + while ((t = next_thread(t)) != g) + +extern task_t * FASTCALL(next_thread(const task_t *p)); + +#define thread_group_leader(p) (p->pid == p->tgid) + +static inline int thread_group_empty(task_t *p) +{ + return list_empty(&p->pids[PIDTYPE_TGID].pid_list); +} + +#define delay_group_leader(p) \ + (thread_group_leader(p) && !thread_group_empty(p)) + +extern void unhash_process(struct task_struct *p); + +/* + * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring + * subscriptions and synchronises with wait4(). Also used in procfs. Also + * pins the final release of task.io_context. Also protects ->cpuset. + * + * Nests both inside and outside of read_lock(&tasklist_lock). + * It must not be nested with write_lock_irq(&tasklist_lock), + * neither inside nor outside. + */ +static inline void task_lock(struct task_struct *p) +{ + spin_lock(&p->alloc_lock); +} + +static inline void task_unlock(struct task_struct *p) +{ + spin_unlock(&p->alloc_lock); +} + +#ifndef __HAVE_THREAD_FUNCTIONS + +#define task_thread_info(task) (task)->thread_info +#define task_stack_page(task) ((void*)((task)->thread_info)) + +static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) +{ + *task_thread_info(p) = *task_thread_info(org); + task_thread_info(p)->task = p; +} + +static inline unsigned long *end_of_stack(struct task_struct *p) +{ + return (unsigned long *)(p->thread_info + 1); +} + +#endif + +/* set thread flags in other task's structures + * - see asm/thread_info.h for TIF_xxxx flags available + */ +static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) +{ + set_ti_thread_flag(task_thread_info(tsk), flag); +} + +static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) +{ + clear_ti_thread_flag(task_thread_info(tsk), flag); +} + +static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) +{ + return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); +} + +static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) +{ + return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); +} + +static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) +{ + return test_ti_thread_flag(task_thread_info(tsk), flag); +} + +static inline void set_tsk_need_resched(struct task_struct *tsk) +{ + set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); +} + +static inline void clear_tsk_need_resched(struct task_struct *tsk) +{ + clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); +} + +static inline int signal_pending(struct task_struct *p) +{ + return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); +} + +static inline int need_resched(void) +{ + return unlikely(test_thread_flag(TIF_NEED_RESCHED)); +} + +/* + * cond_resched() and cond_resched_lock(): latency reduction via + * explicit rescheduling in places that are safe. The return + * value indicates whether a reschedule was done in fact. + * cond_resched_lock() will drop the spinlock before scheduling, + * cond_resched_softirq() will enable bhs before scheduling. + */ +extern int cond_resched(void); +extern int cond_resched_lock(spinlock_t * lock); +extern int cond_resched_softirq(void); + +/* + * Does a critical section need to be broken due to another + * task waiting?: + */ +#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) +# define need_lockbreak(lock) ((lock)->break_lock) +#else +# define need_lockbreak(lock) 0 +#endif + +/* + * Does a critical section need to be broken due to another + * task waiting or preemption being signalled: + */ +static inline int lock_need_resched(spinlock_t *lock) +{ + if (need_lockbreak(lock) || need_resched()) + return 1; + return 0; +} + +/* Reevaluate whether the task has signals pending delivery. + This is required every time the blocked sigset_t changes. + callers must hold sighand->siglock. */ + +extern FASTCALL(void recalc_sigpending_tsk(struct task_struct *t)); +extern void recalc_sigpending(void); + +extern void signal_wake_up(struct task_struct *t, int resume_stopped); + +/* + * Wrappers for p->thread_info->cpu access. No-op on UP. + */ +#ifdef CONFIG_SMP + +static inline unsigned int task_cpu(const struct task_struct *p) +{ + return task_thread_info(p)->cpu; +} + +static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + task_thread_info(p)->cpu = cpu; +} + +#else + +static inline unsigned int task_cpu(const struct task_struct *p) +{ + return 0; +} + +static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) +{ +} + +#endif /* CONFIG_SMP */ + +#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT +extern void arch_pick_mmap_layout(struct mm_struct *mm); +#else +static inline void arch_pick_mmap_layout(struct mm_struct *mm) +{ + mm->mmap_base = TASK_UNMAPPED_BASE; + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; +} +#endif + +extern long sched_setaffinity(pid_t pid, cpumask_t new_mask); +extern long sched_getaffinity(pid_t pid, cpumask_t *mask); + +extern void normalize_rt_tasks(void); + +#ifdef CONFIG_PM +/* + * Check if a process has been frozen + */ +static inline int frozen(struct task_struct *p) +{ + return p->flags & PF_FROZEN; +} + +/* + * Check if there is a request to freeze a process + */ +static inline int freezing(struct task_struct *p) +{ + return p->flags & PF_FREEZE; +} + +/* + * Request that a process be frozen + * FIXME: SMP problem. We may not modify other process' flags! + */ +static inline void freeze(struct task_struct *p) +{ + p->flags |= PF_FREEZE; +} + +/* + * Wake up a frozen process + */ +static inline int thaw_process(struct task_struct *p) +{ + if (frozen(p)) { + p->flags &= ~PF_FROZEN; + wake_up_process(p); + return 1; + } + return 0; +} + +/* + * freezing is complete, mark process as frozen + */ +static inline void frozen_process(struct task_struct *p) +{ + p->flags = (p->flags & ~PF_FREEZE) | PF_FROZEN; +} + +extern void refrigerator(void); +extern int freeze_processes(void); +extern void thaw_processes(void); + +static inline int try_to_freeze(void) +{ + if (freezing(current)) { + refrigerator(); + return 1; + } else + return 0; +} +#else +static inline int frozen(struct task_struct *p) { return 0; } +static inline int freezing(struct task_struct *p) { return 0; } +static inline void freeze(struct task_struct *p) { BUG(); } +static inline int thaw_process(struct task_struct *p) { return 1; } +static inline void frozen_process(struct task_struct *p) { BUG(); } + +static inline void refrigerator(void) {} +static inline int freeze_processes(void) { BUG(); return 0; } +static inline void thaw_processes(void) {} + +static inline int try_to_freeze(void) { return 0; } + +#endif /* CONFIG_PM */ +#endif /* __KERNEL__ */ + +#endif diff -urN oldtree/include/linux/suspend.h newtree/include/linux/suspend.h --- oldtree/include/linux/suspend.h 2006-02-18 15:18:29.971760848 +0000 +++ newtree/include/linux/suspend.h 2006-02-18 15:24:31.365820600 +0000 @@ -9,6 +9,7 @@ #include #include #include +#include /* page backup entry */ typedef struct pbe { @@ -46,6 +47,8 @@ #if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) extern int pm_prepare_console(void); extern void pm_restore_console(void); +extern int freeze_processes(void); +extern void thaw_processes(int which_threads); #else static inline int pm_prepare_console(void) { return 0; } static inline void pm_restore_console(void) {} @@ -56,8 +59,12 @@ printk("Warning: fake suspend called\n"); return -EPERM; } +static inline int freeze_processes(void) { return 0; } +static inline void thaw_processes(int which_threads) { } #endif /* CONFIG_PM */ +extern char resume2_file[256]; + #ifdef CONFIG_SUSPEND_SMP extern void disable_nonboot_cpus(void); extern void enable_nonboot_cpus(void); @@ -69,8 +76,6 @@ void save_processor_state(void); void restore_processor_state(void); struct saved_context; -void __save_processor_state(struct saved_context *ctxt); -void __restore_processor_state(struct saved_context *ctxt); unsigned long get_safe_page(gfp_t gfp_mask); /* diff -urN oldtree/include/linux/suspend2.h newtree/include/linux/suspend2.h --- oldtree/include/linux/suspend2.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/include/linux/suspend2.h 2006-02-18 15:24:31.366820448 +0000 @@ -0,0 +1,231 @@ +#ifndef _LINUX_SUSPEND2_H +#define _LINUX_SUSPEND2_H + +#include +#include +#ifdef CONFIG_ACPI +#include +#include +#endif + +/* arch/i386/mm/init.c */ +extern char __nosave_begin, __nosave_end; + +extern char __nosavedata swsusp_pg_dir[PAGE_SIZE] + __attribute__ ((aligned (PAGE_SIZE))); + +#define SECTOR_SIZE 512 + +/* kernel/power/process.c */ + +/* kernel/power/main.c */ +extern unsigned long suspend_result; + +/* kernel/power/process.c */ +extern unsigned long suspend_debug_state; + +/* arch/i386/power/suspend2.c */ +extern unsigned long suspend_action; +extern int suspend_io_time[2][2]; + +extern dyn_pageflags_t pageset1_map; +extern dyn_pageflags_t pageset1_copy_map; + +#ifdef CONFIG_PM_DEBUG +#define test_debug_state(bit) (test_bit(bit, &suspend_debug_state)) +#else +#define test_debug_state(bit) (0) +#endif + +#define test_result_state(bit) (test_bit(bit, &suspend_result)) + +/* + * First status register - this is suspend's return code. + * + * All the rest are in kernel/power/suspend2_common.h + */ +#define SUSPEND_ABORTED 0 + +/* Second status register - ditto */ +#define SUSPEND_RETRY_RESUME 0 + +/* Debug sections - if debugging compiled in */ +enum { + SUSPEND_ANY_SECTION, + SUSPEND_FREEZER, + SUSPEND_EAT_MEMORY, + SUSPEND_PAGESETS, + SUSPEND_IO, + SUSPEND_BMAP, + SUSPEND_HEADER, + SUSPEND_WRITER, + SUSPEND_MEMORY, + SUSPEND_EXTENTS, + SUSPEND_SPINLOCKS, + SUSPEND_MEM_POOL, + SUSPEND_RANGE_PARANOIA, + SUSPEND_NOSAVE, + SUSPEND_INTEGRITY +}; + +/* debugging levels. */ +#define SUSPEND_STATUS 0 +#define SUSPEND_ERROR 2 +#define SUSPEND_LOW 3 +#define SUSPEND_MEDIUM 4 +#define SUSPEND_HIGH 5 +#define SUSPEND_VERBOSE 6 + +/* second status register */ +enum { + SUSPEND_REBOOT, + SUSPEND_PAUSE, + SUSPEND_SLOW, + SUSPEND_NOPAGESET2, + SUSPEND_LOGALL, + SUSPEND_CAN_CANCEL, + SUSPEND_KEEP_IMAGE, + SUSPEND_FREEZER_TEST, + SUSPEND_FREEZER_TEST_SHOWALL, + SUSPEND_SINGLESTEP, + SUSPEND_PAUSE_NEAR_PAGESET_END, + SUSPEND_USE_ACPI_S4, + SUSPEND_TEST_FILTER_SPEED, + SUSPEND_FREEZE_TIMERS, + SUSPEND_DISABLE_SYSDEV_SUPPORT, + SUSPEND_VGA_POST, + SUSPEND_TEST_BIO, + SUSPEND_NO_PAGESET2, +}; + +#ifdef CONFIG_SUSPEND2 +#define test_action_state(bit) (test_bit(bit, &suspend_action)) +#define set_action_state(bit) (test_and_set_bit(bit, &suspend_action)) +#define clear_action_state(bit) (test_and_clear_bit(bit, &suspend_action)) +#else +#define test_action_state(bit) (0) +#endif + +extern void __suspend_message(unsigned long section, unsigned long level, int log_normally, + const char *fmt, ...); + +#ifdef CONFIG_PM_DEBUG +#define suspend_message(sn, lev, log, fmt, a...) \ +do { \ + if (test_debug_state(sn)) \ + __suspend_message(sn, lev, log, fmt, ##a); \ +} while(0) +#else /* CONFIG_PM_DEBUG */ +#define suspend_message(sn, lev, log, fmt, a...) \ +do { \ + if (lev == 0) \ + __suspend_message(sn, lev, log, fmt, ##a); \ +} while(0) +#endif /* CONFIG_PM_DEBUG */ + +/* Suspend 2 */ + +enum { + SUSPEND_DISABLED, + SUSPEND_RUNNING, + SUSPEND_RESUME_DEVICE_OK, + SUSPEND_NORESUME_SPECIFIED, + SUSPEND_COMMANDLINE_ERROR, + SUSPEND_IGNORE_IMAGE, + SUSPEND_SANITY_CHECK_PROMPT, + SUSPEND_FREEZER_ON, + SUSPEND_BLOCK_PAGE_ALLOCATIONS, + SUSPEND_USE_MEMORY_POOL, + SUSPEND_STAGE2_CONTINUE, + SUSPEND_FREEZE_SMP, + SUSPEND_PAGESET2_NOT_LOADED, + SUSPEND_CONTINUE_REQ, + SUSPEND_RESUMED_BEFORE, + SUSPEND_RUNNING_INITRD, + SUSPEND_RESUME_NOT_DONE, + SUSPEND_BOOT_TIME, + SUSPEND_NOW_RESUMING, + SUSPEND_SLAB_ALLOC_FALLBACK, + SUSPEND_IGNORE_LOGLEVEL, + SUSPEND_TIMER_FREEZER_ON, + SUSPEND_ACT_USED, + SUSPEND_DBG_USED, + SUSPEND_LVL_USED, + SUSPEND_TRYING_TO_RESUME, + SUSPEND_FORK_COPYBACK_THREAD, + SUSPEND_TRY_RESUME_RD, + SUSPEND_IGNORE_ROOTFS, +}; + +#define test_and_set_suspend_state(bit) \ + (test_and_set_bit(bit, &software_suspend_state)) + +#define get_suspend_state() (software_suspend_state) +#define restore_suspend_state(saved_state) \ + do { software_suspend_state = saved_state; } while(0) + +/* --------------------------------------------------------------------- */ +#ifdef CONFIG_SUSPEND2 + +/* Used in init dir files */ +extern unsigned long software_suspend_state; + +extern void suspend2_try_resume(void); +extern int suspend_early_boot_message + (int can_erase_image, int default_answer, char *warning_reason, ...); +extern void suspend_handle_keypress(unsigned int keycode, int source); +extern unsigned long suspend_update_status (unsigned long value, unsigned long maximum, + const char *fmt, ...); +extern void suspend_prepare_status (int clearbar, const char *fmt, ...); + +#define test_suspend_state(bit) \ + (test_bit(bit, &software_suspend_state)) + +#define clear_suspend_state(bit) \ + (clear_bit(bit, &software_suspend_state)) + +#define set_suspend_state(bit) \ + (set_bit(bit, &software_suspend_state)) + +extern inline void suspend_copyback_low(void); +extern inline void suspend_copyback_high(void); + +extern void suspend2_try_suspend(void); + +/* --------------------------------------------------------------------- */ +#else +/* --------------------------------------------------------------------- */ + +#define software_suspend_state (0) +#define clear_suspend_state(bit) do { } while (0) +#define test_suspend_state(bit) (0) +#define set_suspend_state(bit) do { } while(0) + +#define suspend2_try_resume() do { } while(0) +static inline int suspend_early_boot_message(int a, int b, char *c, ...) { return 0; } +#define suspend_handle_keypress(a, b) do { } while(0) +static inline unsigned long suspend_update_status(unsigned long value, unsigned long maximum, + const char *fmt, ...) +{ + return maximum; +} +#define suspend_prepare_status(a, ...) do { } while(0) + +#endif /* CONFIG_SUSPEND2 */ + +#if defined(CONFIG_SUSPEND2) && defined(CONFIG_ACPI) +static inline int may_try_suspend2(u32 state) +{ + if (state == ACPI_STATE_S4) { + suspend2_try_suspend(); + return 1; + } + return 0; +} +#else +static inline int may_try_suspend2(u32 state) +{ + return 0; +} +#endif +#endif /* _LINUX_SUSPEND2_H */ diff -urN oldtree/include/linux/workqueue.h newtree/include/linux/workqueue.h --- oldtree/include/linux/workqueue.h 2006-02-18 15:18:29.983759024 +0000 +++ newtree/include/linux/workqueue.h 2006-02-18 15:24:31.367820296 +0000 @@ -51,9 +51,12 @@ } while (0) extern struct workqueue_struct *__create_workqueue(const char *name, - int singlethread); -#define create_workqueue(name) __create_workqueue((name), 0) -#define create_singlethread_workqueue(name) __create_workqueue((name), 1) + int singlethread, + unsigned long freezer_flag); +#define create_workqueue(name) __create_workqueue((name), 0, 0) +#define create_nofreeze_workqueue(name) __create_workqueue((name), 0, PF_NOFREEZE) +#define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0) +#define create_nofreeze_singlethread_workqueue(name) __create_workqueue((name), 1, PF_NOFREEZE) extern void destroy_workqueue(struct workqueue_struct *wq); diff -urN oldtree/init/do_mounts.c newtree/init/do_mounts.c --- oldtree/init/do_mounts.c 2006-01-03 03:21:10.000000000 +0000 +++ newtree/init/do_mounts.c 2006-02-18 15:24:31.368820144 +0000 @@ -139,11 +139,16 @@ char s[32]; char *p; dev_t res = 0; - int part; + int part, mount_result; #ifdef CONFIG_SYSFS int mkdir_err = sys_mkdir("/sys", 0700); - if (sys_mount("sysfs", "/sys", "sysfs", 0, NULL) < 0) + /* + * When changing resume2 parameter for Software Suspend, sysfs may + * already be mounted. + */ + mount_result = sys_mount("sysfs", "/sys", "sysfs", 0, NULL); + if (mount_result < 0 && mount_result != -EBUSY) goto out; #endif @@ -195,7 +200,8 @@ res = try_name(s, part); done: #ifdef CONFIG_SYSFS - sys_umount("/sys", 0); + if (mount_result >= 0) + sys_umount("/sys", 0); out: if (!mkdir_err) sys_rmdir("/sys"); @@ -412,9 +418,25 @@ is_floppy = MAJOR(ROOT_DEV) == FLOPPY_MAJOR; + /* Suspend2: + * By this point, suspend_early_init has been called to initialise our + * proc interface. If modules are built in, they have registered (all + * of the above via late_initcalls). + * + * We have not yet looked to see if an image exists, however. If we + * have an initrd, it is expected that the user will have set it up + * to echo > /proc/suspend2/do_resume and thus initiate any + * resume. If they don't do that, we do it immediately after the initrd + * is finished (major issues if they mount filesystems rw from the + * initrd! - they are warned. If there's no usable initrd, we do our + * check next. + */ if (initrd_load()) goto out; + if (test_suspend_state(SUSPEND_RESUME_NOT_DONE)) + suspend2_try_resume(); + if (is_floppy && rd_doload && rd_load_disk(0)) ROOT_DEV = Root_RAM0; diff -urN oldtree/init/do_mounts_initrd.c newtree/init/do_mounts_initrd.c --- oldtree/init/do_mounts_initrd.c 2006-01-03 03:21:10.000000000 +0000 +++ newtree/init/do_mounts_initrd.c 2006-02-18 15:24:31.369819992 +0000 @@ -7,6 +7,7 @@ #include #include #include +#include #include "do_mounts.h" @@ -58,10 +59,16 @@ pid = kernel_thread(do_linuxrc, "/linuxrc", SIGCHLD); if (pid > 0) { - while (pid != sys_wait4(-1, NULL, 0, NULL)) + while (pid != sys_wait4(-1, NULL, 0, NULL)) { yield(); + try_to_freeze(); + } } + if (test_suspend_state(SUSPEND_RESUME_NOT_DONE)) + printk(KERN_ERR "Suspend2: Initrd lacks echo > /proc/suspend2/do_resume.\n"); + clear_suspend_state(SUSPEND_BOOT_TIME); + /* move initrd to rootfs' /old */ sys_fchdir(old_fd); sys_mount("/", ".", NULL, MS_MOVE, NULL); diff -urN oldtree/init/main.c newtree/init/main.c --- oldtree/init/main.c 2006-02-18 15:18:30.056747928 +0000 +++ newtree/init/main.c 2006-02-18 15:24:31.370819840 +0000 @@ -688,7 +688,9 @@ /* * check if there is an early userspace init. If yes, let it do all - * the work + * the work. For suspend2, we assume that it will do the right thing + * with regard to trying to resume at the right place. When that + * happens, the BOOT_TIME flag will be cleared. */ if (!ramdisk_execute_command) diff -urN oldtree/kernel/audit.c newtree/kernel/audit.c --- oldtree/kernel/audit.c 2006-02-18 15:18:30.061747168 +0000 +++ newtree/kernel/audit.c 2006-02-18 15:24:31.371819688 +0000 @@ -288,6 +288,9 @@ } } else { DECLARE_WAITQUEUE(wait, current); + + try_to_freeze(); + set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&kauditd_wait, &wait); diff -urN oldtree/kernel/fork.c newtree/kernel/fork.c --- oldtree/kernel/fork.c 2006-02-18 15:18:30.074745192 +0000 +++ newtree/kernel/fork.c 2006-02-18 15:24:31.373819384 +0000 @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -166,7 +167,13 @@ if (!tsk) return NULL; - ti = alloc_thread_info(tsk); + if (test_suspend_state(SUSPEND_FORK_COPYBACK_THREAD)) { + extern void * suspend_get_nonconflicting_pages(int); + ti = suspend_get_nonconflicting_pages(get_order(THREAD_SIZE)); + printk("Starting a copyback thread %p\n", ti); + } else + ti = alloc_thread_info(tsk); + if (!ti) { free_task_struct(tsk); return NULL; diff -urN oldtree/kernel/kmod.c newtree/kernel/kmod.c --- oldtree/kernel/kmod.c 2006-01-03 03:21:10.000000000 +0000 +++ newtree/kernel/kmod.c 2006-02-18 15:24:31.374819232 +0000 @@ -36,6 +36,7 @@ #include #include #include +#include #include extern int max_threads; @@ -249,6 +250,9 @@ if (!khelper_wq) return -EBUSY; + if (freezer_is_on()) + return 0; + if (path[0] == '\0') return 0; diff -urN oldtree/kernel/kthread.c newtree/kernel/kthread.c --- oldtree/kernel/kthread.c 2006-01-03 03:21:10.000000000 +0000 +++ newtree/kernel/kthread.c 2006-02-18 15:24:31.375819080 +0000 @@ -25,6 +25,7 @@ /* Information passed to kthread() from keventd. */ int (*threadfn)(void *data); void *data; + unsigned long freezer_flags; struct completion started; /* Result passed back to kthread_create() from keventd. */ @@ -86,6 +87,10 @@ /* By default we can run anywhere, unlike keventd. */ set_cpus_allowed(current, CPU_MASK_ALL); + /* Set our freezer flags */ + current->flags &= ~PF_NOFREEZE; + current->flags |= (create->freezer_flags & PF_NOFREEZE); + /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_INTERRUPTIBLE); complete(&create->started); @@ -119,16 +124,18 @@ complete(&create->done); } -struct task_struct *kthread_create(int (*threadfn)(void *data), +struct task_struct *__kthread_create(int (*threadfn)(void *data), void *data, + unsigned long freezer_flags, const char namefmt[], - ...) + va_list * args) { struct kthread_create_info create; DECLARE_WORK(work, keventd_create_kthread, &create); create.threadfn = threadfn; create.data = data; + create.freezer_flags = freezer_flags; init_completion(&create.started); init_completion(&create.done); @@ -141,18 +148,89 @@ queue_work(helper_wq, &work); wait_for_completion(&create.done); } - if (!IS_ERR(create.result)) { - va_list args; - va_start(args, namefmt); + if (!IS_ERR(create.result)) vsnprintf(create.result->comm, sizeof(create.result->comm), - namefmt, args); - va_end(args); - } + namefmt, *args); return create.result; } + +struct task_struct *kthread_create(int (*threadfn)(void *data), + void *data, + const char namefmt[], ...) +{ + struct task_struct * result; + + va_list args; + va_start(args, namefmt); + result = __kthread_create(threadfn, data, 0, namefmt, &args); + va_end(args); + return result; +} + EXPORT_SYMBOL(kthread_create); +struct task_struct *kthread_nofreeze_create(int (*threadfn)(void *data), + void *data, + const char namefmt[], ...) +{ + struct task_struct * result; + + va_list args; + va_start(args, namefmt); + result = __kthread_create(threadfn, data, PF_NOFREEZE, namefmt, &args); + va_end(args); + return result; +} + +EXPORT_SYMBOL(kthread_nofreeze_create); + +/** + * kthread_run: create and wake a thread. + * @threadfn: the function to run until signal_pending(current). + * @data: data ptr for @threadfn. + * @namefmt: printf-style name for the thread. + * + * Description: Convenient wrapper for kthread_create() followed by + * wake_up_process(). Returns the kthread, or ERR_PTR(-ENOMEM). + **/ +struct task_struct * kthread_run(int (*threadfn)(void *data), + void *data, + const char namefmt[], ...) +{ + struct task_struct *__k; + va_list args; + + va_start(args, namefmt); + __k = __kthread_create(threadfn, data, 0, namefmt, &args); + va_end(args); + + if(!IS_ERR(__k)) + wake_up_process(__k); + + return __k; +} + +EXPORT_SYMBOL(kthread_run); + +struct task_struct * kthread_nofreeze_run(int (*threadfn)(void *data), + void *data, + const char namefmt[], ...) +{ + struct task_struct *__k; + va_list args; + + va_start(args, namefmt); + __k = __kthread_create(threadfn, data, PF_NOFREEZE, namefmt, &args); + va_end(args); + + if(!IS_ERR(__k)) + wake_up_process(__k); + + return __k; +} +EXPORT_SYMBOL(kthread_nofreeze_run); + void kthread_bind(struct task_struct *k, unsigned int cpu) { BUG_ON(k->state != TASK_INTERRUPTIBLE); diff -urN oldtree/kernel/power/Kconfig newtree/kernel/power/Kconfig --- oldtree/kernel/power/Kconfig 2006-02-18 15:18:30.086743368 +0000 +++ newtree/kernel/power/Kconfig 2006-02-18 15:24:31.376818928 +0000 @@ -98,3 +98,76 @@ bool depends on HOTPLUG_CPU && X86 && PM default y + +config SUSPEND_DEBUG_PAGEALLOC + bool + depends on DEBUG_PAGEALLOC && (SOFTWARE_SUSPEND || SUSPEND2) + default y + +config SUSPEND2_CRYPTO + bool + depends on SUSPEND2 && CRYPTO + default y + +menuconfig SUSPEND2 + bool "Suspend2" + select DYN_PAGEFLAGS + depends on PM + select HOTPLUG_CPU if SMP + ---help--- + Suspend2 is the 'new and improved' suspend support. + + See the Suspend2 home page (suspend2.net) + for FAQs, HOWTOs and other documentation. + + comment 'Image Storage (you need at least one writer)' + depends on SUSPEND2 + + config SUSPEND2_FILEWRITER + bool ' File Writer' + depends on SUSPEND2 + ---help--- + This option enables support for storing an image in a + simple file. This should be possible, but we're still + testing it. + + config SUSPEND2_SWAPWRITER + bool ' Swap Writer' + depends on SUSPEND2 + select SWAP + ---help--- + This option enables support for storing an image in your + swap space. + + comment 'General Options' + depends on SUSPEND2 + + config SUSPEND2_DEFAULT_RESUME2 + string ' Default resume device name' + depends on SUSPEND2 + ---help--- + You normally need to add a resume2= parameter to your lilo.conf or + equivalent. With this option properly set, the kernel has a value + to default. No damage will be done if the value is invalid. + + config SUSPEND2_CHECKSUMMING + bool ' Checksum images - developer option (SLOW!)' + depends on PM_DEBUG && SUSPEND2 + ---help--- + This option implements checksumming of images. It is not designed + for everyone to use, but as a development tool. + + config SUSPEND2_KEEP_IMAGE + bool ' Allow Keep Image Mode' + depends on SUSPEND2 + ---help--- + This option allows you to keep and image and reuse it. It is intended + __ONLY__ for use with systems where all filesystems are mounted read- + only (kiosks, for example). To use it, compile this option in and boot + normally. Set the KEEP_IMAGE flag in /proc/suspend2 and suspend. + When you resume, the image will not be removed. You will be unable to turn + off swap partitions (assuming you are using the swap writer), but future + suspends simply do a power-down. The image can be updated using the + kernel command line parameter suspend_act= to turn off the keep image + bit. Keep image mode is a little less user friendly on purpose - it + should not be used without thought! diff -urN oldtree/kernel/power/Makefile newtree/kernel/power/Makefile --- oldtree/kernel/power/Makefile 2006-01-03 03:21:10.000000000 +0000 +++ newtree/kernel/power/Makefile 2006-02-18 15:24:31.377818776 +0000 @@ -3,10 +3,33 @@ EXTRA_CFLAGS += -DDEBUG endif +CFLAGS_atomic_copy.o := -O0 + obj-y := main.o process.o console.o obj-$(CONFIG_PM_LEGACY) += pm.o -obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o obj-$(CONFIG_SUSPEND_SMP) += smp.o +# Order is important for compression and encryption - we +# compress before encrypting. + +suspend_core-objs := io.o pagedir.o prepare_image.o \ + extent.o suspend.o modules.o \ + pageflags.o ui.o proc.o \ + power_off.o atomic_copy.o debug_pagealloc.o \ + netlink.o + +#ifdef CONFIG_NET +suspend_core-objs += storage.o +#endif +obj-$(CONFIG_SUSPEND2) += suspend_core.o +obj-$(CONFIG_SUSPEND2_CRYPTO) += compression.o encryption.o + +obj-$(CONFIG_SUSPEND2_SWAPWRITER) += suspend_block_io.o suspend_swap.o +obj-$(CONFIG_SUSPEND2_FILEWRITER) += suspend_block_io.o suspend_file.o + +obj-$(CONFIG_SUSPEND2_CHECKSUMMING) += suspend_checksums.o + +obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o + obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff -urN oldtree/kernel/power/atomic_copy.c newtree/kernel/power/atomic_copy.c --- oldtree/kernel/power/atomic_copy.c 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/atomic_copy.c 2006-02-18 15:24:31.378818624 +0000 @@ -0,0 +1,473 @@ +/* + */ + +#include +#include +#include +#include +#include +#include +#include "suspend2_common.h" +#include "io.h" +#include "power_off.h" +#include "version.h" +#include "ui.h" +#include "modules.h" +#include "atomic_copy.h" +#include "suspend2.h" +#include "checksum.h" +#include "pageflags.h" +#include "debug_pagealloc.h" +#include "storage.h" + +#include + +volatile static int state1 __nosavedata = 0; +volatile static int state2 __nosavedata = 0; +volatile static int state3 __nosavedata = 0; +volatile static int io_speed_save[2][2] __nosavedata; + +static dyn_pageflags_t __nosavedata origmap; +static dyn_pageflags_t __nosavedata copymap; +static unsigned long __nosavedata origoffset; +static unsigned long __nosavedata copyoffset; +static int __nosavedata loop; +static __nosavedata int o_zone_num, c_zone_num; +static __nosavedata int is_resuming; + +__nosavedata char resume_commandline[COMMAND_LINE_SIZE]; + +static atomic_t atomic_copy_hold; +static atomic_t restore_thread_ready; + +suspend2_saved_context_t suspend2_saved_context; /* temporary storage */ + +struct zone_data { + unsigned long start_pfn; + unsigned long end_pfn; + int is_highmem; +}; + +static __nosavedata struct zone_data *zone_nosave; +static __nosavedata int num_zones; + +/* + * Zone information might be overwritten during the copy back, + * so we copy the fields we need to a non-conflicting page and + * use it. + */ +static void init_nosave_zone_table(void) +{ + struct zone *zone; + + zone_nosave = (struct zone_data *) suspend_get_nonconflicting_pages(0); + + BUG_ON(!zone_nosave); + + for_each_zone(zone) { + if (zone->spanned_pages) { + zone_nosave[num_zones].start_pfn = zone->zone_start_pfn; + zone_nosave[num_zones].end_pfn = zone->zone_start_pfn + + zone->spanned_pages - 1; + zone_nosave[num_zones].is_highmem = is_highmem(zone); + } + num_zones++; + } +} + +/* For Suspend2, where this all has to be inlined */ +static unsigned long inline __get_next_bit_on(dyn_pageflags_t bitmap, int *zone_num, long counter) +{ + unsigned long *ul_ptr = NULL; + int reset_ul_ptr = 1; + BUG_ON(counter == max_pfn); + + if (counter == -1) { + *zone_num = 0; + + /* + * Test the end because the start can validly + * be zero. + */ + while (!zone_nosave[*zone_num].end_pfn) + (*zone_num)++; + counter = zone_nosave[*zone_num].start_pfn - 1; + } + + do { + counter++; + if (counter > zone_nosave[*zone_num].end_pfn) { + (*zone_num)++; + while (!zone_nosave[*zone_num].end_pfn && *zone_num < num_zones) + (*zone_num)++; + + if (*zone_num == num_zones) + return max_pfn; + counter = zone_nosave[*zone_num].start_pfn; + reset_ul_ptr = 1; + } else + if (!(counter & BIT_NUM_MASK)) + reset_ul_ptr = 1; + if (reset_ul_ptr) { + reset_ul_ptr = 0; + ul_ptr = PAGE_UL_PTR(bitmap, *zone_num, + (counter - zone_nosave[*zone_num].start_pfn)); + if (!*ul_ptr) { + counter += BIT_NUM_MASK - 1; + continue; + } + } + } while((counter < max_pfn) && !test_bit(PAGEBIT(counter), ul_ptr)); + return counter; +} + +/** + * copyback_prepare + * Functionality : Preparatory steps for copying the original kernel back. + * Called From : do_suspend2_lowlevel + **/ + +static void copyback_prepare(void) +{ + int loop; + + state1 = suspend_action; + state2 = suspend_debug_state; + state3 = console_loglevel; + for (loop = 0; loop < 4; loop++) + io_speed_save[loop/2][loop%2] = + suspend_io_time[loop/2][loop%2]; + + init_nosave_zone_table(); + + memcpy(resume_commandline, saved_command_line, COMMAND_LINE_SIZE); + + suspend_map_atomic_copy_pages(); + + suspend_deactivate_storage(1); + + /* Arch specific preparation */ + suspend2_arch_pre_copyback(); + + device_suspend(PMSG_FREEZE); + local_irq_disable(); /* irqs might have been re-enabled on us by buggy drivers */ + + device_power_down(PMSG_FREEZE); + + barrier(); + mb(); +} + +/* + * copyback_post + * Functionality : Steps taken after copying back the original kernel at + * resume. + * Key Assumptions : Will be able to read back secondary pagedir (if + * applicable). + * Called From : do_suspend2_lowlevel + */ + +static void copyback_post(void) +{ + int loop; + + /* Arch specific code */ + suspend2_arch_post_copyback(); + + suspend_action = state1; + suspend_debug_state = state2; + console_loglevel = state3; + + for (loop = 0; loop < 4; loop++) + suspend_io_time[loop/2][loop%2] = + io_speed_save[loop/2][loop%2]; + + set_suspend_state(SUSPEND_NOW_RESUMING); + set_suspend_state(SUSPEND_PAGESET2_NOT_LOADED); + + suspend_unmap_atomic_copy_pages(); + + local_irq_disable(); + device_power_up(); + local_irq_enable(); + + device_resume(); + + if (pm_ops && pm_ops->finish && suspend_powerdown_method > 3) + pm_ops->finish(suspend_powerdown_method); + + if (suspend_activate_storage(1)) + panic("Failed to reactivate our storage."); + + userui_redraw(); + + check_shift_keys(1, "About to reload secondary pagedir."); + + read_pageset2(0); + clear_suspend_state(SUSPEND_PAGESET2_NOT_LOADED); + + suspend_prepare_status(DONT_CLEAR_BAR, "Cleaning up..."); +} + + +/* + * suspend_pre_copy + * Functionality : Steps taken prior to saving CPU state and the image + * itself. + * Called From : do_suspend2_lowlevel + */ + +static void suspend_pre_copy(void) +{ + suspend2_arch_pre_copy(); + + device_suspend(PMSG_FREEZE); + + mb(); + barrier(); + + local_irq_disable(); + + device_power_down(PMSG_FREEZE); +} + +/* + * suspend_post_copy + * Functionality : Steps taken after saving CPU state to save the + * image and powerdown/reboot or recover on failure. + * Key Assumptions : save_image returns zero on success; otherwise we need to + * clean up and exit. The state on exiting this routine + * should be essentially the same as if we have suspended, + * resumed and reached the end of copyback_post. + * Called From : do_suspend2_lowlevel + */ +extern void suspend_power_down(void); + +static void suspend_post_copy(void) +{ + suspend2_arch_post_copy(); + + if (!save_image_part1()) { + int temp_result; + + suspend_power_down(); + + temp_result = read_pageset2(1); + + /* If that failed, we're sunk. Panic! */ + if (temp_result) + panic("Attempt to reload pagedir 2 failed. Try rebooting."); + } + + if (!test_result_state(SUSPEND_ABORT_REQUESTED) && + !test_action_state(SUSPEND_TEST_FILTER_SPEED) && + !test_action_state(SUSPEND_TEST_BIO) && + suspend_powerdown_method != PM_SUSPEND_MEM) + printk(KERN_EMERG name_suspend + "Suspend failed, trying to recover...\n"); + barrier(); + mb(); +} + +/* + * copyback_low + */ + +static inline void copyback_low(void) +{ + unsigned long *origpage; + unsigned long *copypage; + + o_zone_num = 0; + c_zone_num = 0; + + origmap = pageset1_map; + copymap = pageset1_copy_map; + + origoffset = __get_next_bit_on(origmap, &o_zone_num, -1); + copyoffset = __get_next_bit_on(copymap, &c_zone_num, -1); + + while (origoffset < max_pfn) { + if (!zone_nosave[o_zone_num].is_highmem) { + origpage = (unsigned long *) __va(origoffset << PAGE_SHIFT); + copypage = (unsigned long *) __va(copyoffset << PAGE_SHIFT); + + loop = (PAGE_SIZE / sizeof(unsigned long)) - 1; + + while (loop >= 0) { + *(origpage + loop) = *(copypage + loop); + loop--; + } + } + + origoffset = __get_next_bit_on(origmap, &o_zone_num, origoffset); + copyoffset = __get_next_bit_on(copymap, &c_zone_num, copyoffset); + } +} + +/* + * copyback_high + */ +static void copyback_high(void) +{ + unsigned long *origpage; + unsigned long *copypage; + + origoffset = get_next_bit_on(origmap, -1); + copyoffset = get_next_bit_on(copymap, -1); + + while (origoffset < max_pfn) { + if (PageHighMem(pfn_to_page(origoffset))) { + origpage = (unsigned long *) kmap_atomic(pfn_to_page(origoffset), KM_USER1); + copypage = (unsigned long *) __va(copyoffset << PAGE_SHIFT); + + memcpy(origpage, copypage, PAGE_SIZE); + + kunmap_atomic(origpage, KM_USER1); + } + + origoffset = get_next_bit_on(origmap, origoffset); + copyoffset = get_next_bit_on(copymap, copyoffset); + } +} + +void do_suspend2_lowlevel(int resume) +{ + is_resuming = resume; + + if (resume) { + copyback_prepare(); + + suspend2_arch_save_processor_context(); + + copyback_low(); /* 0 = use logical addresses */ + + suspend2_arch_restore_processor_context(); + } else { + suspend_pre_copy(); + + suspend2_arch_save_processor_context(); + } + + if (is_resuming) { + suspend2_arch_flush_caches(); + + /* Now we are running with our old stack, and with registers copied + * from suspend time. Let's copy back those remaining highmem pages. */ + copyback_high(); + suspend2_arch_flush_caches(); + + touch_softlockup_watchdog(); + + suspend_checksum_print_differences(); + + copyback_post(); + + } else { + suspend_post_copy(); /* If everything goes okay, this function does not return */ + } +} + +/* suspend_copy_pageset1 + * + * Description: Make the atomic copy of pageset1. We can't use copy_page (as we + * once did) because we can't be sure what side effects it has. On + * my old Duron, with 3DNOW, kernel_fpu_begin increments preempt + * count, making our preempt count at resume time 4 instead of 3. + * + * We don't want to call kmap_atomic unconditionally because it has + * the side effect of incrementing the preempt count, which will + * leave it one too high post resume (the page containing the + * preempt count will be copied after its incremented. This is + * essentially the same problem. + */ + +void suspend_copy_pageset1(void) +{ + unsigned long i, source_index, dest_index; + + source_index = get_next_bit_on(pageset1_map, -1); + dest_index = get_next_bit_on(pageset1_copy_map, -1); + + for (i = 0; i < pagedir1.pageset_size; i++) { + unsigned long *origvirt, *copyvirt; + struct page *origpage; + int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1; + + origpage = pfn_to_page(source_index); + + copyvirt = (unsigned long *) page_address(pfn_to_page(dest_index)); + + if (PageHighMem(origpage)) + origvirt = kmap_atomic(origpage, KM_USER1); + else + origvirt = page_address(origpage); + + while (loop >= 0) { + *(copyvirt + loop) = *(origvirt + loop); + loop--; + } + + if (PageHighMem(origpage)) + kunmap_atomic(origvirt, KM_USER1); + + source_index = get_next_bit_on(pageset1_map, source_index); + dest_index = get_next_bit_on(pageset1_copy_map, dest_index); + } +} + +int __suspend_atomic_restore(void *data) +{ + struct page *my_thread_info = virt_to_page(current->thread_info); + + BUG_ON(PagePageset1(my_thread_info)); + BUG_ON(THREAD_SIZE > PAGE_SIZE && PagePageset1(++my_thread_info)); + + atomic_set(&restore_thread_ready, 1); + + while atomic_read(&atomic_copy_hold) + yield(); + + suspend_prepare_status(DONT_CLEAR_BAR, "Copying original kernel back"); + + /* + * If you're hitting this BUG_ON, you have a process that's + * not freezing which is started prior to this. + */ + BUG_ON(freeze_processes()); + + do_suspend2_lowlevel(1); + + printk("Returned from do_suspend2_lowlevel when resuming?!"); + BUG(); + + return 0; +} + +void suspend_atomic_restore(void) +{ + struct task_struct *work_thread; + + disable_nonboot_cpus(); + + yield(); + + set_suspend_state(SUSPEND_FORK_COPYBACK_THREAD); + BUG_ON(atomic_read(&restore_thread_ready)); + + atomic_set(&atomic_copy_hold, 1); + + /* Now start the new thread */ + work_thread = kthread_run(__suspend_atomic_restore, 0, "kcopyback"); + BUG_ON(IS_ERR(work_thread)); + + while (!atomic_read(&restore_thread_ready)) + yield(); + + atomic_set(&atomic_copy_hold, 0); + + while(1) { + try_to_freeze(); + yield(); + } +} diff -urN oldtree/kernel/power/atomic_copy.h newtree/kernel/power/atomic_copy.h --- oldtree/kernel/power/atomic_copy.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/atomic_copy.h 2006-02-18 15:24:31.379818472 +0000 @@ -0,0 +1,4 @@ +extern inline void move_stack_to_nonconflicing_area(void); +extern int save_image_part1(void); +extern void suspend_atomic_restore(void); + diff -urN oldtree/kernel/power/block_io.h newtree/kernel/power/block_io.h --- oldtree/kernel/power/block_io.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/block_io.h 2006-02-18 15:24:31.380818320 +0000 @@ -0,0 +1,76 @@ +/* + * block_io.h + * + * Copyright 2004-2005 Nigel Cunningham + * + * Distributed under GPLv2. + * + * This file contains declarations for functions exported from + * block_io.c, which contains low level io functions. + */ + +#include +#include "extent.h" + +/* + * submit_params + * + * The structure we use for tracking submitted I/O. + */ +struct submit_params { + swp_entry_t swap_address; + struct page *page; + struct block_device *dev; + sector_t block[MAX_BUF_PER_PAGE]; + int readahead_index; + struct submit_params *next; + int printme; +}; + +struct suspend_bdev_info { + struct block_device *bdev; + dev_t dev_t; + int bmap_shift; + int blocks_per_page; +}; + +/* + * Our exported interface so the swapwriter and filewriter don't + * need these functions duplicated. + */ +struct suspend_bio_ops { + int (*submit_io) (int rw, + struct submit_params *submit_info, int syncio); + int (*bdev_page_io) (int rw, struct block_device *bdev, long pos, + struct page *page); + int (*rw_page) (int rw, struct page *page, int readahead_index, + int sync); + void (*wait_on_readahead) (int readahead_index); + void (*check_io_stats) (void); + void (*reset_io_stats) (void); + void (*finish_all_io) (void); + int (*prepare_readahead) (int index); + void (*cleanup_readahead) (int index); + struct page ** readahead_pages; + int (*readahead_ready) (int readahead_index); + int *need_extra_next; + int (*forward_one_page) (void); + void (*set_devinfo) (struct suspend_bdev_info *info); + int (*read_init) (int stream_number); + int (*read_chunk) (struct page *buffer_page, int sync); + int (*read_cleanup) (void); + int (*write_init) (int stream_number); + int (*write_chunk) (struct page *buffer_page); + int (*write_cleanup) (void); + int (*read_header_chunk) (char *buffer, int buffer_size); + int (*write_header_chunk) (char *buffer, int buffer_size); + int (*write_header_chunk_finish) (void); +}; + +extern struct suspend_bio_ops suspend_bio_ops; + +extern char *suspend_writer_buffer; +extern int suspend_writer_buffer_posn; +extern int suspend_read_fd; +extern struct extent_iterate_saved_state suspend_writer_posn_save[3]; +extern struct extent_iterate_state suspend_writer_posn; diff -urN oldtree/kernel/power/checksum.h newtree/kernel/power/checksum.h --- oldtree/kernel/power/checksum.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/checksum.h 2006-02-18 15:24:31.380818320 +0000 @@ -0,0 +1,11 @@ +#ifdef CONFIG_SUSPEND2_CHECKSUMS +extern void suspend_verify_checksums(void); +extern void suspend_checksum_calculate_checksums(void); +extern void suspend_checksum_print_differences(void); +extern int suspend_allocate_checksum_pages(void); +#else +static inline void suspend_verify_checksums(void) { }; +static inline void suspend_checksum_calculate_checksums(void) { }; +static inline void suspend_checksum_print_differences(void) { }; +static inline int suspend_allocate_checksum_pages(void) { return 0; }; +#endif diff -urN oldtree/kernel/power/compression.c newtree/kernel/power/compression.c --- oldtree/kernel/power/compression.c 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/compression.c 2006-02-18 15:24:31.382818016 +0000 @@ -0,0 +1,638 @@ +/* + * kernel/power/suspend_core/compression.c + * + * Copyright (C) 2003-2005 Nigel Cunningham + * + * This file is released under the GPLv2. + * + * This file contains data compression routines for suspend, + * using LZH compression. + * + */ + +#include +#include +#include +#include +#include + +#include "suspend2.h" +#include "modules.h" +#include "proc.h" +#include "suspend2_common.h" +#include "io.h" + +#define S2C_WRITE 0 +#define S2C_READ 1 + +static int suspend_expected_compression = 0; + +static struct suspend_module_ops suspend_compression_ops; +static struct suspend_module_ops *next_driver; + +static char suspend_compressor_name[32]; +static struct crypto_tfm *suspend_compressor_transform; + +static u8 *local_buffer = NULL; +static u8 *page_buffer = NULL; +static unsigned int bufofs; + +static int position = 0; + +/* ---- Local buffer management ---- */ + +/* allocate_local_buffer + * + * Description: Allocates a page of memory for buffering output. + * Returns: Int: Zero if successful, -ENONEM otherwise. + */ + +static int allocate_local_buffer(void) +{ + if (!local_buffer) { + local_buffer = (char *) get_zeroed_page(GFP_ATOMIC); + + if (!local_buffer) { + printk(KERN_ERR + "Failed to allocate the local buffer for " + "suspend2 compression driver.\n"); + return -ENOMEM; + } + } + + if (!page_buffer) { + page_buffer = (char *) get_zeroed_page(GFP_ATOMIC); + + if (!page_buffer) { + printk(KERN_ERR + "Failed to allocate the page buffer for " + "suspend2 compression driver.\n"); + return -ENOMEM; + } + } + + return 0; +} + +/* free_local_buffer + * + * Description: Frees memory allocated for buffering output. + */ + +static inline void free_local_buffer(void) +{ + if (local_buffer) + free_page((unsigned long) local_buffer); + + local_buffer = NULL; + + if (page_buffer) + free_page((unsigned long) page_buffer); + + page_buffer = NULL; +} + +/* suspend_crypto_cleanup + * + * Description: Frees memory allocated for our labours. + */ + +static void suspend_crypto_cleanup(void) +{ + if (suspend_compressor_transform) { + crypto_free_tfm(suspend_compressor_transform); + suspend_compressor_transform = NULL; + } +} + +/* suspend_crypto_prepare + * + * Description: Prepare to do some work by allocating buffers and transforms. + * Returns: Int: Zero if successful, -ENONEM otherwise. + */ + +static int suspend_compress_crypto_prepare(int mode) +{ + if (!*suspend_compressor_name) { + printk("Suspend2: Compression enabled but no compressor name set.\n"); + return 1; + } + + if (!(suspend_compressor_transform = crypto_alloc_tfm(suspend_compressor_name, 0))) { + printk("Suspend2: Failed to initialise the %s compression transform.\n", + suspend_compressor_name); + return 1; + } + + return 0; +} + +/* ---- Exported functions ---- */ + +/* write_init() + * + * Description: Allocate buffers and prepare to compress data. + * Arguments: Stream_number: Ignored. + * Returns: Zero on success, -ENOMEM if unable to vmalloc. + */ + +static int suspend_compress_write_init(int stream_number) +{ + int result; + + next_driver = suspend_get_next_filter(&suspend_compression_ops); + + if (!next_driver) { + printk("Compression Driver: Argh! No one wants my output!"); + return -ECHILD; + } + + if ((result = suspend_compress_crypto_prepare(S2C_WRITE))) { + return result; + } + + if ((result = allocate_local_buffer())) + return result; + + /* Only reset the stats if starting to write an image */ + if (stream_number == 2) + bytes_in = bytes_out = 0; + + bufofs = 0; + + position = 0; + + return 0; +} + +/* suspend_compress_write() + * + * Description: Helper function for write_chunk. Write the compressed data. + * Arguments: u8*: Output buffer to be written. + * unsigned int: Length of buffer. + * Return: int: Result to be passed back to caller. + */ + +static int suspend_compress_write (u8 *buffer, unsigned int len) +{ + int ret; + + bytes_out += len; + + while (len + bufofs > PAGE_SIZE) { + unsigned int chunk = PAGE_SIZE - bufofs; + memcpy (local_buffer + bufofs, buffer, chunk); + buffer += chunk; + len -= chunk; + bufofs = 0; + if ((ret = next_driver->ops.filter.write_chunk(virt_to_page(local_buffer))) < 0) + return ret; + } + memcpy (local_buffer + bufofs, buffer, len); + bufofs += len; + return 0; +} + +/* suspend_compress_write_chunk() + * + * Description: Compress a page of data, buffering output and passing on + * filled pages to the next module in the pipeline. + * Arguments: Buffer_page: Pointer to a buffer of size PAGE_SIZE, + * containing data to be compressed. + * Returns: 0 on success. Otherwise the error is that returned by later + * modules, -ECHILD if we have a broken pipeline or -EIO if + * zlib errs. + */ + +static int suspend_compress_write_chunk(struct page *buffer_page) +{ + int ret; + unsigned int len; + u16 len_written; + char *buffer_start; + + if (!suspend_compressor_transform) + return next_driver->ops.filter.write_chunk(buffer_page); + + buffer_start = kmap(buffer_page); + + bytes_in += PAGE_SIZE; + + len = PAGE_SIZE; + + ret = crypto_comp_compress(suspend_compressor_transform, + buffer_start, PAGE_SIZE, + page_buffer, &len); + + if (ret) { + printk("Compression failed.\n"); + goto failure; + } + + len_written = (u16) len; + + if ((ret = suspend_compress_write((u8 *)&len_written, 2)) >= 0) { + if ((ret = suspend_compress_write((u8 *) &position, sizeof(position)))) + return -EIO; + if (len < PAGE_SIZE) { // some compression + position += len; + ret = suspend_compress_write(page_buffer, len); + } else { + ret = suspend_compress_write(buffer_start, PAGE_SIZE); + position += PAGE_SIZE; + } + } + position += 2 + sizeof(int); + + +failure: + kunmap(buffer_page); + return ret; +} + +/* write_cleanup() + * + * Description: Write unflushed data and free workspace. + * Returns: Result of writing last page. + */ + +static int suspend_compress_write_cleanup(void) +{ + int ret = 0; + + if (suspend_compressor_transform) + ret = next_driver->ops.filter.write_chunk(virt_to_page(local_buffer)); + + suspend_crypto_cleanup(); + free_local_buffer(); + + return ret; +} + +/* read_init() + * + * Description: Prepare to read a new stream of data. + * Arguments: int: Section of image about to be read. + * Returns: int: Zero on success, error number otherwise. + */ + +static int suspend_compress_read_init(int stream_number) +{ + int result; + + next_driver = suspend_get_next_filter(&suspend_compression_ops); + + if (!next_driver) { + printk("Compression Driver: Argh! No one wants " + "to feed me data!"); + return -ECHILD; + } + + if ((result = suspend_compress_crypto_prepare(S2C_READ))) + return result; + + if ((result = allocate_local_buffer())) + return result; + + bufofs = PAGE_SIZE; + + position = 0; + + return 0; +} + +/* suspend_compress_read() + * + * Description: Read data into compression buffer. + * Arguments: u8 *: Address of the buffer. + * unsigned int: Length + * Returns: int: Result of reading the image chunk. + */ + +static int suspend_compress_read (u8 *buffer, unsigned int len) +{ + int ret; + + while (len + bufofs > PAGE_SIZE) { + unsigned int chunk = PAGE_SIZE - bufofs; + memcpy(buffer, local_buffer + bufofs, chunk); + buffer += chunk; + len -= chunk; + bufofs = 0; + if ((ret = next_driver->ops.filter.read_chunk( + virt_to_page(local_buffer), SUSPEND_SYNC)) < 0) { + return ret; + } + } + memcpy (buffer, local_buffer + bufofs, len); + bufofs += len; + return 0; +} + +/* suspend_compress_read_chunk() + * + * Description: Retrieve data from later modules and decompress it until the + * input buffer is filled. + * Arguments: Buffer_start: Pointer to a buffer of size PAGE_SIZE. + * Sync: Whether the previous module (or core) wants its + * data synchronously. + * Returns: Zero if successful. Error condition from me or from downstream + * on failure. + */ + +static int suspend_compress_read_chunk(struct page *buffer_page, int sync) +{ + int ret, position_saved; + unsigned int len; + u16 len_written; + char *buffer_start; + + if (!suspend_compressor_transform) + return next_driver->ops.filter.read_chunk(buffer_page, SUSPEND_ASYNC); + + /* + * All our reads must be synchronous - we can't decompress + * data that hasn't been read yet. + */ + + buffer_start = kmap(buffer_page); + + if ((ret = suspend_compress_read ((u8 *)&len_written, 2)) >= 0) { + len = (unsigned int) len_written; + ret = suspend_compress_read((u8 *) &position_saved, sizeof(position_saved)); + if (ret) + return ret; + + if (position != position_saved) { + printk("Position saved (%d) != position I'm at now (%d).\n", + position_saved, position); + BUG_ON(1); + } + if (len >= PAGE_SIZE) { // uncompressed + ret = suspend_compress_read(buffer_start, PAGE_SIZE); + if (ret) + return ret; + + position += PAGE_SIZE; + } else { // compressed + if ((ret = suspend_compress_read(page_buffer, len)) >= 0) { + int outlen = PAGE_SIZE; + /* Important note. + * + * For Deflate, decompression return values may represent + * errors. Deflate complains when everything is alright, so + * we ignore the errors unless the number of output bytes is + * not PAGE_SIZE. + */ + crypto_comp_decompress(suspend_compressor_transform, + page_buffer, len, + buffer_start, &outlen); + if (outlen != PAGE_SIZE) { + printk("Decompression yielded %ld bytes instead of %d.\n", PAGE_SIZE, outlen); + ret = -EIO; + } else + ret = 0; + } + position += len; + } + position += 2 + sizeof(int); + } else + printk("Compress_read returned %d.", ret); + kunmap(buffer_page); + return ret; +} + +/* read_cleanup() + * + * Description: Clean up after reading part or all of a stream of data. + * Returns: int: Always zero. Never fails. + */ + +static int suspend_compress_read_cleanup(void) +{ + suspend_crypto_cleanup(); + free_local_buffer(); + return 0; +} + +/* suspend_compress_print_debug_stats + * + * Description: Print information to be recorded for debugging purposes into a + * buffer. + * Arguments: buffer: Pointer to a buffer into which the debug info will be + * printed. + * size: Size of the buffer. + * Returns: Number of characters written to the buffer. + */ + +static int suspend_compress_print_debug_stats(char *buffer, int size) +{ + int pages_in = bytes_in >> PAGE_SHIFT, + pages_out = bytes_out >> PAGE_SHIFT; + int len; + + /* Output the compression ratio achieved. */ + len = snprintf_used(buffer, size, "- Compressor %s enabled.\n", + suspend_compressor_name); + if (pages_in) + len+= snprintf_used(buffer+len, size - len, + " Compressed %ld bytes into %ld (%d percent compression).\n", + bytes_in, bytes_out, (pages_in - pages_out) * 100 / pages_in); + return len; +} + +/* compression_memory_needed + * + * Description: Tell the caller how much memory we need to operate during + * suspend/resume. + * Returns: Unsigned long. Maximum number of bytes of memory required for + * operation. + */ + +static unsigned long suspend_compress_memory_needed(void) +{ + return PAGE_SIZE; +} + +static unsigned long suspend_compress_storage_needed(void) +{ + return 2 * sizeof(unsigned long) + sizeof(int); +} + +/* suspend_compress_save_config_info + * + * Description: Save informaton needed when reloading the image at resume time. + * Arguments: Buffer: Pointer to a buffer of size PAGE_SIZE. + * Returns: Number of bytes used for saving our data. + */ + +static int suspend_compress_save_config_info(char *buffer) +{ + int namelen = strlen(suspend_compressor_name) + 1; + int total_len; + + *((unsigned long *) buffer) = bytes_in; + *((unsigned long *) (buffer + 1 * sizeof(unsigned long))) = bytes_out; + *((unsigned long *) (buffer + 2 * sizeof(unsigned long))) = suspend_expected_compression; + *((unsigned long *) (buffer + 3 * sizeof(unsigned long))) = namelen; + strncpy(buffer + 4 * sizeof(unsigned long), suspend_compressor_name, namelen); + total_len = 4 * sizeof(unsigned long) + namelen; + return total_len; +} + +/* suspend_compress_load_config_info + * + * Description: Reload information needed for decompressing the image at + * resume time. + * Arguments: Buffer: Pointer to the start of the data. + * Size: Number of bytes that were saved. + */ + +static void suspend_compress_load_config_info(char *buffer, int size) +{ + int namelen; + + bytes_in = *((unsigned long *) buffer); + bytes_out = *((unsigned long *) (buffer + 1 * sizeof(unsigned long))); + suspend_expected_compression = *((unsigned long *) (buffer + 2 * sizeof(unsigned long))); + namelen = *((unsigned long *) (buffer + 3 * sizeof(unsigned long))); + strncpy(suspend_compressor_name, buffer + 4 * sizeof(unsigned long), namelen); + return; +} + +/* suspend_expected_compression_ratio + * + * Description: Returns the expected ratio between data passed into this module + * and the amount of data output when writing. + * Returns: 100 if the module is disabled. Otherwise the value set by the + * user via our proc entry. + */ + +int suspend_expected_compression_ratio(void) +{ + if (suspend_compression_ops.disabled) + return 100; + else + return 100 - suspend_expected_compression; +} + +static void suspend_compressor_disable_if_empty(void) +{ + suspend_compression_ops.disabled = !(*suspend_compressor_name); +} + +static int suspend_compress_initialise(int starting_cycle) +{ + if (starting_cycle) + suspend_compressor_disable_if_empty(); + + return 0; +} +/* + * data for our proc entries. + */ + +static struct suspend_proc_data proc_params[] = { + { + .filename = "expected_compression", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_INTEGER, + .data = { + .integer = { + .variable = &suspend_expected_compression, + .minimum = 0, + .maximum = 99, + } + } + }, + + { + .filename = "disable_compression", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_INTEGER, + .data = { + .integer = { + .variable = &suspend_compression_ops.disabled, + .minimum = 0, + .maximum = 1, + } + } + }, + + { + .filename = "compressor", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_STRING, + .data = { + .string = { + .variable = suspend_compressor_name, + .max_length = 31, + } + }, + .write_proc = &suspend_compressor_disable_if_empty, + } +}; + +/* + * Ops structure. + */ + +static struct suspend_module_ops suspend_compression_ops = { + .type = FILTER_PLUGIN, + .name = "Suspend2 Compressor", + .module = THIS_MODULE, + .memory_needed = suspend_compress_memory_needed, + .print_debug_info = suspend_compress_print_debug_stats, + .save_config_info = suspend_compress_save_config_info, + .load_config_info = suspend_compress_load_config_info, + .storage_needed = suspend_compress_storage_needed, + + .initialise = suspend_compress_initialise, + + .write_init = suspend_compress_write_init, + .write_cleanup = suspend_compress_write_cleanup, + .read_init = suspend_compress_read_init, + .read_cleanup = suspend_compress_read_cleanup, + + .ops = { + .filter = { + .write_chunk = suspend_compress_write_chunk, + .read_chunk = suspend_compress_read_chunk, + } + } +}; + +/* ---- Registration ---- */ + +static __init int suspend_compress_load(void) +{ + int result; + int i, numfiles = sizeof(proc_params) / sizeof(struct suspend_proc_data); + + printk("Suspend2 Compression Driver loading.\n"); + if (!(result = suspend_register_module(&suspend_compression_ops))) { + for (i=0; i< numfiles; i++) + suspend_register_procfile(&proc_params[i]); + } else + printk("Suspend2 Compression Driver unable to register!\n"); + return result; +} + +#ifdef MODULE +static __exit void suspend_compress_unload(void) +{ + printk("Suspend2 Compression Driver unloading.\n"); + for (i=0; i< numfiles; i++) + suspend_unregister_procfile(&proc_params[i]); + suspend_unregister_module(&suspend_compression_ops); +} + + +module_init(suspend_compress_load); +module_exit(suspend_compress_unload); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Nigel Cunningham"); +MODULE_DESCRIPTION("Compression Support for Suspend2"); +#else +late_initcall(suspend_compress_load); +#endif diff -urN oldtree/kernel/power/debug_pagealloc.c newtree/kernel/power/debug_pagealloc.c --- oldtree/kernel/power/debug_pagealloc.c 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/debug_pagealloc.c 2006-02-18 15:24:31.383817864 +0000 @@ -0,0 +1,111 @@ +#include +#ifdef CONFIG_DEBUG_PAGEALLOC +#include +#include + +#include "pageflags.h" +#include "suspend2.h" +#include "pagedir.h" + + +extern pte_t *lookup_address(unsigned long address); + +/* Returns whether it was already in the requested state */ +extern void kernel_map_pages(struct page *page, int numpages, int enable); + +static int page_is_kernel_mapped(struct page *page) +{ + pte_t *kpte; + unsigned long address; + + if (PageHighMem(page)) + return 0; + + address = (unsigned long)page_address(page); + + kpte = lookup_address(address); + if (!kpte) + return 0; + + if (pte_same(*kpte, mk_pte(page, PAGE_KERNEL))) + return 1; + + return 0; +} + +int suspend_map_kernel_page(struct page *page, int enable) +{ + int is_already_mapped = page_is_kernel_mapped(page); + + if (enable == is_already_mapped) + return 1; + + kernel_map_pages(page, 1, enable); + + return 0; +} + +/* + * suspend_map_atomic_copy_pages + * + * When DEBUG_PAGEALLOC is enabled, we need to map the pages before + * an atomic copy. + */ +void suspend_map_atomic_copy_pages(void) +{ + int i = 0, source_index = -1, dest_index = -1; + + for (i = 0; i < pagedir1.pageset_size; i++) { + int orig_was_mapped = 1, copy_was_mapped = 1; + struct page *origpage, *copypage; + + source_index = get_next_bit_on(pageset1_map, source_index); + dest_index = get_next_bit_on(pageset1_copy_map, dest_index); + + origpage = pfn_to_page(source_index); + copypage = pfn_to_page(dest_index); + + if (!PageHighMem(origpage)) { + orig_was_mapped = suspend_map_kernel_page(origpage, 1); + if ((!orig_was_mapped) && + (!test_suspend_state(SUSPEND_NOW_RESUMING))) + SetPageUnmap(origpage); + } + + copy_was_mapped = suspend_map_kernel_page(copypage, 1); + if ((!copy_was_mapped) && + (!test_suspend_state(SUSPEND_NOW_RESUMING))) + SetPageUnmap(copypage); + } +} + +/* + * suspend_unmap_atomic_copy_pages + * + * We also need to unmap pages when DEBUG_PAGEALLOC is enabled. + */ +void suspend_unmap_atomic_copy_pages(void) +{ + int i; + struct zone *zone; + + for_each_zone(zone) { + if (!zone->present_pages) + continue; + for (i = 0; i < zone->spanned_pages; i++) { + struct page *page = pfn_to_page(zone->zone_start_pfn + i); + if (PageUnmap(page)) + suspend_map_kernel_page(page, 0); + } + } +} +#else +void suspend_map_atomic_copy_pages(void) { }; + +void suspend_unmap_atomic_copy_pages(void) { }; + +int suspend_map_kernel_page(struct page *page, int enable) +{ + return 1; +} +#endif diff -urN oldtree/kernel/power/debug_pagealloc.h newtree/kernel/power/debug_pagealloc.h --- oldtree/kernel/power/debug_pagealloc.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/debug_pagealloc.h 2006-02-18 15:24:31.384817712 +0000 @@ -0,0 +1,3 @@ +extern void suspend_map_atomic_copy_pages(void); +extern void suspend_unmap_atomic_copy_pages(void); +extern int suspend_map_kernel_page(struct page *page, int enable); diff -urN oldtree/kernel/power/disk.c newtree/kernel/power/disk.c --- oldtree/kernel/power/disk.c 2006-02-18 15:18:30.087743216 +0000 +++ newtree/kernel/power/disk.c 2006-02-18 15:24:31.385817560 +0000 @@ -10,6 +10,7 @@ */ #include +#include #include #include #include @@ -99,7 +100,7 @@ if (!(error = swsusp_shrink_memory())) return 0; thaw: - thaw_processes(); + thaw_processes(FREEZER_ALL_THREADS); enable_nonboot_cpus(); pm_restore_console(); return error; @@ -108,7 +109,7 @@ static void unprepare_processes(void) { platform_finish(); - thaw_processes(); + thaw_processes(FREEZER_ALL_THREADS); enable_nonboot_cpus(); pm_restore_console(); } diff -urN oldtree/kernel/power/encryption.c newtree/kernel/power/encryption.c --- oldtree/kernel/power/encryption.c 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/encryption.c 2006-02-18 15:24:31.386817408 +0000 @@ -0,0 +1,597 @@ +/* + * kernel/power/suspend_core/encryption.c + * + * Copyright (C) 2003-2005 Nigel Cunningham + * + * This file is released under the GPLv2. + * + * This file contains data encryption routines for suspend, + * using cryptoapi transforms. + * + * ToDo: + * - Apply min/max_keysize the cipher changes. + * - Test. + */ + +#include +#include +#include +#include +#include +#include + +#include "suspend2.h" +#include "modules.h" +#include "proc.h" +#include "suspend2_common.h" +#include "io.h" + +#define S2C_WRITE 0 +#define S2C_READ 1 + +static struct suspend_module_ops suspend_encryption_ops; +static struct suspend_module_ops *next_driver; + +static char suspend_encryptor_name[32]; +static struct crypto_tfm *suspend_encryptor_transform; +static char suspend_encryptor_key[256]; +static int suspend_key_len; +static char suspend_encryptor_iv[256]; +static int suspend_encryptor_mode; +static int suspend_encryptor_save_key_and_iv; + +static u8 *page_buffer = NULL; +static unsigned int bufofs; + +static struct scatterlist suspend_crypt_sg[PAGE_SIZE/8]; + +/* ---- Local buffer management ---- */ + +/* allocate_local_buffer + * + * Description: Allocates a page of memory for buffering output. + * Returns: Int: Zero if successful, -ENONEM otherwise. + */ + +static int allocate_local_buffer(void) +{ + if (!page_buffer) { + int i; + + page_buffer = (char *) get_zeroed_page(GFP_ATOMIC); + + if (!page_buffer) { + printk(KERN_ERR + "Failed to allocate the page buffer for " + "suspend2 encryption driver.\n"); + return -ENOMEM; + } + + for (i=0; i < (PAGE_SIZE / suspend_key_len); i++) { + suspend_crypt_sg[i].page = virt_to_page(page_buffer); + suspend_crypt_sg[i].offset = suspend_key_len * i; + suspend_crypt_sg[i].length = suspend_key_len; + } + } + + return 0; +} + +/* free_local_buffer + * + * Description: Frees memory allocated for buffering output. + */ + +static void free_local_buffer(void) +{ + if (page_buffer) + free_page((unsigned long) page_buffer); + + page_buffer = NULL; +} + +/* suspend_crypto_cleanup + * + * Description: Frees memory allocated for our labours. + */ + +static void suspend_crypto_cleanup(void) +{ + if (suspend_encryptor_transform) { + crypto_free_tfm(suspend_encryptor_transform); + suspend_encryptor_transform = NULL; + } +} + +/* suspend_crypto_prepare + * + * Description: Prepare to do some work by allocating buffers and transforms. + * Returns: Int: Zero if successful, -ENONEM otherwise. + */ + +static int suspend_encrypt_crypto_prepare(int mode) +{ + if (!*suspend_encryptor_name) { + printk("Suspend2: Encryptor enabled but no name set.\n"); + return 1; + } + + if (!(suspend_encryptor_transform = crypto_alloc_tfm(suspend_encryptor_name, + 1 << suspend_encryptor_mode))) { + printk("Suspend2: Failed to initialise the encryption transform (%s, mode %d).\n", + suspend_encryptor_name, suspend_encryptor_mode); + return 1; + } + + if (mode) + bufofs = PAGE_SIZE; + else + bufofs = 0; + + suspend_key_len = strlen(suspend_encryptor_key); + + if (crypto_cipher_setkey(suspend_encryptor_transform, suspend_encryptor_key, + suspend_key_len)) { + printk("%d is an invalid key length for cipher %s.\n", + suspend_key_len, + suspend_encryptor_name); + return 1; + } + + if (!mode) { + crypto_cipher_set_iv(suspend_encryptor_transform, + suspend_encryptor_iv, + crypto_tfm_alg_ivsize(suspend_encryptor_transform)); + } + + return 0; +} + +/* ---- Exported functions ---- */ + +/* write_init() + * + * Description: Allocate buffers and prepare to encrypt data. + * Arguments: Stream_number: Ignored. + * Returns: Zero on success, -ENOMEM if unable to vmalloc. + */ + +static int suspend_encrypt_write_init(int stream_number) +{ + int result; + + next_driver = suspend_get_next_filter(&suspend_encryption_ops); + + if (!next_driver) { + printk("Encryption Driver: Argh! No one wants my output!"); + return -ECHILD; + } + + if ((result = suspend_encrypt_crypto_prepare(S2C_WRITE))) { + set_result_state(SUSPEND_ENCRYPTION_SETUP_FAILED); + suspend_crypto_cleanup(); + return result; + } + + if ((result = allocate_local_buffer())) + return result; + + /* Only reset the stats if starting to write an image */ + if (stream_number == 2) + bytes_in = bytes_out = 0; + + bufofs = 0; + + return 0; +} + +/* suspend_encrypt_write_chunk() + * + * Description: Encrypt a page of data, buffering output and passing on + * filled pages to the next module in the pipeline. + * Arguments: Buffer_page: Pointer to a buffer of size PAGE_SIZE, + * containing data to be encrypted. + * Returns: 0 on success. Otherwise the error is that returned by later + * modules, -ECHILD if we have a broken pipeline or -EIO if + * zlib errs. + */ + +static int suspend_encrypt_write_chunk(struct page *buffer_page) +{ + int ret; + unsigned int len; + u16 len_written; + char *buffer_start; + + if (!suspend_encryptor_transform) + return next_driver->ops.filter.write_chunk(buffer_page); + + buffer_start = kmap(buffer_page); + memcpy(page_buffer, buffer_start, PAGE_SIZE); + kunmap(buffer_page); + + bytes_in += PAGE_SIZE; + + len = PAGE_SIZE; + + ret = crypto_cipher_encrypt(suspend_encryptor_transform, + suspend_crypt_sg, suspend_crypt_sg, PAGE_SIZE); + + if (ret) { + printk("Encryption failed.\n"); + return -EIO; + } + + len_written = (u16) len; + + ret = next_driver->ops.filter.write_chunk(virt_to_page(page_buffer)); + + return ret; +} + +/* write_cleanup() + * + * Description: Write unflushed data and free workspace. + * Returns: Result of writing last page. + */ + +static int suspend_encrypt_write_cleanup(void) +{ + suspend_crypto_cleanup(); + free_local_buffer(); + + return 0; +} + +/* read_init() + * + * Description: Prepare to read a new stream of data. + * Arguments: int: Section of image about to be read. + * Returns: int: Zero on success, error number otherwise. + */ + +static int suspend_encrypt_read_init(int stream_number) +{ + int result; + + next_driver = suspend_get_next_filter(&suspend_encryption_ops); + + if (!next_driver) { + printk("Encryption Driver: Argh! No one wants " + "to feed me data!"); + return -ECHILD; + } + + if ((result = suspend_encrypt_crypto_prepare(S2C_READ))) { + set_result_state(SUSPEND_ENCRYPTION_SETUP_FAILED); + suspend_crypto_cleanup(); + return result; + } + + if ((result = allocate_local_buffer())) + return result; + + bufofs = PAGE_SIZE; + + return 0; +} + +/* suspend_encrypt_read_chunk() + * + * Description: Retrieve data from later modules and deencrypt it until the + * input buffer is filled. + * Arguments: Buffer_start: Pointer to a buffer of size PAGE_SIZE. + * Sync: Whether the previous module (or core) wants its + * data synchronously. + * Returns: Zero if successful. Error condition from me or from downstream + * on failure. + */ + +static int suspend_encrypt_read_chunk(struct page *buffer_page, int sync) +{ + int ret; + char *buffer_start; + + if (!suspend_encryptor_transform) + return next_driver->ops.filter.read_chunk(buffer_page, sync); + + /* + * All our reads must be synchronous - we can't deencrypt + * data that hasn't been read yet. + */ + + if ((ret = next_driver->ops.filter.read_chunk( + virt_to_page(page_buffer), SUSPEND_SYNC)) < 0) { + printk("Failed to read an encrypted block.\n"); + return ret; + } + + ret = crypto_cipher_decrypt(suspend_encryptor_transform, + suspend_crypt_sg, suspend_crypt_sg, PAGE_SIZE); + + if (ret) + printk("Decrypt function returned %d.\n", ret); + + buffer_start = kmap(buffer_page); + memcpy(buffer_start, page_buffer, PAGE_SIZE); + kunmap(buffer_page); + return ret; +} + +/* read_cleanup() + * + * Description: Clean up after reading part or all of a stream of data. + * Returns: int: Always zero. Never fails. + */ + +static int suspend_encrypt_read_cleanup(void) +{ + suspend_crypto_cleanup(); + free_local_buffer(); + return 0; +} + +/* suspend_encrypt_print_debug_stats + * + * Description: Print information to be recorded for debugging purposes into a + * buffer. + * Arguments: buffer: Pointer to a buffer into which the debug info will be + * printed. + * size: Size of the buffer. + * Returns: Number of characters written to the buffer. + */ + +static int suspend_encrypt_print_debug_stats(char *buffer, int size) +{ + int len; + + len = snprintf_used(buffer, size, "- Encryptor %s enabled.\n", + suspend_encryptor_name); + return len; +} + +/* encryption_memory_needed + * + * Description: Tell the caller how much memory we need to operate during + * suspend/resume. + * Returns: Unsigned long. Maximum number of bytes of memory required for + * operation. + */ + +static unsigned long suspend_encrypt_memory_needed(void) +{ + return PAGE_SIZE; +} + +static unsigned long suspend_encrypt_storage_needed(void) +{ + return 2 * sizeof(unsigned long) + sizeof(int); +} + +/* suspend_encrypt_save_config_info + * + * Description: Save informaton needed when reloading the image at resume time. + * Arguments: Buffer: Pointer to a buffer of size PAGE_SIZE. + * Returns: Number of bytes used for saving our data. + */ + +static int suspend_encrypt_save_config_info(char *buffer) +{ + int buf_offset, str_size; + + str_size = strlen(suspend_encryptor_name); + *buffer = (char) str_size; + strncpy(buffer + 1, suspend_encryptor_name, str_size + 1); + buf_offset = str_size + 2; + + *(buffer + buf_offset) = (char) suspend_encryptor_mode; + buf_offset++; + + *(buffer + buf_offset) = (char) suspend_encryptor_save_key_and_iv; + buf_offset++; + + if (suspend_encryptor_save_key_and_iv) { + + str_size = strlen(suspend_encryptor_key); + *(buffer + buf_offset) = (char) str_size; + strncpy(buffer + buf_offset + 1, suspend_encryptor_key, str_size + 1); + + buf_offset+= str_size + 2; + + str_size = strlen(suspend_encryptor_iv); + *(buffer + buf_offset) = (char) str_size; + strncpy(buffer + buf_offset + 1, suspend_encryptor_iv, str_size + 1); + + buf_offset += str_size + 2; + } + + return buf_offset; +} + +/* suspend_encrypt_load_config_info + * + * Description: Reload information needed for deencrypting the image at + * resume time. + * Arguments: Buffer: Pointer to the start of the data. + * Size: Number of bytes that were saved. + */ + +static void suspend_encrypt_load_config_info(char *buffer, int size) +{ + int buf_offset, str_size; + + str_size = (int) *buffer; + strncpy(suspend_encryptor_name, buffer + 1, str_size + 1); + buf_offset = str_size + 2; + + suspend_encryptor_mode = (int) *(buffer + buf_offset); + buf_offset++; + + suspend_encryptor_save_key_and_iv = (int) *(buffer + buf_offset); + buf_offset++; + + if (suspend_encryptor_save_key_and_iv) { + str_size = (int) *(buffer + buf_offset); + strncpy(suspend_encryptor_key, buffer + buf_offset + 1, str_size + 1); + + buf_offset+= str_size + 2; + + str_size = (int) *(buffer + buf_offset); + strncpy(suspend_encryptor_iv, buffer + buf_offset + 1, str_size + 1); + + buf_offset += str_size + 2; + } else { + *suspend_encryptor_key = 0; + *suspend_encryptor_iv = 0; + } + + if (buf_offset != size) { + printk("Suspend Encryptor config info size mismatch (%d != %d): settings ignored.\n", + buf_offset, size); + *suspend_encryptor_key = 0; + *suspend_encryptor_iv = 0; + } + return; +} + +static void suspend_encryptor_disable_if_empty(void) +{ + suspend_encryption_ops.disabled = !(*suspend_encryptor_name); +} + +static int suspend_encrypt_initialise(int starting_cycle) +{ + if (starting_cycle) + suspend_encryptor_disable_if_empty(); + + return 0; +} +/* + * data for our proc entries. + */ + +static struct suspend_proc_data proc_params[] = { + { + .filename = "encryptor", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_STRING, + .data = { + .string = { + .variable = suspend_encryptor_name, + .max_length = 31, + } + }, + .write_proc = suspend_encryptor_disable_if_empty, + }, + + { + .filename = "encryption_mode", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_INTEGER, + .data = { + .integer = { + .variable = &suspend_encryptor_mode, + .minimum = 0, + .maximum = 3, + } + } + }, + + { + .filename = "encryption_save_key_and_iv", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_INTEGER, + .data = { + .integer = { + .variable = &suspend_encryptor_save_key_and_iv, + .minimum = 0, + .maximum = 1, + } + } + }, + + { + .filename = "encryption_key", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_STRING, + .data = { + .string = { + .variable = suspend_encryptor_key, + .max_length = 255, + } + } + }, + + { + .filename = "encryption_iv", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_STRING, + .data = { + .string = { + .variable = suspend_encryptor_iv, + .max_length = 255, + } + } + }, + + { + .filename = "disable_encryption", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_INTEGER, + .data = { + .integer = { + .variable = &suspend_encryption_ops.disabled, + .minimum = 0, + .maximum = 1, + } + } + }, + +}; + +/* + * Ops structure. + */ + +static struct suspend_module_ops suspend_encryption_ops = { + .type = FILTER_PLUGIN, + .name = "Encryptor", + .module = THIS_MODULE, + .memory_needed = suspend_encrypt_memory_needed, + .print_debug_info = suspend_encrypt_print_debug_stats, + .save_config_info = suspend_encrypt_save_config_info, + .load_config_info = suspend_encrypt_load_config_info, + .storage_needed = suspend_encrypt_storage_needed, + + .initialise = suspend_encrypt_initialise, + + .write_init = suspend_encrypt_write_init, + .write_cleanup = suspend_encrypt_write_cleanup, + .read_init = suspend_encrypt_read_init, + .read_cleanup = suspend_encrypt_read_cleanup, + + .ops = { + .filter = { + .write_chunk = suspend_encrypt_write_chunk, + .read_chunk = suspend_encrypt_read_chunk, + } + } +}; + +/* ---- Registration ---- */ + +static __init int suspend_encrypt_load(void) +{ + int result; + int i, numfiles = sizeof(proc_params) / sizeof(struct suspend_proc_data); + + printk("Suspend2 Encryption Driver loading.\n"); + if (!(result = suspend_register_module(&suspend_encryption_ops))) { + for (i=0; i< numfiles; i++) + suspend_register_procfile(&proc_params[i]); + } else + printk("Suspend2 Encryption Driver unable to register!\n"); + return result; +} + +late_initcall(suspend_encrypt_load); diff -urN oldtree/kernel/power/extent.c newtree/kernel/power/extent.c --- oldtree/kernel/power/extent.c 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/extent.c 2006-02-18 15:24:31.387817256 +0000 @@ -0,0 +1,247 @@ +/* kernel/power/extent.c + * + * (C) 2003-2005 Nigel Cunningham + * + * Distributed under GPLv2. + * + * These functions encapsulate the manipulation of storage metadata. For + * pageflags, we use dynamically allocated bitmaps. + */ + +#include +#include +#include "modules.h" +#include "extent.h" +#include "ui.h" + +int extents_allocated = 0; + +/* get_extent + * + * Returns a free extent. May fail, returning NULL instead. + */ + +static struct extent *get_extent(void) +{ + struct extent *result; + + if (!(result = kmalloc(sizeof(struct extent), GFP_ATOMIC))) + return NULL; + + extents_allocated++; + result->minimum = result->maximum = 0; + result->next = NULL; + return result; +} + +/* put_extent. + * + * Frees an extent. Assumes unlinking is done by the caller. + */ +void put_extent(struct extent *extent) +{ + BUG_ON(!extent); + + kfree(extent); + extents_allocated--; +} + +/* put_extent_chain. + * + * Frees a whole chain of extents. + */ +void put_extent_chain(struct extent_chain *chain) +{ + struct extent *this; + + this = chain->first; + + while(this) { + struct extent *next = this->next; + kfree(this); + chain->frees++; + extents_allocated --; + this = next; + } + + BUG_ON(chain->frees != chain->allocs); + chain->first = chain->last = NULL; + chain->size = chain->allocs = chain->frees = 0; +} + +/* append_extent_to_extent_chain + * + * Used where we know a extent is to be added to the end of the list + * and does not need merging with the current last extent. + */ + +int append_extent_to_extent_chain(struct extent_chain *chain, + unsigned long minimum, unsigned long maximum) +{ + struct extent *newextent = NULL; + + newextent = get_extent(); + if (!newextent) { + printk("Error unable to append a new extent to the chain.\n"); + return 2; + } + + chain->allocs++; + chain->size+= (maximum - minimum + 1); + newextent->minimum = minimum; + newextent->maximum = maximum; + newextent->next = NULL; + + if (chain->last) { + chain->last->next = newextent; + chain->last = newextent; + } else + chain->last = chain->first = newextent; + + return 0; +} + +/* serialise_extent_chain + * + * Write a chain in the image. + */ +int serialise_extent_chain(struct extent_chain *chain) +{ + struct extent *this; + int ret, i = 1; + + if ((ret = suspend_active_writer->ops.writer.write_header_chunk((char *) chain, + sizeof(struct extent_chain) - 2 * sizeof(struct extent *)))) + return ret; + + this = chain->first; + while (this) { + if ((ret = suspend_active_writer->ops.writer.write_header_chunk((char *) this, + 2 * sizeof(unsigned long)))) + return ret; + this = this->next; + i++; + } + return ret; +} + +/* load_extent_chain + * + * Read back a chain saved in the image. + */ +int load_extent_chain(struct extent_chain *chain) +{ + struct extent *this, *last = NULL; + int i, ret; + + if (!(ret = suspend_active_writer->ops.writer.read_header_chunk((char *) chain, + sizeof(struct extent_chain) - 2 * sizeof(struct extent *)))) + return ret; + + for (i = 0; i < (chain->allocs - chain->frees); i++) { + this = kmalloc(sizeof(struct extent), GFP_ATOMIC); + BUG_ON(!this); /* Shouldn't run out of memory trying this! */ + this->next = NULL; + if (!(ret = suspend_active_writer->ops.writer.read_header_chunk((char *) this, + 2 * sizeof(unsigned long)))) + return ret; + if (last) + last->next = this; + else + chain->first = this; + last = this; + } + chain->last = last; + return ret; +} + +/* extent_state_next + * + * Given a state, progress to the next valid entry. We may begin in an + * invalid state, as we do when invoked from extent_state_goto_start below. + */ +unsigned long extent_state_next(struct extent_iterate_state *state) +{ + if (state->current_chain > state->num_chains) + return 0; + + if (state->current_extent) + GET_EXTENT_NEXT(state->current_extent, state->current_offset); + + while(!state->current_extent) { + int chain_num = ++(state->current_chain); + + if (chain_num > state->num_chains) + return 0; + + state->current_extent = (state->chains + chain_num)->first; + + if (!state->current_extent) + continue; + + state->current_offset = state->current_extent->minimum; + } + + return state->current_offset; +} + +/* extent_state_goto_start + * + * Find the first valid value in a group of chains. + */ +void extent_state_goto_start(struct extent_iterate_state *state) +{ + state->current_chain = -1; + state->current_extent = NULL; + state->current_offset = 0; +} + +/* extent_start_save + * + * Given a state and a struct extent_state_store, save the crreutn + * position in a format that can be used with relocated chains (at + * resume time). + */ + +void extent_state_save(struct extent_iterate_state *state, + struct extent_iterate_saved_state *saved_state) +{ + struct extent *extent; + + saved_state->chain_num = state->current_chain; + saved_state->extent_num = 0; + saved_state->offset = state->current_offset; + + if (saved_state->chain_num == -1) + return; + + extent = (state->chains + state->current_chain)->first; + + while (extent != state->current_extent) { + saved_state->extent_num++; + extent = extent->next; + } +} + +/* extent_start_restore + * + * Restore the position saved by extent_state_save. + */ + +void extent_state_restore(struct extent_iterate_state *state, + struct extent_iterate_saved_state *saved_state) +{ + int posn = saved_state->extent_num; + + if (saved_state->chain_num == -1) { + extent_state_goto_start(state); + return; + } + + state->current_chain = saved_state->chain_num; + state->current_extent = (state->chains + state->current_chain)->first; + state->current_offset = saved_state->offset; + + while (posn--) + state->current_extent = state->current_extent->next; +} diff -urN oldtree/kernel/power/extent.h newtree/kernel/power/extent.h --- oldtree/kernel/power/extent.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/extent.h 2006-02-18 15:24:31.388817104 +0000 @@ -0,0 +1,105 @@ +/* + * kernel/power/extent.h + * + * Copyright (C) 2004-2005 Nigel Cunningham + * + * This file is released under the GPLv2. + * + * It contains declarations related to extents. Extents are + * suspend's method of storing some of the metadata for the image. + * See extent.c for more info. + * + */ + +#ifndef EXTENT_H +#define EXTENT_H +struct extent_chain { + int size; /* size of the extent ie sum (max-min+1) */ + int allocs; + int frees; + int debug; + char *name; + struct extent *first; + struct extent *last; +}; + +/* + * We rely on extents not fitting evenly into a page. + * The last four bytes are used to store the number + * of the page, to make saving & reloading pages simpler. + */ +struct extent { + unsigned long minimum; + unsigned long maximum; + struct extent *next; +}; + +struct extent_iterate_state { + struct extent_chain *chains; + int num_chains; + int current_chain; + struct extent *current_extent; + unsigned long current_offset; +}; + +struct extent_iterate_saved_state { + int chain_num; + int extent_num; + unsigned long offset; +}; + +#define extent_state_eof(state) ((state)->num_chains < (state)->current_chain) + +#define extent_for_each(extent_chain, extentpointer, value) \ +if ((extent_chain)->first) \ + for ((extentpointer) = (extent_chain)->first, (value) = \ + (extentpointer)->minimum; \ + ((extentpointer) && ((extentpointer)->next || (value) <= \ + (extentpointer)->maximum)); \ + (((value) == (extentpointer)->maximum) ? \ + ((extentpointer) = (extentpointer)->next, (value) = \ + ((extentpointer) ? (extentpointer)->minimum : 0)) : \ + (value)++)) + +/* + * When using compression and expected_compression > 0, + * we allocate fewer swap entries, so GET_EXTENT_NEXT can + * validly run out of data to return. + */ +#define GET_EXTENT_NEXT(currentextent, currentval) \ +{ \ + if (currentextent) { \ + if ((currentval) == (currentextent)->maximum) { \ + if ((currentextent)->next) { \ + (currentextent) = (currentextent)->next; \ + (currentval) = (currentextent)->minimum; \ + } else { \ + (currentextent) = NULL; \ + (currentval) = 0; \ + } \ + } else \ + currentval++; \ + } \ +} + +extern int extents_allocated; +void put_extent(struct extent *extent); +void put_extent_chain(struct extent_chain *chain); +int append_extent_to_extent_chain(struct extent_chain *chain, + unsigned long minimum, unsigned long maximum); +int serialise_extent_chain(struct extent_chain *chain); +int load_extent_chain(struct extent_chain *chain); + +/* swap_entry_to_extent_val & extent_val_to_swap_entry: + * We are putting offset in the low bits so consecutive swap entries + * make consecutive extent values */ +#define swap_entry_to_extent_val(swp_entry) (swp_entry.val) +#define extent_val_to_swap_entry(val) (swp_entry_t) { (val) } + +void extent_state_save(struct extent_iterate_state *state, + struct extent_iterate_saved_state *saved_state); +void extent_state_restore(struct extent_iterate_state *state, + struct extent_iterate_saved_state *saved_state); +void extent_state_goto_start(struct extent_iterate_state *state); +unsigned long extent_state_next(struct extent_iterate_state *state); +#endif diff -urN oldtree/kernel/power/io.c newtree/kernel/power/io.c --- oldtree/kernel/power/io.c 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/io.c 2006-02-18 15:24:31.390816800 +0000 @@ -0,0 +1,1026 @@ +/* + * kernel/power/io.c + * + * Copyright (C) 1998-2001 Gabor Kuti + * Copyright (C) 1998,2001,2002 Pavel Machek + * Copyright (C) 2002-2003 Florent Chabaud + * Copyright (C) 2002-2005 Nigel Cunningham + * + * This file is released under the GPLv2. + * + * It contains high level IO routines for suspending. + * + */ + +#include +#include +#include +#include +#include + +#include "version.h" +#include "modules.h" +#include "pageflags.h" +#include "io.h" +#include "ui.h" +#include "suspend2_common.h" +#include "suspend2.h" +#include "debug_pagealloc.h" +#include "storage.h" + +/* attempt_to_parse_resume_device + * + * Can we suspend, using the current resume2= parameter? + */ +int attempt_to_parse_resume_device(void) +{ + struct list_head *writer; + struct suspend_module_ops *this_writer; + int result, returning = 0; + + if (suspend_activate_storage(0)) + return 0; + + suspend_active_writer = NULL; + clear_suspend_state(SUSPEND_RESUME_DEVICE_OK); + set_suspend_state(SUSPEND_DISABLED); + clear_result_state(SUSPEND_ABORTED); + + if (!suspend_num_writers) { + printk(name_suspend "No writers have been registered. Suspending will be disabled.\n"); + goto cleanup; + } + + if (!resume2_file[0]) { + printk(name_suspend "Resume2 parameter is empty. Suspending will be disabled.\n"); + goto cleanup; + } + + list_for_each(writer, &suspend_writers) { + this_writer = list_entry(writer, struct suspend_module_ops, + ops.writer.writer_list); + + /* + * Not sure why you'd want to disable a writer, but + * we should honour the flag if we're providing it + */ + if (this_writer->disabled) { + printk(name_suspend + "Writer '%s' is disabled. Ignoring it.\n", + this_writer->name); + continue; + } + + result = this_writer->ops.writer.parse_sig_location( + resume2_file, (suspend_num_writers == 1)); + + switch (result) { + case -EINVAL: + /* + * For this writer, but not a valid + * configuration. Error already printed. + */ + + goto cleanup; + + case 0: + /* + * For this writer and valid. + */ + + suspend_active_writer = this_writer; + + set_suspend_state(SUSPEND_RESUME_DEVICE_OK); + clear_suspend_state(SUSPEND_DISABLED); + printk(name_suspend "Suspending enabled.\n"); + + returning = 1; + goto cleanup; + } + } + printk(name_suspend "No matching enabled writer found. Suspending disabled.\n"); +cleanup: + suspend_deactivate_storage(0); + return returning; +} + +void attempt_to_parse_resume_device2(void) +{ + suspend_prepare_usm(); + attempt_to_parse_resume_device(); + suspend_cleanup_usm(); +} + +/* noresume_reset_modules + * + * Description: When we read the start of an image, modules (and especially the + * active writer) might need to reset data structures if we decide + * to invalidate the image rather than resuming from it. + */ + +static void noresume_reset_modules(void) +{ + struct suspend_module_ops *this_filter; + + list_for_each_entry(this_filter, &suspend_filters, ops.filter.filter_list) { + if (this_filter->ops.filter.noresume_reset) + this_filter->ops.filter.noresume_reset(); + } + + if (suspend_active_writer && suspend_active_writer->ops.writer.noresume_reset) + suspend_active_writer->ops.writer.noresume_reset(); +} + +/* fill_suspend_header() + * + * Description: Fill the suspend header structure. + * Arguments: struct suspend_header: Header data structure to be filled. + */ + +static void fill_suspend_header(struct suspend_header *sh) +{ + int i; + + memset((char *)sh, 0, sizeof(*sh)); + + sh->version_code = LINUX_VERSION_CODE; + sh->num_physpages = num_physpages; + sh->orig_mem_free = suspend_orig_mem_free; + strncpy(sh->machine, system_utsname.machine, 65); + strncpy(sh->version, system_utsname.version, 65); + sh->page_size = PAGE_SIZE; + sh->pagedir = pagedir1; + sh->pageset_2_size = pagedir2.pageset_size; + sh->param0 = suspend_result; + sh->param1 = suspend_action; + sh->param2 = suspend_debug_state; + sh->param3 = console_loglevel; + sh->root_fs = current->fs->rootmnt->mnt_sb->s_dev; + for (i = 0; i < 4; i++) + sh->io_time[i/2][i%2] = + suspend_io_time[i/2][i%2]; +} + +/* + * rw_init_modules + * + * Iterate over modules, preparing the ones that will be used to read or write + * data. + */ +static int rw_init_modules(int write, int which) +{ + struct suspend_module_ops *this_module; + /* Initialise page transformers */ + list_for_each_entry(this_module, &suspend_filters, + ops.filter.filter_list) { + if (this_module->disabled) + continue; + if ((write && this_module->write_init && + this_module->write_init(which)) || + (!write && this_module->read_init && + this_module->read_init(which))) { + abort_suspend("Failed to initialise the %s filter.", + this_module->name); + return 1; + } + } + + /* Initialise writer */ + if ((write && suspend_active_writer->write_init(which)) || + (!write && suspend_active_writer->read_init(which))) { + abort_suspend("Failed to initialise the writer."); + if (!write) + suspend_active_writer->ops.writer.invalidate_image(); + return 1; + } + + /* Initialise other modules */ + list_for_each_entry(this_module, &suspend_modules, module_list) { + if (this_module->disabled) + continue; + if ((this_module->type == FILTER_PLUGIN) || + (this_module->type == WRITER_PLUGIN)) + continue; + if ((write && this_module->write_init && + this_module->write_init(which)) || + (!write && this_module->read_init && + this_module->read_init(which))) { + set_result_state(SUSPEND_ABORTED); + return 1; + } + } + + return 0; +} + +/* + * rw_cleanup_modules + * + * Cleanup components after reading or writing a set of pages. + * Only the writer may fail. + */ +static int rw_cleanup_modules(int write) +{ + struct suspend_module_ops *this_module; + int result = 0; + + /* Cleanup other modules */ + list_for_each_entry(this_module, &suspend_modules, module_list) { + if (this_module->disabled) + continue; + if ((this_module->type == FILTER_PLUGIN) || + (this_module->type == WRITER_PLUGIN)) + continue; + if (write) { + if (this_module->write_cleanup) + result |= this_module->write_cleanup(); + } else + if (this_module->read_cleanup) + result |= this_module->read_cleanup(); + } + + /* Flush data and cleanup */ + list_for_each_entry(this_module, &suspend_filters, + ops.filter.filter_list) { + if (this_module->disabled) + continue; + if (write) { + if (this_module->write_cleanup) + result |= this_module->write_cleanup(); + } else + if (this_module->read_cleanup) + result |= this_module->read_cleanup(); + } + + if (write) + result |= suspend_active_writer->write_cleanup(); + else + result |= suspend_active_writer->read_cleanup(); + + return result; +} + +/* + * do_rw_loop + * + * The main I/O loop for reading or writing pages. + */ +static int do_rw_loop(int write, int finish_at, dyn_pageflags_t *pageflags, + int base, int barmax) +{ + int current_page_index = -1, pc, step = 1, nextupdate = 0, i; + int result; + struct suspend_module_ops *first_filter = suspend_get_next_filter(NULL); + + current_page_index = get_next_bit_on(*pageflags, -1); + + pc = finish_at / 5; + + /* Read the pages */ + for (i=0; i< finish_at; i++) { + int was_mapped = 0; + struct page *page = pfn_to_page(current_page_index); + + /* Status */ + if ((i+base) >= nextupdate) + nextupdate = suspend_update_status(i+base, barmax, + " %d/%d MB ", MB(base+i+1), MB(barmax)); + + if ((i + 1) == pc) { + printk("%d%%...", 20 * step); + step++; + pc = finish_at * step / 5; + } + + was_mapped = suspend_map_kernel_page(page, 1); + if (write) + result = first_filter->ops.filter.write_chunk(page); + else + result = first_filter->ops.filter.read_chunk(page, + SUSPEND_ASYNC); + if (!was_mapped) + suspend_map_kernel_page(page, 0); + + if (result) { + if (write) { + printk("Write chunk returned %d.\n", result); + abort_suspend("Failed to write a chunk of the " + "image."); + return result; + } else + panic("Failed to read chunk %d/%d of the image. (%d)", + i, finish_at, result); + } + + /* Interactivity*/ + check_shift_keys(0, NULL); + + if (test_result_state(SUSPEND_ABORTED) && write) + return 1; + + /* Prepare next */ + current_page_index = get_next_bit_on(*pageflags, + current_page_index); + } + + printk("done.\n"); + + suspend_update_status(base + finish_at, barmax, " %d/%d MB ", + MB(base + finish_at), MB(barmax)); + return 0; +} + +/* write_pageset() + * + * Description: Write a pageset to disk. + * Arguments: pagedir: Pointer to the pagedir to be saved. + * whichtowrite: Controls what debugging output is printed. + * Returns: Zero on success or -1 on failure. + */ + +int write_pageset(struct pagedir *pagedir, int whichtowrite) +{ + int finish_at, base = 0, start_time, end_time; + int barmax = pagedir1.pageset_size + pagedir2.pageset_size; + long error = 0; + dyn_pageflags_t *pageflags; + + /* + * Even if there is nothing to read or write, the writer + * may need the init/cleanup for it's housekeeping. (eg: + * Pageset1 may start where pageset2 ends when writing). + */ + finish_at = pagedir->pageset_size; + + if (whichtowrite == 1) { + suspend_prepare_status(DONT_CLEAR_BAR, + "Writing kernel & process data..."); + base = pagedir2.pageset_size; + if (test_action_state(SUSPEND_TEST_FILTER_SPEED) || + test_action_state(SUSPEND_TEST_BIO)) + pageflags = &pageset1_map; + else + pageflags = &pageset1_copy_map; + } else { + suspend_prepare_status(CLEAR_BAR, "Writing caches..."); + pageflags = &pageset2_map; + bytes_in = bytes_out = 0; + } + + start_time = jiffies; + + if (!rw_init_modules(1, whichtowrite)) + error = do_rw_loop(1, finish_at, pageflags, base, barmax); + + if (rw_cleanup_modules(1)) { + abort_suspend("Failed to cleanup after writing."); + error = 1; + } + + /* Statistics */ + end_time = jiffies; + + if ((end_time - start_time) && (!test_result_state(SUSPEND_ABORTED))) { + suspend_io_time[0][0] += finish_at, + suspend_io_time[0][1] += (end_time - start_time); + } + + return error; +} + +/* read_pageset() + * + * Description: Read a pageset from disk. + * Arguments: pagedir: Pointer to the pagedir to be saved. + * whichtowrite: Controls what debugging output is printed. + * overwrittenpagesonly: Whether to read the whole pageset or + * only part. + * Returns: Zero on success or -1 on failure. + */ + +static int read_pageset(struct pagedir *pagedir, int whichtoread, + int overwrittenpagesonly) +{ + int result = 0, base = 0, start_time, end_time; + int finish_at = pagedir->pageset_size; + int barmax = pagedir1.pageset_size + pagedir2.pageset_size; + dyn_pageflags_t *pageflags; + + if (whichtoread == 1) { + suspend_prepare_status(CLEAR_BAR, + "Reading kernel & process data..."); + pageflags = &pageset1_copy_map; + } else { + suspend_prepare_status(DONT_CLEAR_BAR, "Reading caches..."); + if (overwrittenpagesonly) + barmax = finish_at = min(pagedir1.pageset_size, + pagedir2.pageset_size); + else { + base = pagedir1.pageset_size; + } + pageflags = &pageset2_map; + } + + start_time = jiffies; + + if (rw_init_modules(0, whichtoread)) { + suspend_active_writer->ops.writer.invalidate_image(); + result = 1; + } else + result = do_rw_loop(0, finish_at, pageflags, base, barmax); + + if (rw_cleanup_modules(0)) { + abort_suspend("Failed to cleanup after reading."); + result = 1; + } + + /* Statistics */ + end_time=jiffies; + + if ((end_time - start_time) && (!test_result_state(SUSPEND_ABORTED))) { + suspend_io_time[1][0] += finish_at, + suspend_io_time[1][1] += (end_time - start_time); + } + + return result; +} + +/* write_module_configs() + * + * Description: Store the configuration for each module in the image header. + * Returns: Int: Zero on success, Error value otherwise. + */ +static int write_module_configs(void) +{ + struct suspend_module_ops *this_module; + char *buffer = (char *) get_zeroed_page(GFP_ATOMIC); + int len, index = 1; + struct suspend_module_header suspend_module_header; + + if (!buffer) { + printk("Failed to allocate a buffer for saving " + "module configuration info.\n"); + return -ENOMEM; + } + + /* + * We have to know which data goes with which module, so we at + * least write a length of zero for a module. Note that we are + * also assuming every module's config data takes <= PAGE_SIZE. + */ + + /* For each module (in registration order) */ + list_for_each_entry(this_module, &suspend_modules, module_list) { + + /* Get the data from the module */ + len = 0; + if (this_module->save_config_info) + len = this_module->save_config_info(buffer); + + /* Save the details of the module */ + suspend_module_header.disabled = this_module->disabled; + suspend_module_header.type = this_module->type; + suspend_module_header.index = index++; + strncpy(suspend_module_header.name, this_module->name, + sizeof(suspend_module_header.name)); + suspend_active_writer->ops.writer.write_header_chunk( + (char *) &suspend_module_header, + sizeof(suspend_module_header)); + + /* Save the size of the data and any data returned */ + suspend_active_writer->ops.writer.write_header_chunk((char *) &len, + sizeof(int)); + if (len) + suspend_active_writer->ops.writer.write_header_chunk( + buffer, len); + } + + /* Write a blank header to terminate the list */ + suspend_module_header.name[0] = '\0'; + suspend_active_writer->ops.writer.write_header_chunk( + (char *) &suspend_module_header, + sizeof(suspend_module_header)); + + free_page((unsigned long) buffer); + return 0; +} + +/* read_module_configs() + * + * Description: Reload module configurations from the image header. + * Returns: Int. Zero on success, error value otherwise. + */ + +static int read_module_configs(void) +{ + struct suspend_module_ops *this_module; + char *buffer = (char *) get_zeroed_page(GFP_ATOMIC); + int len, result = 0; + struct suspend_module_header suspend_module_header; + + if (!buffer) { + printk("Failed to allocate a buffer for reloading module " + "configuration info.\n"); + return -ENOMEM; + } + + /* All modules are initially disabled. That way, if we have a module + * loaded now that wasn't loaded when we suspended, it won't be used + * in trying to read the data. + */ + list_for_each_entry(this_module, &suspend_modules, module_list) + this_module->disabled = 1; + + /* Get the first module header */ + result = suspend_active_writer->ops.writer.read_header_chunk( + (char *) &suspend_module_header, sizeof(suspend_module_header)); + if (!result) { + printk("Failed to read the next module header.\n"); + free_page((unsigned long) buffer); + return -EINVAL; + } + + /* For each module (in registration order) */ + while (suspend_module_header.name[0]) { + + /* Find the module */ + this_module = suspend_find_module_given_name(suspend_module_header.name); + + if (!this_module) { + /* + * Is it used? Only need to worry about filters. The active + * writer must be loaded! + */ + if ((!suspend_module_header.disabled) && + (suspend_module_header.type == FILTER_PLUGIN)) { + suspend_early_boot_message(1, SUSPEND_CONTINUE_REQ, + "It looks like we need module %s for " + "reading the image but it hasn't been " + "registered.\n", + suspend_module_header.name); + if (!(test_suspend_state(SUSPEND_CONTINUE_REQ))) { + suspend_active_writer->ops.writer.invalidate_image(); + result = -EINVAL; + noresume_reset_modules(); + free_page((unsigned long) buffer); + return -EINVAL; + } + } else + printk("Plugin %s configuration data found, but the module " + "hasn't registered. Looks like it was disabled, so " + "we're ignoring it's data.", + suspend_module_header.name); + } + + /* Get the length of the data (if any) */ + result = suspend_active_writer->ops.writer.read_header_chunk( + (char *) &len, sizeof(int)); + if (!result) { + printk("Failed to read the length of the module %s's" + " configuration data.\n", + suspend_module_header.name); + free_page((unsigned long) buffer); + return -EINVAL; + } + + /* Read any data and pass to the module (if we found one) */ + if (len) { + suspend_active_writer->ops.writer.read_header_chunk(buffer, len); + if (this_module) { + if (!this_module->save_config_info) { + printk("Huh? Plugin %s appears to have a " + "save_config_info, but not a " + "load_config_info function!\n", + this_module->name); + } else + this_module->load_config_info(buffer, len); + } + } + + if (this_module) { + /* Now move this module to the tail of its lists. This will put it + * in order. Any new modules will end up at the top of the lists. + * They should have been set to disabled when loaded (people will + * normally not edit an initrd to load a new module and then + * suspend without using it!). + */ + + suspend_move_module_tail(this_module); + + /* + * We apply the disabled state; modules don't need to save whether they + * were disabled and if they do, we override them anyway. + */ + this_module->disabled = suspend_module_header.disabled; + } + + /* Get the next module header */ + result = suspend_active_writer->ops.writer.read_header_chunk( + (char *) &suspend_module_header, + sizeof(suspend_module_header)); + + if (!result) { + printk("Failed to read the next module header.\n"); + free_page((unsigned long) buffer); + return -EINVAL; + } + + } + + free_page((unsigned long) buffer); + return 0; +} + +/* write_image_header() + * + * Description: Write the image header after write the image proper. + * Returns: Int. Zero on success or -1 on failure. + */ + +int write_image_header(void) +{ + int ret; + int total = pagedir1.pageset_size + pagedir2.pageset_size+2; + char *header_buffer = NULL; + + /* Now prepare to write the header */ + if ((ret = suspend_active_writer->ops.writer.write_header_init())) { + abort_suspend("Active writer's write_header_init" + " function failed."); + goto write_image_header_abort; + } + + /* Get a buffer */ + header_buffer = (char *) get_zeroed_page(GFP_ATOMIC); + if (!header_buffer) { + abort_suspend("Out of memory when trying to get page " + "for header!"); + goto write_image_header_abort; + } + + /* Write suspend header */ + fill_suspend_header((struct suspend_header *) header_buffer); + suspend_active_writer->ops.writer.write_header_chunk(header_buffer, + sizeof(struct suspend_header)); + + free_page((unsigned long) header_buffer); + + /* Write module configurations */ + if ((ret = write_module_configs())) { + abort_suspend("Failed to write module configs."); + goto write_image_header_abort; + } + + save_dyn_pageflags(pageset1_map); + + if (suspend_active_writer->ops.writer.serialise_extents && + (ret = suspend_active_writer->ops.writer.serialise_extents())) { + abort_suspend("Active writer's prepare_save_extents " + "function failed."); + goto write_image_header_abort; + } + + /* Flush data and let writer cleanup */ + if (suspend_active_writer->ops.writer.write_header_cleanup()) { + abort_suspend("Failed to cleanup writing header."); + goto write_image_header_abort_no_cleanup; + } + + if (test_result_state(SUSPEND_ABORTED)) + goto write_image_header_abort_no_cleanup; + + suspend_message(SUSPEND_IO, SUSPEND_VERBOSE, 1, "|\n"); + suspend_update_status(total, total, NULL); + + return 0; + +write_image_header_abort: + suspend_active_writer->ops.writer.write_header_cleanup(); +write_image_header_abort_no_cleanup: + return -1; +} + +/* sanity_check() + * + * Description: Perform a few checks, seeking to ensure that the kernel being + * booted matches the one suspended. They need to match so we can + * be _sure_ things will work. It is not absolutely impossible for + * resuming from a different kernel to work, just not assured. + * Arguments: Struct suspend_header. The header which was saved at suspend + * time. + */ +static char *sanity_check(struct suspend_header *sh) +{ + if (sh->version_code != LINUX_VERSION_CODE) + return "Incorrect kernel version."; + + if (sh->num_physpages != num_physpages) + return "Incorrect memory size."; + + if (strncmp(sh->machine, system_utsname.machine, 65)) + return "Incorrect machine type."; + + if (strncmp(sh->version, system_utsname.version, 65)) + return "Right kernel version but wrong build number."; + + if (sh->page_size != PAGE_SIZE) + return "Incorrect PAGE_SIZE."; + + if ((sh->root_fs == current->fs->rootmnt->mnt_sb->s_dev) && + (!test_suspend_state(SUSPEND_IGNORE_ROOTFS))) + return "Root filesystem has been mounted prior to trying to resume."; + + return 0; +} + +/* __read_pageset1 + * + * Description: Test for the existence of an image and attempt to load it. + * Returns: Int. Zero if image found and pageset1 successfully loaded. + * Error if no image found or loaded. + */ +static int __read_pageset1(void) +{ + int i, result = 0; + char *header_buffer = (char *) get_zeroed_page(GFP_ATOMIC), *sanity_error = NULL; + struct suspend_header *suspend_header; + + if (!header_buffer) + return -ENOMEM; + + /* Check for an image */ + if (!(result = suspend_active_writer->ops.writer.image_exists())) { + result = -ENODATA; + noresume_reset_modules(); + goto out; + } + + /* Check for noresume command line option */ + if (test_suspend_state(SUSPEND_NORESUME_SPECIFIED)) { + suspend_active_writer->ops.writer.invalidate_image(); + result = -EINVAL; + noresume_reset_modules(); + goto out; + } + + /* Check whether we've resumed before */ + if (test_suspend_state(SUSPEND_RESUMED_BEFORE)) { + int resumed_before_default = 0; + if (test_suspend_state(SUSPEND_RETRY_RESUME)) + resumed_before_default = SUSPEND_CONTINUE_REQ; + suspend_early_boot_message(1, resumed_before_default, NULL); + clear_suspend_state(SUSPEND_RETRY_RESUME); + if (!(test_suspend_state(SUSPEND_CONTINUE_REQ))) { + suspend_active_writer->ops.writer.invalidate_image(); + result = -EINVAL; + noresume_reset_modules(); + goto out; + } + } + + clear_suspend_state(SUSPEND_CONTINUE_REQ); + + /* + * Prepare the active writer for reading the image header. The + * activate writer might read its own configuration. + * + * NB: This call may never return because there might be a signature + * for a different image such that we warn the user and they choose + * to reboot. (If the device ids look erroneous (2.4 vs 2.6) or the + * location of the image might be unavailable if it was stored on a + * network connection. + */ + + if ((result = suspend_active_writer->ops.writer.read_header_init())) { + noresume_reset_modules(); + goto out; + } + + /* Read suspend header */ + if ((result = suspend_active_writer->ops.writer.read_header_chunk( + header_buffer, sizeof(struct suspend_header))) < 0) { + noresume_reset_modules(); + goto out; + } + + suspend_header = (struct suspend_header *) header_buffer; + + /* + * NB: This call may also result in a reboot rather than returning. + */ + + if ((sanity_error = sanity_check(suspend_header)) && + suspend_early_boot_message(1, SUSPEND_CONTINUE_REQ, sanity_error)) { + suspend_active_writer->ops.writer.invalidate_image(); + result = -EINVAL; + noresume_reset_modules(); + goto out; + } + + /* + * We have an image and it looks like it will load okay. + */ + + /* Get metadata from header. Don't override commandline parameters. + * + * We don't need to save the image size limit because it's not used + * during resume and will be restored with the image anyway. + */ + + suspend_orig_mem_free = suspend_header->orig_mem_free; + memcpy((char *) &pagedir1, + (char *) &suspend_header->pagedir, sizeof(pagedir1)); + suspend_result = suspend_header->param0; + if (!test_suspend_state(SUSPEND_ACT_USED)) + suspend_action = suspend_header->param1; + if (!test_suspend_state(SUSPEND_DBG_USED)) + suspend_debug_state = suspend_header->param2; + if (!test_suspend_state(SUSPEND_LVL_USED)) + suspend_default_console_level = suspend_header->param3; + clear_suspend_state(SUSPEND_IGNORE_LOGLEVEL); + pagedir2.pageset_size = suspend_header->pageset_2_size; + for (i = 0; i < 4; i++) + suspend_io_time[i/2][i%2] = + suspend_header->io_time[i/2][i%2]; + + /* Read module configurations */ + if ((result = read_module_configs())) { + noresume_reset_modules(); + pagedir1.pageset_size = + pagedir2.pageset_size = 0; + goto out; + } + + suspend_prepare_console(); + + check_shift_keys(1, "About to read original pageset1 locations."); + /* Read original pageset1 locations. These are the addresses we can't use for + * the data to be restored */ + allocate_dyn_pageflags(&pageset1_map); + load_dyn_pageflags(pageset1_map); + + allocate_dyn_pageflags(&conflicting_pages_map); + + set_suspend_state(SUSPEND_NOW_RESUMING); + + /* Relocate it so that it's not overwritten while we're using it to + * copy the original contents back */ + relocate_dyn_pageflags(&pageset1_map); + relocate_dyn_pageflags(&conflicting_pages_map); + + allocate_dyn_pageflags(&pageset1_copy_map); + relocate_dyn_pageflags(&pageset1_copy_map); + + /* Read extent pages */ + if (suspend_active_writer->ops.writer.load_extents && + (result = suspend_active_writer->ops.writer.load_extents())) { + noresume_reset_modules(); + abort_suspend("Active writer's load_extents " + "function failed."); + goto out_reset_console; + } + + /* Clean up after reading the header */ + if ((result = suspend_active_writer->ops.writer.read_header_cleanup())) { + noresume_reset_modules(); + goto out_reset_console; + } + + check_shift_keys(1, "About to read pagedir."); + + /* + * Get the addresses of pages into which we will load the kernel to + * be copied back + */ + if (suspend_get_pageset1_load_addresses()) { + result = -ENOMEM; + noresume_reset_modules(); + goto out_reset_console; + } + + /* Read the original kernel back */ + check_shift_keys(1, "About to read pageset 1."); + + if (read_pageset(&pagedir1, 1, 0)) { + suspend_prepare_status(CLEAR_BAR, "Failed to read pageset 1."); + result = -EPERM; + noresume_reset_modules(); + goto out_reset_console; + } + + check_shift_keys(1, "About to restore original kernel."); + result = 0; + + if (!test_action_state(SUSPEND_KEEP_IMAGE) && + suspend_active_writer->ops.writer.mark_resume_attempted) + suspend_active_writer->ops.writer.mark_resume_attempted(); + +out: + free_page((unsigned long) header_buffer); + return result; + +out_reset_console: + free_dyn_pageflags(&pageset1_map); + free_dyn_pageflags(&pageset1_copy_map); + free_dyn_pageflags(&conflicting_pages_map); + suspend_cleanup_console(); + goto out; +} + +/* read_pageset1() + * + * Description: Attempt to read the header and pageset1 of a suspend image. + * Handle the outcome, complaining where appropriate. + */ + +int read_pageset1(void) +{ + int error; + + error = __read_pageset1(); + + switch (error) { + case 0: + case -ENODATA: + case -EINVAL: /* non fatal error */ + return error; + case -EIO: + printk(KERN_CRIT name_suspend "I/O error\n"); + break; + case -ENOENT: + printk(KERN_CRIT name_suspend "No such file or directory\n"); + break; + case -EPERM: + printk(KERN_CRIT name_suspend "Sanity check error\n"); + break; + default: + printk(KERN_CRIT name_suspend "Error %d resuming\n", error); + break; + } + abort_suspend("Error %d in read_pageset1",error); + return error; +} + +/* + * get_have_image_data() + */ + +char *get_have_image_data(void) +{ + char *output_buffer = (char *) get_zeroed_page(GFP_ATOMIC); + struct suspend_header *suspend_header; + + if (!output_buffer) { + printk("Output buffer null.\n"); + return NULL; + } + + /* Check for an image */ + if (!suspend_active_writer->ops.writer.image_exists() || + suspend_active_writer->ops.writer.read_header_init() || + suspend_active_writer->ops.writer.read_header_chunk( + output_buffer, sizeof(struct suspend_header)) != + sizeof(struct suspend_header)) { + sprintf(output_buffer, "0\n"); + goto out; + } + + suspend_header = (struct suspend_header *) output_buffer; + + sprintf(output_buffer, "1\n%s\n%s\n", + suspend_header->machine, + suspend_header->version); + + /* Check whether we've resumed before */ + if (test_suspend_state(SUSPEND_RESUMED_BEFORE)) + strcat(output_buffer, "Resumed before.\n"); + +out: + noresume_reset_modules(); + return output_buffer; +} + +/* read_pageset2() + * + * Description: Read in part or all of pageset2 of an image, depending upon + * whether we are suspending and have only overwritten a portion + * with pageset1 pages, or are resuming and need to read them + * all. + * Arguments: Int. Boolean. Read only pages which would have been + * overwritten by pageset1? + * Returns: Int. Zero if no error, otherwise the error value. + */ +int read_pageset2(int overwrittenpagesonly) +{ + int result = 0; + + if (!pagedir2.pageset_size) + return 0; + + result = read_pageset(&pagedir2, 2, overwrittenpagesonly); + + suspend_update_status(100, 100, NULL); + check_shift_keys(1, "Pagedir 2 read."); + + return result; +} diff -urN oldtree/kernel/power/io.h newtree/kernel/power/io.h --- oldtree/kernel/power/io.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/io.h 2006-02-18 15:24:31.391816648 +0000 @@ -0,0 +1,38 @@ +/* + * kernel/power/io.h + */ + +#include "pagedir.h" + +/* Non-module data saved in our image header */ +struct suspend_header { + u32 version_code; + unsigned long num_physpages; + unsigned long orig_mem_free; + char machine[65]; + char version[65]; + int num_cpus; + int page_size; + int pageset_2_size; + int param0; + int param1; + int param2; + int param3; + int progress0; + int progress1; + int progress2; + int progress3; + int io_time[2][2]; + struct pagedir pagedir; + dev_t root_fs; +}; + +extern int write_pageset(struct pagedir *pagedir, int whichtowrite); +extern int write_image_header(void); +extern int read_pageset1(void); +extern int read_pageset2(int overwrittenpagesonly); + +extern int attempt_to_parse_resume_device(void); +extern void attempt_to_parse_resume_device2(void); +extern dev_t name_to_dev_t(char *line); +extern __nosavedata unsigned long bytes_in, bytes_out; diff -urN oldtree/kernel/power/main.c newtree/kernel/power/main.c --- oldtree/kernel/power/main.c 2006-02-18 15:18:30.087743216 +0000 +++ newtree/kernel/power/main.c 2006-02-18 15:24:31.392816496 +0000 @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -95,7 +96,7 @@ if (pm_ops->finish) pm_ops->finish(state); Thaw: - thaw_processes(); + thaw_processes(FREEZER_ALL_THREADS); Enable_cpu: enable_nonboot_cpus(); pm_restore_console(); @@ -103,7 +104,7 @@ } -static int suspend_enter(suspend_state_t state) +int suspend_enter(suspend_state_t state) { int error = 0; unsigned long flags; @@ -133,7 +134,7 @@ static void suspend_finish(suspend_state_t state) { device_resume(); - thaw_processes(); + thaw_processes(FREEZER_ALL_THREADS); enable_nonboot_cpus(); if (pm_ops && pm_ops->finish) pm_ops->finish(state); @@ -146,7 +147,7 @@ static char *pm_states[PM_SUSPEND_MAX] = { [PM_SUSPEND_STANDBY] = "standby", [PM_SUSPEND_MEM] = "mem", -#ifdef CONFIG_SOFTWARE_SUSPEND +#if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_SUSPEND2) [PM_SUSPEND_DISK] = "disk", #endif }; @@ -177,7 +178,7 @@ static int enter_state(suspend_state_t state) { - int error; + int error = 0; if (!valid_state(state)) return -ENODEV; diff -urN oldtree/kernel/power/modules.c newtree/kernel/power/modules.c --- oldtree/kernel/power/modules.c 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/modules.c 2006-02-18 15:24:31.393816344 +0000 @@ -0,0 +1,312 @@ +/* + * kernel/power/modules.c + * + * Copyright (C) 2004-2005 Nigel Cunningham + * + */ + +#include +#include +#include "suspend2.h" +#include "modules.h" + +struct list_head suspend_filters, suspend_writers, suspend_modules; +struct suspend_module_ops *suspend_active_writer; +static int suspend_num_filters; +int suspend_num_writers, suspend_num_modules; + +/* + * suspend_header_storage_for_modules + * + * Returns the amount of space needed to store configuration + * data needed by the modules prior to copying back the original + * kernel. We can exclude data for pageset2 because it will be + * available anyway once the kernel is copied back. + */ +unsigned long suspend_header_storage_for_modules(void) +{ + struct suspend_module_ops *this_module; + unsigned long bytes = 0; + + list_for_each_entry(this_module, &suspend_modules, module_list) { + if (this_module->disabled) + continue; + if (this_module->storage_needed) + bytes += this_module->storage_needed(); + } + + return bytes; +} + +/* + * suspend_memory_for_modules + * + * Returns the amount of memory requested by modules for + * doing their work during the cycle. + */ + +unsigned long suspend_memory_for_modules(void) +{ + unsigned long bytes = 0; + struct suspend_module_ops *this_module; + + list_for_each_entry(this_module, &suspend_modules, module_list) { + if (this_module->disabled) + continue; + if (this_module->memory_needed) + bytes += this_module->memory_needed(); + } + + return ((bytes + PAGE_SIZE - 1) >> PAGE_SHIFT); +} + +/* suspend_find_module_given_name + * Functionality : Return a module (if found), given a pointer + * to its name + */ + +struct suspend_module_ops *suspend_find_module_given_name(char *name) +{ + struct suspend_module_ops *this_module, *found_module = NULL; + + list_for_each_entry(this_module, &suspend_modules, module_list) { + if (!strcmp(name, this_module->name)) { + found_module = this_module; + break; + } + } + + return found_module; +} + +/* + * suspend_print_module_debug_info + * Functionality : Get debugging info from modules into a buffer. + */ +int suspend_print_module_debug_info(char *buffer, int buffer_size) +{ + struct suspend_module_ops *this_module; + int len = 0; + + list_for_each_entry(this_module, &suspend_modules, module_list) { + if (this_module->disabled) + continue; + if (this_module->print_debug_info) { + int result; + result = this_module->print_debug_info(buffer + len, + buffer_size - len); + len += result; + } + } + + return len; +} + +/* + * suspend_register_module + * + * Register a module. + */ +int suspend_register_module(struct suspend_module_ops *module) +{ + if (suspend_find_module_given_name(module->name)) + return -EBUSY; + + switch (module->type) { + case FILTER_PLUGIN: + list_add_tail(&module->ops.filter.filter_list, + &suspend_filters); + suspend_num_filters++; + break; + + case WRITER_PLUGIN: + list_add_tail(&module->ops.writer.writer_list, + &suspend_writers); + suspend_num_writers++; + break; + + case MISC_PLUGIN: + break; + + default: + printk("Hmmm. Plugin '%s' has an invalid type." + " It has been ignored.\n", module->name); + return -EINVAL; + } + list_add_tail(&module->module_list, &suspend_modules); + suspend_num_modules++; + + return 0; +} + +/* + * suspend_unregister_module + * + * Remove a module. + */ +void suspend_unregister_module(struct suspend_module_ops *module) +{ + switch (module->type) { + case FILTER_PLUGIN: + list_del(&module->ops.filter.filter_list); + suspend_num_filters--; + break; + + case WRITER_PLUGIN: + list_del(&module->ops.writer.writer_list); + suspend_num_writers--; + if (suspend_active_writer == module) { + suspend_active_writer = NULL; + set_suspend_state(SUSPEND_DISABLED); + } + break; + + case MISC_PLUGIN: + break; + + default: + printk("Hmmm. Plugin '%s' has an invalid type." + " It has been ignored.\n", module->name); + return; + } + list_del(&module->module_list); + suspend_num_modules--; +} + +/* + * suspend_move_module_tail + * + * Rearrange modules when reloading the config. + */ +void suspend_move_module_tail(struct suspend_module_ops *module) +{ + switch (module->type) { + case FILTER_PLUGIN: + if (suspend_num_filters > 1) + list_move_tail(&module->ops.filter.filter_list, + &suspend_filters); + break; + + case WRITER_PLUGIN: + if (suspend_num_writers > 1) + list_move_tail(&module->ops.writer.writer_list, + &suspend_writers); + break; + + case MISC_PLUGIN: + break; + default: + printk("Hmmm. Plugin '%s' has an invalid type." + " It has been ignored.\n", module->name); + return; + } + if ((suspend_num_filters + suspend_num_writers) > 1) + list_move_tail(&module->module_list, &suspend_modules); +} + +/* + * suspend_initialise_modules + * + * Get ready to do some work! + */ +int suspend_initialise_modules(int starting_cycle) +{ + struct suspend_module_ops *this_module; + int result; + + list_for_each_entry(this_module, &suspend_modules, module_list) { + if (this_module->disabled) + continue; + if (this_module->initialise) { + suspend_message(SUSPEND_MEMORY, SUSPEND_MEDIUM, 1, + "Initialising module %s.\n", + this_module->name); + if ((result = this_module->initialise(starting_cycle))) { + printk("%s didn't initialise okay.\n", + this_module->name); + return result; + } + } + } + + return 0; +} + +/* + * suspend_cleanup_modules + * + * Tell modules the work is done. + */ +void suspend_cleanup_modules(int finishing_cycle) +{ + struct suspend_module_ops *this_module; + + list_for_each_entry(this_module, &suspend_modules, module_list) { + if (this_module->disabled) + continue; + if (this_module->cleanup) { + suspend_message(SUSPEND_MEMORY, SUSPEND_MEDIUM, 1, + "Cleaning up module %s.\n", + this_module->name); + this_module->cleanup(finishing_cycle); + } + } +} + +/* + * suspend_get_next_filter + * + * Get the next filter in the pipeline. + */ +struct suspend_module_ops *suspend_get_next_filter(struct suspend_module_ops *filter_sought) +{ + struct suspend_module_ops *last_filter = NULL, *this_filter = NULL; + + list_for_each_entry(this_filter, &suspend_filters, ops.filter.filter_list) { + if (this_filter->disabled) + continue; + if ((last_filter == filter_sought) || (!filter_sought)) + return this_filter; + last_filter = this_filter; + } + + return suspend_active_writer; +} + +/* suspend_get_modules + * + * Take a reference to modules so they can't go away under us. + */ + +int suspend_get_modules(void) +{ + struct suspend_module_ops *this_module; + + list_for_each_entry(this_module, &suspend_modules, module_list) { + if (!try_module_get(this_module->module)) { + /* Failed! Reverse gets and return error */ + struct suspend_module_ops *this_module2; + list_for_each_entry(this_module2, &suspend_modules, module_list) { + if (this_module == this_module2) + return -EINVAL; + module_put(this_module2->module); + } + } + } + + return 0; +} + +/* suspend_put_modules + * + * Release our references to modules we used. + */ + +void suspend_put_modules(void) +{ + struct suspend_module_ops *this_module; + + list_for_each_entry(this_module, &suspend_modules, module_list) { + module_put(this_module->module); + } +} diff -urN oldtree/kernel/power/modules.h newtree/kernel/power/modules.h --- oldtree/kernel/power/modules.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/modules.h 2006-02-18 15:24:31.394816192 +0000 @@ -0,0 +1,180 @@ +/* + * kernel/power/modules.h + * + * Copyright (C) 2004-2005 Nigel Cunningham + * + * This file is released under the GPLv2. + * + * It contains declarations for modules. Plugins are additions to + * suspend2 that provide facilities such as image compression or + * encryption, backends for storage of the image and user interfaces. + * + */ + +/* This is the maximum size we store in the image header for a module name */ +#define SUSPEND_MAX_PLUGIN_NAME_LENGTH 30 + +/* Per-module metadata */ +struct suspend_module_header { + char name[SUSPEND_MAX_PLUGIN_NAME_LENGTH]; + int disabled; + int type; + int index; + int data_length; + unsigned long signature; +}; + +extern int suspend_num_modules, suspend_num_writers; + +enum { + FILTER_PLUGIN, + WRITER_PLUGIN, + MISC_PLUGIN, /* Block writer, eg. */ + CHECKSUM_PLUGIN +}; + +enum { + SUSPEND_ASYNC, + SUSPEND_SYNC +}; + +struct suspend_filter_ops { + /* Writing the image proper */ + int (*write_chunk) (struct page *buffer_page); + + /* Reading the image proper */ + int (*read_chunk) (struct page *buffer_page, int sync); + + /* Reset module if image exists but reading aborted */ + void (*noresume_reset) (void); + struct list_head filter_list; +}; + +struct suspend_writer_ops { + + /* Writing the image proper */ + int (*write_chunk) (struct page *buffer_page); + + /* Reading the image proper */ + int (*read_chunk) (struct page *buffer_page, int sync); + + /* Reset module if image exists but reading aborted */ + void (*noresume_reset) (void); + + /* Calls for allocating storage */ + + /* Maximum size of image we can save (incl. space already allocated).*/ + int (*storage_available) (void); + + /* Amount of storage already allocated */ + int (*storage_allocated) (void); + + int (*release_storage) (void); + + /* + * Header space is allocated separately. Note that allocation + * of space for the header might result in allocated space + * being stolen from the main pool if there is no unallocated + * space. We have to be able to allocate enough space for + * the header. We can eat memory to ensure there is enough + * for the main pool. + */ + int (*allocate_header_space) (int space_requested); + int (*allocate_storage) (int space_requested); + + /* Read and write the metadata */ + int (*write_header_init) (void); + int (*write_header_chunk) (char *buffer_start, int buffer_size); + int (*write_header_cleanup) (void); + + int (*read_header_init) (void); + int (*read_header_chunk) (char *buffer_start, int buffer_size); + int (*read_header_cleanup) (void); + + /* Prepare metadata to be saved (relativise/absolutise extents) */ + int (*serialise_extents) (void); + int (*load_extents) (void); + + /* Attempt to parse an image location */ + int (*parse_sig_location) (char *buffer, int only_writer); + + /* Determine whether image exists that we can restore */ + int (*image_exists) (void); + + /* Mark the image as having tried to resume */ + void (*mark_resume_attempted) (void); + + /* Destroy image if one exists */ + int (*invalidate_image) (void); + + /* Wait on I/O */ + int (*wait_on_io) (int flush_all); + + struct list_head writer_list; +}; + +struct suspend_module_ops { + /* Functions common to all modules */ + int type; + char *name; + struct module *module; + int disabled; + struct list_head module_list; + + /* Bytes */ + unsigned long (*memory_needed) (void); + unsigned long (*storage_needed) (void); + + int (*print_debug_info) (char *buffer, int size); + int (*save_config_info) (char *buffer); + void (*load_config_info) (char *buffer, int len); + + /* Initialise & cleanup - general routines called + * at the start and end of a cycle. */ + int (*initialise) (int starting_cycle); + void (*cleanup) (int finishing_cycle); + + int (*write_init) (int stream_number); + int (*write_cleanup) (void); + + int (*read_init) (int stream_number); + int (*read_cleanup) (void); + + union { + struct suspend_filter_ops filter; + struct suspend_writer_ops writer; + } ops; +}; + +extern struct suspend_module_ops *suspend_active_writer; +extern struct list_head suspend_filters, suspend_writers, suspend_modules; + +extern void suspend_prepare_console_modules(void); +extern void suspend_cleanup_console_modules(void); + +extern struct suspend_module_ops *suspend_find_module_given_name(char *name), + *suspend_get_next_filter(struct suspend_module_ops *); + +extern int suspend_register_module(struct suspend_module_ops *module); +extern void suspend_move_module_tail(struct suspend_module_ops *module); + +extern unsigned long suspend_header_storage_for_modules(void); +extern unsigned long suspend_memory_for_modules(void); + +extern int suspend_print_module_debug_info(char *buffer, int buffer_size); +extern int suspend_register_module(struct suspend_module_ops *module); +extern void suspend_unregister_module(struct suspend_module_ops *module); + +extern int suspend_initialise_modules(int starting_cycle); +extern void suspend_cleanup_modules(int finishing_cycle); + +int suspend_get_modules(void); +void suspend_put_modules(void); + +static inline void suspend_initialise_module_lists(void) { + INIT_LIST_HEAD(&suspend_filters); + INIT_LIST_HEAD(&suspend_writers); + INIT_LIST_HEAD(&suspend_modules); +} + +extern int suspend_expected_compression_ratio(void); diff -urN oldtree/kernel/power/netlink.c newtree/kernel/power/netlink.c --- oldtree/kernel/power/netlink.c 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/netlink.c 2006-02-18 15:24:31.395816040 +0000 @@ -0,0 +1,370 @@ +/* + * netlink.c + * + * Functions for communicating with a userspace helper via netlink. + */ + + +#include +#include "netlink.h" + +#ifdef CONFIG_NET +struct user_helper_data *uhd_list = NULL; + +/* + * Refill our pool of SKBs for use in emergencies (eg, when eating memory and none + * can be allocated). + */ +static void suspend_fill_skb_pool(struct user_helper_data *uhd) +{ + while (uhd->pool_level < uhd->pool_limit) { + struct sk_buff *new_skb = + alloc_skb(NLMSG_SPACE(uhd->skb_size), GFP_ATOMIC); + + if (!new_skb) + break; + + new_skb->next = uhd->emerg_skbs; + uhd->emerg_skbs = new_skb; + uhd->pool_level++; + } +} + +/* + * Try to allocate a single skb. If we can't get one, try to use one from + * our pool. + */ +static struct sk_buff *suspend_get_skb(struct user_helper_data *uhd) +{ + struct sk_buff *skb = + alloc_skb(NLMSG_SPACE(uhd->skb_size), GFP_ATOMIC); + + if (skb) + return skb; + + skb = uhd->emerg_skbs; + if (skb) { + uhd->pool_level--; + uhd->emerg_skbs = skb->next; + skb->next = NULL; + } + + return skb; +} + +static void put_skb(struct user_helper_data *uhd, struct sk_buff *skb) +{ + if (uhd->pool_level < uhd->pool_limit) { + skb->next = uhd->emerg_skbs; + uhd->emerg_skbs = skb; + } else + kfree_skb(skb); +} + + +static void suspend_notify_userspace(void* data) +{ + struct task_struct *t; + struct user_helper_data *uhd = (struct user_helper_data *) data; + + BUG_ON(!uhd); + + read_lock(&tasklist_lock); + if ((t = find_task_by_pid(uhd->pid))) + wake_up_process(t); + read_unlock(&tasklist_lock); +} + +DECLARE_WORK(suspend_notify_userspace_work, suspend_notify_userspace, NULL); + +void suspend_send_netlink_message(struct user_helper_data *uhd, + int type, void* params, size_t len) +{ + struct sk_buff *skb; + struct nlmsghdr *nlh; + void *dest; + + skb = suspend_get_skb(uhd); + if (!skb) { + printk("suspend_netlink: Can't allocate skb!\n"); + return; + } + + /* NLMSG_PUT contains a hidden goto nlmsg_failure */ + nlh = NLMSG_PUT(skb, 0, uhd->sock_seq, type, len); + uhd->sock_seq++; + + dest = NLMSG_DATA(nlh); + if (params && len > 0) + memcpy(dest, params, len); + + netlink_unicast(uhd->nl, skb, uhd->pid, 0); + + /* We may be in an interrupt context so defer waking up userspace */ + suspend_notify_userspace_work.data = uhd; + schedule_work(&suspend_notify_userspace_work); + + return; + +nlmsg_failure: + if (skb) + put_skb(uhd, skb); +} + +#ifdef CONFIG_PM_DEBUG +static int is_debugging = 1; +#else +static int is_debugging = 0; +#endif + +static void send_whether_debugging(struct user_helper_data *uhd) +{ + suspend_send_netlink_message(uhd, NETLINK_MSG_IS_DEBUGGING, + &is_debugging, sizeof(int)); +} + +/* + * Set the PF_NOFREEZE flag on the given process to ensure it can run whilst we + * are suspending. + */ +static int nl_set_nofreeze(struct user_helper_data *uhd, int pid) +{ + struct task_struct *t; + + read_lock(&tasklist_lock); + if ((t = find_task_by_pid(pid)) == NULL) { + read_unlock(&tasklist_lock); + printk("Strange. Can't find the userspace task %d.\n", pid); + return -EINVAL; + } + + t->flags |= PF_NOFREEZE; + + read_unlock(&tasklist_lock); + uhd->pid = pid; + + suspend_send_netlink_message(uhd, NETLINK_MSG_NOFREEZE_ACK, NULL, 0); + + return 0; +} + +/* + * Called when the userspace process has informed us that it's ready to roll. + */ +static int nl_ready(struct user_helper_data *uhd, int version) +{ + if (version != uhd->interface_version) { + printk("%s userspace process using invalid interface version." + " Trying to continue without it.\n", + uhd->name); + if (uhd->not_ready) + uhd->not_ready(); + return 1; + } + + complete(&uhd->wait_for_process); + + return 0; +} + +static int suspend_nl_gen_rcv_msg(struct user_helper_data *uhd, + struct sk_buff *skb, struct nlmsghdr *nlh) +{ + int type; + int *data; + int err; + + /* Let the more specific handler go first. It returns + * 1 for valid messages that it doesn't know. */ + if ((err = uhd->rcv_msg(skb, nlh)) != 1) + return err; + + type = nlh->nlmsg_type; + + /* Only allow one task to receive NOFREEZE privileges */ + if (type == NETLINK_MSG_NOFREEZE_ME && uhd->pid != -1) { + printk("Received extra nofreeze me requests.\n"); + return -EBUSY; + } + + data = (int*)NLMSG_DATA(nlh); + + switch (type) { + case NETLINK_MSG_NOFREEZE_ME: + if ((err = nl_set_nofreeze(uhd, nlh->nlmsg_pid)) != 0) + return err; + break; + case NETLINK_MSG_GET_DEBUGGING: + send_whether_debugging(uhd); + break; + case NETLINK_MSG_READY: + if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) { + printk("Invalid ready mesage.\n"); + return -EINVAL; + } + if ((err = nl_ready(uhd, *data)) != 0) + return err; + break; + } + + return 0; +} + +static void suspend_user_rcv_skb(struct user_helper_data *uhd, + struct sk_buff *skb) +{ + int err; + struct nlmsghdr *nlh; + + while (skb->len >= NLMSG_SPACE(0)) { + u32 rlen; + + nlh = (struct nlmsghdr *) skb->data; + if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) + return; + + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + + if ((err = suspend_nl_gen_rcv_msg(uhd, skb, nlh)) != 0) + netlink_ack(skb, nlh, err); + else if (nlh->nlmsg_flags & NLM_F_ACK) + netlink_ack(skb, nlh, 0); + skb_pull(skb, rlen); + } +} + +static void suspend_netlink_input(struct sock *sk, int len) +{ + struct user_helper_data *uhd = uhd_list; + + while (uhd && uhd->netlink_id != sk->sk_protocol) + uhd= uhd->next; + + BUG_ON(!uhd); + + do { + struct sk_buff *skb; + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + suspend_user_rcv_skb(uhd, skb); + put_skb(uhd, skb); + } + } while (uhd->nl && uhd->nl->sk_receive_queue.qlen); +} + +static int netlink_prepare(struct user_helper_data *uhd) +{ + uhd->next = uhd_list; + uhd_list = uhd; + + uhd->sock_seq = 0x42c0ffee; + uhd->nl = netlink_kernel_create(uhd->netlink_id, 0, + suspend_netlink_input, THIS_MODULE); + if (!uhd->nl) { + printk("Failed to allocate netlink socket for %s.\n", + uhd->name); + return -ENOMEM; + } + + suspend_fill_skb_pool(uhd); + + return 0; +} + +void suspend_netlink_close(struct user_helper_data *uhd) +{ + if (uhd->nl) { + sock_release(uhd->nl->sk_socket); + uhd->nl = NULL; + } + + while (uhd->emerg_skbs) { + struct sk_buff *next = uhd->emerg_skbs->next; + kfree_skb(uhd->emerg_skbs); + uhd->emerg_skbs = next; + } +} + +int suspend2_launch_userspace_program(char *command, int channel_no) +{ + int retval; + static char *envp[] = { + "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL }; + static char *argv[] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }; + char *channel = kmalloc(6, GFP_KERNEL); + int arg = 0, size; + char test_read[255]; + char *orig_posn = command; + + if (!strlen(orig_posn)) + return 1; + + /* Up to 7 args supported */ + while (arg < 7) { + sscanf(orig_posn, "%s", test_read); + size = strlen(test_read); + if (!(size)) + break; + argv[arg] = kmalloc(size + 1, GFP_ATOMIC); + strcpy(argv[arg], test_read); + orig_posn += size + 1; + *test_read = 0; + arg++; + } + + if (channel_no) { + sprintf(channel, "-c%d", channel_no); + argv[arg] = channel; + } else + arg--; + + retval = call_usermodehelper(argv[0], argv, envp, 0); + + if (retval) + printk("Failed to launch userspace program '%s': Error %d\n", + command, retval); + + { + int i; + for (i = 0; i < arg; i++) + if (argv[i] && argv[i] != channel) + kfree(argv[i]); + } + + kfree(channel); + + return retval; +} + +int suspend_netlink_setup(struct user_helper_data *uhd) +{ + if (netlink_prepare(uhd) < 0) { + printk("Netlink prepare failed.\n"); + return 1; + } + + if (suspend2_launch_userspace_program(uhd->program, uhd->netlink_id) < 0) { + printk("Launch userspace program failed.\n"); + suspend_netlink_close(uhd); + return 1; + } + + /* Wait 2 seconds for the userspace process to make contact */ + wait_for_completion_timeout(&uhd->wait_for_process, 2*HZ); + + if (uhd->pid == -1) { + printk("%s: Failed to contact userspace process.\n", + uhd->name); + suspend_netlink_close(uhd); + return 1; + } + + return 0; +} + +#else +#endif diff -urN oldtree/kernel/power/netlink.h newtree/kernel/power/netlink.h --- oldtree/kernel/power/netlink.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/netlink.h 2006-02-18 15:24:31.396815888 +0000 @@ -0,0 +1,43 @@ +/* + * netlink.h + * + * Declarations for functions for communicating with a userspace helper + * via netlink. + */ + +#include +#include + +#define NETLINK_MSG_BASE 0x10 + +#define NETLINK_MSG_READY 0x10 +#define NETLINK_MSG_NOFREEZE_ME 0x16 +#define NETLINK_MSG_GET_DEBUGGING 0x19 +#define NETLINK_MSG_CLEANUP 0x24 +#define NETLINK_MSG_NOFREEZE_ACK 0x27 +#define NETLINK_MSG_IS_DEBUGGING 0x28 + +struct user_helper_data { + int (*rcv_msg) (struct sk_buff *skb, struct nlmsghdr *nlh); + void (* not_ready) (void); + struct sock *nl; + u32 sock_seq; + pid_t pid; + char *comm; + char program[256]; + int pool_level; + int pool_limit; + struct sk_buff *emerg_skbs; + int skb_size; + int netlink_id; + char *name; + struct user_helper_data *next; + struct completion wait_for_process; + int interface_version; + int must_init; +}; + +void suspend_send_netlink_message(struct user_helper_data *uhd, + int type, void* params, size_t len); +int suspend_netlink_setup(struct user_helper_data *uhd); +void suspend_netlink_close(struct user_helper_data *uhd); diff -urN oldtree/kernel/power/pagedir.c newtree/kernel/power/pagedir.c --- oldtree/kernel/power/pagedir.c 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/pagedir.c 2006-02-18 15:24:31.397815736 +0000 @@ -0,0 +1,370 @@ +/* + * kernel/power/pagedir.c + * + * Copyright (C) 1998-2001 Gabor Kuti + * Copyright (C) 1998,2001,2002 Pavel Machek + * Copyright (C) 2002-2003 Florent Chabaud + * Copyright (C) 2002-2005 Nigel Cunningham + * + * This file is released under the GPLv2. + * + * Routines for handling pagesets. + * Note that pbes aren't actually stored as such. They're stored as + * bitmaps and extents. + */ + +#include +#include +#include +#include + +#include "pageflags.h" +#include "ui.h" +#include "pagedir.h" + +int extra_pagedir_pages_allocated = 0; + +/* Not static so allocation routine can BUG if recursively called */ +dyn_pageflags_t conflicting_pages_map; + +#define PageConflicting(page) (test_dynpageflag(&conflicting_pages_map, page)) +#define SetPageConflicting(page) (set_dynpageflag(&conflicting_pages_map, page)) +#define ClearPageConflicting(page) (clear_dynpageflag(&conflicting_pages_map, page)) + +/* suspend_free_extra_pagedir_memory + * + * Description: Free a previously pagedir metadata. + */ +void suspend_free_extra_pagedir_memory(void) +{ + unsigned long pagenumber; + + free_dyn_pageflags(&pageset1_map); + free_dyn_pageflags(&pageset2_map); + free_dyn_pageflags(&pageset1_copy_map); + + /* Free allocated pages */ + if (allocd_pages_map) { + BITMAP_FOR_EACH_SET(allocd_pages_map, pagenumber) { + struct page *page = pfn_to_page(pagenumber); + ClearPageNosave(page); + __free_page(page); + extra_pagedir_pages_allocated--; + } + free_dyn_pageflags(&allocd_pages_map); + } +} + +/* suspend_allocate_extra_pagedir_memory + * + * Description: Allocate memory for making the atomic copy of pagedir1 in the + * case where it is bigger than pagedir2. + * Arguments: struct pagedir *: The pagedir for which we should + * allocate memory. + * int: Size of pageset 1. + * int: Size of pageset 2. + * Result: int. Zero on success. One if unable to allocate enough memory. + */ +int suspend_allocate_extra_pagedir_memory(struct pagedir *p, int pageset_size, + int alloc_from) +{ + int num_to_alloc = pageset_size - alloc_from - extra_pagedir_pages_allocated; + int j, order; + + if (num_to_alloc < 1) + num_to_alloc = 0; + + if (num_to_alloc) { + int num_added = 0; + + order = generic_fls(num_to_alloc); + if (order >= MAX_ORDER) + order = MAX_ORDER - 1; + + while (num_added < num_to_alloc) { + struct page *newpage; + unsigned long virt; + + while ((1 << order) > (num_to_alloc - num_added)) + order--; + + virt = __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, order); + while ((!virt) && (order > 0)) { + order--; + virt = __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, order); + } + + if (!virt) { + p->pageset_size += num_added; + return 1; + } + + newpage = virt_to_page(virt); + for (j = 0; j < (1 << order); j++) { + SetPageNosave(newpage + j); + /* Pages will be freed one at a time. */ + set_page_count(newpage + j, 1); + SetPageAllocd(newpage + j); + extra_pagedir_pages_allocated++; + } + num_added+= (1 << order); + } + } + + return 0; +} + +/* + * suspend_mark_task_as_pageset1 + * Functionality : Marks all the pages belonging to a given process as + * pageset 1 pages. + * Called From : pagedir.c - mark_pages_for_pageset2 + * + */ +extern struct page *suspend2_follow_page(struct mm_struct *mm, unsigned long address); + +void suspend_mark_task_as_pageset1(struct task_struct *t) +{ + struct vm_area_struct *vma; + struct mm_struct *mm; + + mm = t->active_mm; + + if (!mm || !mm->mmap) return; + + /* Don't try to take the sem when processes are frozen, + * drivers are suspended and irqs are disabled. We're + * not racing with anything anyway. */ + BUG_ON(in_atomic() && !irqs_disabled()); + + if (!irqs_disabled()) + down_read(&mm->mmap_sem); + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->vm_flags & VM_PFNMAP) + continue; + if (vma->vm_start) { + unsigned long posn; + for (posn = vma->vm_start; posn < vma->vm_end; + posn += PAGE_SIZE) { + struct page *page = + suspend2_follow_page(mm, posn); + if (page) + ClearPagePageset2(page); + } + } + } + + BUG_ON(in_atomic() && !irqs_disabled()); + + if (!irqs_disabled()) + up_read(&mm->mmap_sem); +} + +/* mark_pages_for_pageset2 + * + * Description: Mark unshared pages in processes not needed for suspend as + * being able to be written out in a separate pagedir. + * HighMem pages are simply marked as pageset2. They won't be + * needed during suspend. + */ + +struct attention_list { + struct task_struct *task; + struct attention_list *next; +}; + +#define HALT_ON(condition) \ + do { if (unlikely(condition)) { \ + printk("Suspend2: Halting at line %d. Please report to nigel@suspend2.net.\n", __LINE__); \ + while(1) \ + cpu_relax(); \ + } } while(0) + +void suspend_mark_pages_for_pageset2(void) +{ + struct zone *zone; + struct task_struct *p; + struct attention_list *attention_list = NULL, *last = NULL; + unsigned long flags, i; + + HALT_ON(in_atomic() && !irqs_disabled()); + + clear_dyn_pageflags(pageset2_map); + + if (test_action_state(SUSPEND_NO_PAGESET2)) + return; + + /* + * Note that we don't clear the map to begin with! + * This is because if we eat memory, we loose track + * of LRU pages that are still in use but taken off + * the LRU. If I can figure out how the VM keeps + * track of them, I might be able to tweak this a + * little further and decrease pageset one's size + * further. + * + * (Memory grabbing clears the pageset2 flag on + * pages that are really freed!). + */ + + for_each_zone(zone) { + spin_lock_irqsave(&zone->lru_lock, flags); + if (zone->nr_inactive) { + struct page *page; + list_for_each_entry(page, &zone->inactive_list, lru) + SetPagePageset2(page); + } + if (zone->nr_active) { + struct page *page; + list_for_each_entry(page, &zone->active_list, lru) + SetPagePageset2(page); + } + spin_unlock_irqrestore(&zone->lru_lock, flags); + } + + HALT_ON(in_atomic() && !irqs_disabled()); + + /* Now we find all userspace process (with task->mm) marked PF_NOFREEZE + * and move them into pageset1. + */ + read_lock(&tasklist_lock); + for_each_process(p) + if ((p->mm || p->active_mm) && (p->flags & PF_NOFREEZE)) { + struct attention_list *this = kmalloc(sizeof(struct attention_list), GFP_ATOMIC); + BUG_ON(!this); + this->task = p; + this->next = NULL; + if (attention_list) { + last->next = this; + last = this; + } else + attention_list = last = this; + } + read_unlock(&tasklist_lock); + + HALT_ON(in_atomic() && !irqs_disabled()); + + /* Because the tasks in attention_list are ones related to suspending, + * we know that they won't go away under us. + */ + + while (attention_list) { + suspend_mark_task_as_pageset1(attention_list->task); + last = attention_list; + attention_list = attention_list->next; + kfree(last); + } + + HALT_ON(in_atomic() && !irqs_disabled()); + + for_each_zone(zone) { + if (!zone->present_pages) + continue; + for (i = 0; i < zone->spanned_pages; i++) { + struct page *page = pfn_to_page(zone->zone_start_pfn + i); + BUG_ON(PagePageset2(page) && PageSlab(page)); + } + } + + HALT_ON(in_atomic() && !irqs_disabled()); + +} + +/* suspend_get_nonconflicting_pages + * + * Description: Gets higher-order pages that won't be overwritten + * while copying the original pages. + * + * Note that if only one of the allocated pages overlaps + * with the pages that overlap, another set must be + * tried. Therefore, you shouldn't use this function + * much, and not with high orders. + */ + +unsigned long suspend_get_nonconflicting_pages(const int order) +{ + struct page *page; + unsigned long new_page, i; + int more = 0; + + do { + new_page = __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, order); + if (!new_page) + return 0; + page = virt_to_page(new_page); + more = 0; + for (i = 0; i < (1UL << order); i++) { + if (PagePageset1(page + i)) { + more = 1; + break; + } + } + if (more) { + for (i = 0; i < (1UL << order); i++) + if (PagePageset1(page + i)) + SetPageConflicting(page + i); + else { + set_page_count(page + i, 1); + __free_pages(page + i, 0); + } + } + } + while (more); + + memset((void*)new_page, 0, PAGE_SIZE * (1< + * + * This file is released under the GPLv2. + * + * Declarations for routines for handling pagesets. + */ + +/* Pagedir + * + * Contains the metadata for a set of pages saved in the image. + */ + +struct pagedir { + int pageset_size; + int lastpageset_size; +}; + +extern struct pagedir pagedir1, pagedir2; + +extern void suspend_copy_pageset1(void); + +extern void suspend_free_extra_pagedir_memory(void); + +extern int suspend_allocate_extra_pagedir_memory(struct pagedir *p, int pageset_size, int alloc_from); + +extern void suspend_mark_task_as_pageset1 (struct task_struct *t); +extern void suspend_mark_pages_for_pageset2(void); + +extern void suspend_relocate_if_required(unsigned long *current_value, unsigned int size); +extern int suspend_get_pageset1_load_addresses(void); + +extern int extra_pagedir_pages_allocated; + +extern unsigned long suspend_get_nonconflicting_pages(int order); diff -urN oldtree/kernel/power/pageflags.c newtree/kernel/power/pageflags.c --- oldtree/kernel/power/pageflags.c 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/pageflags.c 2006-02-18 15:24:31.399815432 +0000 @@ -0,0 +1,150 @@ +/* + * kernel/power/suspend_core/pageflags.c + * + * Copyright (C) 2004-2005 Nigel Cunningham + * + * This file is released under the GPLv2. + * + * Routines for dynamically allocating and releasing bitmaps + * used as pseudo-pageflags. + * + * Arrays are not contiguous. The first sizeof(void *) bytes are + * the pointer to the next page in the bitmap. This allows us to + * 1) work under low memory conditions where order 0 might be all + * that's available + * 2) save the pages at suspend time, reload and relocate them as + * necessary at resume time without breaking anything (cf + * extent pages). + */ + +#include +#include +#include +#include +#include +#include +#include "pageflags.h" +#include "modules.h" +#include "pagedir.h" + +/* Maps used in copying the image back are in builtin.c */ +dyn_pageflags_t pageset1_map; +dyn_pageflags_t pageset1_copy_map; +dyn_pageflags_t pageset2_map; +dyn_pageflags_t in_use_map; +dyn_pageflags_t allocd_pages_map; +#ifdef CONFIG_DEBUG_PAGEALLOC +dyn_pageflags_t unmap_map; +#endif +dyn_pageflags_t checksum_map; + +static int num_zones(void) +{ + int result = 0; + struct zone *zone; + + for_each_zone(zone) + result++; + + return result; +} + +static int pages_for_zone(struct zone *zone) +{ + return (zone->spanned_pages + (PAGE_SIZE << 3) - 1) / + (PAGE_SIZE << 3); +} + +/* save_dyn_pageflags + * + * Description: Save a set of pageflags. + * Arguments: dyn_pageflags_t *: Pointer to the bitmap being saved. + */ + +void save_dyn_pageflags(dyn_pageflags_t pagemap) +{ + int i, zone_num = 0; + struct zone *zone; + + if (!*pagemap) + return; + + for_each_zone(zone) { + int size = pages_for_zone(zone); + suspend_active_writer->ops.writer.write_header_chunk((char *) &zone_num, sizeof(int)); + suspend_active_writer->ops.writer.write_header_chunk((char *) &size, sizeof(int)); + + for (i = 0; i < size; i++) + suspend_active_writer->ops.writer.write_header_chunk((char *) pagemap[zone_num][i], PAGE_SIZE); + zone_num++; + } + zone_num = -1; + suspend_active_writer->ops.writer.write_header_chunk((char *) &zone_num, sizeof(int)); +} + +/* load_dyn_pageflags + * + * Description: Load a set of pageflags. + * Arguments: dyn_pageflags_t *: Pointer to the bitmap being loaded. + * (It must be allocated before calling this routine). + */ + +void load_dyn_pageflags(dyn_pageflags_t pagemap) +{ + int i, zone_num = 0, zone_check = 0; + struct zone *zone; + + if (!pagemap) + return; + + for_each_zone(zone) { + int size = 0; + suspend_active_writer->ops.writer.read_header_chunk((char *) &zone_check, sizeof(int)); + if (zone_check != zone_num) { + printk("Zone check (%d) != zone_num (%d).\n", zone_check, zone_num); + BUG(); + } + suspend_active_writer->ops.writer.read_header_chunk((char *) &size, sizeof(int)); + + for (i = 0; i < size; i++) + suspend_active_writer->ops.writer.read_header_chunk((char *) pagemap[zone_num][i], PAGE_SIZE); + zone_num++; + } + suspend_active_writer->ops.writer.read_header_chunk((char *) &zone_check, sizeof(int)); + if (zone_check != -1) { + printk("Didn't read end of dyn pageflag data marker.(%x)\n", zone_check); + BUG(); + } +} + +/* relocate_dyn_pageflags + * + * Description: Relocate a set of pageflags to ensure they don't collide with + * pageset 1 data which will get overwritten on copyback. + * Arguments: dyn_pageflags_t *: Pointer to the bitmap being relocated. + */ + +extern int num_zones(void); + +void relocate_dyn_pageflags(dyn_pageflags_t *pagemap) +{ + int i, zone_num = 0; + struct zone *zone; + + if (!*pagemap) + return; + + suspend_relocate_if_required((void *) pagemap, sizeof (void *) * num_zones()); + + for_each_zone(zone) { + int pages = (zone->spanned_pages + (PAGE_SIZE << 3) - 1) >> + (PAGE_SHIFT + 3); + + suspend_relocate_if_required((void *) &((*pagemap)[zone_num]), sizeof(void *) * pages); + + for (i = 0; i < pages; i++) + suspend_relocate_if_required((void *) &((*pagemap)[zone_num][i]), + PAGE_SIZE); + zone_num++; + } +} diff -urN oldtree/kernel/power/pageflags.h newtree/kernel/power/pageflags.h --- oldtree/kernel/power/pageflags.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/pageflags.h 2006-02-18 15:24:31.400815280 +0000 @@ -0,0 +1,86 @@ +/* + * kernel/power/pageflags.h + * + * Copyright (C) 2004-2005 Nigel Cunningham + * + * This file is released under the GPLv2. + * + * Suspend2 needs a few pageflags while working that aren't otherwise + * used. To save the struct page pageflags, we dynamically allocate + * a bitmap and use that. These are the only non order-0 allocations + * we do. + * + * NOTE!!! + * We assume that PAGE_SIZE - sizeof(void *) is a multiple of + * sizeof(unsigned long). Is this ever false? + */ + +#include +#include + +extern dyn_pageflags_t in_use_map; +extern dyn_pageflags_t allocd_pages_map; +#ifdef CONFIG_DEBUG_PAGEALLOC +extern dyn_pageflags_t unmap_map; +#endif +extern dyn_pageflags_t pageset2_map; +extern dyn_pageflags_t conflicting_pages_map; +extern dyn_pageflags_t checksum_map; + +/* + * inusemap is used in two ways: + * - During suspend, to tag pages which are not used (to speed up + * count_data_pages); + * - During resume, to tag pages which are in pagedir1. This does not tag + * pagedir2 pages, so !== first use. + */ + +#define PageInUse(page) (test_dynpageflag(&in_use_map, page)) +#define SetPageInUse(page) (set_dynpageflag(&in_use_map, page)) +#define ClearPageInUse(page) (clear_dynpageflag(&in_use_map, page)) + +#define PagePageset1(page) (test_dynpageflag(&pageset1_map, page)) +#define SetPagePageset1(page) (set_dynpageflag(&pageset1_map, page)) +#define ClearPagePageset1(page) (clear_dynpageflag(&pageset1_map, page)) + +#define PagePageset1Copy(page) (test_dynpageflag(&pageset1_copy_map, page)) +#define SetPagePageset1Copy(page) (set_dynpageflag(&pageset1_copy_map, page)) +#define ClearPagePageset1Copy(page) (clear_dynpageflag(&pageset1_copy_map, page)) + +#define PagePageset2(page) (test_dynpageflag(&pageset2_map, page)) +#define SetPagePageset2(page) (set_dynpageflag(&pageset2_map, page)) +#define ClearPagePageset2(page) (clear_dynpageflag(&pageset2_map, page)) + +#define PageAllocd(page) (test_dynpageflag(&allocd_pages_map, page)) +#define SetPageAllocd(page) (set_dynpageflag(&allocd_pages_map, page)) +#define ClearPageAllocd(page) (clear_dynpageflag(&allocd_pages_map, page)) + +#ifdef CONFIG_DEBUG_PAGEALLOC +#define PageUnmap(page) (test_dynpageflag(&unmap_map, page)) +#define SetPageUnmap(page) (set_dynpageflag(&unmap_map, page)) +#define ClearPageUnmap(page) (clear_dynpageflag(&unmap_map, page)) +#endif + +static inline int PageChecksumIgnore(struct page *page) +{ + return checksum_map ? + test_dynpageflag(&checksum_map, page) : + 0; +} + +static inline void SetPageChecksumIgnore(struct page *page) +{ + if (checksum_map) + set_dynpageflag(&checksum_map, page); +}; + +static inline void ClearPageChecksumIgnore(struct page *page) +{ + if (checksum_map) + clear_dynpageflag(&checksum_map, page); +}; + +extern void save_dyn_pageflags(dyn_pageflags_t pagemap); +extern void load_dyn_pageflags(dyn_pageflags_t pagemap); +void relocate_dyn_pageflags(dyn_pageflags_t *pagemap); + diff -urN oldtree/kernel/power/power.h newtree/kernel/power/power.h --- oldtree/kernel/power/power.h 2006-02-18 15:18:30.087743216 +0000 +++ newtree/kernel/power/power.h 2006-02-18 15:24:31.401815128 +0000 @@ -1,6 +1,8 @@ #include #include +#include "suspend.h" + struct swsusp_info { struct new_utsname uts; u32 version_code; @@ -35,7 +37,7 @@ extern struct subsystem power_subsys; /* References to section boundaries */ -extern const void __nosave_begin, __nosave_end; +//extern const void __nosave_begin, __nosave_end; extern unsigned int nr_copy_pages; extern struct pbe *pagedir_nosave; diff -urN oldtree/kernel/power/power_off.c newtree/kernel/power/power_off.c --- oldtree/kernel/power/power_off.c 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/power_off.c 2006-02-18 15:24:31.402814976 +0000 @@ -0,0 +1,78 @@ +/* + * kernel/power/power_off.c + * + * Copyright (C) 2005 Nigel Cunningham + * + * This file is released under the GPLv2. + * + * Support for powering down. + */ + +#include +#include +#include +#include +#include +#include "suspend2_common.h" +#include "suspend2.h" +#include "ui.h" + +unsigned long suspend_powerdown_method = 0; /* 0 - Kernel power off */ + +extern struct pm_ops *pm_ops; + +/* Use suspend_enter from main.c */ +extern int suspend_enter(suspend_state_t state); + +int try_pm_state_powerdown(void) +{ + if (pm_ops && pm_ops->prepare && suspend_powerdown_method && + pm_ops->prepare(suspend_powerdown_method)) + return 0; + + if (suspend_powerdown_method > 3) + kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); + else { + if (device_suspend(PMSG_SUSPEND)) { + printk(KERN_ERR "Some devices failed to suspend\n"); + return 0; + } + } + + if (suspend_enter(suspend_powerdown_method)) + return 0; + + device_resume(); + + if (pm_ops && pm_ops->finish && suspend_powerdown_method) + pm_ops->finish(suspend_powerdown_method); + + return 1; +} + +/* + * suspend_power_down + * Functionality : Powers down or reboots the computer once the image + * has been written to disk. + * Key Assumptions : Able to reboot/power down via code called or that + * the warning emitted if the calls fail will be visible + * to the user (ie printk resumes devices). + * Called From : do_suspend2_suspend_2 + */ + +void suspend_power_down(void) +{ + if (test_action_state(SUSPEND_REBOOT)) { + suspend_prepare_status(DONT_CLEAR_BAR, "Ready to reboot."); + kernel_restart(NULL); + } + + if (pm_ops && pm_ops->enter && suspend_powerdown_method && try_pm_state_powerdown()) + return; + + kernel_power_off(); + suspend_prepare_status(DONT_CLEAR_BAR, "Powerdown failed"); + while (1) + cpu_relax(); +} + diff -urN oldtree/kernel/power/power_off.h newtree/kernel/power/power_off.h --- oldtree/kernel/power/power_off.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/power_off.h 2006-02-18 15:24:31.403814824 +0000 @@ -0,0 +1,13 @@ +/* + * kernel/power/suspend2_core/power_off.h + * + * Copyright (C) 2005 Nigel Cunningham + * + * This file is released under the GPLv2. + * + * Support for the powering down. + */ + +int suspend_pm_state_finish(void); +void suspend_power_down(void); +extern unsigned long suspend_powerdown_method; diff -urN oldtree/kernel/power/prepare_image.c newtree/kernel/power/prepare_image.c --- oldtree/kernel/power/prepare_image.c 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/prepare_image.c 2006-02-18 15:24:31.404814672 +0000 @@ -0,0 +1,751 @@ +/* + * kernel/power/prepare_image.c + * + * Copyright (C) 2003-2005 Nigel Cunningham + * + * This file is released under the GPLv2. + * + * We need to eat memory until we can: + * 1. Perform the save without changing anything (RAM_NEEDED < max_pfn) + * 2. Fit it all in available space (suspend_active_writer->available_space() >= + * storage_needed()) + * 3. Reload the pagedir and pageset1 to places that don't collide with their + * final destinations, not knowing to what extent the resumed kernel will + * overlap with the one loaded at boot time. I think the resumed kernel + * should overlap completely, but I don't want to rely on this as it is + * an unproven assumption. We therefore assume there will be no overlap at + * all (worse case). + * 4. Meet the user's requested limit (if any) on the size of the image. + * The limit is in MB, so pages/256 (assuming 4K pages). + * + */ + +#include +#include +#include + +#include "suspend2.h" +#include "pageflags.h" +#include "modules.h" +#include "suspend2_common.h" +#include "io.h" +#include "ui.h" +#include "extent.h" +#include "prepare_image.h" +#include "checksum.h" + +static int are_frozen = 0, num_nosave = 0; +static int header_space_allocated = 0; +static int storage_allocated = 0; +static int storage_available = 0; +int extra_pd1_pages_allowance = 100; + +static int num_pcp_pages(void) +{ + struct zone *zone; + int result = 0, i = 0; + + /* PCP lists */ + for_each_zone(zone) { + struct per_cpu_pageset *pset; + int cpu; + + if (!zone->present_pages) + continue; + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + if (!cpu_possible(cpu)) + continue; + + pset = zone_pcp(zone, cpu); + + for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { + struct per_cpu_pages *pcp; + + pcp = &(pset->pcp[i]); + result += pcp->count; + } + } + } + return result; +} + +int real_nr_free_pages(void) +{ + return nr_free_pages() + num_pcp_pages(); +} + +static void get_extra_pd1_allowance(void) +{ + int orig_num_free = real_nr_free_pages(), final; + + suspend_prepare_status(CLEAR_BAR, "Finding allowance for drivers."); + device_suspend(PMSG_FREEZE); + local_irq_disable(); /* irqs might have been re-enabled on us */ + device_power_down(PMSG_FREEZE); + + final = real_nr_free_pages(); + + device_power_up(); + local_irq_enable(); + + device_resume(); + + extra_pd1_pages_allowance = orig_num_free - final + 100; +} + +static int main_storage_needed(int use_ecr, + int ignore_extra_pd1_allow) +{ + return ((pagedir1.pageset_size + pagedir2.pageset_size + + (ignore_extra_pd1_allow ? 0 : extra_pd1_pages_allowance)) * + (use_ecr ? suspend_expected_compression_ratio() : 100) / 100); +} + +static int header_storage_needed(void) +{ + unsigned long bytes = ((extents_allocated * 2 * sizeof(unsigned long)) + + sizeof(struct suspend_header) + + sizeof(struct suspend_module_header) + + (int) suspend_header_storage_for_modules() + + (dyn_pageflags_pages_per_bitmap() << PAGE_SHIFT) + + suspend_num_modules * + (sizeof(struct suspend_module_header) + sizeof(int))); + + return ((int) ((bytes + (int) PAGE_SIZE - 1) >> PAGE_SHIFT)); +} + +static void display_stats(int always, int sub_extra_pd1_allow) +{ + unsigned long storage_allocated = suspend_active_writer->ops.writer.storage_allocated(); + char buffer[255]; + snprintf(buffer, 254, + "Free:%d(%d). Sets:%d(%d),%d(%d). Header:%d. Nosave:%d-%d=%d. Storage:%lu/%u(%u). Needed:%d|%d|%d.\n", + + /* Free */ + nr_free_pages(), + nr_free_pages() - nr_free_highpages(), + + /* Sets */ + pagedir1.pageset_size, pageset1_sizelow, + pagedir2.pageset_size, pageset2_sizelow, + + /* Header */ + header_storage_needed(), + + /* Nosave */ + num_nosave, extra_pagedir_pages_allocated, + num_nosave - extra_pagedir_pages_allocated, + + /* Storage - converted to pages for comparison */ + storage_allocated, + storage_needed(1, sub_extra_pd1_allow), + storage_available, + + /* Needed */ + ram_to_suspend() - nr_free_pages() - nr_free_highpages(), + storage_needed(1, sub_extra_pd1_allow) - storage_available, + (image_size_limit > 0) ? (storage_needed(1, sub_extra_pd1_allow) - (image_size_limit << 8)) : 0); + if (always) + printk(buffer); + else + suspend_message(SUSPEND_EAT_MEMORY, SUSPEND_MEDIUM, 1, buffer); +} + +/* generate_free_page_map + * + * Description: This routine generates a bitmap of free pages from the + * lists used by the memory manager. We then use the bitmap + * to quickly calculate which pages to save and in which + * pagesets. + */ +static void generate_free_page_map(void) +{ + int i, order, loop, cpu; + struct page *page; + unsigned long flags; + struct zone *zone; + struct per_cpu_pageset *pset; + + for_each_zone(zone) { + if (!zone->present_pages) + continue; + for(i=0; i < zone->spanned_pages; i++) + SetPageInUse(pfn_to_page(zone->zone_start_pfn + i)); + } + + for_each_zone(zone) { + if (!zone->present_pages) + continue; + spin_lock_irqsave(&zone->lock, flags); + for (order = MAX_ORDER - 1; order >= 0; --order) { + list_for_each_entry(page, &zone->free_area[order].free_list, lru) + for(loop=0; loop < (1 << order); loop++) { + ClearPageInUse(page+loop); + ClearPagePageset2(page+loop); + } + } + + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + if (!cpu_possible(cpu)) + continue; + + pset = zone_pcp(zone, cpu); + + for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { + struct per_cpu_pages *pcp; + struct page *page; + + pcp = &pset->pcp[i]; + list_for_each_entry(page, &pcp->list, lru) { + ClearPageInUse(page); + ClearPagePageset2(page); + } + } + } + + spin_unlock_irqrestore(&zone->lock, flags); + } +} + +/* size_of_free_region + * + * Description: Return the number of pages that are free, beginning with and + * including this one. + */ +static int size_of_free_region(struct page *page) +{ + struct zone *zone = page_zone(page); + struct page *posn = page, *last_in_zone = + zone->zone_mem_map + zone->spanned_pages - 1; + + while (posn < last_in_zone && !PageInUse(posn)) + posn++; + return (posn - page); +} + +static struct page *rotext_start, *rotext_end; +static struct page *nosave_start, *nosave_end; +#ifdef CONFIG_DEBUG_RODATA +static struct page *rtas_start, *rtas_end; +static struct page *rodata_start, *rodata_end; +extern char __start_rodata, __end_rodata; +#endif +#ifdef CONFIG_PPC_RTAS +extern unsigned int rtas_data, rtas_size; +#endif +#ifdef CONFIG_PPC +extern char _etext[]; +#else +extern char _text[], _etext[]; +#endif + +#ifdef CONFIG_X86_32 /* 2.6.15 and later */ +extern int bad_ppro; + +/* + * Copied from arch/i386/mm/init.c. It should be moved to + * an include file after testing. + */ +static inline int page_kills_ppro(unsigned long pagenr) +{ + if (pagenr >= 0x70000 && pagenr <= 0x7003F) + return 1; + return 0; +} + +#else +#define bad_ppro (0) +#define page_kills_ppro(pfn) (0) +#endif + +static __init int page_nosave_init(void) +{ +#ifdef CONFIG_DEBUG_RODATA + rodata_start = virt_to_page(&__start_rodata); + rodata_end = virt_to_page(&__end_rodata); +#endif +#ifdef CONFIG_PPC + rotext_start = virt_to_page(PAGE_OFFSET); +#else + rotext_start = virt_to_page(&_text); +#endif + rotext_end = virt_to_page(&_etext); + + nosave_start = virt_to_page(&__nosave_begin); + nosave_end = virt_to_page(((char *) &__nosave_end) - 1); + +#ifdef CONFIG_PPC_RTAS + rtas_start = virt_to_page(__va(rtas_data)); + rtas_end = virt_to_page(__va(rtas_data) + rtas_size); +#endif + return 0; +} + +subsys_initcall(page_nosave_init); + +/* count_data_pages + * + * This routine generates our lists of pages to be stored in each + * pageset. Since we store the data using extents, and adding new + * extents might allocate a new extent page, this routine may well + * be called more than once. + */ +static struct pageset_sizes_result count_data_pages(void) +{ + int chunk_size, num_free = 0; + unsigned long loop; + int use_pagedir2; + struct pageset_sizes_result result; + struct zone *zone; + + result.size1 = 0; + result.size1low = 0; + result.size2 = 0; + result.size2low = 0; + + num_nosave = 0; + + clear_dyn_pageflags(pageset1_map); + clear_dyn_pageflags(pageset1_copy_map); + + generate_free_page_map(); + + if (test_result_state(SUSPEND_ABORTED)) + return result; + + /* + * Pages not to be saved are marked Nosave irrespective of being reserved + */ + for_each_zone(zone) { + for (loop = 0; loop < zone->spanned_pages; loop++) { + unsigned long pfn = zone->zone_start_pfn + loop; + struct page *page = pfn_to_page(pfn); + + if ( +#if 0 +#ifdef CONFIG_DEBUG_RODATA + (page >= rodata_start && page <= rodata_end) || +#endif +#ifdef CONFIG_DEBUG_ROTEXT + (page >= rotext_start && page <= rotext_end) || +#endif +#ifdef CONFIG_PPC_RTAS + (page >= rtas_start && page <= rtas_end) || +#endif + !pfn_valid(pfn) || + (bad_ppro && page_kills_ppro(pfn)) || + (checksum_map && PageChecksumIgnore(page)) || + !page_is_ram(pfn)) { +#endif + (page >= nosave_start && page <= nosave_end) || + PageAllocd(page)) { + num_nosave++; + continue; + } + if (!PageReserved(page)) { + if ((chunk_size=size_of_free_region(page))!=0) { + num_free += chunk_size; + loop += chunk_size - 1; + continue; + } + } else { + if (PageHighMem(page)) { + /* HighMem pages may be marked Reserved. We ignore them. */ + num_nosave++; + continue; + } + }; + + use_pagedir2 = PagePageset2(page); + + if (use_pagedir2) { + result.size2++; + if (!PageHighMem(page)) + result.size2low++; + SetPagePageset1Copy(page); + } else { + result.size1++; + SetPagePageset1(page); + if (!PageHighMem(page)) + result.size1low++; + } + } + } + + suspend_message(SUSPEND_EAT_MEMORY, SUSPEND_MEDIUM, 0, + "Count data pages: Set1 (%d) + Set2 (%d) + Nosave (%d) + NumFree (%d) = %d.\n", + result.size1, result.size2, num_nosave, num_free, + result.size1 + result.size2 + num_nosave + num_free); + BITMAP_FOR_EACH_SET(allocd_pages_map, loop) + SetPagePageset1Copy(pfn_to_page(loop)); + return result; +} + +/* amount_needed + * + * Calculates the amount by which the image size needs to be reduced to meet + * our constraints. + */ +static int amount_needed(int use_image_size_limit) +{ + + int max1 = max( (int) (ram_to_suspend() - real_nr_free_pages() - + nr_free_highpages()), + ((int) (storage_needed(1, 0) - + storage_available))); + if (use_image_size_limit) + return max( max1, + (image_size_limit > 0) ? + ((int) (storage_needed(1, 0) - (image_size_limit << 8))) : 0); + return max1; +} + +/* suspend_recalculate_stats + * + * Eaten is the number of pages which have been eaten. + * Pagedirincluded is the number of pages which have been allocated for the pagedir. + */ +struct pageset_sizes_result suspend_recalculate_stats(int storage_unavailable) +{ + struct pageset_sizes_result result; + + suspend_mark_pages_for_pageset2(); /* Need to call this before getting pageset1_size! */ + BUG_ON(in_atomic() && !irqs_disabled()); + result = count_data_pages(); + pageset1_sizelow = result.size1low; + pageset2_sizelow = result.size2low; + pagedir1.lastpageset_size = pagedir1.pageset_size = result.size1; + pagedir2.lastpageset_size = pagedir2.pageset_size = result.size2; + if (!storage_unavailable) { + storage_available = suspend_active_writer->ops.writer.storage_available(); + display_stats(0, 0); + } + BUG_ON(in_atomic() && !irqs_disabled()); + return result; +} + +/* update_image + * + * Allocate [more] memory and storage for the image. + */ +static int update_image(void) +{ + struct pageset_sizes_result result; + int result2, param_used; + + result = suspend_recalculate_stats(0); + + if (suspend_allocate_checksum_pages()) { + suspend_message(SUSPEND_ANY_SECTION, SUSPEND_LOW, 1, + "Still need to get more pages for checksum pages.\n"); + return 1; + } + + /* Include allowance for growth in pagedir1 while writing pagedir 2 */ + if (suspend_allocate_extra_pagedir_memory(&pagedir1, + pagedir1.pageset_size + extra_pd1_pages_allowance, + pageset2_sizelow)) { + suspend_message(SUSPEND_EAT_MEMORY, SUSPEND_LOW, 1, + "Still need to get more pages for pagedir 1.\n"); + return 1; + } + + thaw_processes(FREEZER_KERNEL_THREADS); + + param_used = main_storage_needed(1, 0); + if ((result2 = suspend_active_writer->ops.writer.allocate_storage(param_used))) { + suspend_message(SUSPEND_EAT_MEMORY, SUSPEND_LOW, 1, + "Allocate storage returned %d. Still need to get more storage space for the image proper.\n", + result2); + storage_allocated = suspend_active_writer->ops.writer.storage_allocated(); + if (freeze_processes()) { + set_result_state(SUSPEND_FREEZING_FAILED); + set_result_state(SUSPEND_ABORTED); + } + return 1; + } + + param_used = header_storage_needed(); + if ((result2 = suspend_active_writer->ops.writer.allocate_header_space(param_used))) { + suspend_message(SUSPEND_EAT_MEMORY, SUSPEND_LOW, 1, + "Still need to get more storage space for header.\n"); + if (freeze_processes()) { + set_result_state(SUSPEND_FREEZING_FAILED); + set_result_state(SUSPEND_ABORTED); + } + storage_allocated = suspend_active_writer->ops.writer.storage_allocated(); + return 1; + } + + header_space_allocated = param_used; + + /* + * Allocate remaining storage space, if possible, up to the + * maximum we know we'll need. It's okay to allocate the + * maximum if the writer is the swapwriter, but + * we don't want to grab all available space on an NFS share. + * We therefore ignore the expected compression ratio here, + * thereby trying to allocate the maximum image size we could + * need (assuming compression doesn't expand the image), but + * don't complain if we can't get the full amount we're after. + */ + + suspend_active_writer->ops.writer.allocate_storage( + min(storage_available, + main_storage_needed(0, 1))); + + storage_allocated = suspend_active_writer->ops.writer.storage_allocated(); + + if (freeze_processes()) { + set_result_state(SUSPEND_FREEZING_FAILED); + set_result_state(SUSPEND_ABORTED); + } + + suspend_recalculate_stats(0); + + suspend_message(SUSPEND_EAT_MEMORY, SUSPEND_LOW, 1, + "Amount still needed (%d) > 0:%d. Header: %d < %d: %d," + " Storage allocd: %d < %d + %d: %d.\n", + amount_needed(0), + (amount_needed(0) > 0), + header_space_allocated, header_storage_needed(), + header_space_allocated < header_storage_needed(), + storage_allocated, + header_storage_needed(), main_storage_needed(1, 1), + storage_allocated < + (header_storage_needed() + main_storage_needed(1, 1))); + + check_shift_keys(0, NULL); + + return ((amount_needed(0) > 0) || + header_space_allocated < header_storage_needed() || + storage_allocated < + (header_storage_needed() + main_storage_needed(1, 1))); +} + +/* attempt_to_freeze + * + * Try to freeze processes. + */ + +static int attempt_to_freeze(void) +{ + int result; + + /* Stop processes before checking again */ + thaw_processes(FREEZER_ALL_THREADS); + suspend_prepare_status(CLEAR_BAR, "Freezing processes"); + result = freeze_processes(); + + if (result) { + set_result_state(SUSPEND_ABORTED); + set_result_state(SUSPEND_FREEZING_FAILED); + } else + are_frozen = 1; + + return result; +} + +int storage_needed(int use_ecr, int ignore_extra_pd1_allow) +{ + return (main_storage_needed(use_ecr, ignore_extra_pd1_allow) + + header_storage_needed()); +} + +int ram_to_suspend(void) +{ + return (1 + + max((pagedir1.pageset_size + extra_pd1_pages_allowance - + pageset2_sizelow), 0) + + MIN_FREE_RAM + suspend_memory_for_modules()); +} + + +/* eat_memory + * + * Try to free some memory, either to meet hard or soft constraints on the image + * characteristics. + * + * Hard constraints: + * - Pageset1 must be < half of memory; + * - We must have enough memory free at resume time to have pageset1 + * be able to be loaded in pages that don't conflict with where it has to + * be restored. + * Soft constraints + * - User specificied image size limit. + */ +static int eat_memory(void) +{ + int orig_memory_still_to_eat, last_amount_needed = 0, times_criteria_met = 0; + int free_flags = 0, did_eat_memory = 0; + + /* + * Note that if we have enough storage space and enough free memory, we may + * exit without eating anything. We give up when the last 10 iterations ate + * no extra pages because we're not going to get much more anyway, but + * the few pages we get will take a lot of time. + * + * We freeze processes before beginning, and then unfreeze them if we + * need to eat memory until we think we have enough. If our attempts + * to freeze fail, we give up and abort. + */ + + /* -- Stage 1: Freeze Processes -- */ + + + suspend_recalculate_stats(0); + + orig_memory_still_to_eat = amount_needed(1); + last_amount_needed = orig_memory_still_to_eat; + + switch (image_size_limit) { + case -1: /* Don't eat any memory */ + if (orig_memory_still_to_eat) { + set_result_state(SUSPEND_ABORTED); + set_result_state(SUSPEND_WOULD_EAT_MEMORY); + } + break; + case -2: /* Free caches only */ + free_flags = GFP_NOIO | __GFP_HIGHMEM; + break; + default: + free_flags = GFP_ATOMIC | __GFP_HIGHMEM; + } + + thaw_processes(FREEZER_KERNEL_THREADS); + + /* -- Stage 2: Eat memory -- */ + + while (((amount_needed(1) > 0) || (image_size_limit == -2)) && + (!test_result_state(SUSPEND_ABORTED)) && + (times_criteria_met < 10)) { + int amount_freed; + int amount_wanted = orig_memory_still_to_eat - amount_needed(1); + + suspend_prepare_status(CLEAR_BAR, "Seeking to free %dMB of memory.", MB(amount_needed(1))); + + if (amount_wanted < 1) + amount_wanted = 1; /* image_size_limit == -2 */ + + if (orig_memory_still_to_eat) + suspend_update_status(orig_memory_still_to_eat - amount_needed(1), + orig_memory_still_to_eat, + " Image size %d ", + MB(storage_needed(1, 0))); + else + suspend_update_status(0, 1, "Image size %d ", + MB(storage_needed(1, 0))); + + if ((last_amount_needed - amount_needed(1)) < 10) + times_criteria_met++; + else + times_criteria_met = 0; + last_amount_needed = amount_needed(1); + amount_freed = shrink_all_memory(last_amount_needed); + suspend_recalculate_stats(0); + + did_eat_memory = 1; + + check_shift_keys(0, NULL); + } + + if (freeze_processes()) { + set_result_state(SUSPEND_FREEZING_FAILED); + set_result_state(SUSPEND_ABORTED); + } + + if (did_eat_memory) { + unsigned long orig_state = get_suspend_state(); + /* Freeze_processes will call sys_sync too */ + restore_suspend_state(orig_state); + suspend_recalculate_stats(0); + } + + /* Blank out image size display */ + suspend_update_status(100, 100, NULL); + + if (!test_result_state(SUSPEND_ABORTED)) { + /* Include image size limit when checking what to report */ + if (amount_needed(1) - extra_pd1_pages_allowance > 0) + set_result_state(SUSPEND_UNABLE_TO_FREE_ENOUGH_MEMORY); + + /* But don't include it when deciding whether to abort (soft limit) */ + if ((amount_needed(0) - extra_pd1_pages_allowance > 0)) { + printk("Unable to free sufficient memory to suspend. Still need %d pages.\n", + amount_needed(1)); + display_stats(1, 1); + set_result_state(SUSPEND_ABORTED); + } + + check_shift_keys(1, "Memory eating completed."); + } + + return 0; +} + +/* prepare_image + * + * Entry point to the whole image preparation section. + * + * We do four things: + * - Freeze processes; + * - Ensure image size constraints are met; + * - Complete all the preparation for saving the image, + * including allocation of storage. The only memory + * that should be needed when we're finished is that + * for actually storing the image (and we know how + * much is needed for that because the modules tell + * us). + * - Make sure that all dirty buffers are written out. + */ + +#define MAX_TRIES 4 +int suspend_prepare_image(void) +{ + int result = 1, tries = 0; + + are_frozen = 0; + + header_space_allocated = 0; + + if (attempt_to_freeze()) + return 1; + + if (!extra_pd1_pages_allowance) + get_extra_pd1_allowance(); + + storage_available = suspend_active_writer->ops.writer.storage_available(); + + if (!storage_available) { + printk(KERN_ERR "You need some storage available to be able to suspend.\n"); + set_result_state(SUSPEND_ABORTED); + set_result_state(SUSPEND_NOSTORAGE_AVAILABLE); + return 1; + } + + do { + suspend_prepare_status(CLEAR_BAR, "Preparing Image."); + + if (eat_memory() || test_result_state(SUSPEND_ABORTED)) + break; + + result = update_image(); + + check_shift_keys(0, NULL); + + tries++; + + } while ((result) && (tries < MAX_TRIES) && (!test_result_state(SUSPEND_ABORTED)) && + (!test_result_state(SUSPEND_UNABLE_TO_FREE_ENOUGH_MEMORY))); + + if (tries == MAX_TRIES) { + abort_suspend("Unable to successfully prepare the image.\n"); + display_stats(1, 0); + } + + check_shift_keys(1, "Image preparation complete."); + + return result; +} diff -urN oldtree/kernel/power/prepare_image.h newtree/kernel/power/prepare_image.h --- oldtree/kernel/power/prepare_image.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/prepare_image.h 2006-02-18 15:24:31.405814520 +0000 @@ -0,0 +1,31 @@ +/* + * kernel/power/prepare_image.h + */ + +extern int suspend_prepare_image(void); +extern struct pageset_sizes_result suspend_recalculate_stats(int storage_available); +extern int real_nr_free_pages(void); +extern int image_size_limit; +extern int pageset1_sizelow, pageset2_sizelow; + +struct pageset_sizes_result { + int size1; /* Can't be unsigned - breaks MAX function */ + int size1low; + int size2; + int size2low; +}; + +#ifdef CONFIG_CRYPTO +extern int suspend_expected_compression_ratio(void); +#else +static inline int suspend_expected_compression_ratio(void) +{ + return 0; +}; +#endif + +#define MIN_FREE_RAM (max_low_pfn >> 7) + +extern int extra_pd1_pages_allowance; +extern int storage_needed(int use_ecr, int ignore_extra_p1_allowance); +extern int ram_to_suspend(void); diff -urN oldtree/kernel/power/proc.c newtree/kernel/power/proc.c --- oldtree/kernel/power/proc.c 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/proc.c 2006-02-18 15:24:31.406814368 +0000 @@ -0,0 +1,305 @@ +/* + * /kernel/power/proc.c + * + * Copyright (C) 2002-2005 Nigel Cunningham + * + * This file is released under the GPLv2. + * + * This file contains support for proc entries for tuning Suspend2. + * + * We have a generic handler that deals with the most common cases, and + * hooks for special handlers to use. + */ + +#include +#include +#include + +#include "proc.h" +#include "suspend2.h" +#include "storage.h" + +static int suspend_proc_initialised = 0; + +static struct list_head suspend_proc_entries; +static struct proc_dir_entry *suspend_dir; +static struct suspend_proc_data proc_params[]; + +extern void __suspend_try_resume(void); +extern void suspend_main(void); + +/* suspend_read_proc + * + * Generic handling for reading the contents of bits, integers, + * unsigned longs and strings. + */ +static int suspend_read_proc(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + int len = 0; + struct suspend_proc_data *proc_data = (struct suspend_proc_data *) data; + + if (suspend_start_anything(0)) + return -EBUSY; + + if (proc_data->needs_storage_manager & 1) + suspend_prepare_usm(); + + switch (proc_data->type) { + case SUSPEND_PROC_DATA_CUSTOM: + if (proc_data->data.special.read_proc) { + read_proc_t *read_proc = proc_data->data.special.read_proc; + len = read_proc(page, start, off, count, eof, data); + } else + len = 0; + break; + case SUSPEND_PROC_DATA_BIT: + len = sprintf(page, "%d\n", + -test_bit(proc_data->data.bit.bit, + proc_data->data.bit.bit_vector)); + break; + case SUSPEND_PROC_DATA_INTEGER: + { + int *variable = proc_data->data.integer.variable; + len = sprintf(page, "%d\n", *variable); + break; + } + case SUSPEND_PROC_DATA_UL: + { + long *variable = proc_data->data.ul.variable; + len = sprintf(page, "%lu\n", *variable); + break; + } + case SUSPEND_PROC_DATA_STRING: + { + char *variable = proc_data->data.string.variable; + len = sprintf(page, "%s\n", variable); + break; + } + } + /* Side effect routine? */ + if (proc_data->read_proc) + proc_data->read_proc(); + + if (len <= count) + *eof = 1; + + if (proc_data->needs_storage_manager & 1) + suspend_cleanup_usm(); + + suspend_finish_anything(0); + + return len; +} +/* suspend_write_proc + * + * Generic routine for handling writing to files representing + * bits, integers and unsigned longs. + */ + +static int suspend_write_proc(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct suspend_proc_data *proc_data = (struct suspend_proc_data *) data; + char *my_buf = (char *) get_zeroed_page(GFP_ATOMIC); + int result = count, assigned_temp_buffer = 0; + + if (!my_buf) + return -ENOMEM; + + if (count > PAGE_SIZE) + count = PAGE_SIZE; + + if (copy_from_user(my_buf, buffer, count)) + return -EFAULT; + + if (suspend_start_anything(proc_data == &proc_params[0])) + return -EBUSY; + + my_buf[count] = 0; + + if (proc_data->needs_storage_manager & 2) + suspend_prepare_usm(); + + switch (proc_data->type) { + case SUSPEND_PROC_DATA_CUSTOM: + if (proc_data->data.special.write_proc) { + write_proc_t *write_proc = proc_data->data.special.write_proc; + result = write_proc(file, buffer, count, data); + } + break; + case SUSPEND_PROC_DATA_BIT: + { + int value = simple_strtoul(my_buf, NULL, 0); + if (value) + set_bit(proc_data->data.bit.bit, + (proc_data->data.bit.bit_vector)); + else + clear_bit(proc_data->data.bit.bit, + (proc_data->data.bit.bit_vector)); + } + break; + case SUSPEND_PROC_DATA_INTEGER: + { + int *variable = proc_data->data.integer.variable; + int minimum = proc_data->data.integer.minimum; + int maximum = proc_data->data.integer.maximum; + *variable = simple_strtol(my_buf, NULL, 0); + if (((*variable) < minimum)) + *variable = minimum; + + if (((*variable) > maximum)) + *variable = maximum; + break; + } + case SUSPEND_PROC_DATA_UL: + { + unsigned long *variable = proc_data->data.ul.variable; + unsigned long minimum = proc_data->data.ul.minimum; + unsigned long maximum = proc_data->data.ul.maximum; + *variable = simple_strtoul(my_buf, NULL, 0); + + if (minimum && ((*variable) < minimum)) + *variable = minimum; + + if (maximum && ((*variable) > maximum)) + *variable = maximum; + break; + } + break; + case SUSPEND_PROC_DATA_STRING: + { + int copy_len = count; + char *variable = + proc_data->data.string.variable; + + if (proc_data->data.string.max_length && + (copy_len > proc_data->data.string.max_length)) + copy_len = proc_data->data.string.max_length; + + if (!variable) { + proc_data->data.string.variable = + variable = (char *) get_zeroed_page(GFP_ATOMIC); + assigned_temp_buffer = 1; + } + strncpy(variable, my_buf, copy_len); + if ((copy_len) && + (my_buf[copy_len - 1] == '\n')) + variable[count - 1] = 0; + variable[count] = 0; + } + break; + } + free_page((unsigned long) my_buf); + /* Side effect routine? */ + if (proc_data->write_proc) + proc_data->write_proc(); + + /* Free temporary buffers */ + if (assigned_temp_buffer) { + free_page((unsigned long) proc_data->data.string.variable); + proc_data->data.string.variable = NULL; + } + + if (proc_data->needs_storage_manager & 2) + suspend_cleanup_usm(); + + suspend_finish_anything(proc_data == &proc_params[0]); + + return result; +} + +/* Non-module proc entries. + * + * This array contains entries that are automatically registered at + * boot. Plugins and the console code register their own entries separately. + * + * NB: If you move do_suspend, change suspend_write_proc's test so that + * suspend_start_anything still gets a 1 when the user echos > do_suspend! + */ + +static struct suspend_proc_data proc_params[] = { + { .filename = "do_suspend", + .permissions = PROC_WRITEONLY, + .type = SUSPEND_PROC_DATA_CUSTOM, + .write_proc = suspend_main, + .needs_storage_manager = 2, + }, + + { .filename = "do_resume", + .permissions = PROC_WRITEONLY, + .type = SUSPEND_PROC_DATA_CUSTOM, + .write_proc = __suspend_try_resume, + .needs_storage_manager = 2, + }, +}; + +/* suspend_initialise_proc + * + * Initialise the /proc/suspend2 directory. + */ + +static void suspend_initialise_proc(void) +{ + int i; + int numfiles = sizeof(proc_params) / sizeof(struct suspend_proc_data); + + if (suspend_proc_initialised) + return; + + suspend_dir = proc_mkdir("suspend2", NULL); + + BUG_ON(!suspend_dir); + + INIT_LIST_HEAD(&suspend_proc_entries); + + suspend_proc_initialised = 1; + + for (i=0; i< numfiles; i++) + suspend_register_procfile(&proc_params[i]); +} + +/* suspend_register_procfile + * + * Helper for registering a new /proc/suspend2 entry. + */ + +struct proc_dir_entry *suspend_register_procfile( + struct suspend_proc_data *suspend_proc_data) +{ + struct proc_dir_entry *new_entry; + + if (!suspend_proc_initialised) + suspend_initialise_proc(); + + new_entry = create_proc_entry( + suspend_proc_data->filename, + suspend_proc_data->permissions, + suspend_dir); + if (new_entry) { + list_add_tail(&suspend_proc_data->proc_data_list, &suspend_proc_entries); + new_entry->read_proc = suspend_read_proc; + new_entry->write_proc = suspend_write_proc; + new_entry->data = suspend_proc_data; + } else { + printk("Error! create_proc_entry returned NULL.\n"); + INIT_LIST_HEAD(&suspend_proc_data->proc_data_list); + } + return new_entry; +} + +/* suspend_unregister_procfile + * + * Helper for removing unwanted /proc/suspend2 entries. + * + */ +void suspend_unregister_procfile(struct suspend_proc_data *suspend_proc_data) +{ + if (list_empty(&suspend_proc_data->proc_data_list)) + return; + + remove_proc_entry( + suspend_proc_data->filename, + suspend_dir); + list_del(&suspend_proc_data->proc_data_list); +} diff -urN oldtree/kernel/power/proc.h newtree/kernel/power/proc.h --- oldtree/kernel/power/proc.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/proc.h 2006-02-18 15:24:31.407814216 +0000 @@ -0,0 +1,70 @@ +/* + * kernel/power/proc.h + * + * Copyright (C) 2004-2005 Nigel Cunningham + * + * This file is released under the GPLv2. + * + * It provides declarations for suspend to use in managing + * /proc/suspend2. When we switch to kobjects, + * this will become redundant. + * + */ + +#include + +struct suspend_proc_data { + char *filename; + int permissions; + int type; + int needs_storage_manager; + union { + struct { + unsigned long *bit_vector; + int bit; + } bit; + struct { + int *variable; + int minimum; + int maximum; + } integer; + struct { + unsigned long *variable; + unsigned long minimum; + unsigned long maximum; + } ul; + struct { + char *variable; + int max_length; + } string; + struct { + read_proc_t *read_proc; + write_proc_t *write_proc; + void *data; + } special; + } data; + + /* Side effects routines. Used, eg, for reparsing the + * resume2 entry when it changes */ + void (*read_proc) (void); + void (*write_proc) (void); + struct list_head proc_data_list; +}; + +enum { + SUSPEND_PROC_DATA_NONE, + SUSPEND_PROC_DATA_CUSTOM, + SUSPEND_PROC_DATA_BIT, + SUSPEND_PROC_DATA_INTEGER, + SUSPEND_PROC_DATA_UL, + SUSPEND_PROC_DATA_STRING +}; + +#define PROC_WRITEONLY 0200 +#define PROC_READONLY 0400 +#define PROC_RW 0600 + +struct proc_dir_entry *suspend_register_procfile( + struct suspend_proc_data *suspend_proc_data); +void suspend_unregister_procfile(struct suspend_proc_data *suspend_proc_data); + diff -urN oldtree/kernel/power/process.c newtree/kernel/power/process.c --- oldtree/kernel/power/process.c 2006-01-03 03:21:10.000000000 +0000 +++ newtree/kernel/power/process.c 2006-02-18 15:24:31.409813912 +0000 @@ -1,134 +1,422 @@ /* - * drivers/power/process.c - Functions for starting/stopping processes on - * suspend transitions. + * kernel/power/process.c * - * Originally from swsusp. + * Copyright (C) 1998-2001 Gabor Kuti + * Copyright (C) 1998,2001,2002 Pavel Machek + * Copyright (C) 2002-2003 Florent Chabaud + * Copyright (C) 2002-2004 Nigel Cunningham + * + * This file is released under the GPLv2. + * + * Freeze_and_free contains the routines software suspend uses to freeze other + * processes during the suspend cycle and to (if necessary) free up memory in + * accordance with limitations on the image size. + * + * Ideally, the image saved to disk would be an atomic copy of the entire + * contents of all RAM and related hardware state. One of the first + * prerequisites for getting our approximation of this is stopping the activity + * of other processes. We can't stop all other processes, however, since some + * are needed in doing the I/O to save the image. Freeze_and_free.c contains + * the routines that control suspension and resuming of these processes. + * + * Under high I/O load, we need to be careful about the order in which we + * freeze processes. If we freeze processes in the wrong order, we could + * deadlock others. The freeze_order array this specifies the order in which + * critical processes are frozen. All others are suspended after these have + * entered the refrigerator. + * + * Another complicating factor is that freeing memory requires the processes + * to not be frozen, but at the end of freeing memory, they need to be frozen + * so that we can be sure we actually have eaten enough memory. This is why + * freezing and freeing are in the one file. The freezer is not called from + * the main logic, but indirectly, via the code for eating memory. The eat + * memory logic is iterative, first freezing processes and checking the stats, + * then (if necessary) unfreezing them and eating more memory until it looks + * like the criteria are met (at which point processes are frozen & stats + * checked again). */ - -#undef DEBUG - -#include -#include #include +#include #include +#include +#include +#include +#include +#include + +unsigned long freezer_state = 0; + +#if 0 +//#ifdef CONFIG_PM_DEBUG +#define freezer_message(msg, a...) do { printk(msg, ##a); } while(0) +#else +#define freezer_message(msg, a...) do { } while(0) +#endif + +/* Timeouts when freezing */ +#define FREEZER_TOTAL_TIMEOUT (5 * HZ) +#define FREEZER_CHECK_TIMEOUT (HZ / 10) + +DECLARE_COMPLETION(kernelspace_thaw); +DECLARE_COMPLETION(userspace_thaw); +static atomic_t nr_userspace_frozen; +static atomic_t nr_kernelspace_frozen; + +struct frozen_fs +{ + struct list_head fsb_list; + struct super_block *sb; +}; + +LIST_HEAD(frozen_fs_list); + +void freezer_make_fses_rw(void) +{ + struct frozen_fs *fs, *next_fs; + + list_for_each_entry_safe(fs, next_fs, &frozen_fs_list, fsb_list) { + thaw_bdev(fs->sb->s_bdev, fs->sb); + + list_del(&fs->fsb_list); + kfree(fs); + } +} /* - * Timeout for stopping processes + * Done after userspace is frozen, so there should be no danger of + * fses being unmounted while we're in here. */ -#define TIMEOUT (6 * HZ) +int freezer_make_fses_ro(void) +{ + struct frozen_fs *fs; + struct super_block *sb; + + /* Generate the list */ + list_for_each_entry(sb, &super_blocks, s_list) { + if (!sb->s_root || !sb->s_bdev || + (sb->s_frozen == SB_FREEZE_TRANS) || + (sb->s_flags & MS_RDONLY)) + continue; + fs = kmalloc(sizeof(struct frozen_fs), GFP_ATOMIC); + fs->sb = sb; + list_add_tail(&fs->fsb_list, &frozen_fs_list); + }; + + /* Do the freezing in reverse order so filesystems dependant + * upon others are frozen in the right order. (Eg loopback + * on ext3). */ + list_for_each_entry_reverse(fs, &frozen_fs_list, fsb_list) + freeze_bdev(fs->sb->s_bdev); -static inline int freezeable(struct task_struct * p) + return 0; +} + +/* + * freezeable + * + * Description: Determine whether a process should be frozen yet. + * Parameters: struct task_struct * The process to consider. + * int Boolean - 0 = userspace else all. + * Returns: int 0 if don't freeze yet, otherwise do. + */ +static int freezeable(struct task_struct * p, int all_freezable) { if ((p == current) || + (p->flags & PF_FROZEN) || (p->flags & PF_NOFREEZE) || (p->exit_state == EXIT_ZOMBIE) || (p->exit_state == EXIT_DEAD) || (p->state == TASK_STOPPED) || - (p->state == TASK_TRACED)) + (p->state == TASK_TRACED) || + (!p->mm && !all_freezable)) return 0; return 1; } -/* Refrigerator is place where frozen processes are stored :-). */ -void refrigerator(void) +static void __freeze_process(struct completion *completion_handler, + atomic_t *nr_frozen) { - /* Hmm, should we be allowed to suspend when there are realtime - processes around? */ long save; + + freezer_message("%s (%d) frozen.\n", + current->comm, current->pid); save = current->state; - pr_debug("%s entered refrigerator\n", current->comm); - printk("="); + + atomic_inc(nr_frozen); + wait_for_completion(completion_handler); + atomic_dec(nr_frozen); + + current->state = save; + freezer_message("%s (%d) leaving freezer.\n", + current->comm, current->pid); +} + +/* + * Refrigerator + */ +void refrigerator(void) +{ + unsigned long flags; + might_sleep(); + + /* Locking to handle race against waking the process in + * freeze threads. */ + spin_lock_irqsave(¤t->sighand->siglock, flags); frozen_process(current); - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); /* We sent fake signal, clean it up */ - spin_unlock_irq(¤t->sighand->siglock); - - while (frozen(current)) { - current->state = TASK_UNINTERRUPTIBLE; - schedule(); + + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + + if (test_freezer_state(FREEZER_ON)) { + if (current->mm) + __freeze_process(&userspace_thaw, &nr_userspace_frozen); + else + __freeze_process(&kernelspace_thaw, + &nr_kernelspace_frozen); } - pr_debug("%s left refrigerator\n", current->comm); - current->state = save; + + spin_lock_irqsave(¤t->sighand->siglock, flags); + recalc_sigpending(); + current->flags &= ~PF_FROZEN; + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + + return; } -/* 0 = success, else # of processes that we failed to stop */ -int freeze_processes(void) +void thaw_processes(int do_all_threads) { - int todo; - unsigned long start_time; + if (do_all_threads) { + clear_freezer_state(FREEZER_ON); + clear_freezer_state(ABORT_FREEZING); + } + + complete_all(&kernelspace_thaw); + while (atomic_read(&nr_kernelspace_frozen) > 0) + yield(); + + init_completion(&kernelspace_thaw); + freezer_make_fses_rw(); + + if (do_all_threads) { + complete_all(&userspace_thaw); + while (atomic_read(&nr_userspace_frozen) > 0) + yield(); + init_completion(&userspace_thaw); + } +} + +/* + * num_freezeable + * + * Description: Determine how many processes of our type are still to be + * frozen. As a side effect, update the progress bar too. + * Parameters: int Which type we are trying to freeze. + * int Whether we are displaying our progress. + */ +static int num_freezeable(int do_all_threads) { + struct task_struct *g, *p; - unsigned long flags; + int todo_this_type = 0; - printk( "Stopping tasks: " ); - start_time = jiffies; - do { - todo = 0; - read_lock(&tasklist_lock); - do_each_thread(g, p) { - if (!freezeable(p)) - continue; - if (frozen(p)) - continue; + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if (freezeable(p, do_all_threads)) + todo_this_type++; + } while_each_thread(g, p); + read_unlock(&tasklist_lock); - freeze(p); - spin_lock_irqsave(&p->sighand->siglock, flags); - signal_wake_up(p, 0); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - todo++; - } while_each_thread(g, p); - read_unlock(&tasklist_lock); - yield(); /* Yield is okay here */ - if (todo && time_after(jiffies, start_time + TIMEOUT)) { - printk( "\n" ); - printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo ); - break; - } - } while(todo); + return todo_this_type; +} - /* This does not unfreeze processes that are already frozen - * (we have slightly ugly calling convention in that respect, - * and caller must call thaw_processes() if something fails), - * but it cleans up leftover PF_FREEZE requests. - */ - if (todo) { - read_lock(&tasklist_lock); - do_each_thread(g, p) - if (freezing(p)) { - pr_debug(" clean up: %s\n", p->comm); - p->flags &= ~PF_FREEZE; - spin_lock_irqsave(&p->sighand->siglock, flags); - recalc_sigpending_tsk(p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - } - while_each_thread(g, p); - read_unlock(&tasklist_lock); - return todo; - } +/* + * num_uninterruptible + * + * Description: Determine how many processes of our type are in state + * task uninterruptible. + * Parameters: int Which type we are trying to freeze. + */ +static int num_uninterruptible(int do_all_threads) { + + struct task_struct *g, *p; + int count = 0; - printk( "|\n" ); - BUG_ON(in_atomic()); - return 0; + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if (freezeable(p, do_all_threads) && + p->state == TASK_UNINTERRUPTIBLE) + count++; + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + + return count; } -void thaw_processes(void) +/* + * Tell threads of the type to enter the freezer. + */ +static void signal_threads(int do_all_threads) { struct task_struct *g, *p; + unsigned long flags; - printk( "Restarting tasks..." ); read_lock(&tasklist_lock); do_each_thread(g, p) { - if (!freezeable(p)) + if (!freezeable(p, do_all_threads)) continue; - if (!thaw_process(p)) - printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); + + freeze(p); + spin_lock_irqsave(&p->sighand->siglock, flags); + signal_wake_up(p, 0); + spin_unlock_irqrestore(&p->sighand->siglock, flags); } while_each_thread(g, p); + read_unlock(&tasklist_lock); +} +/* + * Prod processes that haven't entered the refrigerator yet. + */ +static void prod_processes(int do_all_threads) +{ + struct task_struct *g, *p; + unsigned long flags; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if (!freezeable(p, do_all_threads)) + continue; + + spin_lock_irqsave(&p->sighand->siglock, flags); + if (!(p->flags & PF_FROZEN)) { + recalc_sigpending(); + signal_wake_up(p, 0); + } + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } while_each_thread(g, p); read_unlock(&tasklist_lock); - schedule(); - printk( " done\n" ); } +/* + * Freezer failure. + * + * Check whether we failed to freeze all the processes that + * should be frozen. If we find a task that failed to freeze, + * we give useful information on what failed and how. + */ +static int freezer_failure(int do_all_threads) +{ + int result = 0; + struct task_struct *g, *p; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if (!freezeable(p, do_all_threads) || + p->state == TASK_UNINTERRUPTIBLE) + continue; + + if (!result) { + printk(KERN_ERR "Stopping tasks failed.\n"); + printk(KERN_ERR "Tasks that refused to be " + "refrigerated and haven't since exited:\n"); + set_freezer_state(ABORT_FREEZING); + result = 1; + } + + if ((freezing(p))) { + printk(" - %s (#%d) signalled but " + "didn't enter refrigerator.\n", + p->comm, p->pid); + } else + printk(" - %s (#%d) signalled " + "and todo list empty.\n", + p->comm, p->pid); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + + return result; +} + +/* + * freeze_threads + * + * Freeze a set of threads having particular attributes. + * + * Types: + * 2: User threads. + * 3: Kernel threads. + */ +static int freeze_threads(int do_all_threads) +{ + int result = 0, still_to_do; + unsigned long start_time = jiffies; + + if (do_all_threads) + freezer_make_fses_ro(); + + signal_threads(do_all_threads); + + /* Watch them do it, wake them if they ignore us. */ + do { + prod_processes(do_all_threads); + + set_task_state(current, TASK_INTERRUPTIBLE); + schedule_timeout(FREEZER_CHECK_TIMEOUT); + + still_to_do = num_freezeable(do_all_threads) - + num_uninterruptible(do_all_threads); + + } while(still_to_do && (!test_freezer_state(ABORT_FREEZING)) && + !time_after(jiffies, start_time + FREEZER_TOTAL_TIMEOUT)); + + /* + * Did we time out? See if we failed to freeze processes as well. + * + */ + if ((time_after(jiffies, start_time + FREEZER_TOTAL_TIMEOUT)) + && (still_to_do)) + result = freezer_failure(do_all_threads); + + BUG_ON(in_atomic()); + + return 0; +} + +/* + * freeze_processes - Freeze processes prior to saving an image of memory. + * + * Return value: 0 = success, 1 = faulure. + */ +int freeze_processes(void) +{ + enum system_states old_state = system_state; + int result = 0; + + if (!test_freezer_state(FREEZER_ON)) { + /* + * No race. While !FREEZER_ON, processes + * won't enter __freeze_process + */ + init_completion(&userspace_thaw); + init_completion(&kernelspace_thaw); + set_freezer_state(FREEZER_ON); + } + + /* Now freeze processes that were syncing and are still running */ + if (freeze_threads(0) || (test_freezer_state(ABORT_FREEZING))) { + result = 1; + goto out; + } + + /* Freeze kernel threads */ + if (freeze_threads(1) || (test_freezer_state(ABORT_FREEZING))) + result = 1; + +out: + system_state = old_state; + return result; +} + +EXPORT_SYMBOL(freezer_state); EXPORT_SYMBOL(refrigerator); diff -urN oldtree/kernel/power/snapshot.c newtree/kernel/power/snapshot.c --- oldtree/kernel/power/snapshot.c 2006-02-18 15:18:30.088743064 +0000 +++ newtree/kernel/power/snapshot.c 2006-02-18 15:24:31.410813760 +0000 @@ -172,7 +172,7 @@ return 0; page = pfn_to_page(pfn); - BUG_ON(PageReserved(page) && PageNosave(page)); + //BUG_ON(PageReserved(page) && PageNosave(page)); if (PageNosave(page)) return 0; if (PageReserved(page) && pfn_is_nosave(pfn)) diff -urN oldtree/kernel/power/snapshot.c.orig newtree/kernel/power/snapshot.c.orig --- oldtree/kernel/power/snapshot.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/snapshot.c.orig 2006-02-18 15:18:30.000000000 +0000 @@ -0,0 +1,510 @@ +/* + * linux/kernel/power/snapshot.c + * + * This file provide system snapshot/restore functionality. + * + * Copyright (C) 1998-2005 Pavel Machek + * + * This file is released under the GPLv2, and is based on swsusp.c. + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "power.h" + +struct pbe *pagedir_nosave; +unsigned int nr_copy_pages; + +#ifdef CONFIG_HIGHMEM +unsigned int count_highmem_pages(void) +{ + struct zone *zone; + unsigned long zone_pfn; + unsigned int n = 0; + + for_each_zone (zone) + if (is_highmem(zone)) { + mark_free_pages(zone); + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) { + struct page *page; + unsigned long pfn = zone_pfn + zone->zone_start_pfn; + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + if (PageReserved(page)) + continue; + if (PageNosaveFree(page)) + continue; + n++; + } + } + return n; +} + +struct highmem_page { + char *data; + struct page *page; + struct highmem_page *next; +}; + +static struct highmem_page *highmem_copy; + +static int save_highmem_zone(struct zone *zone) +{ + unsigned long zone_pfn; + mark_free_pages(zone); + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { + struct page *page; + struct highmem_page *save; + void *kaddr; + unsigned long pfn = zone_pfn + zone->zone_start_pfn; + + if (!(pfn%1000)) + printk("."); + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + /* + * This condition results from rvmalloc() sans vmalloc_32() + * and architectural memory reservations. This should be + * corrected eventually when the cases giving rise to this + * are better understood. + */ + if (PageReserved(page)) + continue; + BUG_ON(PageNosave(page)); + if (PageNosaveFree(page)) + continue; + save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC); + if (!save) + return -ENOMEM; + save->next = highmem_copy; + save->page = page; + save->data = (void *) get_zeroed_page(GFP_ATOMIC); + if (!save->data) { + kfree(save); + return -ENOMEM; + } + kaddr = kmap_atomic(page, KM_USER0); + memcpy(save->data, kaddr, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + highmem_copy = save; + } + return 0; +} + +int save_highmem(void) +{ + struct zone *zone; + int res = 0; + + pr_debug("swsusp: Saving Highmem\n"); + for_each_zone (zone) { + if (is_highmem(zone)) + res = save_highmem_zone(zone); + if (res) + return res; + } + return 0; +} + +int restore_highmem(void) +{ + printk("swsusp: Restoring Highmem\n"); + while (highmem_copy) { + struct highmem_page *save = highmem_copy; + void *kaddr; + highmem_copy = save->next; + + kaddr = kmap_atomic(save->page, KM_USER0); + memcpy(kaddr, save->data, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + free_page((long) save->data); + kfree(save); + } + return 0; +} +#endif + +static int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; + unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; + return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); +} + +/** + * saveable - Determine whether a page should be cloned or not. + * @pfn: The page + * + * We save a page if it's Reserved, and not in the range of pages + * statically defined as 'unsaveable', or if it isn't reserved, and + * isn't part of a free chunk of pages. + */ + +static int saveable(struct zone *zone, unsigned long *zone_pfn) +{ + unsigned long pfn = *zone_pfn + zone->zone_start_pfn; + struct page *page; + + if (!pfn_valid(pfn)) + return 0; + + page = pfn_to_page(pfn); + BUG_ON(PageReserved(page) && PageNosave(page)); + if (PageNosave(page)) + return 0; + if (PageReserved(page) && pfn_is_nosave(pfn)) + return 0; + if (PageNosaveFree(page)) + return 0; + + return 1; +} + +unsigned int count_data_pages(void) +{ + struct zone *zone; + unsigned long zone_pfn; + unsigned int n = 0; + + for_each_zone (zone) { + if (is_highmem(zone)) + continue; + mark_free_pages(zone); + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) + n += saveable(zone, &zone_pfn); + } + return n; +} + +static void copy_data_pages(struct pbe *pblist) +{ + struct zone *zone; + unsigned long zone_pfn; + struct pbe *pbe, *p; + + pbe = pblist; + for_each_zone (zone) { + if (is_highmem(zone)) + continue; + mark_free_pages(zone); + /* This is necessary for swsusp_free() */ + for_each_pb_page (p, pblist) + SetPageNosaveFree(virt_to_page(p)); + for_each_pbe (p, pblist) + SetPageNosaveFree(virt_to_page(p->address)); + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { + if (saveable(zone, &zone_pfn)) { + struct page *page; + page = pfn_to_page(zone_pfn + zone->zone_start_pfn); + BUG_ON(!pbe); + pbe->orig_address = (unsigned long)page_address(page); + /* copy_page is not usable for copying task structs. */ + memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE); + pbe = pbe->next; + } + } + } + BUG_ON(pbe); +} + + +/** + * free_pagedir - free pages allocated with alloc_pagedir() + */ + +void free_pagedir(struct pbe *pblist) +{ + struct pbe *pbe; + + while (pblist) { + pbe = (pblist + PB_PAGE_SKIP)->next; + ClearPageNosave(virt_to_page(pblist)); + ClearPageNosaveFree(virt_to_page(pblist)); + free_page((unsigned long)pblist); + pblist = pbe; + } +} + +/** + * fill_pb_page - Create a list of PBEs on a given memory page + */ + +static inline void fill_pb_page(struct pbe *pbpage) +{ + struct pbe *p; + + p = pbpage; + pbpage += PB_PAGE_SKIP; + do + p->next = p + 1; + while (++p < pbpage); +} + +/** + * create_pbe_list - Create a list of PBEs on top of a given chain + * of memory pages allocated with alloc_pagedir() + */ + +static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) +{ + struct pbe *pbpage, *p; + unsigned int num = PBES_PER_PAGE; + + for_each_pb_page (pbpage, pblist) { + if (num >= nr_pages) + break; + + fill_pb_page(pbpage); + num += PBES_PER_PAGE; + } + if (pbpage) { + for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++) + p->next = p + 1; + p->next = NULL; + } +} + +/** + * On resume it is necessary to trace and eventually free the unsafe + * pages that have been allocated, because they are needed for I/O + * (on x86-64 we likely will "eat" these pages once again while + * creating the temporary page translation tables) + */ + +struct eaten_page { + struct eaten_page *next; + char padding[PAGE_SIZE - sizeof(void *)]; +}; + +static struct eaten_page *eaten_pages = NULL; + +void release_eaten_pages(void) +{ + struct eaten_page *p, *q; + + p = eaten_pages; + while (p) { + q = p->next; + /* We don't want swsusp_free() to free this page again */ + ClearPageNosave(virt_to_page(p)); + free_page((unsigned long)p); + p = q; + } + eaten_pages = NULL; +} + +/** + * @safe_needed - on resume, for storing the PBE list and the image, + * we can only use memory pages that do not conflict with the pages + * which had been used before suspend. + * + * The unsafe pages are marked with the PG_nosave_free flag + * + * Allocated but unusable (ie eaten) memory pages should be marked + * so that swsusp_free() can release them + */ + +static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) +{ + void *res; + + if (safe_needed) + do { + res = (void *)get_zeroed_page(gfp_mask); + if (res && PageNosaveFree(virt_to_page(res))) { + /* This is for swsusp_free() */ + SetPageNosave(virt_to_page(res)); + ((struct eaten_page *)res)->next = eaten_pages; + eaten_pages = res; + } + } while (res && PageNosaveFree(virt_to_page(res))); + else + res = (void *)get_zeroed_page(gfp_mask); + if (res) { + SetPageNosave(virt_to_page(res)); + SetPageNosaveFree(virt_to_page(res)); + } + return res; +} + +unsigned long get_safe_page(gfp_t gfp_mask) +{ + return (unsigned long)alloc_image_page(gfp_mask, 1); +} + +/** + * alloc_pagedir - Allocate the page directory. + * + * First, determine exactly how many pages we need and + * allocate them. + * + * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE + * struct pbe elements (pbes) and the last element in the page points + * to the next page. + * + * On each page we set up a list of struct_pbe elements. + */ + +struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed) +{ + unsigned int num; + struct pbe *pblist, *pbe; + + if (!nr_pages) + return NULL; + + pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages); + pblist = alloc_image_page(gfp_mask, safe_needed); + /* FIXME: rewrite this ugly loop */ + for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; + pbe = pbe->next, num += PBES_PER_PAGE) { + pbe += PB_PAGE_SKIP; + pbe->next = alloc_image_page(gfp_mask, safe_needed); + } + if (!pbe) { /* get_zeroed_page() failed */ + free_pagedir(pblist); + pblist = NULL; + } else + create_pbe_list(pblist, nr_pages); + return pblist; +} + +/** + * Free pages we allocated for suspend. Suspend pages are alocated + * before atomic copy, so we need to free them after resume. + */ + +void swsusp_free(void) +{ + struct zone *zone; + unsigned long zone_pfn; + + for_each_zone(zone) { + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) + if (pfn_valid(zone_pfn + zone->zone_start_pfn)) { + struct page *page; + page = pfn_to_page(zone_pfn + zone->zone_start_pfn); + if (PageNosave(page) && PageNosaveFree(page)) { + ClearPageNosave(page); + ClearPageNosaveFree(page); + free_page((long) page_address(page)); + } + } + } +} + + +/** + * enough_free_mem - Make sure we enough free memory to snapshot. + * + * Returns TRUE or FALSE after checking the number of available + * free pages. + */ + +static int enough_free_mem(unsigned int nr_pages) +{ + struct zone *zone; + unsigned int n = 0; + + for_each_zone (zone) + if (!is_highmem(zone)) + n += zone->free_pages; + pr_debug("swsusp: available memory: %u pages\n", n); + return n > (nr_pages + PAGES_FOR_IO + + (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); +} + +int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed) +{ + struct pbe *p; + + for_each_pbe (p, pblist) { + p->address = (unsigned long)alloc_image_page(gfp_mask, safe_needed); + if (!p->address) + return -ENOMEM; + } + return 0; +} + +static struct pbe *swsusp_alloc(unsigned int nr_pages) +{ + struct pbe *pblist; + + if (!(pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, 0))) { + printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); + return NULL; + } + + if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) { + printk(KERN_ERR "suspend: Allocating image pages failed.\n"); + swsusp_free(); + return NULL; + } + + return pblist; +} + +asmlinkage int swsusp_save(void) +{ + unsigned int nr_pages; + + pr_debug("swsusp: critical section: \n"); + + drain_local_pages(); + nr_pages = count_data_pages(); + printk("swsusp: Need to copy %u pages\n", nr_pages); + + pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n", + nr_pages, + (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE, + PAGES_FOR_IO, nr_free_pages()); + + if (!enough_free_mem(nr_pages)) { + printk(KERN_ERR "swsusp: Not enough free memory\n"); + return -ENOMEM; + } + + pagedir_nosave = swsusp_alloc(nr_pages); + if (!pagedir_nosave) + return -ENOMEM; + + /* During allocating of suspend pagedir, new cold pages may appear. + * Kill them. + */ + drain_local_pages(); + copy_data_pages(pagedir_nosave); + + /* + * End of critical section. From now on, we can write to memory, + * but we should not touch disk. This specially means we must _not_ + * touch swap space! Except we must write out our image of course. + */ + + nr_copy_pages = nr_pages; + + printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); + return 0; +} diff -urN oldtree/kernel/power/storage.c newtree/kernel/power/storage.c --- oldtree/kernel/power/storage.c 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/storage.c 2006-02-18 15:24:31.412813456 +0000 @@ -0,0 +1,323 @@ +/* + * kernel/power/storage.c + * + * Copyright (C) 2005 Nigel Cunningham + * + * This file is released under the GPLv2. + * + * Routines for talking to a userspace program that manages storage. + * + * The kernel side: + * - starts the userspace program; + * - sends messages telling it when to open and close the connection; + * - tells it when to quit; + * + * The user space side: + * - passes messages regarding status; + * + */ + +#include +#include + +#include "proc.h" +#include "modules.h" +#include "netlink.h" +#include "storage.h" +#include "ui.h" + +static struct user_helper_data usm_helper_data; +static struct suspend_module_ops usm_ops; +static int message_received = 0; +static int activations = 0; +static int usm_prepare_count = 0; +static int storage_manager_last_action = 0; +static int storage_manager_action = 0; + +static int usm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + int type; + int *data; + + type = nlh->nlmsg_type; + + /* A control message: ignore them */ + if (type < NETLINK_MSG_BASE) + return 0; + + /* Unknown message: reply with EINVAL */ + if (type >= USM_MSG_MAX) + return -EINVAL; + + /* All operations require privileges, even GET */ + if (security_netlink_recv(skb)) + return -EPERM; + + /* Only allow one task to receive NOFREEZE privileges */ + if (type == NETLINK_MSG_NOFREEZE_ME && usm_helper_data.pid != -1) + return -EBUSY; + + data = (int*)NLMSG_DATA(nlh); + + switch (type) { + case USM_MSG_SUCCESS: + case USM_MSG_FAILED: + message_received = type; + complete(&usm_helper_data.wait_for_process); + break; + default: + printk("Storage manager doesn't recognise message %d.\n", type); + } + + return 1; +} + +int suspend_activate_storage(int force) +{ + int tries = 1; + + if (usm_helper_data.pid == -1 || usm_ops.disabled) + return 0; + + message_received = 0; + activations++; + + if (activations > 1 && !force) + return 0; + + while ((!message_received || message_received == USM_MSG_FAILED) && tries < 2) { + suspend_prepare_status(DONT_CLEAR_BAR, "Activate storage attempt %d.\n", tries); + + init_completion(&usm_helper_data.wait_for_process); + + suspend_send_netlink_message(&usm_helper_data, + USM_MSG_CONNECT, + NULL, 0); + + /* Wait 2 seconds for the userspace process to make contact */ + wait_for_completion_timeout(&usm_helper_data.wait_for_process, 2*HZ); + + tries++; + } + + return 0; +} + +int suspend_deactivate_storage(int force) +{ + if (usm_helper_data.pid == -1 || usm_ops.disabled) + return 0; + + message_received = 0; + activations--; + + if (activations && !force) + return 0; + + init_completion(&usm_helper_data.wait_for_process); + + suspend_send_netlink_message(&usm_helper_data, + USM_MSG_DISCONNECT, + NULL, 0); + + wait_for_completion_timeout(&usm_helper_data.wait_for_process, 2*HZ); + + if (!message_received || message_received == USM_MSG_FAILED) { + printk("Returning failure disconnecting storage.\n"); + return 1; + } + + return 0; +} + +#ifdef CONFIG_PM_DEBUG +static void storage_manager_simulate(void) +{ + printk("--- Storage manager simulate ---\n"); + suspend_prepare_usm(); + schedule(); + printk("--- Deactivate storage 1 ---\n"); + suspend_deactivate_storage(1); + schedule(); + printk("--- Activate storage 1 ---\n"); + suspend_activate_storage(1); + schedule(); + printk("--- Cleanup usm ---\n"); + suspend_cleanup_usm(); + schedule(); + printk("--- Storage manager simulate ends ---\n"); +} +#endif + +static unsigned long usm_storage_needed(void) +{ + return strlen(usm_helper_data.program); +} + +static int usm_save_config_info(char *buf) +{ + int len = strlen(usm_helper_data.program); + memcpy(buf, usm_helper_data.program, len); + return len; +} + +static void usm_load_config_info(char *buf, int size) +{ + /* Don't load the saved path if one has already been set */ + if (usm_helper_data.program[0]) + return; + + memcpy(usm_helper_data.program, buf, size); +} + +static unsigned long usm_memory_needed(void) +{ + /* ball park figure of 32 pages */ + return (32 * PAGE_SIZE); +} + +/* suspend_prepare_usm + */ +int suspend_prepare_usm(void) +{ + usm_prepare_count++; + + if (usm_prepare_count > 1 || usm_ops.disabled) + return 0; + + usm_helper_data.pid = -1; + + if (!*usm_helper_data.program) + return 0; + + suspend_netlink_setup(&usm_helper_data); + + if (usm_helper_data.pid == -1) + printk("Suspend2 Storage Manager wanted, but couldn't start it.\n"); + + suspend_activate_storage(0); + + return (usm_helper_data.pid != -1); +} + +void suspend_cleanup_usm(void) +{ + usm_prepare_count--; + + if (usm_helper_data.pid > -1 && !usm_prepare_count) { + struct task_struct *t; + + suspend_deactivate_storage(0); + + suspend_send_netlink_message(&usm_helper_data, + NETLINK_MSG_CLEANUP, NULL, 0); + + read_lock(&tasklist_lock); + if ((t = find_task_by_pid(usm_helper_data.pid))) + t->flags &= ~PF_NOFREEZE; + read_unlock(&tasklist_lock); + + suspend_netlink_close(&usm_helper_data); + + usm_helper_data.pid = -1; + } +} + +static void storage_manager_activate(void) +{ + if (storage_manager_action == storage_manager_last_action) + return; + + if (storage_manager_action) + suspend_prepare_usm(); + else + suspend_cleanup_usm(); + + storage_manager_last_action = storage_manager_action; +} + +/* + * User interface specific /proc/suspend entries. + */ + +static struct suspend_proc_data proc_params[] = { + { .filename = "disable_storage_manager", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_INTEGER, + .data = { + .integer = { + .variable = &usm_ops.disabled, + .minimum = 0, + .maximum = 1, + } + } + }, + { .filename = "storage_manager", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_STRING, + .data = { + .string = { + .variable = usm_helper_data.program, + .max_length = 254, + } + } + }, + { .filename = "activate_storage", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_INTEGER, + .data = { + .integer = { + .variable = &storage_manager_action, + .minimum = 0, + .maximum = 1, + } + }, + .write_proc = storage_manager_activate, + }, + +#ifdef CONFIG_PM_DEBUG + { .filename = "simulate_atomic_copy", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_NONE, + .write_proc = storage_manager_simulate, + } +#endif +}; + +static struct suspend_module_ops usm_ops = { + .type = MISC_PLUGIN, + .name = "Userspace Storage Manager", + .module = THIS_MODULE, + .storage_needed = usm_storage_needed, + .save_config_info = usm_save_config_info, + .load_config_info = usm_load_config_info, + .memory_needed = usm_memory_needed, +}; + +/* suspend_usm_proc_init + * Description: Boot time initialisation for user interface. + */ +static __init int suspend_usm_proc_init(void) +{ + int result, i, numfiles = sizeof(proc_params) / sizeof(struct suspend_proc_data); + + if (!(result = suspend_register_module(&usm_ops))) + for (i=0; i< numfiles; i++) + suspend_register_procfile(&proc_params[i]); + + usm_helper_data.nl = NULL; + usm_helper_data.program[0] = '\0'; + usm_helper_data.pid = -1; + usm_helper_data.skb_size = 0; + usm_helper_data.pool_limit = 6; + usm_helper_data.netlink_id = NETLINK_SUSPEND2_USM; + usm_helper_data.name = "userspace storage manager"; + usm_helper_data.rcv_msg = usm_user_rcv_msg; + usm_helper_data.interface_version = 1; + usm_helper_data.must_init = 0; + init_completion(&usm_helper_data.wait_for_process); + + return result; +} + +late_initcall(suspend_usm_proc_init); diff -urN oldtree/kernel/power/storage.h newtree/kernel/power/storage.h --- oldtree/kernel/power/storage.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/storage.h 2006-02-18 15:24:31.413813304 +0000 @@ -0,0 +1,21 @@ +/* + * + */ + +int suspend_prepare_usm(void); +void suspend_cleanup_usm(void); + +int suspend_activate_storage(int force); +int suspend_deactivate_storage(int force); + +enum { + USM_MSG_BASE = 0x10, + + /* Kernel -> Userspace */ + USM_MSG_CONNECT = 0x30, + USM_MSG_DISCONNECT = 0x31, + USM_MSG_SUCCESS = 0x40, + USM_MSG_FAILED = 0x41, + + USM_MSG_MAX, +}; diff -urN oldtree/kernel/power/suspend.c newtree/kernel/power/suspend.c --- oldtree/kernel/power/suspend.c 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/suspend.c 2006-02-18 15:24:31.415813000 +0000 @@ -0,0 +1,1132 @@ +/* + * kernel/power/suspend2.c + */ +/** \mainpage Suspend2. + * + * Suspend2 provides support for saving and restoring an image of + * system memory to an arbitrary storage device, either on the local computer, + * or across some network. The support is entirely OS based, so Suspend2 + * works without requiring BIOS, APM or ACPI support. The vast majority of the + * code is also architecture independant, so it should be very easy to port + * the code to new architectures. Suspend includes support for SMP, 4G HighMem + * and preemption. Initramfses and initrds are also supported. + * + * Suspend2 uses a modular design, in which the method of storing the image is + * completely abstracted from the core code, as are transformations on the data + * such as compression and/or encryption (multiple 'modules' can be used to + * provide arbitrary combinations of functionality). The user interface is also + * modular, so that arbitrarily simple or complex interfaces can be used to + * provide anything from debugging information through to eye candy. + * + * \section Copyright + * + * Suspend2 is released under the GPLv2. + * + * Copyright (C) 1998-2001 Gabor Kuti
+ * Copyright (C) 1998,2001,2002 Pavel Machek
+ * Copyright (C) 2002-2003 Florent Chabaud
+ * Copyright (C) 2002-2005 Nigel Cunningham
+ * + * \section Credits + * + * Nigel would like to thank the following people for their work: + * + * Pavel Machek
+ * Modifications, defectiveness pointing, being with Gabor at the very beginning, + * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17. + * + * Steve Doddi
+ * Support the possibility of hardware state restoring. + * + * Raph
+ * Support for preserving states of network devices and virtual console + * (including X and svgatextmode) + * + * Kurt Garloff
+ * Straightened the critical function in order to prevent compilers from + * playing tricks with local variables. + * + * Andreas Mohr + * + * Alex Badea
+ * Fixed runaway init + * + * Jeff Snyder
+ * ACPI patch + * + * Nathan Friess
+ * Some patches. + * + * Michael Frank
+ * Extensive testing and help with improving stability. Nigel was constantly + * amazed by the quality and quantity of Michael's help. + * + * Bernard Blackham
+ * Web page & Wiki administration, some coding. Another person without whom + * Suspend would not be where it is. + * + * ..and of course the myriads of Suspend2 users who have helped diagnose + * and fix bugs, made suggestions on how to improve the code, proofread + * documentation, and donated time and money. + * + * Thanks also to corporate sponsors: + * + * Cyclades.com. Nigel's employers from Dec 2004, who allow him to work on + * Suspend and PM related issues on company time. + * + * LinuxFund.org. Sponsored Nigel's work on Suspend for four months Oct 2003 + * to Jan 2004. + * + * LAC Linux. Donated P4 hardware that enabled development and ongoing + * maintenance of SMP and Highmem support. + * + * OSDL. Provided access to various hardware configurations, make occasional + * small donations to the project. + */ + +#define SUSPEND_MAIN_C + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "version.h" +#include "suspend2.h" +#include "modules.h" +#include "proc.h" +#include "pageflags.h" +#include "prepare_image.h" +#include "io.h" +#include "ui.h" +#include "suspend2_common.h" +#include "extent.h" +#include "power_off.h" +#include "atomic_copy.h" +#include "debug_pagealloc.h" +#include "storage.h" + +#ifdef CONFIG_X86 +#include /* for kernel_fpu_end */ +#endif + +/* Variables to be preserved over suspend */ +int pageset1_sizelow = 0, pageset2_sizelow = 0, image_size_limit = 0; +unsigned long suspend_orig_mem_free = 0; + +static dyn_pageflags_t pageset1_check_map; +static dyn_pageflags_t pageset2_check_map; +static char *debug_info_buffer; +static char suspend_core_version[] = SUSPEND_CORE_VERSION; + +extern void do_suspend2_lowlevel(int resume); +extern __nosavedata char resume_commandline[COMMAND_LINE_SIZE]; + +unsigned long suspend_action = 0; +unsigned long suspend_result = 0; +unsigned long suspend_debug_state = 0; + +/* + * --- Variables ----- + * + * The following are used by the arch specific low level routines + * and only needed if suspend2 is compiled in. Other variables, + * used by the freezer even if suspend2 is not compiled in, are + * found in process.c + */ + +/*! How long I/O took. */ +int suspend_io_time[2][2]; + +/* Compression ratio */ +__nosavedata unsigned long bytes_in = 0, bytes_out = 0; + +/*! Pageset metadata. */ +struct pagedir pagedir1 = { 0, 0}, pagedir2 = { 0, 0}; + +/* Suspend2 variables used by built-in routines. */ + +/*! The number of suspends we have started (some may have been cancelled) */ +unsigned int nr_suspends = 0; + +/*! The console log level we default to. */ +int suspend_default_console_level = 0; + +/* + * For resume2= kernel option. It's pointless to compile + * suspend2 without any writers, but compilation shouldn't + * fail if you do. + */ + +unsigned long software_suspend_state = ((1 << SUSPEND_DISABLED) | (1 << SUSPEND_BOOT_TIME) | + (1 << SUSPEND_RESUME_NOT_DONE) | (1 << SUSPEND_IGNORE_LOGLEVEL)); + +mm_segment_t oldfs; + +char resume2_file[256] = CONFIG_SUSPEND2_DEFAULT_RESUME2; + +static atomic_t actions_running; + +extern int block_dump; + +int block_dump_save; + +/* + * Basic clean-up routine. + */ +void suspend_finish_anything(int finishing_cycle) +{ + if (atomic_dec_and_test(&actions_running)) { + suspend_cleanup_modules(finishing_cycle); + suspend_put_modules(); + clear_suspend_state(SUSPEND_RUNNING); + } + + set_fs(oldfs); + + if (finishing_cycle) + block_dump = block_dump_save; +} + +/* + * Basic set-up routine. + */ +int suspend_start_anything(int starting_cycle) +{ + oldfs = get_fs(); + + if (atomic_add_return(1, &actions_running) == 1) { + set_fs(KERNEL_DS); + + set_suspend_state(SUSPEND_RUNNING); + + if (suspend_get_modules()) { + printk("Get modules failed!\n"); + clear_suspend_state(SUSPEND_RUNNING); + set_fs(oldfs); + return -EBUSY; + } + + if (suspend_initialise_modules(starting_cycle)) { + printk("Initialise modules failed!\n"); + suspend_finish_anything(starting_cycle); + return -EBUSY; + } + + if (starting_cycle) { + block_dump_save = block_dump; + block_dump = 0; + } + } + + return 0; +} + +/* + * save_image + * Result code (int): Zero on success, non zero on failure. + * Functionality : High level routine which performs the steps necessary + * to prepare and save the image after preparatory steps + * have been taken. + * Key Assumptions : Processes frozen, sufficient memory available, drivers + * suspended. + * Called from : suspend_suspend_2 + */ + +static int save_image(void) +{ + int temp_result; + + suspend_message(SUSPEND_ANY_SECTION, SUSPEND_LOW, 1, + " - Final values: %d and %d.\n", + pagedir1.pageset_size, + pagedir2.pageset_size); + + check_shift_keys(1, "About to write pagedir2."); + + temp_result = write_pageset(&pagedir2, 2); + + if (temp_result == -1 || test_result_state(SUSPEND_ABORTED)) + return -1; + + check_shift_keys(1, "About to copy pageset 1."); + + suspend_deactivate_storage(1); + + suspend_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy."); + + do_suspend2_lowlevel(0); + + return 0; +} + +/* + * Save the second part of the image. + */ +int save_image_part1(void) +{ + int temp_result, old_ps1_size = pagedir1.pageset_size; + dyn_pageflags_t temp; + + /* Quick switch: We want to compare the old stats with the new ones. */ + temp = pageset1_map; + pageset1_map = pageset1_check_map; + pageset1_check_map = temp; + + temp = pageset2_map; + pageset2_map = pageset2_check_map; + pageset2_check_map = temp; + + BUG_ON(!irqs_disabled()); + + suspend_recalculate_stats(1); + + if ((pagedir1.pageset_size - old_ps1_size) > extra_pd1_pages_allowance) { + abort_suspend("Pageset1 has grown by %d pages." + " Only %d growth is allowed for!\n", + pagedir1.pageset_size - old_ps1_size, + extra_pd1_pages_allowance); + return -1; + } + + suspend_map_atomic_copy_pages(); + + BUG_ON(!irqs_disabled()); + + if (!test_action_state(SUSPEND_TEST_FILTER_SPEED) && + !test_action_state(SUSPEND_TEST_BIO)) + suspend_copy_pageset1(); + + /* + * ---- FROM HERE ON, NEED TO REREAD PAGESET2 IF ABORTING!!! ----- + * + */ + + suspend_unmap_atomic_copy_pages(); + +#ifdef CONFIG_X86 + kernel_fpu_end(); +#endif + + device_power_up(); + + local_irq_enable(); + + device_resume(); + + if (suspend_activate_storage(1)) + panic("Failed to reactivate our storage."); + + suspend_update_status(pagedir2.pageset_size, + pagedir1.pageset_size + pagedir2.pageset_size, + NULL); + + if (test_result_state(SUSPEND_ABORTED)) + goto abort_reloading_pagedir_two; + + check_shift_keys(1, "About to write pageset1."); + + /* + * End of critical section. + */ + + suspend_message(SUSPEND_ANY_SECTION, SUSPEND_LOW, 1, + "-- Writing pageset1\n"); + + temp_result = write_pageset(&pagedir1, 1); + + /* We didn't overwrite any memory, so no reread needs to be done. */ + if (test_action_state(SUSPEND_TEST_FILTER_SPEED)) + return -1; + + if (temp_result == -1 || test_result_state(SUSPEND_ABORTED)) + goto abort_reloading_pagedir_two; + + check_shift_keys(1, "About to write header."); + + if (test_result_state(SUSPEND_ABORTED)) + goto abort_reloading_pagedir_two; + + temp_result = write_image_header(); + + if (test_action_state(SUSPEND_TEST_BIO)) + return -1; + + if (temp_result || (test_result_state(SUSPEND_ABORTED))) + goto abort_reloading_pagedir_two; + + check_shift_keys(1, "About to power down or reboot."); + + return 0; + +abort_reloading_pagedir_two: + temp_result = read_pageset2(1); + + /* If that failed, we're sunk. Panic! */ + if (temp_result) + panic("Attempt to reload pagedir 2 while aborting " + "a suspend failed."); + + return -1; + +} + +#define SNPRINTF(a...) len += snprintf_used(debug_info_buffer + len, \ + PAGE_SIZE - len - 1, ## a) + +static int io_MB_per_second(int read_write) +{ + if (!suspend_io_time[read_write][1]) + return 0; + + return MB((unsigned long) suspend_io_time[read_write][0]) * HZ / + suspend_io_time[read_write][1]; +} + +/* get_debug_info + * Functionality: Store debug info in a buffer. + * Called from: suspend2_try_suspend. + */ + + +static int get_suspend_debug_info(void) +{ + int len = 0; + if (!debug_info_buffer) { + debug_info_buffer = (char *) get_zeroed_page(GFP_ATOMIC); + if (!debug_info_buffer) { + printk("Error! Unable to allocate buffer for" + "software suspend debug info.\n"); + return 0; + } + } + + SNPRINTF("Suspend2 debugging info:\n"); + SNPRINTF("- SUSPEND core : %s\n", SUSPEND_CORE_VERSION); + SNPRINTF("- Kernel Version : %s\n", UTS_RELEASE); + SNPRINTF("- Compiler vers. : %d.%d\n", __GNUC__, __GNUC_MINOR__); + SNPRINTF("- Attempt number : %d\n", nr_suspends); + SNPRINTF("- Parameters : %ld %ld %ld %d %d %ld\n", + suspend_result, + suspend_action, + suspend_debug_state, + suspend_default_console_level, + image_size_limit, + suspend_powerdown_method); + SNPRINTF("- Overall expected compression percentage: %d.\n", + 100 - suspend_expected_compression_ratio()); + len+= suspend_print_module_debug_info(debug_info_buffer + len, + PAGE_SIZE - len - 1); + if (suspend_io_time[0][1]) { + if ((io_MB_per_second(0) < 5) || (io_MB_per_second(1) < 5)) { + SNPRINTF("- I/O speed: Write %d KB/s", + (KB((unsigned long) suspend_io_time[0][0]) * HZ / + suspend_io_time[0][1])); + if (suspend_io_time[1][1]) + SNPRINTF(", Read %d KB/s", + (KB((unsigned long) suspend_io_time[1][0]) * HZ / + suspend_io_time[1][1])); + } else { + SNPRINTF("- I/O speed: Write %d MB/s", + (MB((unsigned long) suspend_io_time[0][0]) * HZ / + suspend_io_time[0][1])); + if (suspend_io_time[1][1]) + SNPRINTF(", Read %d MB/s", + (MB((unsigned long) suspend_io_time[1][0]) * HZ / + suspend_io_time[1][1])); + } + SNPRINTF(".\n"); + } + else + SNPRINTF("- No I/O speed stats available.\n"); + + return len; +} + +/* + * debuginfo_read_proc + * Functionality : Displays information that may be helpful in debugging + * software suspend. + */ +int debuginfo_read_proc(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + int info_len, copy_len; + + info_len = get_suspend_debug_info(); + + copy_len = min(info_len - (int) off, count); + if (copy_len < 0) + copy_len = 0; + + if (copy_len) { + memcpy(page, debug_info_buffer + off, copy_len); + *start = page; + } + + if (copy_len + off == info_len) + *eof = 1; + + free_page((unsigned long) debug_info_buffer); + debug_info_buffer = NULL; + return copy_len; +} + +static int allocate_bitmaps(void) +{ + suspend_message(SUSPEND_MEMORY, SUSPEND_VERBOSE, 1, + "Allocating in_use_map\n"); + if (allocate_dyn_pageflags(&in_use_map) || + allocate_dyn_pageflags(&pageset1_map) || + allocate_dyn_pageflags(&pageset1_copy_map) || + allocate_dyn_pageflags(&allocd_pages_map) || + allocate_dyn_pageflags(&pageset2_map) || +#ifdef CONFIG_DEBUG_PAGEALLOC + allocate_dyn_pageflags(&unmap_map) || +#endif + allocate_dyn_pageflags(&pageset1_check_map) || + allocate_dyn_pageflags(&pageset2_check_map)) + return 1; + + return 0; +} + +static void free_metadata(void) +{ + free_dyn_pageflags(&pageset1_map); + free_dyn_pageflags(&pageset1_copy_map); + free_dyn_pageflags(&allocd_pages_map); + free_dyn_pageflags(&pageset2_map); + free_dyn_pageflags(&in_use_map); + free_dyn_pageflags(&pageset1_check_map); + free_dyn_pageflags(&pageset2_check_map); +} + +static int check_still_keeping_image(void) +{ + if (test_action_state(SUSPEND_KEEP_IMAGE)) { + printk("Image already stored: powering down immediately."); + suspend_power_down(); + return 1; /* Just in case we're using S3 */ + } + + printk("Invalidating previous image.\n"); + suspend_active_writer->ops.writer.invalidate_image(); + + return 0; +} + +static int suspend_init(void) +{ + suspend_result = 0; + + printk(name_suspend "Initiating a software suspend cycle.\n"); + + nr_suspends++; + clear_suspend_state(SUSPEND_NOW_RESUMING); + + suspend_io_time[0][0] = suspend_io_time[0][1] = + suspend_io_time[1][0] = + suspend_io_time[1][1] = 0; + + suspend_prepare_console(); + + free_metadata(); /* We might have kept it */ + + //attempt_to_parse_resume_device(); + + if (test_suspend_state(SUSPEND_DISABLED)) + return 0; + + if (allocate_bitmaps()) + return 0; + + disable_nonboot_cpus(); + + return 1; +} + +void suspend_cleanup(void) +{ + int i; + + i = get_suspend_debug_info(); + + suspend_free_extra_pagedir_memory(); + + pagedir1.pageset_size = pagedir2.pageset_size = 0; + + thaw_processes(FREEZER_KERNEL_THREADS); + +#ifdef CONFIG_SUSPEND2_KEEP_IMAGE + if (test_action_state(SUSPEND_KEEP_IMAGE) && + !test_result_state(SUSPEND_ABORTED)) { + suspend_message(SUSPEND_ANY_SECTION, SUSPEND_LOW, 1, + name_suspend "Not invalidating the image due " + "to Keep Image being enabled.\n"); + set_result_state(SUSPEND_KEPT_IMAGE); + } else +#endif + if (suspend_active_writer) + suspend_active_writer->ops.writer.invalidate_image(); + + free_metadata(); + +#ifdef CONFIG_DEBUG_PAGE_ALLOC + free_dyn_pageflags(&unmap_map); +#endif + + if (debug_info_buffer) { + /* Printk can only handle 1023 bytes, including + * its level mangling. */ + for (i = 0; i < 3; i++) + printk("%s", debug_info_buffer + (1023 * i)); + free_page((unsigned long) debug_info_buffer); + debug_info_buffer = NULL; + } + + thaw_processes(FREEZER_ALL_THREADS); + + suspend_cleanup_console(); + + enable_nonboot_cpus(); +} + +static int can_suspend(void) +{ + if (test_suspend_state(SUSPEND_DISABLED)) + attempt_to_parse_resume_device(); + + if (test_suspend_state(SUSPEND_DISABLED)) { + printk(name_suspend "Software suspend is disabled.\n" + "This may be because you haven't put something along the " + "lines of\n\nresume2=swap:/dev/hda1\n\n" + "in lilo.conf or equivalent. (Where /dev/hda1 is your " + "swap partition).\n"); + set_result_state(SUSPEND_ABORTED); + return 0; + } + + return 1; +} + +/* + * suspend_main + * Functionality : First level of code for software suspend invocations. + * Stores and restores load averages (to avoid a spike), + * allocates bitmaps, freezes processes and eats memory + * as required before suspending drivers and invoking + * the 'low level' code to save the state to disk. + * By the time we return from do_suspend2_lowlevel, we + * have either failed to save the image or successfully + * suspended and reloaded the image. The difference can + * be discerned by checking SUSPEND_ABORTED. + * Called From : + */ + +void suspend_main(void) +{ + if (suspend_activate_storage(0)) + return; + + if (!can_suspend()) + goto cleanup; + + /* + * If kept image and still keeping image and suspending to RAM, we will + * return 1 after suspending and resuming (provided the power doesn't + * run out. + */ + if (test_result_state(SUSPEND_KEPT_IMAGE) && check_still_keeping_image()) + goto cleanup; + + + if (suspend_init() && !suspend_prepare_image() && !test_result_state(SUSPEND_ABORTED) && + !test_action_state(SUSPEND_FREEZER_TEST)) { + suspend_prepare_status(DONT_CLEAR_BAR, "Starting to save the image.."); + save_image(); + } + + suspend_cleanup(); +cleanup: + suspend_deactivate_storage(0); +} + +/* image_exists_read + * + * Return 0 or 1, depending on whether an image is found. + */ + +char *get_have_image_data(void); + +static int image_exists_read(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + int len = 0; + char *result; + + if (suspend_activate_storage(0)) + return count; + + if (!test_suspend_state(SUSPEND_RESUME_DEVICE_OK)) + attempt_to_parse_resume_device(); + + if (!suspend_active_writer) { + len = sprintf(page, "-1\n"); + } else { + result = get_have_image_data(); + printk("get_have_image_data returned %p.\n", result); + if (result) { + len = sprintf(page, "%s", result); + free_page((unsigned long) result); + } + } + + *eof = 1; + + suspend_deactivate_storage(0); + + return len; +} + +/* image_exists_read + * + * Return 0 or 1, depending on whether an image is found. + */ +static int image_exists_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + if (suspend_activate_storage(0)) + return count; + + if (suspend_active_writer && suspend_active_writer->ops.writer.image_exists()) + suspend_active_writer->ops.writer.invalidate_image(); + + suspend_deactivate_storage(0); + + return count; +} + +/* + * Core proc entries that aren't built in. + * + * This array contains entries that are automatically registered at + * boot. Plugins and the console code register their own entries separately. + */ +static struct suspend_proc_data proc_params[] = { + { .filename = "debug_info", + .permissions = PROC_READONLY, + .type = SUSPEND_PROC_DATA_CUSTOM, + .data = { + .special = { + .read_proc = debuginfo_read_proc, + } + } + }, + + { .filename = "extra_pages_allowance", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_INTEGER, + .data = { + .integer = { + .variable = &extra_pd1_pages_allowance, + .minimum = 0, + .maximum = 32767, + } + } + }, + + { .filename = "ignore_rootfs", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_BIT, + .data = { + .bit = { + .bit_vector = &suspend_action, + .bit = SUSPEND_IGNORE_ROOTFS, + } + } + }, + + { .filename = "image_exists", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_CUSTOM, + .needs_storage_manager = 3, + .data = { + .special = { + .read_proc = image_exists_read, + .write_proc = image_exists_write, + } + } + }, + + { .filename = "image_size_limit", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_INTEGER, + .data = { + .integer = { + .variable = &image_size_limit, + .minimum = -2, + .maximum = 32767, + } + } + }, + + { .filename = "last_result", + .permissions = PROC_READONLY, + .type = SUSPEND_PROC_DATA_UL, + .data = { + .ul = { + .variable = &suspend_result, + } + } + }, + + { .filename = "reboot", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_BIT, + .data = { + .bit = { + .bit_vector = &suspend_action, + .bit = SUSPEND_REBOOT, + } + } + }, + + { .filename = "resume2", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_STRING, + .needs_storage_manager = 2, + .data = { + .string = { + .variable = resume2_file, + .max_length = 255, + } + }, + .write_proc = attempt_to_parse_resume_device2, + }, + + { .filename = "resume_commandline", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_STRING, + .data = { + .string = { + .variable = resume_commandline, + .max_length = COMMAND_LINE_SIZE, + } + }, + }, + + { .filename = "version", + .permissions = PROC_READONLY, + .type = SUSPEND_PROC_DATA_STRING, + .data = { + .string = { + .variable = suspend_core_version, + } + } + }, + +#ifdef CONFIG_PM_DEBUG + { .filename = "freezer_test", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_BIT, + .data = { + .bit = { + .bit_vector = &suspend_action, + .bit = SUSPEND_FREEZER_TEST, + } + } + }, + + { .filename = "test_bio", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_BIT, + .data = { + .bit = { + .bit_vector = &suspend_action, + .bit = SUSPEND_TEST_BIO, + } + } + }, + + { .filename = "test_filter_speed", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_BIT, + .data = { + .bit = { + .bit_vector = &suspend_action, + .bit = SUSPEND_TEST_FILTER_SPEED, + } + } + }, + + { .filename = "slow", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_BIT, + .data = { + .bit = { + .bit_vector = &suspend_action, + .bit = SUSPEND_SLOW, + } + } + }, + + { .filename = "no_pageset2", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_BIT, + .data = { + .bit = { + .bit_vector = &suspend_action, + .bit = SUSPEND_NO_PAGESET2, + } + } + }, + +#endif + +#if defined(CONFIG_ACPI) + { .filename = "powerdown_method", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_UL, + .data = { + .ul = { + .variable = &suspend_powerdown_method, + .minimum = 0, + .maximum = 5, + } + } + }, +#endif + +#ifdef CONFIG_SUSPEND2_KEEP_IMAGE + { .filename = "keep_image", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_BIT, + .data = { + .bit = { + .bit_vector = &suspend_action, + .bit = SUSPEND_KEEP_IMAGE, + } + } + }, +#endif +}; + + +/* + * Called from init kernel_thread. + * We check if we have an image and if so we try to resume. + * We also start ksuspendd if configuration looks right. + */ + +int suspend_resume(void) +{ + int read_image_result = 0; + + if (sizeof(swp_entry_t) != sizeof(long)) { + printk(KERN_WARNING name_suspend + "The size of swp_entry_t != size of long. " + "Please report this!\n"); + return 1; + } + + if (!resume2_file[0]) + printk(KERN_WARNING name_suspend + "You need to use a resume2= command line parameter to " + "tell Suspend2 where to look for an image.\n"); + + suspend_activate_storage(0); + + if (!(test_suspend_state(SUSPEND_RESUME_DEVICE_OK)) && + !attempt_to_parse_resume_device()) { + /* + * Without a usable storage device we can do nothing - + * even if noresume is given + */ + + if (!suspend_num_writers) + printk(KERN_ALERT name_suspend + "No writers have been registered.\n"); + else + printk(KERN_ALERT name_suspend + "Missing or invalid storage location " + "(resume2= parameter). Please correct and " + "rerun lilo (or equivalent) before " + "suspending.\n"); + suspend_deactivate_storage(0); + return 1; + } + + suspend_orig_mem_free = real_nr_free_pages(); + + read_image_result = read_pageset1(); /* non fatal error ignored */ + + if (test_suspend_state(SUSPEND_NORESUME_SPECIFIED)) + printk(KERN_WARNING name_suspend "Resuming disabled as requested.\n"); + + suspend_deactivate_storage(0); + + if (read_image_result) + return 1; + + suspend_atomic_restore(); + + BUG(); + + return 0; +} + +static __init int core_load(void) +{ + int i, numfiles = sizeof(proc_params) / sizeof(struct suspend_proc_data); + + printk("Suspend2 Core.\n"); + + suspend_initialise_module_lists(); + + for (i=0; i< numfiles; i++) + suspend_register_procfile(&proc_params[i]); + + return 0; +} + +/* -- Functions for kickstarting a suspend or resume --- */ + +/* + * Check if we have an image and if so try to resume. + */ + +void __suspend_try_resume(void) +{ + set_suspend_state(SUSPEND_TRYING_TO_RESUME); + + clear_suspend_state(SUSPEND_RESUME_NOT_DONE); + + suspend_resume(); + + clear_suspend_state(SUSPEND_IGNORE_LOGLEVEL); + clear_suspend_state(SUSPEND_TRYING_TO_RESUME); +} + +/* Wrapper for when called from init/do_mounts.c */ +void suspend2_try_resume(void) +{ + if (suspend_start_anything(0)) + return; + + __suspend_try_resume(); + + /* + * For initramfs, we have to clear the boot time + * flag after trying to resume + */ + clear_suspend_state(SUSPEND_BOOT_TIME); + + suspend_finish_anything(0); +} + +/* + * suspend2_try_suspend + * Functionality : Wrapper around suspend_main. + * Called From : drivers/acpi/sleep/main.c + * kernel/reboot.c + */ + +void suspend2_try_suspend(void) +{ + if (suspend_start_anything(0)) + return; + + suspend_main(); + + suspend_finish_anything(0); +} + +/* -- Commandline Parameter Handling --- + * + * Resume setup: obtain the storage device. + */ + +static int __init resume2_setup(char *str) +{ + if (!*str) + return 0; + + strncpy(resume2_file, str, 255); + return 0; +} + +/* + * Allow the user to set the action parameter from lilo, prior to resuming. + */ +static int __init suspend_act_setup(char *str) +{ + if(str) + suspend_action=simple_strtol(str,NULL,0); + set_suspend_state(SUSPEND_ACT_USED); + return 0; +} + +/* + * Allow the user to set the debug parameter from lilo, prior to resuming. + */ +/* + * Allow the user to specify that we should ignore any image found and + * invalidate the image if necesssary. This is equivalent to running + * the task queue and a sync and then turning off the power. The same + * precautions should be taken: fsck if you're not journalled. + */ +static int __init noresume2_setup(char *str) +{ + set_suspend_state(SUSPEND_NORESUME_SPECIFIED); + return 0; +} + +static int __init suspend_retry_resume_setup(char *str) +{ + set_suspend_state(SUSPEND_RETRY_RESUME); + return 0; +} + +#ifdef CONFIG_PM_DEBUG + +static int __init suspend_dbg_setup(char *str) +{ + if(str) + suspend_debug_state=simple_strtol(str,NULL,0); + set_suspend_state(SUSPEND_DBG_USED); + return 0; +} + +/* + * Allow the user to set the debug level parameter from lilo, prior to + * resuming. + */ +static int __init suspend_lvl_setup(char *str) +{ + if(str) + console_loglevel = + suspend_default_console_level = + simple_strtol(str,NULL,0); + set_suspend_state(SUSPEND_LVL_USED); + clear_suspend_state(SUSPEND_IGNORE_LOGLEVEL); + return 0; +} + +__setup("suspend_dbg=", suspend_dbg_setup); +__setup("suspend_lvl=", suspend_lvl_setup); +#endif + +__setup("noresume2", noresume2_setup); +__setup("resume2=", resume2_setup); +__setup("suspend_act=", suspend_act_setup); +__setup("suspend_retry_resume", suspend_retry_resume_setup); + +late_initcall(core_load); +EXPORT_SYMBOL(software_suspend_state); diff -urN oldtree/kernel/power/suspend.h newtree/kernel/power/suspend.h --- oldtree/kernel/power/suspend.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/suspend.h 2006-02-18 15:24:31.416812848 +0000 @@ -0,0 +1,28 @@ +/* + * kernel/power/suspend.h + * + * Copyright (C) 2004-2005 Nigel Cunningham + * + * This file is released under the GPLv2. + * + * It contains declarations used throughout swsusp. + * + */ + +#ifndef KERNEL_POWER_SUSPEND_H +#define KERNEL_POWER_SUSPEND_H + +#define SUSPEND_PD_PAGES(x) (((x)*sizeof(struct pbe))/PAGE_SIZE+1) + +/* mm/page_alloc.c */ +extern void drain_local_pages(void); + +void save_processor_state(void); +void restore_processor_state(void); +struct saved_context; +void __save_processor_state(struct saved_context *ctxt); +void __restore_processor_state(struct saved_context *ctxt); + +extern suspend_pagedir_t *pagedir_nosave __nosavedata; + +#endif diff -urN oldtree/kernel/power/suspend2.h newtree/kernel/power/suspend2.h --- oldtree/kernel/power/suspend2.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/suspend2.h 2006-02-18 15:24:31.417812696 +0000 @@ -0,0 +1,31 @@ +/* + * kernel/power/suspend2.h + * + * Copyright (C) 2004-2005 Nigel Cunningham + * + * This file is released under the GPLv2. + * + * It contains declarations used throughout swsusp and suspend2. + * + */ +#ifndef KERNEL_POWER_SUSPEND_CORE_H +#define KERNEL_POWER_SUSPEND_CORE_H + +#include +#include + +extern unsigned long suspend_orig_mem_free; + +#define KB(x) ((x) << (PAGE_SHIFT - 10)) +#define MB(x) ((x) >> (20 - PAGE_SHIFT)) + +extern int suspend_start_anything(int starting_cycle); +extern void suspend_finish_anything(int finishing_cycle); + +#if 1 +#define PRINTK(a...) do { } while(0) +#else +#define PRINTK(fmt, arg...) printk(KERN_DEBUG fmt, ##arg) +#endif + +#endif diff -urN oldtree/kernel/power/suspend2_common.h newtree/kernel/power/suspend2_common.h --- oldtree/kernel/power/suspend2_common.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/suspend2_common.h 2006-02-18 15:24:31.417812696 +0000 @@ -0,0 +1,25 @@ +#ifdef CONFIG_PM_DEBUG +#define set_debug_state(bit) (test_and_set_bit(bit, &suspend_debug_state)) +#define clear_debug_state(bit) (test_and_clear_bit(bit, &suspend_debug_state)) +#else +#define set_debug_state(bit) (0) +#define clear_debug_state(bit) (0) +#endif + +#define set_result_state(bit) (test_and_set_bit(bit, &suspend_result)) +#define clear_result_state(bit) (test_and_clear_bit(bit, &suspend_result)) + +enum { + SUSPEND_ABORT_REQUESTED = 1, + SUSPEND_NOSTORAGE_AVAILABLE, + SUSPEND_INSUFFICIENT_STORAGE, + SUSPEND_FREEZING_FAILED, + SUSPEND_UNEXPECTED_ALLOC, + SUSPEND_KEPT_IMAGE, + SUSPEND_WOULD_EAT_MEMORY, + SUSPEND_UNABLE_TO_FREE_ENOUGH_MEMORY, + SUSPEND_ENCRYPTION_SETUP_FAILED +}; + +extern int suspend_default_console_level; +extern unsigned int nr_suspends; diff -urN oldtree/kernel/power/suspend_block_io.c newtree/kernel/power/suspend_block_io.c --- oldtree/kernel/power/suspend_block_io.c 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/suspend_block_io.c 2006-02-18 15:24:31.419812392 +0000 @@ -0,0 +1,1086 @@ +/* + * block_io.c + * + * Copyright 2004-2005 Nigel Cunningham + * + * Distributed under GPLv2. + * + * This file contains block io functions for suspend2. These are + * used by the swapwriter and it is planned that they will also + * be used by the NFSwriter. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "suspend2.h" +#include "proc.h" +#include "modules.h" +#include "prepare_image.h" +#include "block_io.h" +#include "extent.h" +#include "suspend2_common.h" +#include "ui.h" + +/* Bits in struct io_info->flags */ +enum { + IO_WRITING, + IO_RESTORE_PAGE_PROT, + IO_AWAITING_READ, + IO_AWAITING_WRITE, + IO_AWAITING_SUBMIT, + IO_AWAITING_CLEANUP, + IO_HANDLE_PAGE_PROT +}; + +#define MAX_OUTSTANDING_IO 2048 + +/* + * + * IO in progress information storage and helpers + * + */ + +struct io_info { + struct bio *sys_struct; + sector_t block[MAX_BUF_PER_PAGE]; + struct page *buffer_page; + struct page *data_page; + unsigned long flags; + struct block_device *dev; + struct list_head list; + int readahead_index; + struct work_struct work; + int printme; +}; + +/* Locks separated to allow better SMP support. + * An io_struct moves through the lists as follows. + * free -> submit_batch -> busy -> ready_for_cleanup -> free + */ +static LIST_HEAD(ioinfo_free); +static DEFINE_SPINLOCK(ioinfo_free_lock); + +static LIST_HEAD(ioinfo_ready_for_cleanup); +static DEFINE_SPINLOCK(ioinfo_ready_lock); + +static LIST_HEAD(ioinfo_submit_batch); +static DEFINE_SPINLOCK(ioinfo_submit_lock); + +static LIST_HEAD(ioinfo_busy); +static DEFINE_SPINLOCK(ioinfo_busy_lock); + +static atomic_t submit_batch; +static int submit_batch_size = 64; +static int submit_batched(void); + +struct task_struct *suspend_bio_task; + +/* [Max] number of I/O operations pending */ +static atomic_t outstanding_io; +static int max_outstanding_io = 0; +static atomic_t buffer_allocs, buffer_frees; + +/* [Max] number of pages used for above struct */ +static int infopages = 0; +static int maxinfopages = 0; + +static volatile unsigned long suspend_readahead_flags[(MAX_OUTSTANDING_IO + BITS_PER_LONG - 1) / BITS_PER_LONG]; +static spinlock_t suspend_readahead_flags_lock = SPIN_LOCK_UNLOCKED; +static struct page *suspend_readahead_pages[MAX_OUTSTANDING_IO]; +static int readahead_index, readahead_submit_index; + +static int current_stream; +struct extent_iterate_saved_state suspend_writer_posn_save[3]; + +/* Pointer to current entry being loaded/saved. */ +struct extent_iterate_state suspend_writer_posn; + +/* Not static, so that the allocators can setup and complete + * writing the header */ +char *suspend_writer_buffer; +int suspend_writer_buffer_posn; + +int suspend_read_fd; + +static unsigned long nr_schedule_calls[8]; + +static char *sch_caller[] = { + "get_io_info_struct #1 ", + "get_io_info_struct #2 ", + "get_io_info_struct #3 ", + "suspend_finish_all_io ", + "wait_on_one_page ", + "submit ", + "start_one ", + "suspend_wait_on_readahead", +}; + +static struct suspend_bdev_info *suspend_devinfo; +int need_extra_next; + +/* + * suspend_reset_io_stats + * + * Description: Reset all our sanity-checking statistics. + */ +static void suspend_reset_io_stats(void) +{ + int i; + + max_outstanding_io = 0; + maxinfopages = 0; + + for (i = 0; i < 8; i++) + nr_schedule_calls[i] = 0; +} + +/* + * suspend_check_io_stats + * + * Description: Check that our statistics look right and print + * any debugging info wanted. + */ +static void suspend_check_io_stats(void) +{ + int i; + + BUG_ON(atomic_read(&outstanding_io)); + BUG_ON(infopages); + BUG_ON(!list_empty(&ioinfo_submit_batch)); + BUG_ON(!list_empty(&ioinfo_busy)); + BUG_ON(!list_empty(&ioinfo_ready_for_cleanup)); + BUG_ON(!list_empty(&ioinfo_free)); + BUG_ON(atomic_read(&buffer_allocs) != atomic_read(&buffer_frees)); + + suspend_message(SUSPEND_WRITER, SUSPEND_LOW, 0, + "Maximum outstanding_io was %d.\n", + max_outstanding_io); + suspend_message(SUSPEND_WRITER, SUSPEND_LOW, 0, + "Max info pages was %d.\n", + maxinfopages); + if (atomic_read(&buffer_allocs) != atomic_read(&buffer_frees)) + suspend_message(SUSPEND_WRITER, SUSPEND_MEDIUM, 0, + "Buffer allocs (%d) != buffer frees (%d)", + atomic_read(&buffer_allocs), + atomic_read(&buffer_frees)); + for(i = 0; i < 8; i++) + suspend_message(SUSPEND_WRITER, SUSPEND_MEDIUM, 0, + "Nr schedule calls %s: %lu.\n", sch_caller[i], nr_schedule_calls[i]); +} + +/* + * cleanup_one + * + * Description: Clean up after completing I/O on a page. + * Arguments: struct io_info: Data for I/O to be completed. + */ +static void __suspend_bio_cleanup_one(struct io_info *io_info) +{ + struct page *buffer_page; + struct page *data_page; + char *buffer_address, *data_address; + int reading; + + buffer_page = io_info->buffer_page; + data_page = io_info->data_page; + + reading = test_bit(IO_AWAITING_READ, &io_info->flags); + suspend_message(SUSPEND_WRITER, SUSPEND_HIGH, 0, + "Cleanup IO: [%p]\n", + io_info); + + if (reading && io_info->readahead_index == -1) { + /* + * Copy the page we read into the buffer our caller provided. + */ + data_address = (char *) kmap(data_page); + buffer_address = (char *) kmap(buffer_page); + memcpy(data_address, buffer_address, PAGE_SIZE); + kunmap(data_page); + kunmap(buffer_page); + + } + + if (!reading || io_info->readahead_index == -1) { + /* Sanity check */ + if (page_count(buffer_page) != 2) + printk(KERN_EMERG "Cleanup IO: Page count on page %p is %d. Not good!\n", + buffer_page, page_count(buffer_page)); + put_page(buffer_page); + __free_page(buffer_page); + atomic_inc(&buffer_frees); + } else + put_page(buffer_page); + + bio_put(io_info->sys_struct); + io_info->sys_struct = NULL; + io_info->flags = 0; +} + +/* __suspend_io_cleanup + */ + +static int suspend_bio_cleanup_one(void *data) +{ + struct io_info *io_info = (struct io_info *) data; + int readahead_index; + unsigned long flags; + + /* + * If this I/O was a readahead, remember its index. + */ + readahead_index = io_info->readahead_index; + + /* + * Add it to the free list. + */ + list_del_init(&io_info->list); + + /* + * Do the cleanup. + */ + __suspend_bio_cleanup_one(io_info); + + /* + * Record the readahead as done. + */ + if (readahead_index > -1) { + int index = readahead_index/BITS_PER_LONG; + int bit = readahead_index - (index * BITS_PER_LONG); + spin_lock_irqsave(&suspend_readahead_flags_lock, flags); + set_bit(bit, &suspend_readahead_flags[index]); + spin_unlock_irqrestore(&suspend_readahead_flags_lock, flags); + } + + spin_lock_irqsave(&ioinfo_free_lock, flags); + list_add_tail(&io_info->list, &ioinfo_free); + spin_unlock_irqrestore(&ioinfo_free_lock, flags); + + /* Important: Must be last thing we do to avoid a race with + * finish_all_io when using keventd to do the cleanup */ + atomic_dec(&outstanding_io); + + return 0; +} + +/* suspend_cleanup_some_completed_io + * + * NB: This is designed so that multiple callers can be in here simultaneously. + */ + +static void suspend_cleanup_some_completed_io(void) +{ + int num_cleaned = 0; + struct io_info *first; + unsigned long flags; + + spin_lock_irqsave(&ioinfo_ready_lock, flags); + while(!list_empty(&ioinfo_ready_for_cleanup)) { + int result; + first = list_entry(ioinfo_ready_for_cleanup.next, struct io_info, list); + + BUG_ON(!test_and_clear_bit(IO_AWAITING_CLEANUP, &first->flags)); + + list_del_init(&first->list); + + spin_unlock_irqrestore(&ioinfo_ready_lock, flags); + + result = suspend_bio_cleanup_one((void *) first); + + spin_lock_irqsave(&ioinfo_ready_lock, flags); + if (result) + continue; + num_cleaned++; + if (num_cleaned == submit_batch_size) + break; + } + spin_unlock_irqrestore(&ioinfo_ready_lock, flags); +} + +/* do_bio_wait + * + * Actions taken when we want some I/O to get run. + * + * Submit any I/O that's batched up (if we're not already doing + * that, unplug queues, schedule and clean up whatever we can. + */ +static void do_bio_wait(int caller) +{ + int num_submitted = 0; + + nr_schedule_calls[caller]++; + + /* Don't want to wait on I/O we haven't submitted! */ + num_submitted = submit_batched(); + + kblockd_flush(); + + io_schedule(); + + suspend_cleanup_some_completed_io(); +} + +/* + * suspend_finish_all_io + * + * Description: Finishes all IO and frees all IO info struct pages. + */ +static void suspend_finish_all_io(void) +{ + struct io_info *this, *next = NULL; + unsigned long flags; + + /* Wait for all I/O to complete. */ + while (atomic_read(&outstanding_io)) + do_bio_wait(2); + + spin_lock_irqsave(&ioinfo_free_lock, flags); + + /* + * Two stages, to avoid using freed pages. + * + * First free all io_info structs on a page except the first. + */ + list_for_each_entry_safe(this, next, &ioinfo_free, list) { + if (((unsigned long) this) & ~PAGE_MASK) + list_del(&this->list); + } + + /* + * Now we have only one reference to each page, and can safely + * free pages, knowing we're not going to be trying to access the + * same page after freeing it. + */ + list_for_each_entry_safe(this, next, &ioinfo_free, list) { + list_del(&this->list); + free_page((unsigned long) this); + infopages--; + suspend_message(SUSPEND_MEMORY, SUSPEND_VERBOSE, 0, + "[FreedIOPage %lx]", this); + } + + spin_unlock_irqrestore(&ioinfo_free_lock, flags); +} + +/* + * wait_on_one_page + * + * Description: Wait for a particular I/O to complete. + */ +static void wait_on_one_page(struct io_info *io_info) +{ + do { do_bio_wait(3); } while (io_info->flags); +} + +/* + * wait_on_readahead + * + * Wait until a particular readahead is ready. + */ +static void suspend_wait_on_readahead(int readahead_index) +{ + int index = readahead_index / BITS_PER_LONG; + int bit = readahead_index - index * BITS_PER_LONG; + + /* read_ahead_index is the one we want to return */ + while (!test_bit(bit, &suspend_readahead_flags[index])) + do_bio_wait(6); +} + +/* + * readahead_done + * + * Returns whether the readahead requested is ready. + */ + +static int suspend_readahead_ready(int readahead_index) +{ + int index = readahead_index / BITS_PER_LONG; + int bit = readahead_index - (index * BITS_PER_LONG); + + return test_bit(bit, &suspend_readahead_flags[index]); +} + +/* suspend_readahead_prepare + * Set up for doing readahead on an image */ +static int suspend_prepare_readahead(int index) +{ + unsigned long new_page = get_zeroed_page(GFP_ATOMIC); + + if(!new_page) + return -ENOMEM; + + suspend_readahead_pages[index] = virt_to_page(new_page); + return 0; +} + +/* suspend_readahead_cleanup + * Clean up structures used for readahead */ +static void suspend_cleanup_readahead(int page) +{ + __free_page(suspend_readahead_pages[page]); + suspend_readahead_pages[page] = 0; + return; +} + +/* + * suspend_end_bio + * + * Description: Function called by block driver from interrupt context when I/O + * is completed. This is the reason we use spinlocks in + * manipulating the io_info lists. + * Nearly the fs/buffer.c version, but we want to mark the page as + * done in our own structures too. + */ + +static int suspend_end_bio(struct bio *bio, unsigned int num, int err) +{ + struct io_info *io_info = bio->bi_private; + unsigned long flags; + + spin_lock_irqsave(&ioinfo_busy_lock, flags); + list_del_init(&io_info->list); + spin_unlock_irqrestore(&ioinfo_busy_lock, flags); + + set_bit(IO_AWAITING_CLEANUP, &io_info->flags); + + spin_lock_irqsave(&ioinfo_ready_lock, flags); + list_add_tail(&io_info->list, &ioinfo_ready_for_cleanup); + spin_unlock_irqrestore(&ioinfo_ready_lock, flags); + return 0; +} + +/** + * submit - submit BIO request. + * @rw: READ or WRITE. + * @io_info: IO info structure. + * + * Based on Patrick's pmdisk code from long ago: + * "Straight from the textbook - allocate and initialize the bio. + * If we're writing, make sure the page is marked as dirty. + * Then submit it and carry on." + * + * With a twist, though - we handle block_size != PAGE_SIZE. + * Caller has already checked that our page is not fragmented. + */ + +static int submit(int rw, struct io_info *io_info) +{ + int error = 0; + struct bio *bio = NULL; + unsigned long flags; + + while (!bio) { + bio = bio_alloc(GFP_ATOMIC,1); + if (!bio) + do_bio_wait(4); + } + + bio->bi_bdev = io_info->dev; + bio->bi_sector = io_info->block[0]; + bio->bi_private = io_info; + bio->bi_end_io = suspend_end_bio; + bio->bi_flags |= (1 << BIO_SUSPEND2); + io_info->sys_struct = bio; + if (io_info->printme) + PRINTK("%s dev %p block %ld => sector %ld\n", + rw ? "Write" : "Read", + bio->bi_bdev, io_info->block[0], + (unsigned long) bio->bi_sector); + + if (bio_add_page(bio, io_info->buffer_page, PAGE_SIZE, 0) < PAGE_SIZE) { + printk("ERROR: adding page to bio at %lld\n", + (unsigned long long) io_info->block[0]); + bio_put(bio); + return -EFAULT; + } + + if (rw == WRITE) + bio_set_pages_dirty(bio); + + spin_lock_irqsave(&ioinfo_busy_lock, flags); + list_add_tail(&io_info->list, &ioinfo_busy); + spin_unlock_irqrestore(&ioinfo_busy_lock, flags); + + submit_bio(rw,bio); + + return error; +} + +/* + * submit a batch. The submit function can wait on I/O, so we have + * simple locking to avoid infinite recursion. + */ +static int submit_batched(void) +{ + static int running_already = 0; + struct io_info *first; + unsigned long flags; + int num_submitted = 0; + + running_already = 1; + spin_lock_irqsave(&ioinfo_submit_lock, flags); + while(!list_empty(&ioinfo_submit_batch)) { + first = list_entry(ioinfo_submit_batch.next, struct io_info, list); + + BUG_ON(!test_and_clear_bit(IO_AWAITING_SUBMIT, &first->flags)); + + list_del_init(&first->list); + + atomic_dec(&submit_batch); + + spin_unlock_irqrestore(&ioinfo_submit_lock, flags); + + if (test_bit(IO_AWAITING_READ, &first->flags)) + submit(READ, first); + else + submit(WRITE, first); + + spin_lock_irqsave(&ioinfo_submit_lock, flags); + + num_submitted++; + if (num_submitted == submit_batch_size) + break; + } + spin_unlock_irqrestore(&ioinfo_submit_lock, flags); + running_already = 0; + + return num_submitted; +} + +static void add_to_batch(struct io_info *io_info) +{ + unsigned long flags; + + set_bit(IO_AWAITING_SUBMIT, &io_info->flags); + + /* Put our prepared I/O struct on the batch list. */ + spin_lock_irqsave(&ioinfo_submit_lock, flags); + list_add_tail(&io_info->list, &ioinfo_submit_batch); + spin_unlock_irqrestore(&ioinfo_submit_lock, flags); + + atomic_inc(&submit_batch); + + if ((!suspend_bio_task) && (atomic_read(&submit_batch) >= submit_batch_size)) + submit_batched(); +} + +/* + * get_io_info_struct + * + * Description: Get an I/O struct. + * Returns: Pointer to the struct prepared for use. + */ +static struct io_info *get_io_info_struct(void) +{ + unsigned long newpage = 0, flags; + struct io_info *this = NULL; + int remaining = 0; + + do { + while (atomic_read(&outstanding_io) >= MAX_OUTSTANDING_IO) + do_bio_wait(0); + + /* Can start a new I/O. Is there a free one? */ + if (!list_empty(&ioinfo_free)) { + /* Yes. Grab it. */ + spin_lock_irqsave(&ioinfo_free_lock, flags); + break; + } + + /* No. Need to allocate a new page for I/O info structs. */ + newpage = get_zeroed_page(GFP_ATOMIC); + if (!newpage) { + do_bio_wait(1); + continue; + } + + suspend_message(SUSPEND_MEMORY, SUSPEND_VERBOSE, 0, + "[NewIOPage %lx]", newpage); + infopages++; + if (infopages > maxinfopages) + maxinfopages++; + + /* Prepare the new page for use. */ + this = (struct io_info *) newpage; + remaining = PAGE_SIZE; + spin_lock_irqsave(&ioinfo_free_lock, flags); + while (remaining >= (sizeof(struct io_info))) { + list_add_tail(&this->list, &ioinfo_free); + this = (struct io_info *) (((char *) this) + + sizeof(struct io_info)); + remaining -= sizeof(struct io_info); + } + break; + } while (1); + + /* + * We have an I/O info struct. Remove it from the free list. + * It will be added to the submit or busy list later. + */ + this = list_entry(ioinfo_free.next, struct io_info, list); + list_del_init(&this->list); + spin_unlock_irqrestore(&ioinfo_free_lock, flags); + return this; +} + +/* + * start_one + * + * Description: Prepare and start a read or write operation. + * Note that we use our own buffer for reading or writing. + * This simplifies doing readahead and asynchronous writing. + * We can begin a read without knowing the location into which + * the data will eventually be placed, and the buffer passed + * for a write can be reused immediately (essential for the + * modules system). + * Failure? What's that? + * Returns: The io_info struct created. + */ +static struct io_info *start_one(int rw, struct submit_params *submit_info) +{ + struct io_info *io_info = get_io_info_struct(); + unsigned long buffer_virt = 0; + char *to, *from; + struct page *buffer_page; + + if (!io_info) + return NULL; + + /* Get our local buffer */ + suspend_message(SUSPEND_WRITER, SUSPEND_HIGH, 1, + "Start_IO: [%p]", io_info); + + /* Copy settings to the io_info struct */ + io_info->data_page = submit_info->page; + io_info->readahead_index = submit_info->readahead_index; + io_info->printme = submit_info->printme; + + if (io_info->readahead_index == -1) { + while (!(buffer_virt = get_zeroed_page(GFP_ATOMIC))) + do_bio_wait(5); + + atomic_inc(&buffer_allocs); + suspend_message(SUSPEND_WRITER, SUSPEND_HIGH, 0, + "[ALLOC BUFFER]->%d", + real_nr_free_pages()); + buffer_page = virt_to_page(buffer_virt); + + io_info->buffer_page = buffer_page; + } else { + unsigned long flags; + int index = io_info->readahead_index / BITS_PER_LONG; + int bit = io_info->readahead_index - index * BITS_PER_LONG; + + spin_lock_irqsave(&suspend_readahead_flags_lock, flags); + clear_bit(bit, &suspend_readahead_flags[index]); + spin_unlock_irqrestore(&suspend_readahead_flags_lock, flags); + + io_info->buffer_page = buffer_page = submit_info->page; + } + + /* If writing, copy our data. The data is probably in + * lowmem, but we cannot be certain. If there is no + * compression/encryption, we might be passed the + * actual source page's address. */ + if (rw == WRITE) { + set_bit(IO_WRITING, &io_info->flags); + + to = (char *) buffer_virt; + from = kmap_atomic(io_info->data_page, KM_USER1); + memcpy(to, from, PAGE_SIZE); + kunmap_atomic(from, KM_USER1); + } + + /* Submit the page */ + get_page(buffer_page); + + io_info->dev = submit_info->dev; + io_info->block[0] = submit_info->block[0]; + + if (rw == READ) + set_bit(IO_AWAITING_READ, &io_info->flags); + else + set_bit(IO_AWAITING_WRITE, &io_info->flags); + + suspend_message(SUSPEND_WRITER, SUSPEND_HIGH, 1, + "-> (PRE BRW) %d\n", + real_nr_free_pages()); + + if (submit_batch_size > 1) + add_to_batch(io_info); + else + submit(rw, io_info); + + atomic_inc(&outstanding_io); + if (atomic_read(&outstanding_io) > max_outstanding_io) + max_outstanding_io++; + + return io_info; +} + +static int suspend_do_io(int rw, + struct submit_params *submit_info, int syncio) +{ + struct io_info *io_info; + + if(!submit_info->dev) + return 1; + + io_info = start_one(rw, submit_info); + + if (!io_info) + return 1; + else if (syncio) + wait_on_one_page(io_info); + + /* If we were the only one, clean everything up */ + if (!atomic_read(&outstanding_io)) + suspend_finish_all_io(); + return 0; +} + +/* We used to use bread here, but it doesn't correctly handle + * blocksize != PAGE_SIZE. Now we create a submit_info to get the data we + * want and use our normal routines (synchronously). + */ + +static int suspend_bdev_page_io(int rw, struct block_device *bdev, long pos, + struct page *page) +{ + struct submit_params submit_info; + + if (!bdev) + return 0; + + submit_info.page = page; + submit_info.dev = bdev; + submit_info.block[0] = pos; + submit_info.readahead_index = -1; + return suspend_do_io(rw, &submit_info, 1); +} + +static unsigned long suspend_bio_memory_needed(void) +{ + /* We want to have at least enough memory so as to have + * MAX_OUTSTANDING_IO transactions on the fly at once. If we + * can to more, fine. */ + return (MAX_OUTSTANDING_IO * (PAGE_SIZE + sizeof(struct request) + + sizeof(struct bio) + sizeof(struct io_info))); +} + +static void suspend_set_devinfo(struct suspend_bdev_info *info) +{ + suspend_devinfo = info; +} + +static int forward_one_page(void) +{ + int i, j; + + for (j = 0; j < need_extra_next + 1; j++) { + extent_state_next(&suspend_writer_posn); + + /* Have to go forward one to ensure we're on the right chain, + * before we can know how many more blocks to skip.*/ + for (i = 1; i < suspend_devinfo[suspend_writer_posn.current_chain].blocks_per_page; i++) + extent_state_next(&suspend_writer_posn); + + if (extent_state_eof(&suspend_writer_posn)) { + printk("Extent state eof.\n"); + return -ENODATA; + } + } + + need_extra_next = 0; + + return 0; +} + +static int __suspend_rw_page(int rw, struct page *page, + int readahead_index, int sync, int debug) +{ + int i, current_chain; + struct submit_params submit_params; + + if (test_action_state(SUSPEND_TEST_FILTER_SPEED)) + return 0; + + submit_params.readahead_index = readahead_index; + submit_params.page = page; + + if (forward_one_page()) + return -ENODATA; + + current_chain = suspend_writer_posn.current_chain; + submit_params.dev = suspend_devinfo[current_chain].bdev; + submit_params.block[0] = (suspend_writer_posn.current_offset - + suspend_devinfo[current_chain].blocks_per_page + 1) << + suspend_devinfo[current_chain].bmap_shift; + + if (debug) + printk("%s: %lx:%lx.\n", rw ? "Write" : "Read", + (long) submit_params.dev->bd_dev, + (long) submit_params.block[0]); + + i = suspend_do_io(rw, &submit_params, sync); + + if (i) + return -EIO; + + return 0; +} + +static int suspend_rw_page(int rw, struct page *page, + int readahead_index, int sync) +{ + return __suspend_rw_page(rw, page, readahead_index, sync, 0); +} + +static int suspend_bio_read_chunk(struct page *buffer_page, int sync) +{ + static int last_result; + unsigned long *virt; + + if (sync == SUSPEND_ASYNC) + return suspend_rw_page(READ, buffer_page, -1, sync); + + /* Start new readahead while we wait for our page */ + if (readahead_index == -1) { + last_result = 0; + readahead_index = readahead_submit_index = 0; + } + + /* Start a new readahead? */ + if (last_result) { + /* We failed to submit a read, and have cleaned up + * all the readahead previously submitted */ + if (readahead_submit_index == readahead_index) + return -EPERM; + goto wait; + } + + do { + if (suspend_prepare_readahead(readahead_submit_index)) + break; + + last_result = suspend_rw_page( + READ, + suspend_readahead_pages[readahead_submit_index], + readahead_submit_index, SUSPEND_ASYNC); + if (last_result) { + printk("Begin read chunk for page %d returned %d.\n", + readahead_submit_index, last_result); + suspend_cleanup_readahead(readahead_submit_index); + break; + } + + readahead_submit_index++; + + if (readahead_submit_index == MAX_OUTSTANDING_IO) + readahead_submit_index = 0; + + } while((!last_result) && (readahead_submit_index != readahead_index) && + (!suspend_readahead_ready(readahead_index))); + +wait: + suspend_wait_on_readahead(readahead_index); + + virt = kmap_atomic(buffer_page, KM_USER1); + memcpy(virt, page_address(suspend_readahead_pages[readahead_index]), + PAGE_SIZE); + kunmap_atomic(virt, KM_USER1); + + suspend_cleanup_readahead(readahead_index); + + readahead_index++; + if (readahead_index == MAX_OUTSTANDING_IO) + readahead_index = 0; + + return 0; +} + +static int suspend_read_init(int stream_number) +{ + current_stream = stream_number; + extent_state_restore(&suspend_writer_posn, + &suspend_writer_posn_save[current_stream]); + + BUG_ON(!suspend_writer_posn.current_extent); + + suspend_reset_io_stats(); + + readahead_index = readahead_submit_index = -1; + + return 0; +} + +static int suspend_read_cleanup(void) +{ + suspend_finish_all_io(); + while (readahead_index != readahead_submit_index) { + suspend_cleanup_readahead(readahead_index); + readahead_index++; + if (readahead_index == MAX_OUTSTANDING_IO) + readahead_index = 0; + } + suspend_check_io_stats(); + return 0; +} + +static int suspend_write_init(int stream_number) +{ + extent_state_restore(&suspend_writer_posn, + &suspend_writer_posn_save[stream_number]); + current_stream = stream_number; + + BUG_ON(!suspend_writer_posn.current_extent); + + suspend_reset_io_stats(); + + return 0; +} + +static int suspend_write_cleanup(void) +{ + if (current_stream == 2) + extent_state_save(&suspend_writer_posn, + &suspend_writer_posn_save[1]); + + suspend_finish_all_io(); + + suspend_check_io_stats(); + + return 0; +} + +static int suspend_write_chunk(struct page *buffer_page) +{ + return suspend_rw_page(WRITE, buffer_page, -1, 0); +} + +static int suspend_rw_header_chunk(int rw, char *buffer, int buffer_size) +{ + int bytes_left = buffer_size; + + /* Read a chunk of the header */ + while (bytes_left) { + char *source_start = buffer + buffer_size - bytes_left; + char *dest_start = suspend_writer_buffer + suspend_writer_buffer_posn; + int capacity = PAGE_SIZE - suspend_writer_buffer_posn; + char *to = rw ? dest_start : source_start; + char *from = rw ? source_start : dest_start; + + if (bytes_left <= capacity) { + if (test_debug_state(SUSPEND_HEADER)) + printk("Copy %d bytes from %p to %p.\n", + bytes_left, to, from); + memcpy(to, from, bytes_left); + suspend_writer_buffer_posn += bytes_left; + return rw ? 0 : buffer_size; + } + + /* Next to read the next page */ + if (test_debug_state(SUSPEND_HEADER)) + printk("Copy %d bytes from %p to %p.\n", + capacity, to, from); + memcpy(to, from, capacity); + bytes_left -= capacity; + + if (rw == READ && test_suspend_state(SUSPEND_TRY_RESUME_RD)) + sys_read(suspend_read_fd, + suspend_writer_buffer, BLOCK_SIZE); + else { + if (__suspend_rw_page(rw, + virt_to_page(suspend_writer_buffer), + -1, !rw, + test_debug_state(SUSPEND_HEADER))) + return -EIO; + } + + suspend_writer_buffer_posn = 0; + check_shift_keys(0, NULL); + } + + return rw ? 0 : buffer_size; +} + +static int write_header_chunk_finish(void) +{ + return __suspend_rw_page(WRITE, + virt_to_page(suspend_writer_buffer), + -1, 0, test_debug_state(SUSPEND_HEADER)) ? -EIO : 0; +} + +static int read_header_chunk(char *buffer, int buffer_size) +{ + return suspend_rw_header_chunk(READ, buffer, buffer_size); +} + +static int write_header_chunk(char *buffer, int buffer_size) +{ + return suspend_rw_header_chunk(WRITE, buffer, buffer_size); +} + +struct suspend_bio_ops suspend_bio_ops = { + .submit_io = suspend_do_io, + .bdev_page_io = suspend_bdev_page_io, + .rw_page = suspend_rw_page, + .wait_on_readahead = suspend_wait_on_readahead, + .check_io_stats = suspend_check_io_stats, + .reset_io_stats = suspend_reset_io_stats, + .finish_all_io = suspend_finish_all_io, + .prepare_readahead = suspend_prepare_readahead, + .cleanup_readahead = suspend_cleanup_readahead, + .readahead_pages = suspend_readahead_pages, + .readahead_ready = suspend_readahead_ready, + .need_extra_next = &need_extra_next, + .forward_one_page = forward_one_page, + .set_devinfo = suspend_set_devinfo, + .read_init = suspend_read_init, + .read_chunk = suspend_bio_read_chunk, + .read_cleanup = suspend_read_cleanup, + .write_init = suspend_write_init, + .write_chunk = suspend_write_chunk, + .write_cleanup = suspend_write_cleanup, + .read_header_chunk = read_header_chunk, + .write_header_chunk = write_header_chunk, + .write_header_chunk_finish = write_header_chunk_finish, +}; + +static struct suspend_module_ops suspend_blockwriter_ops = +{ + .name = "Block I/O", + .type = MISC_PLUGIN, + .module = THIS_MODULE, + .memory_needed = suspend_bio_memory_needed, +}; + +static __init int suspend_block_io_load(void) +{ + return suspend_register_module(&suspend_blockwriter_ops); +} + +#ifdef MODULE +static __exit void suspend_block_io_unload(void) +{ + suspend_unregister_module(&suspend_blockwriter_ops); +} + +module_init(suspend_block_io_load); +module_exit(suspend_block_io_unload); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Nigel Cunningham"); +MODULE_DESCRIPTION("Suspend2 block io functions"); +#else +late_initcall(suspend_block_io_load); +#endif diff -urN oldtree/kernel/power/suspend_checksums.c newtree/kernel/power/suspend_checksums.c --- oldtree/kernel/power/suspend_checksums.c 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/suspend_checksums.c 2006-02-18 15:24:31.421812088 +0000 @@ -0,0 +1,509 @@ +#include +#include +#ifdef CONFIG_KDB +#include +#include +#endif +#include + +#include "suspend.h" +#include "modules.h" +#include "pageflags.h" +#include "proc.h" +#include "pagedir.h" +#include "ui.h" + +#define CHECKSUMS_PER_PAGE ((PAGE_SIZE - sizeof(void *)) / sizeof(unsigned long)) +#define NEXT_CHECKSUM_PAGE(page) *((unsigned long *) (((char *) (page)) + PAGE_SIZE - sizeof(void *))) + +static int checksum_pages; +static unsigned long *first_checksum_page, *last_checksum_page; +static int num_reload_pages = 0; + +struct reload_data +{ + int pageset; + int pagenumber; + struct page *page_address; + char *base_version; + char *compared_version; + struct reload_data *next; +}; + +static struct reload_data *first_reload_data, *last_reload_data; + +unsigned long suspend_page_checksum(struct page *page) +{ + unsigned long *virt; + int i; + unsigned long value = 0; + + virt = (unsigned long *) kmap_atomic(page, KM_USER0); + for (i = 0; i < (PAGE_SIZE / sizeof(unsigned long)); i++) + value += *(virt + i); + kunmap_atomic(virt, KM_USER0); + return value; +} + +extern void get_first_pbe(struct pbe *pbe, struct pagedir *pagedir); +extern void get_next_pbe(struct pbe *pbe); + +void __suspend_calculate_checksums(dyn_pageflags_t map, unsigned long **current_checksum_page, + int *page_index) +{ + int page_number; + + BITMAP_FOR_EACH_SET(map, page_number) { + *(*current_checksum_page + *page_index) = + suspend_page_checksum(pfn_to_page(page_number)); + *page_index++; + if (*page_index == CHECKSUMS_PER_PAGE) { + *page_index = 0; + *current_checksum_page = (unsigned long *) + NEXT_CHECKSUM_PAGE(*current_checksum_page); + } + }; +} + +void suspend_calculate_checksums(void) +{ + int page_index = 0; + unsigned long *current_checksum_page = first_checksum_page; + + if (!first_checksum_page) { + suspend_prepare_status(1, 0, "Unable to checksum at this point."); + return; + } + + suspend_prepare_status(1, 0, "Calculating checksums... "); + + __suspend_calculate_checksums(pageset1_map, ¤t_checksum_page, + &page_index); + + __suspend_calculate_checksums(pageset2_map, ¤t_checksum_page, + &page_index); + + suspend_prepare_status(1, 0, "Checksums done."); +} + +int __suspend_check_checksums(int whichpagedir, unsigned long **current_checksum_page, + int *page_index, struct reload_data **next_reload_data) +{ + int page_number, num_differences = 0; + unsigned long sum_now; + dyn_pageflags_t map; + + if (whichpagedir == 1) + map = pageset1_map; + else + map = pageset2_map; + + BITMAP_FOR_EACH_SET(map, page_number) { + /* Also ignore the page containing our variables */ + if (!PageChecksumIgnore(pfn_to_page(page_number))) { + /* Also ignore the page containing our variables */ + sum_now = suspend_page_checksum(pfn_to_page(page_number)); + if (sum_now != *(*current_checksum_page + *page_index)) { + num_differences++; + if (next_reload_data) { + char *virt; + struct reload_data *this = *next_reload_data; + this->pageset = whichpagedir; + this->pagenumber = page_number; + this->page_address = pfn_to_page(page_number); + virt = kmap_atomic(pfn_to_page(page_number), KM_USER0); + memcpy(this->compared_version, + virt, PAGE_SIZE); + kunmap_atomic(virt, KM_USER0); + *next_reload_data = this->next; + } + } + } + + *page_index++; + if (*page_index == CHECKSUMS_PER_PAGE) { + *page_index = 0; + *current_checksum_page = (unsigned long *) + NEXT_CHECKSUM_PAGE(*current_checksum_page); + } + } + + return num_differences; +} + +void suspend_check_checksums(void) +{ + int page_index = 0, num_differences = 0; + unsigned long *current_checksum_page = first_checksum_page; + struct reload_data *next_reload_data = first_reload_data; + + if (!first_checksum_page) { + suspend_prepare_status(1, 0, "Unable to checksum at this point."); + return; + } + + num_differences += __suspend_check_checksums(1, ¤t_checksum_page, + &page_index, &next_reload_data); + + num_differences += __suspend_check_checksums(2, ¤t_checksum_page, + &page_index, &next_reload_data); +} + +/* + * free_reload_data. + * + * Reload data begins on a page boundary. + */ +void suspend_free_reload_data(void) +{ + struct reload_data *this_data = first_reload_data; + struct reload_data *prev_reload_data = this_data; + + while (this_data) { + if (this_data->compared_version) + free_pages((unsigned long) this_data->compared_version, 0); + + if (this_data->base_version) + free_pages((unsigned long) this_data->base_version, 0); + + this_data = this_data->next; + + if (!(((unsigned long) this_data) & ~PAGE_MASK)) { + prev_reload_data->next = this_data; + prev_reload_data = this_data; + } + } + + this_data = first_reload_data; + while (this_data) { + prev_reload_data = this_data; + this_data = this_data->next; + free_pages((unsigned long) prev_reload_data, 0); + num_reload_pages--; + } + + first_reload_data = last_reload_data = NULL; + +} + +/* suspend_reread_pages() + * + * Description: Reread pages from an image for diagnosing differences. + * Arguments: page_list: A list containing information on pages + * to be reloaded, sorted by pageset and + * page index. + * Returns: Zero on success or -1 on failure. + */ + +int suspend_reread_pages(struct reload_data *page_list) +{ + int result = 0, whichtoread, pageset_offset = -1; + long i = 0; + struct suspend_module_ops *this_filter, *first_filter = get_next_filter(NULL); + dyn_pageflags_t *pageflags = &pageset1_map; + + if (!page_list) + return 0; + + for (whichtoread = page_list->pageset; whichtoread <= 2; whichtoread++) { + struct pagedir *pagedir; + + switch (whichtoread) { + case 1: + pagedir = &pagedir1; + break; + case 2: + pagedir = &pagedir2; + pageflags = &pageset2_map; + pageset_offset = -1; + i = -1; + break; + default: + goto out; + } + + suspend_message(SUSPEND_IO, SUSPEND_LOW, 0, + "Reread pages from pagedir %d.\n", whichtoread); + + /* Initialise page transformers */ + list_for_each_entry(this_filter, &suspend_filters, ops.filter.filter_list) { + if (this_filter->disabled) + continue; + if (this_filter->read_init && + this_filter->read_init(whichtoread)) { + abort_suspend("Failed to initialise a filter."); + return 1; + } + } + + /* Initialise writer */ + if (active_writer->read_init(whichtoread)) { + abort_suspend("Failed to initialise the writer."); + result = 1; + goto reread_free_buffers; + } + + /* Read the pages */ + while(i <= page_list->pagenumber) { + /* Read */ + result = first_filter->ops.filter.read_chunk( + virt_to_page(page_list->base_version), + SUSPEND_SYNC); + + if (result) { + abort_suspend("Failed to read a chunk of the image."); + goto reread_free_buffers; + } + + /* Interactivity*/ + check_shift_keys(0, NULL); + + /* Prepare next */ + pageset_offset = get_next_bit_on(*pageflags, pageset_offset); + + /* Got the one we're after? */ + i++; + + if (i == page_list->pagenumber) + page_list = page_list->next; + + if (page_list->pageset != whichtoread) + break; + } + +reread_free_buffers: + + /* Cleanup reads from this pageset. */ + list_for_each_entry(this_filter, &suspend_modules, module_list) { + if (this_filter->disabled) + continue; + if (this_filter->read_cleanup && + this_filter->read_cleanup()) { + abort_suspend("Failed to cleanup a filter."); + result = 1; + } + } + + if (active_writer->read_cleanup()) { + abort_suspend("Failed to cleanup the writer."); + result = 1; + } + } +out: + printk("\n"); + + return result; +} +void suspend_free_checksum_pages(void) +{ + unsigned long *next_checksum_page; + + while(first_checksum_page) { + next_checksum_page = + (unsigned long *) NEXT_CHECKSUM_PAGE(first_checksum_page); + free_pages((unsigned long) first_checksum_page, 0); + first_checksum_page = next_checksum_page; + } + last_checksum_page = NULL; + checksum_pages = 0; +} + +#define PRINTABLE(a) (((a) < 32 || (a) > 122) ? '.' : (a)) +static void local_print_location( + unsigned char *real, + unsigned char *original, + unsigned char *resumetime) +{ + int i; + + for (i = 0; i < 8; i++) + if (*(original + i) != *(resumetime + i)) + break; + if (i == 8) + return; + + suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1, "%p", real); + if (PageChecksumIgnore(virt_to_page(real))) + suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1, + " [NoSave]"); + if (PageSlab(virt_to_page(real))) + suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1, + " [Slab]"); + suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1, "\n"); + +#ifdef CONFIG_KDB + for (i = 0; i < 8; i++) { + static const char *last_sym = NULL; + if (*(original + i) != *(resumetime + i)) { + kdb_symtab_t symtab; + + kdbnearsym((unsigned long) real + i, + &symtab); + + if ((!symtab.sym_name) || + (symtab.sym_name == last_sym)) + continue; + + last_sym = symtab.sym_name; + + suspend_message(SUSPEND_INTEGRITY, SUSPEND_LOW, 1, + "%p = %s\n", + symtab.sym_start, + symtab.sym_name); + } + } +#endif + + for (i = 0; i < 8; i++) + suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1, + "%2x ", *(original + i)); + suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1, " "); + for (i = 0; i < 8; i++) + suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1, + "%c", PRINTABLE(*(original + i))); + suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1, " "); + + for (i = 0; i < 8; i++) + suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1, + "%2x ", *(resumetime + i)); + suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1, " "); + for (i = 0; i < 8; i++) + suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1, + "%c", PRINTABLE(*(resumetime + i))); + suspend_message(SUSPEND_INTEGRITY, SUSPEND_HIGH, 1, "\n\n"); +} + +int suspend_allocate_reload_data(int pages) +{ + struct reload_data *this_data; + unsigned long data_start; + int i; + + if (num_reload_pages >= pages) + return 0; + + for (i = 1; i <= pages; i++) { + data_start = get_zeroed_page(GFP_ATOMIC); + + if (!data_start) + return -ENOMEM; + + SetPageChecksumIgnore(virt_to_page(data_start)); + this_data = (struct reload_data *) data_start; + num_reload_pages++; + + while (data_start == + ((((unsigned long) (this_data + 1)) - 1) & PAGE_MASK)) { + struct page *page; + unsigned long virt; + + virt = get_zeroed_page(GFP_ATOMIC); + if (!virt) { + printk("Couldn't get a page in which to store " + "a changed page.\n"); + return -ENOMEM; + } + page = virt_to_page(virt); + + this_data->compared_version = (char *) virt; + SetPageChecksumIgnore(page); + + virt = get_zeroed_page(GFP_ATOMIC); + if (!virt) { + printk("Couldn't get a page in which to store " + "a baseline page.\n"); + return -ENOMEM; + } + page = virt_to_page(virt); + + this_data->base_version = (char *) virt; + SetPageChecksumIgnore(page); + + if (last_reload_data) + last_reload_data->next = this_data; + else + first_reload_data = this_data; + + last_reload_data = this_data; + + this_data++; + } + + check_shift_keys(0, NULL); + } + + return 0; +} + +void suspend_print_differences(void) +{ + struct reload_data *this_data = first_reload_data; + int i; + + suspend_reread_pages(first_reload_data); + + while (this_data) { + if (this_data->pageset && + this_data->pagenumber) { + suspend_message(SUSPEND_INTEGRITY, SUSPEND_MEDIUM, 1, + "Pagedir %d. Page %d. Address %p." + " Base %p. Copy %p.\n", + this_data->pageset, + this_data->pagenumber, + page_address(this_data->page_address), + this_data->base_version, + this_data->compared_version); + for (i= 0; i < (PAGE_SIZE / 8); i++) { + local_print_location( + page_address(this_data->page_address) + i * 8, + this_data->base_version + i * 8, + this_data->compared_version + i * 8); + check_shift_keys(0, NULL); + } + check_shift_keys(1, NULL); + } else + return; + this_data = this_data->next; + } +} + +int __suspend_allocate_checksum_pages(void) +{ + int pages_required = + (pagedir1.pageset_size + pagedir2.pageset_size) / CHECKSUMS_PER_PAGE; + unsigned long this_page; + + while (checksum_pages <= pages_required) { + this_page = get_zeroed_page(GFP_ATOMIC); + if (!this_page) + return -ENOMEM; + + if (!first_checksum_page) + first_checksum_page = + (unsigned long *) this_page; + else + NEXT_CHECKSUM_PAGE(last_checksum_page) = this_page; + + last_checksum_page = (unsigned long *) this_page; + SetPageChecksumIgnore(virt_to_page(this_page)); + checksum_pages++; + } + + return suspend_allocate_reload_data(2); +} + +int suspend_checksum_init(void) +{ + if (suspend_allocate_dyn_pageflags(&checksum_map)) + return 1; + return 0; +} + + +void suspend_checksum_cleanup(void) +{ + suspend_free_reload_data(); + suspend_free_checksum_pages(); + + suspend_free_dyn_pageflags(&checksum_map); +} diff -urN oldtree/kernel/power/suspend_file.c newtree/kernel/power/suspend_file.c --- oldtree/kernel/power/suspend_file.c 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/suspend_file.c 2006-02-18 15:24:31.422811936 +0000 @@ -0,0 +1,1077 @@ +/* + * Filewriter.c + * + * Copyright 2005 Nigel Cunningham + * + * Distributed under GPLv2. + * + * This file encapsulates functions for usage of a simple file as a + * backing store. It is based upon the swapwriter, and shares the + * same basic working. Here, though, we have nothing to do with + * swapspace, and only one device to worry about. + * + * The user can just + * + * echo Suspend2 > /path/to/my_file + * + * and + * + * echo /path/to/my_file > /proc/software_suspend/filewriter_target + * + * then put what they find in /proc/software_suspend/resume2 + * as their resume2= parameter in lilo.conf (and rerun lilo if using it). + * + * Having done this, they're ready to suspend and resume. + * + * TODO: + * - File resizing. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "suspend2.h" +#include "suspend2_common.h" +#include "version.h" +#include "proc.h" +#include "modules.h" +#include "ui.h" +#include "extent.h" +#include "io.h" +#include "storage.h" +#include "block_io.h" + +static struct suspend_module_ops filewriterops; + +/* Details of our target. */ + +char filewriter_target[256]; +static struct inode *target_inode; +static struct file *target_file; +static struct block_device *target_bdev; +static int used_devt = 0; +static sector_t target_firstblock = 0; +static int target_storage_available = 0; +static int target_claim = 0; + +static char HaveImage[] = "HaveImage\n"; +static char NoImage[] = "Suspend2\n"; +static const int resumed_before_byte = sizeof(HaveImage) + 1; +#define sig_size resumed_before_byte + +extern dev_t ROOT_DEV; +extern char *__initdata root_device_name; + +/* Header_pages must be big enough for signature */ +static int header_pages, main_pages; + +#define target_is_normal_file() (S_ISREG(target_inode->i_mode)) + +static struct suspend_bdev_info devinfo; + +static void set_devinfo(struct block_device *bdev, int target_blkbits) +{ + devinfo.bdev = bdev; + if (!target_blkbits) { + devinfo.bmap_shift = devinfo.blocks_per_page = 0; + } else { + devinfo.bmap_shift = target_blkbits - 9; + devinfo.blocks_per_page = (1 << (PAGE_SHIFT - target_blkbits)); + } +} + +/* Extent chain for blocks */ +static struct extent_chain block_chain; + +/* Signature operations */ +enum { + GET_IMAGE_EXISTS, + INVALIDATE, + MARK_RESUME_ATTEMPTED, +}; + +/* Helpers. */ + +static int filewriter_storage_available(void) +{ + int result = 0; + + if (!target_inode) + return 0; + + switch (target_inode->i_mode & S_IFMT) { + case S_IFSOCK: + case S_IFCHR: + case S_IFIFO: /* Socket, Char, Fifo */ + return -1; + case S_IFREG: /* Regular file: current size - holes + free space on part */ + result = target_storage_available; + break; + case S_IFBLK: /* Block device */ + if (target_bdev->bd_disk) { + if (target_bdev->bd_part) + result = (unsigned long)target_bdev->bd_part->nr_sects >> (PAGE_SHIFT - 9); + else + result = (unsigned long)target_bdev->bd_disk->capacity >> (PAGE_SHIFT - 9); + } else { + printk("bdev->bd_disk null.\n"); + return 0; + } + } + + return result; +} + +static int has_contiguous_blocks(int page_num) +{ + int j; + sector_t last = 0; + + for (j = 0; j < devinfo.blocks_per_page; j++) { + sector_t this = bmap(target_inode, + page_num * devinfo.blocks_per_page + j); + + if (!this || (last && (last + 1) != this)) + break; + + last = this; + } + + return (j == devinfo.blocks_per_page); +} + +/* + * Ramdisk access variables + */ + +static int size_ignoring_sparseness(void) +{ + int mappable = 0, i; + + if (target_is_normal_file()) { + for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT) ; i++) + if (has_contiguous_blocks(i)) + mappable++; + + return mappable; + } else + return filewriter_storage_available(); +} + +static void get_main_pool_phys_params(void) +{ + int i; + + if (block_chain.first) + put_extent_chain(&block_chain); + + if (target_is_normal_file()) { + int extent_min = -1, extent_max = -1; + + for (i = 0; + i < (target_inode->i_size >> PAGE_SHIFT); + i++) { + sector_t new_sector; + + if (!has_contiguous_blocks(i)) + continue; + + new_sector = bmap(target_inode, + (i * devinfo.blocks_per_page)); + + /* + * I'd love to be able to fill in holes and resize + * files, but not yet... + */ + + if (new_sector == extent_max + 1) + extent_max+= devinfo.blocks_per_page; + else { + if (extent_min > -1) { + if (test_action_state(SUSPEND_TEST_BIO)) + printk("Adding extent %d-%d.\n", + extent_min << devinfo.bmap_shift, + ((extent_max + 1) << devinfo.bmap_shift) - 1); + append_extent_to_extent_chain( + &block_chain, + extent_min, + extent_max); + } + extent_min = new_sector; + extent_max = extent_min + devinfo.blocks_per_page - 1; + } + } + if (extent_min > -1) { + append_extent_to_extent_chain(&block_chain, + extent_min, extent_max); + if (test_action_state(SUSPEND_TEST_BIO)) + printk("Adding extent %d-%d.\n", + extent_min << devinfo.bmap_shift, + ((extent_max + 1) << devinfo.bmap_shift) - 1); + } + + } else + if (target_storage_available > 0) { + append_extent_to_extent_chain(&block_chain, + 0, + min(main_pages, target_storage_available) * devinfo.blocks_per_page - 1); + } +} + +static void get_target_info(int get_size) +{ + if (!target_bdev || IS_ERR(target_bdev)) { + target_inode = NULL; + set_devinfo(NULL, 0); + target_storage_available = 0; + } else { + if (!target_inode) + target_inode = target_bdev->bd_inode; + set_devinfo(target_bdev, target_inode->i_blkbits); + if (get_size) + target_storage_available = size_ignoring_sparseness(); + } +} + +static void filewriter_cleanup(int finishing_cycle) +{ + if (target_bdev) { + if (target_claim) { + bd_release(target_bdev); + target_claim = 0; + } + + if (used_devt) { + blkdev_put(target_bdev); + used_devt = 0; + } + target_bdev = NULL; + get_target_info(0); + } + + if (target_file > 0) { + filp_close(target_file, NULL); + target_file = NULL; + } +} + +static void filewriter_get_target_info(char *target, int get_size, + int resume2) +{ + if (target_file) + filewriter_cleanup(0); + + if (!target || !strlen(target)) + return; + + target_file = filp_open(target, O_RDWR, 0); + + if (IS_ERR(target_file) || !target_file) { + dev_t resume_dev_t; + + if (!resume2) { + printk("Open file %s returned %p.\n", target, target_file); + target_file = NULL; + return; + } + + target_file = NULL; + resume_dev_t = name_to_dev_t(target); + if (!resume_dev_t) { + printk("Open file %s returned %p and name_to_devt failed.\n", target, target_file); + if (!resume_dev_t) { + struct kstat stat; + int error = vfs_stat(target, &stat); + if (error) { + printk("Stating the file also failed. Nothing more we can do.\n"); + return; + } + resume_dev_t = stat.rdev; + } + return; + } + target_bdev = open_by_devnum(resume_dev_t, FMODE_READ); + if (IS_ERR(target_bdev)) { + printk("Got a dev_num (%lx) but failed to open it.\n", + (unsigned long) resume_dev_t); + return; + } + used_devt = 1; + target_inode = target_bdev->bd_inode; + } else + target_inode = target_file->f_mapping->host; + + if (S_ISLNK(target_inode->i_mode) || + S_ISDIR(target_inode->i_mode) || + S_ISSOCK(target_inode->i_mode) || + S_ISFIFO(target_inode->i_mode)) { + printk("The filewriter works with regular files, character files and block devices.\n"); + goto cleanup; + } + + if (!used_devt) { + if (S_ISBLK(target_inode->i_mode)) { + target_bdev = I_BDEV(target_inode); + if (!bd_claim(target_bdev, &filewriterops)) + target_claim = 1; + } else + target_bdev = target_inode->i_sb->s_bdev; + } + + get_target_info(get_size); + + if (!resume2) + target_firstblock = bmap(target_inode, 0) << devinfo.bmap_shift; + + return; +cleanup: + target_inode = NULL; + if (target_file) { + filp_close(target_file, NULL); + target_file = NULL; + } + get_target_info(0); +} + +int parse_signature(char *header) +{ + int have_image = !memcmp(HaveImage, header, sizeof(HaveImage) - 1); + int no_image_header = !memcmp(NoImage, header, sizeof(NoImage) - 1); + + if (no_image_header) + return 0; + + if (!have_image) + return -1; + + if (header[resumed_before_byte] & 1) + set_suspend_state(SUSPEND_RESUMED_BEFORE); + else + clear_suspend_state(SUSPEND_RESUMED_BEFORE); + + return 1; +} + +/* prepare_signature */ + +static int prepare_signature(char *current_header) +{ + /* + * Explicitly put the \0 that clears the 'tried to resume from + * this image before' flag. + */ + strncpy(current_header, HaveImage, sizeof(HaveImage)); + current_header[resumed_before_byte] = 0; + return 0; +} + +static int filewriter_storage_allocated(void) +{ + int result; + + if (!target_inode) + return 0; + + if (target_is_normal_file()) { + result = (int) target_storage_available; + } else + result = header_pages + main_pages; + + return result; +} + +static int filewriter_release_storage(void) +{ + if ((test_action_state(SUSPEND_KEEP_IMAGE)) && + test_suspend_state(SUSPEND_NOW_RESUMING)) + return 0; + + put_extent_chain(&block_chain); + + header_pages = main_pages = 0; + return 0; +} + +static int filewriter_allocate_header_space(int space_requested) +{ + int i; + + /* We only steal pages from the main pool. If it doesn't have any yet... */ + + if (!block_chain.first) + return 0; + + extent_state_goto_start(&suspend_writer_posn); + + for (i = 0; i < space_requested; i++) { + if (suspend_bio_ops.forward_one_page()) + return -ENOSPC; + } + + /* The end of header pages will be the start of pageset 2 */ + extent_state_save(&suspend_writer_posn, &suspend_writer_posn_save[2]); + header_pages = space_requested; + return 0; +} + +static int filewriter_allocate_storage(int space_requested) +{ + int result = 0, prev_header_pages; + /* FIXME This looks wrong */ + int blocks_to_get = (space_requested << devinfo.bmap_shift) - block_chain.size; + + /* Only release_storage reduces the size */ + if (blocks_to_get < 1) + return 0; + + main_pages = space_requested; + + get_main_pool_phys_params(); + + suspend_message(SUSPEND_WRITER, SUSPEND_MEDIUM, 0, + "Finished with block_chain.size == %d.\n", + block_chain.size); + + if (block_chain.size < (header_pages + main_pages)) + result = -ENOSPC; + + prev_header_pages = header_pages; + header_pages = 0; + filewriter_allocate_header_space(prev_header_pages); + return result; +} + +static int filewriter_write_header_init(void) +{ + char new_sig[sig_size]; + + extent_state_goto_start(&suspend_writer_posn); + + suspend_writer_buffer = (char *) get_zeroed_page(GFP_ATOMIC); + suspend_writer_buffer_posn = 0; + + /* We change it once the whole header is written */ + strcpy(new_sig, NoImage); + suspend_bio_ops.write_header_chunk(new_sig, sig_size); + + /* Info needed to bootstrap goes at the start of the header. + * First we save the basic info needed for reading, including the number + * of header pages. Then we save the structs containing data needed + * for reading the header pages back. + * Note that even if header pages take more than one page, when we + * read back the info, we will have restored the location of the + * next header page by the time we go to use it. + */ + suspend_bio_ops.write_header_chunk((char *) &suspend_writer_posn_save, + 3 * sizeof(struct extent_iterate_saved_state)); + + suspend_bio_ops.write_header_chunk((char *) &devinfo, + sizeof(devinfo)); + + serialise_extent_chain(&block_chain); + + return 0; +} + +static int filewriter_write_header_cleanup(void) +{ + /* Write any unsaved data */ + if (suspend_writer_buffer_posn) + suspend_bio_ops.write_header_chunk_finish(); + + suspend_bio_ops.finish_all_io(); + + extent_state_goto_start(&suspend_writer_posn); + suspend_bio_ops.forward_one_page(); + + /* Adjust image header */ + suspend_bio_ops.bdev_page_io(READ, target_bdev, + target_firstblock, + virt_to_page(suspend_writer_buffer)); + + prepare_signature(suspend_writer_buffer); + + suspend_bio_ops.bdev_page_io(WRITE, target_bdev, + target_firstblock, + virt_to_page(suspend_writer_buffer)); + + free_page((unsigned long) suspend_writer_buffer); + suspend_writer_buffer = NULL; + + suspend_bio_ops.finish_all_io(); + + return 0; +} + +/* HEADER READING */ + +#ifdef CONFIG_DEVFS_FS +int create_dev(char *name, dev_t dev, char *devfs_name); +#else +static int create_dev(char *name, dev_t dev, char *devfs_name) +{ + sys_unlink(name); + return sys_mknod(name, S_IFBLK|0600, new_encode_dev(dev)); +} +#endif + +static int rd_init(void) +{ + suspend_writer_buffer_posn = 0; + + create_dev("/dev/root", ROOT_DEV, root_device_name); + create_dev("/dev/ram", MKDEV(RAMDISK_MAJOR, 0), NULL); + + suspend_read_fd = sys_open("/dev/root", O_RDONLY, 0); + if (suspend_read_fd < 0) + goto out; + + sys_read(suspend_read_fd, suspend_writer_buffer, BLOCK_SIZE); + + memcpy(&suspend_writer_posn_save, + suspend_writer_buffer + suspend_writer_buffer_posn, + sizeof(suspend_writer_posn_save)); + + suspend_writer_buffer_posn += sizeof(suspend_writer_posn_save); + + return 0; +out: + sys_unlink("/dev/ram"); + sys_unlink("/dev/root"); + return -EIO; +} + +static int file_init(void) +{ + suspend_writer_buffer_posn = sig_size; + + /* Read filewriter configuration */ + suspend_bio_ops.bdev_page_io(READ, target_bdev, + target_firstblock, + virt_to_page((unsigned long) suspend_writer_buffer)); + + return 0; +} + +/* + * read_header_init() + * + * Ramdisk support based heavily on init/do_mounts_rd.c + * + * Description: + * 1. Attempt to read the device specified with resume2=. + * 2. Check the contents of the header for our signature. + * 3. Warn, ignore, reset and/or continue as appropriate. + * 4. If continuing, read the filewriter configuration section + * of the header and set up block device info so we can read + * the rest of the header & image. + * + * Returns: + * May not return if user choose to reboot at a warning. + * -EINVAL if cannot resume at this time. Booting should continue + * normally. + */ + +static int filewriter_read_header_init(void) +{ + int result; + struct block_device *tmp; + + *(suspend_bio_ops.need_extra_next) = 1; + + suspend_writer_buffer = (char *) get_zeroed_page(GFP_ATOMIC); + + if (test_suspend_state(SUSPEND_TRY_RESUME_RD)) + result = rd_init(); + else + result = file_init(); + + if (result) + return result; + + suspend_writer_buffer_posn = sig_size; + memcpy(&suspend_writer_posn_save, + suspend_writer_buffer + suspend_writer_buffer_posn, + 3 * sizeof(struct extent_iterate_saved_state)); + + suspend_writer_buffer_posn += 3 * sizeof(struct extent_iterate_saved_state); + + tmp = devinfo.bdev; + + memcpy(&devinfo, + suspend_writer_buffer + suspend_writer_buffer_posn, + sizeof(struct suspend_bdev_info)); + devinfo.bdev = tmp; + suspend_writer_buffer_posn += sizeof(struct suspend_bdev_info); + + extent_state_goto_start(&suspend_writer_posn); + load_extent_chain(&block_chain); + + return 0; +} + +static int filewriter_read_header_cleanup(void) +{ + free_page((unsigned long) suspend_writer_buffer); + suspend_writer_buffer = NULL; + return 0; +} + +static int filewriter_signature_op(int op) +{ + char *cur; + int result = 0, changed = 0; + + if(target_bdev <= 0) + return -1; + + cur = (char *) get_zeroed_page(GFP_ATOMIC); + if (!cur) { + printk("Unable to allocate a page for reading the image signature.\n"); + return -ENOMEM; + } + + suspend_bio_ops.bdev_page_io(READ, target_bdev, + target_firstblock, + virt_to_page(cur)); + + result = parse_signature(cur); + + switch (op) { + case INVALIDATE: + if (result == -1) + goto out; + + strcpy(cur, NoImage); + cur[resumed_before_byte] = 0; + result = changed = 1; + break; + case MARK_RESUME_ATTEMPTED: + if (result == 1) { + cur[resumed_before_byte] |= 1; + changed = 1; + } + break; + } + + if (changed) + suspend_bio_ops.bdev_page_io(WRITE, target_bdev, + target_firstblock, + virt_to_page(cur)); + +out: + suspend_bio_ops.finish_all_io(); + free_page((unsigned long) cur); + return result; +} + +/* + * workspace_size + * + * Description: + * Returns the number of bytes of RAM needed for this + * code to do its work. (Used when calculating whether + * we have enough memory to be able to suspend & resume). + * + */ +static unsigned long filewriter_memory_needed(void) +{ + return 0; +} + +/* Print debug info + * + * Description: + */ + +static int filewriter_print_debug_stats(char *buffer, int size) +{ + int len = 0; + + if (suspend_active_writer != &filewriterops) { + len = snprintf_used(buffer, size, "- Filewriter inactive.\n"); + return len; + } + + len = snprintf_used(buffer, size, "- Filewriter active.\n"); + + len+= snprintf_used(buffer+len, size-len, " Storage available for image: %ld pages.\n", + filewriter_storage_allocated()); + + return len; +} + +/* + * Storage needed + * + * Returns amount of space in the image header required + * for the filewriter's data. + * + * We ensure the space is allocated, but actually save the + * data from write_header_init and therefore don't also define a + * save_config_info routine. + */ +static unsigned long filewriter_storage_needed(void) +{ + return strlen(filewriter_target) + 1; +} + +/* + * filewriter_invalidate_image + * + */ +static int filewriter_invalidate_image(void) +{ + int result; + + if (nr_suspends > 0) + filewriter_release_storage(); + + result = filewriter_signature_op(INVALIDATE); + if (result == 1 && !nr_suspends) + printk(KERN_WARNING name_suspend "Image invalidated.\n"); + + return result; +} + +/* + * Image_exists + * + */ + +static int filewriter_image_exists(void) +{ + return filewriter_signature_op(GET_IMAGE_EXISTS); +} + +/* + * Mark resume attempted. + * + * Record that we tried to resume from this image. + */ + +static void filewriter_mark_resume_attempted(void) +{ + filewriter_signature_op(MARK_RESUME_ATTEMPTED); +} + +static void filewriter_set_resume2(void) +{ + char *buffer = (char *) get_zeroed_page(GFP_ATOMIC); + char *buffer2 = (char *) get_zeroed_page(GFP_ATOMIC); + unsigned long sector = bmap(target_inode, 0); + int offset = 0; + + if (target_bdev) { + set_devinfo(target_bdev, target_inode->i_blkbits); + + bdevname(target_bdev, buffer2); + offset += snprintf(buffer + offset, PAGE_SIZE - offset, + "/dev/%s", buffer2); + + if (sector) + offset += snprintf(buffer + offset, PAGE_SIZE - offset, + ":0x%lx", sector << devinfo.bmap_shift); + } else + offset += snprintf(buffer + offset, PAGE_SIZE - offset, + "%s is not a valid target.", filewriter_target); + + sprintf(resume2_file, "file:%s", buffer); + + free_page((unsigned long) buffer); + free_page((unsigned long) buffer2); + + attempt_to_parse_resume_device(); +} + +static int __test_filewriter_target(char *target, int resume_time) +{ + filewriter_get_target_info(filewriter_target, 0, 0); + if (filewriter_signature_op(GET_IMAGE_EXISTS) > -1) { + printk(name_suspend "Filewriter: File signature found.\n"); + if (!resume_time) + filewriter_set_resume2(); + + suspend_bio_ops.set_devinfo(&devinfo); + suspend_writer_posn.chains = &block_chain; + suspend_writer_posn.num_chains = 1; + + return 0; + } + + if (*filewriter_target) + printk(KERN_ERR name_suspend + "Filewriter: Sorry. No signature found at %s.\n", + filewriter_target); + else + printk(KERN_ERR name_suspend + "Filewriter: Sorry. No signature found.\n"); + + return 1; +} + +static void test_filewriter_target(void) +{ + __test_filewriter_target(filewriter_target, 0); +} + +/* + * Parse Image Location + * + * Attempt to parse a resume2= parameter. + * Swap Writer accepts: + * resume2=file:DEVNAME[:FIRSTBLOCK] + * + * Where: + * DEVNAME is convertable to a dev_t by name_to_dev_t + * FIRSTBLOCK is the location of the first block in the file. + * BLOCKSIZE is the logical blocksize >= SECTOR_SIZE & <= PAGE_SIZE, + * mod SECTOR_SIZE == 0 of the device. + * Data is validated by attempting to read a header from the + * location given. Failure will result in filewriter refusing to + * save an image, and a reboot with correct parameters will be + * necessary. + */ + +static int filewriter_parse_sig_location(char *commandline, int only_writer) +{ + char *thischar, *devstart = NULL, *colon = NULL, *at_symbol = NULL; + int result = -EINVAL, target_blocksize = 0; + + if (strncmp(commandline, "file:", 5)) { + if (!only_writer) + return 1; + } else + commandline += 5; + + /* + * Don't check signature again if we're beginning a cycle. If we already + * did the initialisation successfully, assume we'll be okay when it comes + * to resuming. + */ + if (target_bdev) + return 0; + + devstart = thischar = commandline; + while ((*thischar != ':') && (*thischar != '@') && + ((thischar - commandline) < 250) && (*thischar)) + thischar++; + + if (*thischar == ':') { + colon = thischar; + *colon = 0; + thischar++; + } + + while ((*thischar != '@') && ((thischar - commandline) < 250) && (*thischar)) + thischar++; + + if (*thischar == '@') { + at_symbol = thischar; + *at_symbol = 0; + } + + if (colon) + target_firstblock = (int) simple_strtoul(colon + 1, NULL, 0); + else + target_firstblock = 0; + + if (at_symbol) { + target_blocksize = (int) simple_strtoul(at_symbol + 1, NULL, 0); + if (target_blocksize & (SECTOR_SIZE - 1)) { + printk("Filewriter: Blocksizes are multiples of %d.\n", SECTOR_SIZE); + result = -EINVAL; + goto out; + } + } + + filewriter_get_target_info(commandline, 0, 1); + + if (!target_bdev || IS_ERR(target_bdev)) { + target_bdev = NULL; + result = -1; + goto out; + } + + if (target_blocksize) + set_devinfo(target_bdev, generic_ffs(target_blocksize)); + + result = __test_filewriter_target(commandline, 1); + +out: + if (colon) + *colon = ':'; + if (at_symbol) + *at_symbol = '@'; + + return result; +} + +/* filewriter_save_config_info + * + * Description: Save the target's name, not for resume time, but for all_settings. + * Arguments: Buffer: Pointer to a buffer of size PAGE_SIZE. + * Returns: Number of bytes used for saving our data. + */ + +static int filewriter_save_config_info(char *buffer) +{ + strcpy(buffer, filewriter_target); + return strlen(filewriter_target) + 1; +} + +/* filewriter_load_config_info + * + * Description: Reload target's name. + * Arguments: Buffer: Pointer to the start of the data. + * Size: Number of bytes that were saved. + */ + +static void filewriter_load_config_info(char *buffer, int size) +{ + strcpy(filewriter_target, buffer); +} + +static int filewriter_initialise(int starting_cycle) +{ + int result = 0; + + if (starting_cycle) { + if (suspend_active_writer != &filewriterops) + return 0; + + if (!*filewriter_target) { + printk("Filewriter is the active writer, but no filename has been set.\n"); + return 1; + } + } + + if (filewriter_target) + filewriter_get_target_info(filewriter_target, starting_cycle, 0); + + if (starting_cycle && (filewriter_image_exists() == -1)) { + printk("%s is does not have a valid signature for suspending.\n", + filewriter_target); + result = 1; + } + + return result; +} + +static struct suspend_proc_data filewriter_proc_data[] = { + + { + .filename = "filewriter_target", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_STRING, + .needs_storage_manager = 2, + .data = { + .string = { + .variable = filewriter_target, + .max_length = 256, + } + }, + .write_proc = test_filewriter_target, + }, + + { .filename = "disable_filewriter", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_INTEGER, + .data = { + .integer = { + .variable = &filewriterops.disabled, + .minimum = 0, + .maximum = 1, + } + }, + .write_proc = attempt_to_parse_resume_device2, + } +}; + +static struct suspend_module_ops filewriterops = { + .type = WRITER_PLUGIN, + .name = "File Writer", + .module = THIS_MODULE, + .memory_needed = filewriter_memory_needed, + .print_debug_info = filewriter_print_debug_stats, + .save_config_info = filewriter_save_config_info, + .load_config_info = filewriter_load_config_info, + .storage_needed = filewriter_storage_needed, + .initialise = filewriter_initialise, + .cleanup = filewriter_cleanup, + + .ops = { + .writer = { + .storage_available = filewriter_storage_available, + .storage_allocated = filewriter_storage_allocated, + .release_storage = filewriter_release_storage, + .allocate_header_space = filewriter_allocate_header_space, + .allocate_storage = filewriter_allocate_storage, + .image_exists = filewriter_image_exists, + .mark_resume_attempted = filewriter_mark_resume_attempted, + .write_header_init = filewriter_write_header_init, + .write_header_cleanup = filewriter_write_header_cleanup, + .read_header_init = filewriter_read_header_init, + .read_header_cleanup = filewriter_read_header_cleanup, + .invalidate_image = filewriter_invalidate_image, + .parse_sig_location = filewriter_parse_sig_location, + } + } +}; + +/* ---- Registration ---- */ +static __init int filewriter_load(void) +{ + int result; + int i, numfiles = sizeof(filewriter_proc_data) / sizeof(struct suspend_proc_data); + + printk("Suspend2 FileWriter loading.\n"); + + filewriterops.read_init = suspend_bio_ops.read_init; + filewriterops.ops.writer.read_chunk = suspend_bio_ops.read_chunk; + filewriterops.read_cleanup = suspend_bio_ops.read_cleanup; + filewriterops.write_init = suspend_bio_ops.write_init; + filewriterops.ops.writer.write_chunk = suspend_bio_ops.write_chunk; + filewriterops.write_cleanup = suspend_bio_ops.write_cleanup; + filewriterops.ops.writer.read_header_chunk = + suspend_bio_ops.read_header_chunk; + filewriterops.ops.writer.write_header_chunk = + suspend_bio_ops.write_header_chunk; + + if (!(result = suspend_register_module(&filewriterops))) { + for (i=0; i< numfiles; i++) + suspend_register_procfile(&filewriter_proc_data[i]); + } else + printk("Suspend2 FileWriter unable to register!\n"); + + return result; +} + +#ifdef MODULE +static __exit void filewriter_unload(void) +{ + int i, numfiles = sizeof(filewriter_proc_data) / sizeof(struct suspend_proc_data); + + printk("Suspend2 FileWriter unloading.\n"); + + for (i=0; i< numfiles; i++) + suspend_unregister_procfile(&filewriter_proc_data[i]); + suspend_unregister_module(&filewriterops); +} + +module_init(filewriter_load); +module_exit(filewriter_unload); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Nigel Cunningham"); +MODULE_DESCRIPTION("Suspend2 filewriter"); +#else +late_initcall(filewriter_load); +#endif diff -urN oldtree/kernel/power/suspend_swap.c newtree/kernel/power/suspend_swap.c --- oldtree/kernel/power/suspend_swap.c 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/suspend_swap.c 2006-02-18 15:24:31.424811632 +0000 @@ -0,0 +1,1213 @@ +/* + * Swapwriter.c + * + * Copyright 2004-2005 Nigel Cunningham + * + * Distributed under GPLv2. + * + * This file encapsulates functions for usage of swap space as a + * backing store. + */ + +#include +#include +#include +#include +#include + +#include "suspend2.h" +#include "suspend2_common.h" +#include "version.h" +#include "proc.h" +#include "modules.h" +#include "io.h" +#include "ui.h" +#include "extent.h" +#include "block_io.h" + +static struct suspend_module_ops swapwriterops; + +#define SIGNATURE_VER 6 + +/* --- Struct of pages stored on disk */ + +union diskpage { + union swap_header swh; /* swh.magic is the only member used */ +}; + +union p_diskpage { + union diskpage *pointer; + char *ptr; + unsigned long address; +}; + +/* Devices used for swap */ +static struct suspend_bdev_info devinfo[MAX_SWAPFILES]; + +/* Extent chains for swap & blocks */ +struct extent_chain swapextents; +struct extent_chain block_chain[MAX_SWAPFILES]; + +static dev_t header_dev_t; +static struct block_device *header_block_device; +static unsigned long headerblock; + +/* For swapfile automatically swapon/off'd. */ +static char swapfilename[SWAP_FILENAME_MAXLENGTH] = ""; +extern asmlinkage long sys_swapon(const char *specialfile, int swap_flags); +extern asmlinkage long sys_swapoff(const char *specialfile); +static int suspend_swapon_status; + +/* Header Page Information */ +static int header_pages_allocated; + +/* User Specified Parameters. */ + +static unsigned long resume_firstblock; +static int resume_blocksize; +static dev_t resume_dev_t; +static struct block_device *resume_block_device; + +struct sysinfo swapinfo; +static int swapwriter_invalidate_image(void); + +/* Block devices open. */ +struct bdev_opened +{ + dev_t device; + struct block_device *bdev; + int set_swapinfo; + int claimed; +}; + +/* + * Entry MAX_SWAPFILES is the resume block device, which may + * not be a swap device enabled when we suspend. + * Entry MAX_SWAPFILES + 1 is the header block device, which + * is needed before we find out which slot it occupies. + */ +static struct bdev_opened *bdev_info_list[MAX_SWAPFILES + 2]; + +static void close_bdev(int i) +{ + struct bdev_opened *this = bdev_info_list[i]; + + if (this->claimed) + bd_release(this->bdev); + + /* Release our reference. */ + blkdev_put(this->bdev); + + if (this->set_swapinfo) + swap_info[i].bdev = NULL; + + /* Free our info. */ + kfree(this); + + bdev_info_list[i] = NULL; +} + +static void close_bdevs(void) +{ + int i; + + for (i = 0; i < MAX_SWAPFILES; i++) + if (bdev_info_list[i]) + close_bdev(i); + + resume_block_device = header_block_device = NULL; +} + +static struct block_device *open_bdev(int index, dev_t device) +{ + struct bdev_opened *this; + struct block_device *bdev; + + if (bdev_info_list[index] && (bdev_info_list[index]->device == device)) { + bdev = bdev_info_list[index]->bdev; + return bdev; + } + + if (bdev_info_list[index] && bdev_info_list[index]->device != device) + close_bdev(index); + + bdev = open_by_devnum(device, FMODE_READ); + + if (IS_ERR(bdev) || !bdev) { + suspend_early_boot_message(1,SUSPEND_CONTINUE_REQ, + "Failed to get access to block device " + "%d.\n You could be " + "booting with a 2.6 kernel when you " + "suspended a 2.4 kernel."); + return ERR_PTR(-EINVAL); + } + + this = kmalloc(sizeof(struct bdev_opened), GFP_KERNEL); + BUG_ON(!this); + + bdev_info_list[index] = this; + this->device = device; + this->bdev = bdev; + if ((index < MAX_SWAPFILES) && !swap_info[index].bdev) { + this->set_swapinfo = 1; + devinfo[index].bdev = swap_info[index].bdev = bdev; + } + + return bdev; +} + +/* Must be silent - might be called from cat /proc/suspend/debug_info + * Returns 0 if was off, -EBUSY if was on, error value otherwise. + */ +static int enable_swapfile(void) +{ + int activateswapresult = -EINVAL; + + if (suspend_swapon_status) + return 0; + + if (swapfilename[0]) { + /* Attempt to swap on with maximum priority */ + activateswapresult = sys_swapon(swapfilename, 0xFFFF); + if ((activateswapresult) && (activateswapresult != -EBUSY)) + printk(name_suspend + "The swapfile/partition specified by " + "/proc/suspend/swapfile (%s) could not" + " be turned on (error %d). Attempting " + "to continue.\n", + swapfilename, activateswapresult); + if (!activateswapresult) + suspend_swapon_status = 1; + } + return activateswapresult; +} + +/* Returns 0 if was on, -EINVAL if was off, error value otherwise */ +static int disable_swapfile(void) +{ + int result = -EINVAL; + + if (!suspend_swapon_status) + return 0; + + if (swapfilename[0]) { + result = sys_swapoff(swapfilename); + if (result == -EINVAL) + return 0; /* Wasn't on */ + if (!result) + suspend_swapon_status = 0; + } + + return result; +} + +static int try_to_parse_resume_device(char *commandline) +{ + struct kstat stat; + int error; + + resume_dev_t = name_to_dev_t(commandline); + + if (!resume_dev_t) { + error = vfs_stat(commandline, &stat); + if (!error) + resume_dev_t = stat.rdev; + } + + if (!resume_dev_t) { + if (test_suspend_state(SUSPEND_TRYING_TO_RESUME)) + suspend_early_boot_message(1, SUSPEND_CONTINUE_REQ, + "Failed to translate \"%s\" into a device id.\n", + commandline); + else + printk(name_suspend + "Can't translate \"%s\" into a device id yet.\n", + commandline); + return 1; + } + + if (IS_ERR(resume_block_device = + open_bdev(MAX_SWAPFILES, resume_dev_t))) { + suspend_early_boot_message(1, SUSPEND_CONTINUE_REQ, + "Failed to get access to \"%s\", where" + " the swap header should be found.", + commandline); + return 1; + } + + return 0; +} + +/* + * If we have read part of the image, we might have filled memory with + * data that should be zeroed out. + */ +static void swapwriter_noresume_reset(void) +{ + memset((char *) &devinfo, 0, sizeof(devinfo)); + close_bdevs(); +} + +static int parse_signature(char *header, int restore) +{ + int type = -1; + + if (!memcmp("SWAP-SPACE",header,10)) + return 0; + else if (!memcmp("SWAPSPACE2",header,10)) + return 1; + + else if (!memcmp("S1SUSP",header,6)) + type = 4; + else if (!memcmp("S2SUSP",header,6)) + type = 5; + + else if (!memcmp("z",header,1)) + type = 12; + else if (!memcmp("Z",header,1)) + type = 13; + + /* + * Put bdev of suspend header in last byte of swap header + * (unsigned short) + */ + if (type > 11) { + dev_t *header_ptr = (dev_t *) &header[1]; + unsigned char *headerblocksize_ptr = + (unsigned char *) &header[5]; + u32 *headerblock_ptr = (u32 *) &header[6]; + header_dev_t = *header_ptr; + /* + * We are now using the highest bit of the char to indicate + * whether we have attempted to resume from this image before. + */ + clear_suspend_state(SUSPEND_RESUMED_BEFORE); + if (((int) *headerblocksize_ptr) & 0x80) + set_suspend_state(SUSPEND_RESUMED_BEFORE); + headerblock = (unsigned long) *headerblock_ptr; + } + + if ((restore) && (type > 5)) { + /* We only reset our own signatures */ + if (type & 1) + memcpy(header,"SWAPSPACE2",10); + else + memcpy(header,"SWAP-SPACE",10); + } + + return type; +} + +/* + * prepare_signature + */ + +static int prepare_signature(dev_t bdev, unsigned long block, + char *current_header) +{ + int current_type = parse_signature(current_header, 0); + dev_t *header_ptr = (dev_t *) (¤t_header[1]); + unsigned long *headerblock_ptr = + (unsigned long *) (¤t_header[6]); + + if ((current_type > 1) && (current_type < 6)) + return 1; + + /* At the moment, I don't have a way to handle the block being + * > 32 bits. Not enough room in the signature and no way to + * safely put the data elsewhere. */ + + if (BITS_PER_LONG == 64 && ffs(block) > 31) { + suspend_prepare_status(DONT_CLEAR_BAR, + "Header sector requires 33+ bits. " + "Would not be able to resume."); + return 1; + } + + if (current_type & 1) + current_header[0] = 'Z'; + else + current_header[0] = 'z'; + *header_ptr = bdev; + /* prev is the first/last swap page of the resume area */ + *headerblock_ptr = (unsigned long) block; + return 0; +} + +static int swapwriter_allocate_storage(int space_requested); + +static int swapwriter_allocate_header_space(int space_requested) +{ + int i; + + if (!swapextents.size) + swapwriter_allocate_storage(space_requested); + + extent_state_goto_start(&suspend_writer_posn); + + for (i = 0; i < space_requested; i++) { + if (suspend_bio_ops.forward_one_page()) { + printk("Out of space while seeking to allocate header pages,\n"); + return -ENOSPC; + } + + header_pages_allocated++; + } + + /* The end of header pages will be the start of pageset 2 */ + extent_state_save(&suspend_writer_posn, &suspend_writer_posn_save[2]); + return 0; +} + +static void get_main_pool_phys_params(void) +{ + struct extent *extentpointer = NULL; + unsigned long address; + int i, extent_min = -1, extent_max = -1, last_chain = -1; + int prev_header_pages_allocated; + + for (i = 0; i < MAX_SWAPFILES; i++) + if (block_chain[i].first) + put_extent_chain(&block_chain[i]); + + extent_for_each(&swapextents, extentpointer, address) { + swp_entry_t swap_address = extent_val_to_swap_entry(address); + unsigned swapfilenum = swp_type(swap_address); + pgoff_t offset = swp_offset(swap_address); + struct swap_info_struct *sis = get_swap_info_struct(swapfilenum); + sector_t new_sector = map_swap_page(sis, offset); + + if ((new_sector == extent_max + 1) && + (last_chain == swapfilenum)) + extent_max++; + else { + if (extent_min > -1) { + if (test_action_state(SUSPEND_TEST_BIO)) + printk("Adding extent %d-%d.\n", + extent_min << + devinfo[last_chain].bmap_shift, + extent_max << + devinfo[last_chain].bmap_shift); + + append_extent_to_extent_chain( + &block_chain[last_chain], + extent_min, extent_max); + } + extent_min = extent_max = new_sector; + last_chain = swapfilenum; + } + } + + if (extent_min > -1) { + if (test_action_state(SUSPEND_TEST_BIO)) + printk("Adding extent %d-%d.\n", + extent_min << + devinfo[last_chain].bmap_shift, + extent_max << + devinfo[last_chain].bmap_shift); + append_extent_to_extent_chain( + &block_chain[last_chain], + extent_min, extent_max); + } + + prev_header_pages_allocated = header_pages_allocated; + header_pages_allocated = 0; + swapwriter_allocate_header_space(prev_header_pages_allocated); +} + +static int swapwriter_storage_allocated(void) +{ + return swapextents.size; +} + +static int swapwriter_storage_available(void) +{ + si_swapinfo(&swapinfo); + return swapinfo.freeswap + swapwriter_storage_allocated(); +} + +static int swapwriter_initialise(int starting_cycle) +{ + if (starting_cycle) { + enable_swapfile(); + + if (resume_dev_t && !resume_block_device && + IS_ERR(resume_block_device = + open_bdev(MAX_SWAPFILES, resume_dev_t))) + return 1; + } + + return 0; +} + +static void swapwriter_cleanup(int ending_cycle) +{ + if (ending_cycle) + disable_swapfile(); + + close_bdevs(); +} + +static int swapwriter_release_storage(void) +{ + int i = 0; + + if ((test_action_state(SUSPEND_KEEP_IMAGE)) && + test_suspend_state(SUSPEND_NOW_RESUMING)) + return 0; + + header_pages_allocated = 0; + + if (swapextents.first) { + /* Free swap entries */ + struct extent *extentpointer; + unsigned long extentvalue; + swp_entry_t entry; + extent_for_each(&swapextents, extentpointer, + extentvalue) { + entry = extent_val_to_swap_entry(extentvalue); + swap_free(entry); + } + + put_extent_chain(&swapextents); + + for (i = 0; i < MAX_SWAPFILES; i++) + if (block_chain[i].first) + put_extent_chain(&block_chain[i]); + } + + return 0; +} + +static int swapwriter_allocate_storage(int space_requested) +{ + int i, result = 0, first = 1; + int pages_to_get = space_requested - swapextents.size; + unsigned long extent_min = 0, extent_max = 0; + + if (pages_to_get < 1) + return 0; + + for (i=0; i < MAX_SWAPFILES; i++) { + if ((devinfo[i].bdev = swap_info[i].bdev)) + devinfo[i].dev_t = swap_info[i].bdev->bd_dev; + devinfo[i].bmap_shift = 3; + devinfo[i].blocks_per_page = 1; + } + + for(i=0; i < pages_to_get; i++) { + swp_entry_t entry; + unsigned long new_value; + + entry = get_swap_page(); + if (!entry.val) { + printk("Failed to get a swap page.\n"); + result = -ENOSPC; + break; + } + + new_value = swap_entry_to_extent_val(entry); + if (first) { + first = 0; + extent_min = extent_max = new_value; + } else { + if (new_value == extent_max + 1) + extent_max++; + else { + append_extent_to_extent_chain( + &swapextents, + extent_min, extent_max); + extent_min = extent_max = new_value; + } + } + } + + if (!first) + append_extent_to_extent_chain( + &swapextents, + extent_min, extent_max); + + get_main_pool_phys_params(); + return result; +} + +static int swapwriter_write_header_init(void) +{ + int i, result; + + extent_state_goto_start(&suspend_writer_posn); + /* Forward one page will be done prior to the read */ + + for (i = 0; i < MAX_SWAPFILES; i++) + if (swap_info[i].swap_file) + devinfo[i].dev_t = swap_info[i].bdev->bd_dev; + else + devinfo[i].dev_t = (dev_t) 0; + + suspend_writer_buffer = (char *) get_zeroed_page(GFP_ATOMIC); + if (!suspend_writer_buffer) { + printk("Failed to get swapwriter buffer.\n"); + return -ENOMEM; + } + + suspend_writer_buffer_posn = 0; + + /* Info needed to bootstrap goes at the start of the header. + * First we save the positions and devinfo, including the number + * of header pages. Then we save the structs containing data needed + * for reading the header pages back. + * Note that even if header pages take more than one page, when we + * read back the info, we will have restored the location of the + * next header page by the time we go to use it. + */ + if ((result = suspend_bio_ops.write_header_chunk((char *) &suspend_writer_posn_save, + sizeof(suspend_writer_posn_save)))) + return result; + + if ((result = suspend_bio_ops.write_header_chunk((char *) &devinfo, + sizeof(devinfo)))) + return result; + + for (i=0; i < MAX_SWAPFILES; i++) + serialise_extent_chain(&block_chain[i]); + + return 0; +} + +static int swapwriter_write_header_cleanup(void) +{ + int result; + + /* Write any unsaved data */ + if (suspend_writer_buffer_posn) + suspend_bio_ops.write_header_chunk_finish(); + + extent_state_goto_start(&suspend_writer_posn); + suspend_bio_ops.forward_one_page(); + + /* Adjust swap header */ + suspend_bio_ops.bdev_page_io(READ, resume_block_device, + resume_firstblock, + virt_to_page(suspend_writer_buffer)); + + result = prepare_signature(swap_info[suspend_writer_posn.current_chain].bdev->bd_dev, + suspend_writer_posn.current_offset, + ((union swap_header *) suspend_writer_buffer)->magic.magic); + + if (!result) + suspend_bio_ops.bdev_page_io(WRITE, resume_block_device, + resume_firstblock, + virt_to_page(suspend_writer_buffer)); + + free_page((unsigned long) suspend_writer_buffer); + suspend_writer_buffer = NULL; + + suspend_bio_ops.finish_all_io(); + + return result; +} + +/* ------------------------- HEADER READING ------------------------- */ + +/* + * read_header_init() + * + * Description: + * 1. Attempt to read the device specified with resume2=. + * 2. Check the contents of the swap header for our signature. + * 3. Warn, ignore, reset and/or continue as appropriate. + * 4. If continuing, read the swapwriter configuration section + * of the header and set up block device info so we can read + * the rest of the header & image. + * + * Returns: + * May not return if user choose to reboot at a warning. + * -EINVAL if cannot resume at this time. Booting should continue + * normally. + */ + +static int swapwriter_read_header_init(void) +{ + int i; + + BUG_ON(!resume_block_device); + BUG_ON(!resume_dev_t); + + suspend_writer_buffer = (char *) get_zeroed_page(GFP_ATOMIC); + + BUG_ON(!suspend_writer_buffer); + + if (!header_dev_t) { + printk("read_header_init called when we haven't " + "verified there is an image!\n"); + return -EINVAL; + } + + /* + * If the header is not on the resume_dev_t, get the resume device first. + */ + if (header_dev_t != resume_dev_t) { + header_block_device = open_bdev(MAX_SWAPFILES + 1, + header_dev_t); + + if (IS_ERR(header_block_device)) + return PTR_ERR(header_block_device); + } else + header_block_device = resume_block_device; + + /* + * Read swapwriter configuration. + * Headerblock size taken into account already. + */ + suspend_bio_ops.bdev_page_io(READ, header_block_device, + headerblock << 3, + virt_to_page((unsigned long) suspend_writer_buffer)); + + memcpy(&suspend_writer_posn_save, suspend_writer_buffer, 3 * sizeof(struct extent_iterate_saved_state)); + + suspend_writer_buffer_posn = 3 * sizeof(struct extent_iterate_saved_state); + + memcpy(&devinfo, suspend_writer_buffer + suspend_writer_buffer_posn, sizeof(devinfo)); + + suspend_writer_buffer_posn += sizeof(devinfo); + + /* Restore device info */ + for (i = 0; i < MAX_SWAPFILES; i++) { + dev_t thisdevice = devinfo[i].dev_t; + struct block_device *result; + + devinfo[i].bdev = swap_info[i].bdev = NULL; + + if (!thisdevice) + continue; + + if (thisdevice == resume_dev_t) { + devinfo[i].bdev = swap_info[i].bdev = resume_block_device; + bdev_info_list[i] = bdev_info_list[MAX_SWAPFILES]; + BUG_ON(!bdev_info_list[i]); + bdev_info_list[i]->set_swapinfo = 1; + bdev_info_list[MAX_SWAPFILES] = NULL; + continue; + } + + if (thisdevice == header_dev_t) { + devinfo[i].bdev = swap_info[i].bdev = header_block_device; + bdev_info_list[i] = bdev_info_list[MAX_SWAPFILES + 1]; + BUG_ON(!bdev_info_list[i]); + bdev_info_list[i]->set_swapinfo = 1; + bdev_info_list[MAX_SWAPFILES + 1] = NULL; + continue; + } + + result = open_bdev(i, thisdevice); + if (IS_ERR(result)) { + close_bdevs(); + return PTR_ERR(result); + } + } + + extent_state_goto_start(&suspend_writer_posn); + *(suspend_bio_ops.need_extra_next) = 1; + + for (i = 0; i < MAX_SWAPFILES; i++) + load_extent_chain(&block_chain[i]); + + return 0; +} + +static int swapwriter_read_header_cleanup(void) +{ + free_page((unsigned long) suspend_writer_buffer); + return 0; +} + +/* swapwriter_invalidate_image + * + */ +static int swapwriter_invalidate_image(void) +{ + union p_diskpage cur; + int result = 0; + char newsig[11]; + + cur.address = get_zeroed_page(GFP_ATOMIC); + if (!cur.address) { + printk("Unable to allocate a page for restoring the swap signature.\n"); + return -ENOMEM; + } + + /* + * If nr_suspends == 0, we must be booting, so no swap pages + * will be recorded as used yet. + */ + + if (nr_suspends > 0) + swapwriter_release_storage(); + + /* + * We don't do a sanity check here: we want to restore the swap + * whatever version of kernel made the suspend image. + * + * We need to write swap, but swap may not be enabled so + * we write the device directly + */ + + suspend_bio_ops.bdev_page_io(READ, resume_block_device, + resume_firstblock, + virt_to_page(cur.pointer)); + + result = parse_signature(cur.pointer->swh.magic.magic, 1); + + if (result < 4) + goto out; + + strncpy(newsig, cur.pointer->swh.magic.magic, 10); + newsig[10] = 0; + + suspend_bio_ops.bdev_page_io(WRITE, resume_block_device, + resume_firstblock, + virt_to_page(cur.pointer)); + + if (!nr_suspends) + printk(KERN_WARNING name_suspend "Image invalidated.\n"); +out: + suspend_bio_ops.finish_all_io(); + free_page(cur.address); + return 0; +} + +/* + * workspace_size + * + * Description: + * Returns the number of bytes of RAM needed for this + * code to do its work. (Used when calculating whether + * we have enough memory to be able to suspend & resume). + * + */ +static unsigned long swapwriter_memory_needed(void) +{ + return 1; +} + +/* Print debug info + * + * Description: + */ + +static int swapwriter_print_debug_stats(char *buffer, int size) +{ + int len = 0; + struct sysinfo sysinfo; + + if (suspend_active_writer != &swapwriterops) { + len = snprintf_used(buffer, size, "- Swapwriter inactive.\n"); + return len; + } + + len = snprintf_used(buffer, size, "- Swapwriter active.\n"); + if (swapfilename[0]) + len+= snprintf_used(buffer+len, size-len, + " Attempting to automatically swapon: %s.\n", swapfilename); + + si_swapinfo(&sysinfo); + + len+= snprintf_used(buffer+len, size-len, " Swap available for image: %ld pages.\n", + sysinfo.freeswap + swapwriter_storage_allocated()); + + return len; +} + +/* + * Storage needed + * + * Returns amount of space in the swap header required + * for the swapwriter's data. This ignores the links between + * pages, which we factor in when allocating the space. + * + * We ensure the space is allocated, but actually save the + * data from write_header_init and therefore don't also define a + * save_config_info routine. + */ +static unsigned long swapwriter_storage_needed(void) +{ + return sizeof(suspend_writer_posn_save) + sizeof(devinfo); +} + +/* + * Image_exists + */ + +static int swapwriter_image_exists(void) +{ + int signature_found; + union p_diskpage diskpage; + + if (!resume_dev_t) { + printk("Not even trying to read header " + "because resume_dev_t is not set.\n"); + return 0; + } + + if (!resume_block_device && + IS_ERR(resume_block_device = open_bdev(MAX_SWAPFILES, resume_dev_t))) + return 0; + + diskpage.address = get_zeroed_page(GFP_ATOMIC); + + suspend_bio_ops.bdev_page_io(READ, resume_block_device, + resume_firstblock, + virt_to_page(diskpage.ptr)); + suspend_bio_ops.finish_all_io(); + + signature_found = parse_signature(diskpage.pointer->swh.magic.magic, 0); + free_page(diskpage.address); + + if (signature_found < 2) { + return 0; /* Normal swap space */ + } else if (signature_found == -1) { + printk(KERN_ERR name_suspend + "Unable to find a signature. Could you have moved " + "a swap file?\n"); + return 0; + } else if (signature_found < 6) { + if ((!(test_suspend_state(SUSPEND_NORESUME_SPECIFIED))) + && suspend_early_boot_message(1, + SUSPEND_CONTINUE_REQ, + "Detected the signature of an alternate " + "implementation.\n")) + set_suspend_state(SUSPEND_NORESUME_SPECIFIED); + return 0; + } else if ((signature_found >> 1) != SIGNATURE_VER) { + if ((!(test_suspend_state(SUSPEND_NORESUME_SPECIFIED))) && + suspend_early_boot_message(1, SUSPEND_CONTINUE_REQ, + "Found a different style suspend image signature.")) + set_suspend_state(SUSPEND_NORESUME_SPECIFIED); + } + + return 1; +} + +/* + * Mark resume attempted. + * + * Record that we tried to resume from this image. + */ + +static void swapwriter_mark_resume_attempted(void) +{ + union p_diskpage diskpage; + int signature_found; + + if (!resume_dev_t) { + printk("Not even trying to record attempt at resuming" + " because resume_dev_t is not set.\n"); + return; + } + + diskpage.address = get_zeroed_page(GFP_ATOMIC); + + suspend_bio_ops.bdev_page_io(READ, resume_block_device, + resume_firstblock, + virt_to_page(diskpage.ptr)); + signature_found = parse_signature(diskpage.pointer->swh.magic.magic, 0); + + switch (signature_found) { + case 12: + case 13: + diskpage.pointer->swh.magic.magic[5] |= 0x80; + break; + } + + suspend_bio_ops.bdev_page_io(WRITE, resume_block_device, + resume_firstblock, + virt_to_page(diskpage.ptr)); + suspend_bio_ops.finish_all_io(); + free_page(diskpage.address); + + close_bdevs(); + return; +} + +/* + * Parse Image Location + * + * Attempt to parse a resume2= parameter. + * Swap Writer accepts: + * resume2=swap:DEVNAME[:FIRSTBLOCK][@BLOCKSIZE] + * + * Where: + * DEVNAME is convertable to a dev_t by name_to_dev_t + * FIRSTBLOCK is the location of the first block in the swap file + * (specifying for a swap partition is nonsensical but not prohibited). + * Data is validated by attempting to read a swap header from the + * location given. Failure will result in swapwriter refusing to + * save an image, and a reboot with correct parameters will be + * necessary. + */ + +static int swapwriter_parse_sig_location(char *commandline, int only_writer) +{ + char *thischar, *devstart, *colon = NULL, *at_symbol = NULL; + union p_diskpage diskpage; + int signature_found, result = -EINVAL, temp_result; + + if (strncmp(commandline, "swap:", 5)) { + if (!only_writer) + return 1; + } else + commandline += 5; + + devstart = thischar = commandline; + while ((*thischar != ':') && (*thischar != '@') && + ((thischar - commandline) < 250) && (*thischar)) + thischar++; + + if (*thischar == ':') { + colon = thischar; + *colon = 0; + thischar++; + } + + while ((*thischar != '@') && ((thischar - commandline) < 250) && (*thischar)) + thischar++; + + if (*thischar == '@') { + at_symbol = thischar; + *at_symbol = 0; + } + + if (colon) + resume_firstblock = (int) simple_strtoul(colon + 1, NULL, 0); + else + resume_firstblock = 0; + + /* Legacy */ + if (at_symbol) { + resume_blocksize = (int) simple_strtoul(at_symbol + 1, NULL, 0); + if (resume_blocksize & (SECTOR_SIZE - 1)) { + printk("Swapwriter: Blocksizes are multiples of %d!\n", SECTOR_SIZE); + return -EINVAL; + } + resume_firstblock = resume_firstblock * (resume_blocksize / SECTOR_SIZE); + } + + temp_result = try_to_parse_resume_device(devstart); + + if (colon) + *colon = ':'; + if (at_symbol) + *at_symbol = '@'; + + if (temp_result) + return -EINVAL; + + diskpage.address = get_zeroed_page(GFP_ATOMIC); + if (!diskpage.address) { + printk(KERN_ERR name_suspend "Swapwriter: Failed to allocate a diskpage for I/O.\n"); + return -ENOMEM; + } + + temp_result = suspend_bio_ops.bdev_page_io(READ, + resume_block_device, + resume_firstblock, + virt_to_page(diskpage.ptr)); + + suspend_bio_ops.finish_all_io(); + + if (temp_result) { + printk(KERN_ERR name_suspend "Swapwriter: Failed to submit I/O.\n"); + goto invalid; + } + + signature_found = parse_signature(diskpage.pointer->swh.magic.magic, 0); + + if (signature_found != -1) { + printk(name_suspend "Swapwriter: Signature found.\n"); + result = 0; + + suspend_bio_ops.set_devinfo(devinfo); + suspend_writer_posn.chains = &block_chain[0]; + suspend_writer_posn.num_chains = MAX_SWAPFILES; + } else + printk(KERN_ERR name_suspend "Swapwriter: No swap signature found at specified location.\n"); +invalid: + free_page((unsigned long) diskpage.address); + return result; + +} + +static int header_locations_read_proc(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + int i, printedpartitionsmessage = 0, len = 0, haveswap = 0; + struct inode *swapf = 0; + int zone; + char *path_page = (char *) __get_free_page(GFP_KERNEL); + char *path; + int path_len; + + *eof = 1; + if (!page) + return 0; + + for (i = 0; i < MAX_SWAPFILES; i++) { + if (!swap_info[i].swap_file) + continue; + + if (S_ISBLK(swap_info[i].swap_file->f_mapping->host->i_mode)) { + haveswap = 1; + if (!printedpartitionsmessage) { + len += sprintf(page + len, + "For swap partitions, simply use the format: resume2=swap:/dev/hda1.\n"); + printedpartitionsmessage = 1; + } + } else { + path_len = 0; + + path = d_path( swap_info[i].swap_file->f_dentry, + swap_info[i].swap_file->f_vfsmnt, + path_page, + PAGE_SIZE); + path_len = snprintf(path_page, 31, "%s", path); + + haveswap = 1; + swapf = swap_info[i].swap_file->f_mapping->host; + if (!(zone = bmap(swapf,0))) { + len+= sprintf(page + len, + "Swapfile %s has been corrupted. Reuse mkswap on it and try again.\n", + path_page); + } else { + char name_buffer[255]; + len+= sprintf(page + len, "For swapfile `%s`, use resume2=swap:/dev/%s:0x%x.\n", + path_page, + bdevname(swap_info[i].bdev, name_buffer), + zone << (swapf->i_blkbits - 9)); + } + + } + } + + if (!haveswap) + len = sprintf(page, "You need to turn on swap partitions before examining this file.\n"); + + free_page((unsigned long) path_page); + return len; +} + +static struct suspend_proc_data swapwriter_proc_data[] = { + { + .filename = "swapfilename", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_STRING, + .data = { + .string = { + .variable = swapfilename, + .max_length = 255, + } + } + }, + + { + .filename = "headerlocations", + .permissions = PROC_READONLY, + .type = SUSPEND_PROC_DATA_CUSTOM, + .data = { + .special = { + .read_proc = header_locations_read_proc, + } + } + }, + + { .filename = "disable_swapwriter", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_INTEGER, + .data = { + .integer = { + .variable = &swapwriterops.disabled, + .minimum = 0, + .maximum = 1, + } + }, + .write_proc = attempt_to_parse_resume_device2, + } +}; + +static struct suspend_module_ops swapwriterops = { + .type = WRITER_PLUGIN, + .name = "Swap Writer", + .module = THIS_MODULE, + .memory_needed = swapwriter_memory_needed, + .print_debug_info = swapwriter_print_debug_stats, + .storage_needed = swapwriter_storage_needed, + .initialise = swapwriter_initialise, + .cleanup = swapwriter_cleanup, + + .ops = { + .writer = { + .noresume_reset = swapwriter_noresume_reset, + .storage_available = swapwriter_storage_available, + .storage_allocated = swapwriter_storage_allocated, + .release_storage = swapwriter_release_storage, + .allocate_header_space = swapwriter_allocate_header_space, + .allocate_storage = swapwriter_allocate_storage, + .image_exists = swapwriter_image_exists, + .mark_resume_attempted = swapwriter_mark_resume_attempted, + .write_header_init = swapwriter_write_header_init, + .write_header_cleanup = swapwriter_write_header_cleanup, + .read_header_init = swapwriter_read_header_init, + .read_header_cleanup = swapwriter_read_header_cleanup, + .invalidate_image = swapwriter_invalidate_image, + .parse_sig_location = swapwriter_parse_sig_location, + } + } +}; + +/* ---- Registration ---- */ +static __init int swapwriter_load(void) +{ + int result; + int i, numfiles = sizeof(swapwriter_proc_data) / sizeof(struct suspend_proc_data); + + printk("Suspend2 Swap Writer loading.\n"); + + swapwriterops.read_init = suspend_bio_ops.read_init; + swapwriterops.ops.writer.read_chunk = suspend_bio_ops.read_chunk; + swapwriterops.read_cleanup = suspend_bio_ops.read_cleanup; + swapwriterops.write_init = suspend_bio_ops.write_init; + swapwriterops.ops.writer.write_chunk = suspend_bio_ops.write_chunk; + swapwriterops.write_cleanup = suspend_bio_ops.write_cleanup; + swapwriterops.ops.writer.read_header_chunk = + suspend_bio_ops.read_header_chunk; + swapwriterops.ops.writer.write_header_chunk = + suspend_bio_ops.write_header_chunk; + + if (!(result = suspend_register_module(&swapwriterops))) { + + for (i=0; i< numfiles; i++) + suspend_register_procfile(&swapwriter_proc_data[i]); + } else + printk("Suspend2 Swap Writer unable to register!\n"); + return result; +} + +#ifdef MODULE +static __exit void swapwriter_unload(void) +{ + int i, numfiles = sizeof(swapwriter_proc_data) / sizeof(struct suspend_proc_data); + + printk("Suspend2 Swap Writer unloading.\n"); + + for (i=0; i< numfiles; i++) + suspend_unregister_procfile(&swapwriter_proc_data[i]); + suspend_unregister_module(&swapwriterops); +} + +module_init(swapwriter_load); +module_exit(swapwriter_unload); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Nigel Cunningham"); +MODULE_DESCRIPTION("Suspend2 swap writer"); +#else +late_initcall(swapwriter_load); +#endif diff -urN oldtree/kernel/power/swsusp.c newtree/kernel/power/swsusp.c --- oldtree/kernel/power/swsusp.c 2006-02-18 15:18:30.089742912 +0000 +++ newtree/kernel/power/swsusp.c 2006-02-18 15:24:31.427811176 +0000 @@ -49,9 +49,7 @@ #include #include #include -#include #include -#include #include #include #include @@ -60,6 +58,7 @@ #include #include #include +#include #include #include @@ -68,6 +67,8 @@ #include #include "power.h" +#include "swsusp.h" +#include "suspend.h" /* * Preferred image size in bytes (tunable via /sys/power/image_size). @@ -573,6 +574,8 @@ unsigned int i = 0; char *p = "-\\|/"; + thaw_processes(FREEZER_KERNEL_THREADS); + printk("Shrinking memory... "); do { size = 2 * count_highmem_pages(); @@ -596,6 +599,8 @@ } while (tmp > 0); printk("\bdone (%lu pages freed)\n", pages); + freeze_processes(); + return 0; } diff -urN oldtree/kernel/power/swsusp.c.orig newtree/kernel/power/swsusp.c.orig --- oldtree/kernel/power/swsusp.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/swsusp.c.orig 2006-02-18 15:18:30.000000000 +0000 @@ -0,0 +1,1038 @@ +/* + * linux/kernel/power/swsusp.c + * + * This file provides code to write suspend image to swap and read it back. + * + * Copyright (C) 1998-2001 Gabor Kuti + * Copyright (C) 1998,2001-2005 Pavel Machek + * + * This file is released under the GPLv2. + * + * I'd like to thank the following people for their work: + * + * Pavel Machek : + * Modifications, defectiveness pointing, being with me at the very beginning, + * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17. + * + * Steve Doddi : + * Support the possibility of hardware state restoring. + * + * Raph : + * Support for preserving states of network devices and virtual console + * (including X and svgatextmode) + * + * Kurt Garloff : + * Straightened the critical function in order to prevent compilers from + * playing tricks with local variables. + * + * Andreas Mohr + * + * Alex Badea : + * Fixed runaway init + * + * Rafael J. Wysocki + * Added the swap map data structure and reworked the handling of swap + * + * More state savers are welcome. Especially for the scsi layer... + * + * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "power.h" + +/* + * Preferred image size in bytes (tunable via /sys/power/image_size). + * When it is set to N, swsusp will do its best to ensure the image + * size will not exceed N bytes, but if that is impossible, it will + * try to create the smallest image possible. + */ +unsigned long image_size = 500 * 1024 * 1024; + +#ifdef CONFIG_HIGHMEM +unsigned int count_highmem_pages(void); +int save_highmem(void); +int restore_highmem(void); +#else +static int save_highmem(void) { return 0; } +static int restore_highmem(void) { return 0; } +static unsigned int count_highmem_pages(void) { return 0; } +#endif + +extern char resume_file[]; + +#define SWSUSP_SIG "S1SUSPEND" + +static struct swsusp_header { + char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; + swp_entry_t image; + char orig_sig[10]; + char sig[10]; +} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; + +static struct swsusp_info swsusp_info; + +/* + * Saving part... + */ + +static unsigned short root_swap = 0xffff; + +static int mark_swapfiles(swp_entry_t start) +{ + int error; + + rw_swap_page_sync(READ, + swp_entry(root_swap, 0), + virt_to_page((unsigned long)&swsusp_header)); + if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || + !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { + memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); + memcpy(swsusp_header.sig,SWSUSP_SIG, 10); + swsusp_header.image = start; + error = rw_swap_page_sync(WRITE, + swp_entry(root_swap, 0), + virt_to_page((unsigned long) + &swsusp_header)); + } else { + pr_debug("swsusp: Partition is not swap space.\n"); + error = -ENODEV; + } + return error; +} + +/* + * Check whether the swap device is the specified resume + * device, irrespective of whether they are specified by + * identical names. + * + * (Thus, device inode aliasing is allowed. You can say /dev/hda4 + * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs] + * and they'll be considered the same device. This is *necessary* for + * devfs, since the resume code can only recognize the form /dev/hda4, + * but the suspend code would see the long name.) + */ +static inline int is_resume_device(const struct swap_info_struct *swap_info) +{ + struct file *file = swap_info->swap_file; + struct inode *inode = file->f_dentry->d_inode; + + return S_ISBLK(inode->i_mode) && + swsusp_resume_device == MKDEV(imajor(inode), iminor(inode)); +} + +static int swsusp_swap_check(void) /* This is called before saving image */ +{ + int i; + + spin_lock(&swap_lock); + for (i = 0; i < MAX_SWAPFILES; i++) { + if (!(swap_info[i].flags & SWP_WRITEOK)) + continue; + if (!swsusp_resume_device || is_resume_device(swap_info + i)) { + spin_unlock(&swap_lock); + root_swap = i; + return 0; + } + } + spin_unlock(&swap_lock); + return -ENODEV; +} + +/** + * write_page - Write one page to a fresh swap location. + * @addr: Address we're writing. + * @loc: Place to store the entry we used. + * + * Allocate a new swap entry and 'sync' it. Note we discard -EIO + * errors. That is an artifact left over from swsusp. It did not + * check the return of rw_swap_page_sync() at all, since most pages + * written back to swap would return -EIO. + * This is a partial improvement, since we will at least return other + * errors, though we need to eventually fix the damn code. + */ +static int write_page(unsigned long addr, swp_entry_t *loc) +{ + swp_entry_t entry; + int error = -ENOSPC; + + entry = get_swap_page_of_type(root_swap); + if (swp_offset(entry)) { + error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr)); + if (!error || error == -EIO) + *loc = entry; + } + return error; +} + +/** + * Swap map-handling functions + * + * The swap map is a data structure used for keeping track of each page + * written to the swap. It consists of many swap_map_page structures + * that contain each an array of MAP_PAGE_SIZE swap entries. + * These structures are linked together with the help of either the + * .next (in memory) or the .next_swap (in swap) member. + * + * The swap map is created during suspend. At that time we need to keep + * it in memory, because we have to free all of the allocated swap + * entries if an error occurs. The memory needed is preallocated + * so that we know in advance if there's enough of it. + * + * The first swap_map_page structure is filled with the swap entries that + * correspond to the first MAP_PAGE_SIZE data pages written to swap and + * so on. After the all of the data pages have been written, the order + * of the swap_map_page structures in the map is reversed so that they + * can be read from swap in the original order. This causes the data + * pages to be loaded in exactly the same order in which they have been + * saved. + * + * During resume we only need to use one swap_map_page structure + * at a time, which means that we only need to use two memory pages for + * reading the image - one for reading the swap_map_page structures + * and the second for reading the data pages from swap. + */ + +#define MAP_PAGE_SIZE ((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \ + / sizeof(swp_entry_t)) + +struct swap_map_page { + swp_entry_t entries[MAP_PAGE_SIZE]; + swp_entry_t next_swap; + struct swap_map_page *next; +}; + +static inline void free_swap_map(struct swap_map_page *swap_map) +{ + struct swap_map_page *swp; + + while (swap_map) { + swp = swap_map->next; + free_page((unsigned long)swap_map); + swap_map = swp; + } +} + +static struct swap_map_page *alloc_swap_map(unsigned int nr_pages) +{ + struct swap_map_page *swap_map, *swp; + unsigned n = 0; + + if (!nr_pages) + return NULL; + + pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages); + swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); + swp = swap_map; + for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) { + swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); + swp = swp->next; + if (!swp) { + free_swap_map(swap_map); + return NULL; + } + } + return swap_map; +} + +/** + * reverse_swap_map - reverse the order of pages in the swap map + * @swap_map + */ + +static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map) +{ + struct swap_map_page *prev, *next; + + prev = NULL; + while (swap_map) { + next = swap_map->next; + swap_map->next = prev; + prev = swap_map; + swap_map = next; + } + return prev; +} + +/** + * free_swap_map_entries - free the swap entries allocated to store + * the swap map @swap_map (this is only called in case of an error) + */ +static inline void free_swap_map_entries(struct swap_map_page *swap_map) +{ + while (swap_map) { + if (swap_map->next_swap.val) + swap_free(swap_map->next_swap); + swap_map = swap_map->next; + } +} + +/** + * save_swap_map - save the swap map used for tracing the data pages + * stored in the swap + */ + +static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start) +{ + swp_entry_t entry = (swp_entry_t){0}; + int error; + + while (swap_map) { + swap_map->next_swap = entry; + if ((error = write_page((unsigned long)swap_map, &entry))) + return error; + swap_map = swap_map->next; + } + *start = entry; + return 0; +} + +/** + * free_image_entries - free the swap entries allocated to store + * the image data pages (this is only called in case of an error) + */ + +static inline void free_image_entries(struct swap_map_page *swp) +{ + unsigned k; + + while (swp) { + for (k = 0; k < MAP_PAGE_SIZE; k++) + if (swp->entries[k].val) + swap_free(swp->entries[k]); + swp = swp->next; + } +} + +/** + * The swap_map_handle structure is used for handling the swap map in + * a file-alike way + */ + +struct swap_map_handle { + struct swap_map_page *cur; + unsigned int k; +}; + +static inline void init_swap_map_handle(struct swap_map_handle *handle, + struct swap_map_page *map) +{ + handle->cur = map; + handle->k = 0; +} + +static inline int swap_map_write_page(struct swap_map_handle *handle, + unsigned long addr) +{ + int error; + + error = write_page(addr, handle->cur->entries + handle->k); + if (error) + return error; + if (++handle->k >= MAP_PAGE_SIZE) { + handle->cur = handle->cur->next; + handle->k = 0; + } + return 0; +} + +/** + * save_image_data - save the data pages pointed to by the PBEs + * from the list @pblist using the swap map handle @handle + * (assume there are @nr_pages data pages to save) + */ + +static int save_image_data(struct pbe *pblist, + struct swap_map_handle *handle, + unsigned int nr_pages) +{ + unsigned int m; + struct pbe *p; + int error = 0; + + printk("Saving image data pages (%u pages) ... ", nr_pages); + m = nr_pages / 100; + if (!m) + m = 1; + nr_pages = 0; + for_each_pbe (p, pblist) { + error = swap_map_write_page(handle, p->address); + if (error) + break; + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + } + if (!error) + printk("\b\b\b\bdone\n"); + return error; +} + +static void dump_info(void) +{ + pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code); + pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages); + pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname); + pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename); + pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release); + pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version); + pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine); + pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname); + pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus); + pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages); + pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages); +} + +static void init_header(unsigned int nr_pages) +{ + memset(&swsusp_info, 0, sizeof(swsusp_info)); + swsusp_info.version_code = LINUX_VERSION_CODE; + swsusp_info.num_physpages = num_physpages; + memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname)); + + swsusp_info.cpus = num_online_cpus(); + swsusp_info.image_pages = nr_pages; + swsusp_info.pages = nr_pages + + ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1; +} + +/** + * pack_orig_addresses - the .orig_address fields of the PBEs from the + * list starting at @pbe are stored in the array @buf[] (1 page) + */ + +static inline struct pbe *pack_orig_addresses(unsigned long *buf, + struct pbe *pbe) +{ + int j; + + for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { + buf[j] = pbe->orig_address; + pbe = pbe->next; + } + if (!pbe) + for (; j < PAGE_SIZE / sizeof(long); j++) + buf[j] = 0; + return pbe; +} + +/** + * save_image_metadata - save the .orig_address fields of the PBEs + * from the list @pblist using the swap map handle @handle + */ + +static int save_image_metadata(struct pbe *pblist, + struct swap_map_handle *handle) +{ + unsigned long *buf; + unsigned int n = 0; + struct pbe *p; + int error = 0; + + printk("Saving image metadata ... "); + buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); + if (!buf) + return -ENOMEM; + p = pblist; + while (p) { + p = pack_orig_addresses(buf, p); + error = swap_map_write_page(handle, (unsigned long)buf); + if (error) + break; + n++; + } + free_page((unsigned long)buf); + if (!error) + printk("done (%u pages saved)\n", n); + return error; +} + +/** + * enough_swap - Make sure we have enough swap to save the image. + * + * Returns TRUE or FALSE after checking the total amount of swap + * space avaiable from the resume partition. + */ + +static int enough_swap(unsigned int nr_pages) +{ + unsigned int free_swap = swap_info[root_swap].pages - + swap_info[root_swap].inuse_pages; + + pr_debug("swsusp: free swap pages: %u\n", free_swap); + return free_swap > (nr_pages + PAGES_FOR_IO + + (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); +} + +/** + * swsusp_write - Write entire image and metadata. + * + * It is important _NOT_ to umount filesystems at this point. We want + * them synced (in case something goes wrong) but we DO not want to mark + * filesystem clean: it is not. (And it does not matter, if we resume + * correctly, we'll mark system clean, anyway.) + */ + +int swsusp_write(struct pbe *pblist, unsigned int nr_pages) +{ + struct swap_map_page *swap_map; + struct swap_map_handle handle; + swp_entry_t start; + int error; + + if ((error = swsusp_swap_check())) { + printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n"); + return error; + } + if (!enough_swap(nr_pages)) { + printk(KERN_ERR "swsusp: Not enough free swap\n"); + return -ENOSPC; + } + + init_header(nr_pages); + swap_map = alloc_swap_map(swsusp_info.pages); + if (!swap_map) + return -ENOMEM; + init_swap_map_handle(&handle, swap_map); + + error = swap_map_write_page(&handle, (unsigned long)&swsusp_info); + if (!error) + error = save_image_metadata(pblist, &handle); + if (!error) + error = save_image_data(pblist, &handle, nr_pages); + if (error) + goto Free_image_entries; + + swap_map = reverse_swap_map(swap_map); + error = save_swap_map(swap_map, &start); + if (error) + goto Free_map_entries; + + dump_info(); + printk( "S" ); + error = mark_swapfiles(start); + printk( "|\n" ); + if (error) + goto Free_map_entries; + +Free_swap_map: + free_swap_map(swap_map); + return error; + +Free_map_entries: + free_swap_map_entries(swap_map); +Free_image_entries: + free_image_entries(swap_map); + goto Free_swap_map; +} + +/** + * swsusp_shrink_memory - Try to free as much memory as needed + * + * ... but do not OOM-kill anyone + * + * Notice: all userland should be stopped before it is called, or + * livelock is possible. + */ + +#define SHRINK_BITE 10000 + +int swsusp_shrink_memory(void) +{ + long size, tmp; + struct zone *zone; + unsigned long pages = 0; + unsigned int i = 0; + char *p = "-\\|/"; + + printk("Shrinking memory... "); + do { + size = 2 * count_highmem_pages(); + size += size / 50 + count_data_pages(); + size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE + + PAGES_FOR_IO; + tmp = size; + for_each_zone (zone) + if (!is_highmem(zone)) + tmp -= zone->free_pages; + if (tmp > 0) { + tmp = shrink_all_memory(SHRINK_BITE); + if (!tmp) + return -ENOMEM; + pages += tmp; + } else if (size > image_size / PAGE_SIZE) { + tmp = shrink_all_memory(SHRINK_BITE); + pages += tmp; + } + printk("\b%c", p[i++%4]); + } while (tmp > 0); + printk("\bdone (%lu pages freed)\n", pages); + + return 0; +} + +int swsusp_suspend(void) +{ + int error; + + if ((error = arch_prepare_suspend())) + return error; + local_irq_disable(); + /* At this point, device_suspend() has been called, but *not* + * device_power_down(). We *must* device_power_down() now. + * Otherwise, drivers for some devices (e.g. interrupt controllers) + * become desynchronized with the actual state of the hardware + * at resume time, and evil weirdness ensues. + */ + if ((error = device_power_down(PMSG_FREEZE))) { + printk(KERN_ERR "Some devices failed to power down, aborting suspend\n"); + goto Enable_irqs; + } + + if ((error = save_highmem())) { + printk(KERN_ERR "swsusp: Not enough free pages for highmem\n"); + goto Restore_highmem; + } + + save_processor_state(); + if ((error = swsusp_arch_suspend())) + printk(KERN_ERR "Error %d suspending\n", error); + /* Restore control flow magically appears here */ + restore_processor_state(); +Restore_highmem: + restore_highmem(); + device_power_up(); +Enable_irqs: + local_irq_enable(); + return error; +} + +int swsusp_resume(void) +{ + int error; + local_irq_disable(); + if (device_power_down(PMSG_FREEZE)) + printk(KERN_ERR "Some devices failed to power down, very bad\n"); + /* We'll ignore saved state, but this gets preempt count (etc) right */ + save_processor_state(); + error = swsusp_arch_resume(); + /* Code below is only ever reached in case of failure. Otherwise + * execution continues at place where swsusp_arch_suspend was called + */ + BUG_ON(!error); + /* The only reason why swsusp_arch_resume() can fail is memory being + * very tight, so we have to free it as soon as we can to avoid + * subsequent failures + */ + swsusp_free(); + restore_processor_state(); + restore_highmem(); + touch_softlockup_watchdog(); + device_power_up(); + local_irq_enable(); + return error; +} + +/** + * mark_unsafe_pages - mark the pages that cannot be used for storing + * the image during resume, because they conflict with the pages that + * had been used before suspend + */ + +static void mark_unsafe_pages(struct pbe *pblist) +{ + struct zone *zone; + unsigned long zone_pfn; + struct pbe *p; + + if (!pblist) /* a sanity check */ + return; + + /* Clear page flags */ + for_each_zone (zone) { + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) + if (pfn_valid(zone_pfn + zone->zone_start_pfn)) + ClearPageNosaveFree(pfn_to_page(zone_pfn + + zone->zone_start_pfn)); + } + + /* Mark orig addresses */ + for_each_pbe (p, pblist) + SetPageNosaveFree(virt_to_page(p->orig_address)); + +} + +static void copy_page_backup_list(struct pbe *dst, struct pbe *src) +{ + /* We assume both lists contain the same number of elements */ + while (src) { + dst->orig_address = src->orig_address; + dst = dst->next; + src = src->next; + } +} + +/* + * Using bio to read from swap. + * This code requires a bit more work than just using buffer heads + * but, it is the recommended way for 2.5/2.6. + * The following are to signal the beginning and end of I/O. Bios + * finish asynchronously, while we want them to happen synchronously. + * A simple atomic_t, and a wait loop take care of this problem. + */ + +static atomic_t io_done = ATOMIC_INIT(0); + +static int end_io(struct bio *bio, unsigned int num, int err) +{ + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + panic("I/O error reading memory image"); + atomic_set(&io_done, 0); + return 0; +} + +static struct block_device *resume_bdev; + +/** + * submit - submit BIO request. + * @rw: READ or WRITE. + * @off physical offset of page. + * @page: page we're reading or writing. + * + * Straight from the textbook - allocate and initialize the bio. + * If we're writing, make sure the page is marked as dirty. + * Then submit it and wait. + */ + +static int submit(int rw, pgoff_t page_off, void *page) +{ + int error = 0; + struct bio *bio; + + bio = bio_alloc(GFP_ATOMIC, 1); + if (!bio) + return -ENOMEM; + bio->bi_sector = page_off * (PAGE_SIZE >> 9); + bio->bi_bdev = resume_bdev; + bio->bi_end_io = end_io; + + if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) { + printk("swsusp: ERROR: adding page to bio at %ld\n",page_off); + error = -EFAULT; + goto Done; + } + + + atomic_set(&io_done, 1); + submit_bio(rw | (1 << BIO_RW_SYNC), bio); + while (atomic_read(&io_done)) + yield(); + if (rw == READ) + bio_set_pages_dirty(bio); + Done: + bio_put(bio); + return error; +} + +static int bio_read_page(pgoff_t page_off, void *page) +{ + return submit(READ, page_off, page); +} + +static int bio_write_page(pgoff_t page_off, void *page) +{ + return submit(WRITE, page_off, page); +} + +/** + * The following functions allow us to read data using a swap map + * in a file-alike way + */ + +static inline void release_swap_map_reader(struct swap_map_handle *handle) +{ + if (handle->cur) + free_page((unsigned long)handle->cur); + handle->cur = NULL; +} + +static inline int get_swap_map_reader(struct swap_map_handle *handle, + swp_entry_t start) +{ + int error; + + if (!swp_offset(start)) + return -EINVAL; + handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); + if (!handle->cur) + return -ENOMEM; + error = bio_read_page(swp_offset(start), handle->cur); + if (error) { + release_swap_map_reader(handle); + return error; + } + handle->k = 0; + return 0; +} + +static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf) +{ + unsigned long offset; + int error; + + if (!handle->cur) + return -EINVAL; + offset = swp_offset(handle->cur->entries[handle->k]); + if (!offset) + return -EINVAL; + error = bio_read_page(offset, buf); + if (error) + return error; + if (++handle->k >= MAP_PAGE_SIZE) { + handle->k = 0; + offset = swp_offset(handle->cur->next_swap); + if (!offset) + release_swap_map_reader(handle); + else + error = bio_read_page(offset, handle->cur); + } + return error; +} + +static int check_header(void) +{ + char *reason = NULL; + + dump_info(); + if (swsusp_info.version_code != LINUX_VERSION_CODE) + reason = "kernel version"; + if (swsusp_info.num_physpages != num_physpages) + reason = "memory size"; + if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname)) + reason = "system type"; + if (strcmp(swsusp_info.uts.release,system_utsname.release)) + reason = "kernel release"; + if (strcmp(swsusp_info.uts.version,system_utsname.version)) + reason = "version"; + if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) + reason = "machine"; + if (reason) { + printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); + return -EPERM; + } + return 0; +} + +/** + * load_image_data - load the image data using the swap map handle + * @handle and store them using the page backup list @pblist + * (assume there are @nr_pages pages to load) + */ + +static int load_image_data(struct pbe *pblist, + struct swap_map_handle *handle, + unsigned int nr_pages) +{ + int error; + unsigned int m; + struct pbe *p; + + if (!pblist) + return -EINVAL; + printk("Loading image data pages (%u pages) ... ", nr_pages); + m = nr_pages / 100; + if (!m) + m = 1; + nr_pages = 0; + p = pblist; + while (p) { + error = swap_map_read_page(handle, (void *)p->address); + if (error) + break; + p = p->next; + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + } + if (!error) + printk("\b\b\b\bdone\n"); + return error; +} + +/** + * unpack_orig_addresses - copy the elements of @buf[] (1 page) to + * the PBEs in the list starting at @pbe + */ + +static inline struct pbe *unpack_orig_addresses(unsigned long *buf, + struct pbe *pbe) +{ + int j; + + for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { + pbe->orig_address = buf[j]; + pbe = pbe->next; + } + return pbe; +} + +/** + * load_image_metadata - load the image metadata using the swap map + * handle @handle and put them into the PBEs in the list @pblist + */ + +static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle) +{ + struct pbe *p; + unsigned long *buf; + unsigned int n = 0; + int error = 0; + + printk("Loading image metadata ... "); + buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); + if (!buf) + return -ENOMEM; + p = pblist; + while (p) { + error = swap_map_read_page(handle, buf); + if (error) + break; + p = unpack_orig_addresses(buf, p); + n++; + } + free_page((unsigned long)buf); + if (!error) + printk("done (%u pages loaded)\n", n); + return error; +} + +int swsusp_read(struct pbe **pblist_ptr) +{ + int error; + struct pbe *p, *pblist; + struct swap_map_handle handle; + unsigned int nr_pages; + + if (IS_ERR(resume_bdev)) { + pr_debug("swsusp: block device not initialised\n"); + return PTR_ERR(resume_bdev); + } + + error = get_swap_map_reader(&handle, swsusp_header.image); + if (!error) + error = swap_map_read_page(&handle, &swsusp_info); + if (!error) + error = check_header(); + if (error) + return error; + nr_pages = swsusp_info.image_pages; + p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0); + if (!p) + return -ENOMEM; + error = load_image_metadata(p, &handle); + if (!error) { + mark_unsafe_pages(p); + pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); + if (pblist) + copy_page_backup_list(pblist, p); + free_pagedir(p); + if (!pblist) + error = -ENOMEM; + + /* Allocate memory for the image and read the data from swap */ + if (!error) + error = alloc_data_pages(pblist, GFP_ATOMIC, 1); + if (!error) { + release_eaten_pages(); + error = load_image_data(pblist, &handle, nr_pages); + } + if (!error) + *pblist_ptr = pblist; + } + release_swap_map_reader(&handle); + + blkdev_put(resume_bdev); + + if (!error) + pr_debug("swsusp: Reading resume file was successful\n"); + else + pr_debug("swsusp: Error %d resuming\n", error); + return error; +} + +/** + * swsusp_check - Check for swsusp signature in the resume device + */ + +int swsusp_check(void) +{ + int error; + + resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); + if (!IS_ERR(resume_bdev)) { + set_blocksize(resume_bdev, PAGE_SIZE); + memset(&swsusp_header, 0, sizeof(swsusp_header)); + if ((error = bio_read_page(0, &swsusp_header))) + return error; + if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { + memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); + /* Reset swap signature now */ + error = bio_write_page(0, &swsusp_header); + } else { + return -EINVAL; + } + if (error) + blkdev_put(resume_bdev); + else + pr_debug("swsusp: Signature found, resuming\n"); + } else { + error = PTR_ERR(resume_bdev); + } + + if (error) + pr_debug("swsusp: Error %d check for resume file\n", error); + + return error; +} + +/** + * swsusp_close - close swap device. + */ + +void swsusp_close(void) +{ + if (IS_ERR(resume_bdev)) { + pr_debug("swsusp: block device not initialised\n"); + return; + } + + blkdev_put(resume_bdev); +} diff -urN oldtree/kernel/power/swsusp.h newtree/kernel/power/swsusp.h --- oldtree/kernel/power/swsusp.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/swsusp.h 2006-02-18 15:24:31.428811024 +0000 @@ -0,0 +1,24 @@ + +struct suspend_header { + u32 version_code; + unsigned long num_physpages; + unsigned long orig_mem_free; + char machine[65]; + char version[65]; + int num_cpus; + int page_size; + int pageset_2_size; + int param0; + int param1; + int param2; + int param3; + int progress0; + int progress1; + int progress2; + int progress3; + int io_time[2][2]; + + suspend_pagedir_t *suspend_pagedir; + unsigned int num_pbes; +}; + diff -urN oldtree/kernel/power/ui.c newtree/kernel/power/ui.c --- oldtree/kernel/power/ui.c 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/ui.c 2006-02-18 15:24:31.430810720 +0000 @@ -0,0 +1,853 @@ +/* + * kernel/power/ui.c + * + * Copyright (C) 1998-2001 Gabor Kuti + * Copyright (C) 1998,2001,2002 Pavel Machek + * Copyright (C) 2002-2003 Florent Chabaud + * Copyright (C) 2002-2005 Nigel Cunningham + * + * This file is released under the GPLv2. + * + * Routines for Suspend2's user interface. + * + * The user interface code talks to a userspace program via a + * netlink socket. + * + * The kernel side: + * - starts the userui program; + * - sends text messages and progress bar status; + * + * The user space side: + * - passes messages regarding user requests (abort, toggle reboot etc) + * + */ + +#define __KERNEL_SYSCALLS__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "proc.h" +#include "modules.h" +#include "suspend2.h" +#include "suspend2_common.h" +#include "ui.h" +#include "version.h" +#include "netlink.h" +#include "power.h" + +static char local_printf_buf[1024]; /* Same as printk - should be safe */ + +#ifdef CONFIG_NET +static struct user_helper_data ui_helper_data; +static struct suspend_module_ops userui_ops; +static int orig_loglevel; +static int orig_default_message_loglevel; +static int orig_kmsg; + +static char lastheader[512]; +static int lastheader_message_len = 0; + +/* Number of distinct progress amounts that userspace can display */ +static int progress_granularity = 50; + +DECLARE_WAIT_QUEUE_HEAD(userui_wait_for_key); + +static void ui_nl_set_state(int n) +{ + /* Only let them change certain settings */ + static const int suspend_action_mask = + (1 << SUSPEND_REBOOT) | (1 << SUSPEND_PAUSE) | (1 << SUSPEND_SLOW) | + (1 << SUSPEND_LOGALL) | (1 << SUSPEND_SINGLESTEP) | + (1 << SUSPEND_PAUSE_NEAR_PAGESET_END); + + suspend_action = (suspend_action & (~suspend_action_mask)) | + (n & suspend_action_mask); + + if (!test_action_state(SUSPEND_PAUSE) && + !test_action_state(SUSPEND_SINGLESTEP)) + wake_up_interruptible(&userui_wait_for_key); +} + +void userui_redraw(void) +{ + if (ui_helper_data.pid == -1) + return; + + suspend_send_netlink_message(&ui_helper_data, + USERUI_MSG_REDRAW, NULL, 0); +} + +/* request_abort_suspend + * + * Description: Handle the user requesting the cancellation of a suspend by + * pressing escape. + * Callers: Invoked from a netlink packet from userspace when the user presses + * escape. + */ +void request_abort_suspend(void) +{ + if (test_suspend_state(SUSPEND_NOW_RESUMING) || (test_result_state(SUSPEND_ABORT_REQUESTED))) + return; + + suspend_prepare_status(CLEAR_BAR, "--- ESCAPE PRESSED :" + " ABORTING PROCESS ---"); + set_result_state(SUSPEND_ABORTED); + set_result_state(SUSPEND_ABORT_REQUESTED); + + wake_up_interruptible(&userui_wait_for_key); +} + +static int userui_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + int type; + int *data; + + type = nlh->nlmsg_type; + + /* A control message: ignore them */ + if (type < NETLINK_MSG_BASE) + return 0; + + /* Unknown message: reply with EINVAL */ + if (type >= USERUI_MSG_MAX) + return -EINVAL; + + /* All operations require privileges, even GET */ + if (security_netlink_recv(skb)) + return -EPERM; + + /* Only allow one task to receive NOFREEZE privileges */ + if (type == NETLINK_MSG_NOFREEZE_ME && ui_helper_data.pid != -1) + return -EBUSY; + + data = (int*)NLMSG_DATA(nlh); + + switch (type) { + case USERUI_MSG_ABORT: + request_abort_suspend(); + break; + case USERUI_MSG_GET_STATE: + suspend_send_netlink_message(&ui_helper_data, + USERUI_MSG_GET_STATE, &suspend_action, + sizeof(suspend_action)); + break; + case USERUI_MSG_GET_DEBUG_STATE: + suspend_send_netlink_message(&ui_helper_data, + USERUI_MSG_GET_DEBUG_STATE, + &suspend_debug_state, + sizeof(suspend_debug_state)); + break; + case USERUI_MSG_SET_STATE: + if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) + return -EINVAL; + ui_nl_set_state(*data); + break; + case USERUI_MSG_SET_DEBUG_STATE: + if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) + return -EINVAL; + suspend_debug_state = (*data); + break; + case USERUI_MSG_SPACE: + wake_up_interruptible(&userui_wait_for_key); + break; + } + + return 1; +} + +static unsigned long userui_storage_needed(void) +{ + return sizeof(ui_helper_data.program); +} + +static int userui_save_config_info(char *buf) +{ + *((int *) buf) = progress_granularity; + memcpy(buf + sizeof(int), ui_helper_data.program, sizeof(ui_helper_data.program)); + return sizeof(ui_helper_data.program) + sizeof(int); +} + +static void userui_load_config_info(char *buf, int size) +{ + /* Don't load the saved path if one has already been set */ + if (ui_helper_data.program[0]) + return; + + progress_granularity = *((int *) buf); + size -= sizeof(int); + + if (size > sizeof(ui_helper_data.program)) + size = sizeof(ui_helper_data.program); + + memcpy(ui_helper_data.program, buf + sizeof(int), size); + ui_helper_data.program[sizeof(ui_helper_data.program)-1] = '\0'; +} + +static unsigned long userui_memory_needed(void) +{ + /* ball park figure of 128 pages */ + return (128 * PAGE_SIZE); +} + +unsigned long userui_update_progress(unsigned long value, unsigned long maximum, + const char *fmt, va_list args) +{ + static int last_step = -1; + struct userui_msg_params msg; + int bitshift; + int this_step; + unsigned long next_update; + + if (ui_helper_data.pid == -1) + return 0; + + if ((!maximum) || (!progress_granularity)) + return maximum; + + if (value < 0) + value = 0; + + if (value > maximum) + value = maximum; + + /* Try to avoid math problems - we can't do 64 bit math here + * (and shouldn't need it - anyone got screen resolution + * of 65536 pixels or more?) */ + bitshift = generic_fls(maximum) - 16; + if (bitshift > 0) { + unsigned long temp_maximum = maximum >> bitshift; + unsigned long temp_value = value >> bitshift; + this_step = (int) + (temp_value * progress_granularity / temp_maximum); + next_update = (((this_step + 1) * temp_maximum / + progress_granularity) + 1) << bitshift; + } else { + this_step = (int) (value * progress_granularity / maximum); + next_update = ((this_step + 1) * maximum / + progress_granularity) + 1; + } + + if (this_step == last_step) + return next_update; + + memset(&msg, 0, sizeof(msg)); + + msg.a = this_step; + msg.b = progress_granularity; + + if (fmt) { + vsnprintf(msg.text, sizeof(msg.text), fmt, args); + msg.text[sizeof(msg.text)-1] = '\0'; + } + + suspend_send_netlink_message(&ui_helper_data, USERUI_MSG_PROGRESS, + &msg, sizeof(msg)); + last_step = this_step; + + return next_update; +} + +/* __suspend_message. + * + * Description: This function is intended to do the same job as printk, but + * without normally logging what is printed. The point is to be + * able to get debugging info on screen without filling the logs + * with "1/534. ^M 2/534^M. 3/534^M" + * + * It may be called from an interrupt context - can't sleep! + * + * Arguments: int mask: The debugging section(s) this message belongs to. + * int level: The level of verbosity of this message. + * int restartline: Whether to output a \r or \n with this line + * (\n if we're logging all output). + * const char *fmt, ...: Message to be displayed a la printk. + */ +void __suspend_message(unsigned long section, unsigned long level, + int normally_logged, + const char *fmt, ...) +{ + struct userui_msg_params msg; + + va_list args; + + if ((level) && (level > console_loglevel)) + return; + + memset(&msg, 0, sizeof(msg)); + + msg.a = section; + msg.b = level; + msg.c = normally_logged; + + if (fmt) { + va_start(args, fmt); + vsnprintf(msg.text, sizeof(msg.text), fmt, args); + va_end(args); + msg.text[sizeof(msg.text)-1] = '\0'; + } + + if (test_action_state(SUSPEND_LOGALL)) + printk("%s\n", msg.text); + + if (ui_helper_data.pid == -1) + return; + + suspend_send_netlink_message(&ui_helper_data, USERUI_MSG_MESSAGE, + &msg, sizeof(msg)); +} + +static void wait_for_key_via_userui(void) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&userui_wait_for_key, &wait); + set_current_state(TASK_INTERRUPTIBLE); + + interruptible_sleep_on(&userui_wait_for_key); + + set_current_state(TASK_RUNNING); + remove_wait_queue(&userui_wait_for_key, &wait); +} + +char suspend_wait_for_keypress(int timeout) +{ + int fd; + char key = '\0'; + struct termios t, t_backup; + + if (ui_helper_data.pid != -1) { + wait_for_key_via_userui(); + key = ' '; + goto out; + } + + /* We should be guaranteed /dev/console exists after populate_rootfs() in + * init/main.c + */ + if ((fd = sys_open("/dev/console", O_RDONLY, 0)) < 0) { + printk("Couldn't open /dev/console.\n"); + goto out; + } + + if (sys_ioctl(fd, TCGETS, (long)&t) < 0) + goto out_close; + + memcpy(&t_backup, &t, sizeof(t)); + + t.c_lflag &= ~(ISIG|ICANON|ECHO); + t.c_cc[VMIN] = 0; + if (timeout) + t.c_cc[VTIME] = timeout*10; + + if (sys_ioctl(fd, TCSETS, (long)&t) < 0) + goto out_restore; + + while (1) { + if (sys_read(fd, &key, 1) <= 0) { + key = '\0'; + break; + } + key = tolower(key); + if (test_suspend_state(SUSPEND_SANITY_CHECK_PROMPT)) { + if (key == 'c') { + set_suspend_state(SUSPEND_CONTINUE_REQ); + break; + } else if (key == ' ') + break; + } else + break; + } + +out_restore: + sys_ioctl(fd, TCSETS, (long)&t_backup); + +out_close: + sys_close(fd); +out: + return key; +} + +/* abort_suspend + * + * Description: Begin to abort a cycle. If this wasn't at the user's request + * (and we're displaying output), tell the user why and wait for + * them to acknowledge the message. + * Arguments: A parameterised string (imagine this is printk) to display, + * telling the user why we're aborting. + */ + +void abort_suspend(const char *fmt, ...) +{ + va_list args; + int printed_len = 0; + + if (!test_result_state(SUSPEND_ABORTED)) { + if (!test_result_state(SUSPEND_ABORT_REQUESTED)) { + va_start(args, fmt); + printed_len = vsnprintf(local_printf_buf, + sizeof(local_printf_buf), fmt, args); + va_end(args); + if (ui_helper_data.pid != -1) + printed_len = sprintf(local_printf_buf + printed_len, + " (Press SPACE to continue)"); + suspend_prepare_status(CLEAR_BAR, local_printf_buf); + + /* + * Make sure message seen - wait for shift to be + * released if being pressed + */ + if (ui_helper_data.pid != -1) + suspend_wait_for_keypress(0); + } + /* Turn on aborting flag */ + set_result_state(SUSPEND_ABORTED); + } +} + +/* suspend_prepare_status + * Description: Prepare the 'nice display', drawing the header and version, + * along with the current action and perhaps also resetting the + * progress bar. + * Arguments: + * int clearbar: Whether to reset the progress bar. + * const char *fmt, ...: The action to be displayed. + */ +void suspend_prepare_status(int clearbar, const char *fmt, ...) +{ + va_list args; + + if (fmt) { + va_start(args, fmt); + lastheader_message_len = vsnprintf(lastheader, 512, fmt, args); + va_end(args); + } + + if (clearbar) + userui_update_progress(0, 1, NULL, NULL); + + __suspend_message(0, SUSPEND_STATUS, 1, lastheader, NULL); + + if (ui_helper_data.pid == -1) + printk(KERN_EMERG "%s\n", lastheader); +} + +/* update_status + * + * Description: Update the progress bar and (if on) in-bar message. + * Arguments: UL value, maximum: Current progress percentage (value/max). + * const char *fmt, ...: Message to be displayed in the middle + * of the progress bar. + * Note that a NULL message does not mean that any previous + * message is erased! For that, you need suspend_prepare_status with + * clearbar on. + * Returns: Unsigned long: The next value where status needs to be updated. + * This is to reduce unnecessary calls to update_status. + */ +unsigned long suspend_update_status(unsigned long value, unsigned long maximum, + const char *fmt, ...) +{ + unsigned long next_update = maximum; + va_list args; + + if (!maximum) + return maximum; + + if (value < 0) + value = 0; + + if (value > maximum) + value = maximum; + + va_start(args, fmt); + + next_update = userui_update_progress(value, maximum, fmt, args); + + va_end(args); + + return next_update; +} + +/* check_shift_keys + * + * Description: Potentially pause and wait for the user to tell us to continue. + * We normally only pause when @pause is set. + * Arguments: int pause: Whether we normally pause. + * char *message: The message to display. Not parameterised + * because it's normally a constant. + */ + +void check_shift_keys(int pause, char *message) +{ +#ifdef CONFIG_PM_DEBUG + int displayed_message = 0, last_key = 0; + + while (last_key != 32 && + ui_helper_data.pid != -1 && + (!test_result_state(SUSPEND_ABORTED)) && + ((test_action_state(SUSPEND_PAUSE) && pause) || + (test_action_state(SUSPEND_SINGLESTEP)))) { + if (!displayed_message) { + suspend_prepare_status(DONT_CLEAR_BAR, + "%s Press SPACE to continue.%s", + message ? message : "", + (test_action_state(SUSPEND_SINGLESTEP)) ? + " Single step on." : ""); + displayed_message = 1; + } + last_key = suspend_wait_for_keypress(0); + } +#endif + schedule(); +} + +extern asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, + unsigned long arg); + +/* suspend_prepare_console + * + * Description: Prepare a console for use, save current settings. + * Returns: Boolean: Whether an error occured. Errors aren't + * treated as fatal, but a warning is printed. + */ +void suspend_prepare_console(void) +{ + orig_loglevel = console_loglevel; + orig_default_message_loglevel = default_message_loglevel; + orig_kmsg = kmsg_redirect; + kmsg_redirect = fg_console + 1; + default_message_loglevel = 1; + console_loglevel = suspend_default_console_level; + + ui_helper_data.pid = -1; + + if (userui_ops.disabled) + return; + + if (!*ui_helper_data.program) { + printk("suspend_userui: program not configured. suspend_userui disabled.\n"); + return; + } + + suspend_netlink_setup(&ui_helper_data); + + return; +} + +/* suspend_restore_console + * + * Description: Restore the settings we saved above. + */ + +void suspend_cleanup_console(void) +{ + suspend_default_console_level = console_loglevel; + + if (ui_helper_data.pid > -1) { + struct task_struct *t; + + suspend_send_netlink_message(&ui_helper_data, + NETLINK_MSG_CLEANUP, NULL, 0); + + read_lock(&tasklist_lock); + if ((t = find_task_by_pid(ui_helper_data.pid))) + t->flags &= ~PF_NOFREEZE; + read_unlock(&tasklist_lock); + + suspend_netlink_close(&ui_helper_data); + + ui_helper_data.pid = -1; + } + + console_loglevel = orig_loglevel; + kmsg_redirect = orig_kmsg; + default_message_loglevel = orig_default_message_loglevel; +} +#else +static char suspend_wait_for_keypress(int timeout) +{ + return 0; +} + +unsigned long suspend_update_status(unsigned long value, unsigned long maximum, + const char *fmt, ...) +{ + return maximum; +} + +void __suspend_message(unsigned long section, unsigned long level, + int normally_logged, + const char *fmt, ...) { } +void suspend_prepare_status(int clearbar, const char *fmt, ...) { } +void check_shift_keys(int pause, char *message) { } +void abort_suspend(const char *fmt, ...) { } +void suspend_prepare_console(void) { } +void suspend_cleanup_console(void) { } +void userui_redraw(void) { } +#endif + +/* suspend_early_boot_message() + * Description: Handle errors early in the process of booting. + * The user may press C to continue booting, perhaps + * invalidating the image, or space to reboot. + * This works from either the serial console or normally + * attached keyboard. + * + * Note that we come in here from init, while the kernel is + * locked. If we want to get events from the serial console, + * we need to temporarily unlock the kernel. + * + * suspend_early_boot_message may also be called post-boot. + * In this case, it simply printks the message and returns. + * + * Arguments: int Whether we are able to erase the image. + * int default_answer. What to do when we timeout. This + * will normally be continue, but the user might + * provide command line options (__setup) to override + * particular cases. + * Char *. Pointer to a string explaining why we're moaning. + */ + +#define say(message, a...) printk(KERN_EMERG message, ##a) +#define message_timeout 25 /* message_timeout * 10 must fit in 8 bits */ + +int suspend_early_boot_message(int message_detail, int default_answer, char *warning_reason, ...) +{ + unsigned long orig_state = get_suspend_state(), continue_req = 0; + va_list args; + int printed_len; + + if (warning_reason) { + va_start(args, warning_reason); + printed_len = vsnprintf(local_printf_buf, + sizeof(local_printf_buf), + warning_reason, + args); + va_end(args); + } + + if (!test_suspend_state(SUSPEND_BOOT_TIME)) { + printk(name_suspend "%s\n", local_printf_buf); + return default_answer; + } + + /* We might be called directly from do_mounts_initrd if the + * user fails to set up their initrd properly. We need to + * enable the keyboard handler by setting the running flag */ + set_suspend_state(SUSPEND_RUNNING); + +#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE) + console_loglevel = 7; + + say("=== Suspend2 ===\n\n"); + if (warning_reason) { + say("BIG FAT WARNING!! %s\n\n", local_printf_buf); + switch (message_detail) { + case 0: + say("If you continue booting, note that any image WILL NOT BE REMOVED.\n"); + say("Suspend is unable to do so because the appropriate modules aren't\n"); + say("loaded. You should manually remove the image to avoid any\n"); + say("possibility of corrupting your filesystem(s) later.\n"); + break; + case 1: + say("If you want to use the current suspend image, reboot and try\n"); + say("again with the same kernel that you suspended from. If you want\n"); + say("to forget that image, continue and the image will be erased.\n"); + break; + } + say("Press SPACE to reboot or C to continue booting with this kernel\n\n"); + say("Default action if you don't select one in %d seconds is: %s.\n", + message_timeout, + default_answer == SUSPEND_CONTINUE_REQ ? + "continue booting" : "reboot"); + } else { + say("BIG FAT WARNING!!\n\n"); + say("You have tried to resume from this image before.\n"); + say("If it failed once, it may well fail again.\n"); + say("Would you like to remove the image and boot normally?\n"); + say("This will be equivalent to entering noresume2 on the\n"); + say("kernel command line.\n\n"); + say("Press SPACE to remove the image or C to continue resuming.\n\n"); + say("Default action if you don't select one in %d seconds is: %s.\n", + message_timeout, + !!default_answer ? + "continue resuming" : "remove the image"); + } + + set_suspend_state(SUSPEND_SANITY_CHECK_PROMPT); + clear_suspend_state(SUSPEND_CONTINUE_REQ); + + if (suspend_wait_for_keypress(message_timeout) == 0) /* We timed out */ + continue_req = !!default_answer; + else + continue_req = test_suspend_state(SUSPEND_CONTINUE_REQ); + + if ((warning_reason) && (!continue_req)) + machine_restart(NULL); + + restore_suspend_state(orig_state); + if (continue_req) + set_suspend_state(SUSPEND_CONTINUE_REQ); + +#endif // CONFIG_VT or CONFIG_SERIAL_CONSOLE + return -EPERM; +} +#undef say + +/* + * User interface specific /proc/suspend entries. + */ + +static struct suspend_proc_data proc_params[] = { +#ifdef CONFIG_NET +#ifdef CONFIG_PROC_FS + { .filename = "default_console_level", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_INTEGER, + .data = { + .integer = { + .variable = &suspend_default_console_level, + .minimum = 0, +#ifdef CONFIG_PM_DEBUG + .maximum = 7, +#else + .maximum = 1, +#endif + + } + } + }, + + { .filename = "enable_escape", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_BIT, + .data = { + .bit = { + .bit_vector = &suspend_action, + .bit = SUSPEND_CAN_CANCEL, + } + } + }, + +#ifdef CONFIG_PM_DEBUG + { .filename = "debug_sections", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_UL, + .data = { + .ul = { + .variable = &suspend_debug_state, + .minimum = 0, + .maximum = 2 << 30, + } + } + }, + + { .filename = "log_everything", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_BIT, + .data = { + .bit = { + .bit_vector = &suspend_action, + .bit = SUSPEND_LOGALL, + } + } + }, + + { .filename = "pause_between_steps", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_BIT, + .data = { + .bit = { + .bit_vector = &suspend_action, + .bit = SUSPEND_PAUSE, + } + } + }, +#endif + { .filename = "disable_userui_support", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_INTEGER, + .data = { + .integer = { + .variable = &userui_ops.disabled, + .minimum = 0, + .maximum = 1, + } + } + }, + { .filename = "userui_progress_granularity", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_INTEGER, + .data = { + .integer = { + .variable = &progress_granularity, + .minimum = 1, + .maximum = 2048, + } + } + }, + { .filename = "userui_program", + .permissions = PROC_RW, + .type = SUSPEND_PROC_DATA_STRING, + .data = { + .string = { + .variable = ui_helper_data.program, + .max_length = 255, + } + } + } +#endif +#endif +}; + +static struct suspend_module_ops userui_ops = { + .type = MISC_PLUGIN, + .name = "Userspace UI Support", + .module = THIS_MODULE, +#ifdef CONFIG_NET + .storage_needed = userui_storage_needed, + .save_config_info = userui_save_config_info, + .load_config_info = userui_load_config_info, + .memory_needed = userui_memory_needed, +#endif +}; + +/* suspend_console_proc_init + * Description: Boot time initialisation for user interface. + */ +static __init int suspend_console_proc_init(void) +{ + int result, i, numfiles = sizeof(proc_params) / sizeof(struct suspend_proc_data); + + if (!(result = suspend_register_module(&userui_ops))) + for (i=0; i< numfiles; i++) + suspend_register_procfile(&proc_params[i]); + +#ifdef CONFIG_NET + ui_helper_data.nl = NULL; + ui_helper_data.program[0] = '\0'; +#endif + ui_helper_data.pid = -1; + ui_helper_data.skb_size = sizeof(struct userui_msg_params); + ui_helper_data.pool_limit = 6; + ui_helper_data.netlink_id = NETLINK_SUSPEND2_USERUI; + ui_helper_data.name = "userspace ui"; + ui_helper_data.rcv_msg = userui_user_rcv_msg; + ui_helper_data.interface_version = 6; + ui_helper_data.must_init = 0; + ui_helper_data.not_ready = suspend_cleanup_console; + init_completion(&ui_helper_data.wait_for_process); + + return result; +} + +late_initcall(suspend_console_proc_init); diff -urN oldtree/kernel/power/ui.h newtree/kernel/power/ui.h --- oldtree/kernel/power/ui.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/ui.h 2006-02-18 15:24:31.431810568 +0000 @@ -0,0 +1,44 @@ +/* + * + */ + +extern void suspend_prepare_console(void); +extern void suspend_cleanup_console(void); + +extern void check_shift_keys(int pause, char *message); +extern unsigned long suspend_update_status(unsigned long value, unsigned long maximum, + const char *fmt, ...); + +extern void abort_suspend(const char *fmt, ...); + +extern void userui_redraw(void); + +enum { + DONT_CLEAR_BAR, + CLEAR_BAR +}; + +enum { + /* Userspace -> Kernel */ + USERUI_MSG_ABORT = 0x11, + USERUI_MSG_SET_STATE = 0x12, + USERUI_MSG_GET_STATE = 0x13, + USERUI_MSG_GET_DEBUG_STATE = 0x14, + USERUI_MSG_SET_DEBUG_STATE = 0x15, + USERUI_MSG_SET_PROGRESS_GRANULARITY = 0x17, + USERUI_MSG_SPACE = 0x18, + + /* Kernel -> Userspace */ + USERUI_MSG_MESSAGE = 0x21, + USERUI_MSG_PROGRESS = 0x22, + USERUI_MSG_REDRAW = 0x25, + USERUI_MSG_KEYPRESS = 0x26, + USERUI_MSG_DEBUG_STATE = 0x29, + + USERUI_MSG_MAX, +}; + +struct userui_msg_params { + unsigned long a, b, c, d; + char text[255]; +}; diff -urN oldtree/kernel/power/version.h newtree/kernel/power/version.h --- oldtree/kernel/power/version.h 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/power/version.h 2006-02-18 15:24:31.431810568 +0000 @@ -0,0 +1,2 @@ +#define SUSPEND_CORE_VERSION "2.2.0.1" +#define name_suspend "Suspend2 " SUSPEND_CORE_VERSION ": " diff -urN oldtree/kernel/sched.c newtree/kernel/sched.c --- oldtree/kernel/sched.c 2006-02-18 15:18:30.095742000 +0000 +++ newtree/kernel/sched.c 2006-02-18 15:24:31.435809960 +0000 @@ -4704,7 +4704,6 @@ p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); if (IS_ERR(p)) return NOTIFY_BAD; - p->flags |= PF_NOFREEZE; kthread_bind(p, cpu); /* Must be high prio: stop_machine expects to yield to it. */ rq = task_rq_lock(p, &flags); diff -urN oldtree/kernel/sched.c.orig newtree/kernel/sched.c.orig --- oldtree/kernel/sched.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ newtree/kernel/sched.c.orig 2006-02-18 15:18:30.000000000 +0000 @@ -0,0 +1,6166 @@ +/* + * kernel/sched.c + * + * Kernel scheduler and related syscalls + * + * Copyright (C) 1991-2002 Linus Torvalds + * + * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and + * make semaphores SMP safe + * 1998-11-19 Implemented schedule_timeout() and related stuff + * by Andrea Arcangeli + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: + * hybrid priority-list and round-robin design with + * an array-switch method of distributing timeslices + * and per-CPU runqueues. Cleanups and useful suggestions + * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2003-09-03 Interactivity tuning by Con Kolivas. + * 2004-04-02 Scheduler domains code by Nick Piggin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], + * and back. + */ +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) + +/* + * Some helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) + +/* + * These are the 'tuning knobs' of the scheduler: + * + * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), + * default timeslice is 100 msecs, maximum timeslice is 800 msecs. + * Timeslices get refilled after they expire. + */ +#define MIN_TIMESLICE max(5 * HZ / 1000, 1) +#define DEF_TIMESLICE (100 * HZ / 1000) +#define ON_RUNQUEUE_WEIGHT 30 +#define CHILD_PENALTY 95 +#define PARENT_PENALTY 100 +#define EXIT_WEIGHT 3 +#define PRIO_BONUS_RATIO 25 +#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) +#define INTERACTIVE_DELTA 2 +#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) +#define STARVATION_LIMIT (MAX_SLEEP_AVG) +#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) + +/* + * If a task is 'interactive' then we reinsert it in the active + * array after it has expired its current timeslice. (it will not + * continue to run immediately, it will still roundrobin with + * other interactive tasks.) + * + * This part scales the interactivity limit depending on niceness. + * + * We scale it linearly, offset by the INTERACTIVE_DELTA delta. + * Here are a few examples of different nice levels: + * + * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] + * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] + * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] + * + * (the X axis represents the possible -5 ... 0 ... +5 dynamic + * priority range a task can explore, a value of '1' means the + * task is rated interactive.) + * + * Ie. nice +19 tasks can never get 'interactive' enough to be + * reinserted into the active array. And only heavily CPU-hog nice -20 + * tasks will be expired. Default nice 0 tasks are somewhere between, + * it takes some effort for them to get interactive, but it's not + * too hard. + */ + +#define CURRENT_BONUS(p) \ + (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ + MAX_SLEEP_AVG) + +#define GRANULARITY (10 * HZ / 1000 ? : 1) + +#ifdef CONFIG_SMP +#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ + num_online_cpus()) +#else +#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) +#endif + +#define SCALE(v1,v1_max,v2_max) \ + (v1) * (v2_max) / (v1_max) + +#define DELTA(p) \ + (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) + +#define TASK_INTERACTIVE(p) \ + ((p)->prio <= (p)->static_prio - DELTA(p)) + +#define INTERACTIVE_SLEEP(p) \ + (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ + (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) + +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) + +/* + * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] + * to time slice values: [800ms ... 100ms ... 5ms] + * + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. + */ + +#define SCALE_PRIO(x, prio) \ + max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) + +static unsigned int task_timeslice(task_t *p) +{ + if (p->static_prio < NICE_TO_PRIO(0)) + return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); + else + return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); +} +#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ + < (long long) (sd)->cache_hot_time) + +void __put_task_struct_cb(struct rcu_head *rhp) +{ + __put_task_struct(container_of(rhp, struct task_struct, rcu)); +} + +EXPORT_SYMBOL_GPL(__put_task_struct_cb); + +/* + * These are the runqueue data structures: + */ + +#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) + +typedef struct runqueue runqueue_t; + +struct prio_array { + unsigned int nr_active; + unsigned long bitmap[BITMAP_SIZE]; + struct list_head queue[MAX_PRIO]; +}; + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct runqueue { + spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned long nr_running; +#ifdef CONFIG_SMP + unsigned long cpu_load[3]; +#endif + unsigned long long nr_switches; + + /* + * This is part of a global counter where only the total sum + * over all CPUs matters. A task can increase this counter on + * one CPU and if it got migrated afterwards it may decrease + * it on another CPU. Always updated under the runqueue lock: + */ + unsigned long nr_uninterruptible; + + unsigned long expired_timestamp; + unsigned long long timestamp_last_tick; + task_t *curr, *idle; + struct mm_struct *prev_mm; + prio_array_t *active, *expired, arrays[2]; + int best_expired_prio; + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + struct sched_domain *sd; + + /* For active balancing */ + int active_balance; + int push_cpu; + + task_t *migration_thread; + struct list_head migration_queue; +#endif + +#ifdef CONFIG_SCHEDSTATS + /* latency stats */ + struct sched_info rq_sched_info; + + /* sys_sched_yield() stats */ + unsigned long yld_exp_empty; + unsigned long yld_act_empty; + unsigned long yld_both_empty; + unsigned long yld_cnt; + + /* schedule() stats */ + unsigned long sched_switch; + unsigned long sched_cnt; + unsigned long sched_goidle; + + /* try_to_wake_up() stats */ + unsigned long ttwu_cnt; + unsigned long ttwu_local; +#endif +}; + +static DEFINE_PER_CPU(struct runqueue, runqueues); + +/* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * See detach_destroy_domains: synchronize_sched for details. + * + * The domain tree of any CPU may only be accessed from within + * preempt-disabled sections. + */ +#define for_each_domain(cpu, domain) \ +for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) + +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() (&__get_cpu_var(runqueues)) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) + +#ifndef prepare_arch_switch +# define prepare_arch_switch(next) do { } while (0) +#endif +#ifndef finish_arch_switch +# define finish_arch_switch(prev) do { } while (0) +#endif + +#ifndef __ARCH_WANT_UNLOCKED_CTXSW +static inline int task_running(runqueue_t *rq, task_t *p) +{ + return rq->curr == p; +} + +static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) +{ +} + +static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) +{ +#ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = current; +#endif + spin_unlock_irq(&rq->lock); +} + +#else /* __ARCH_WANT_UNLOCKED_CTXSW */ +static inline int task_running(runqueue_t *rq, task_t *p) +{ +#ifdef CONFIG_SMP + return p->oncpu; +#else + return rq->curr == p; +#endif +} + +static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->oncpu = 1; +#endif +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + spin_unlock_irq(&rq->lock); +#else + spin_unlock(&rq->lock); +#endif +} + +static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->oncpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->oncpu = 0; +#endif +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_enable(); +#endif +} +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ + +/* + * task_rq_lock - lock the runqueue a given task resides on and disable + * interrupts. Note the ordering: we can safely lookup the task_rq without + * explicitly disabling preemption. + */ +static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) + __acquires(rq->lock) +{ + struct runqueue *rq; + +repeat_lock_task: + local_irq_save(*flags); + rq = task_rq(p); + spin_lock(&rq->lock); + if (unlikely(rq != task_rq(p))) { + spin_unlock_irqrestore(&rq->lock, *flags); + goto repeat_lock_task; + } + return rq; +} + +static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) + __releases(rq->lock) +{ + spin_unlock_irqrestore(&rq->lock, *flags); +} + +#ifdef CONFIG_SCHEDSTATS +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 12 + +static int show_schedstat(struct seq_file *seq, void *v) +{ + int cpu; + + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); + seq_printf(seq, "timestamp %lu\n", jiffies); + for_each_online_cpu(cpu) { + runqueue_t *rq = cpu_rq(cpu); +#ifdef CONFIG_SMP + struct sched_domain *sd; + int dcnt = 0; +#endif + + /* runqueue-specific stats */ + seq_printf(seq, + "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", + cpu, rq->yld_both_empty, + rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, + rq->sched_switch, rq->sched_cnt, rq->sched_goidle, + rq->ttwu_cnt, rq->ttwu_local, + rq->rq_sched_info.cpu_time, + rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); + + seq_printf(seq, "\n"); + +#ifdef CONFIG_SMP + /* domain-specific stats */ + preempt_disable(); + for_each_domain(cpu, sd) { + enum idle_type itype; + char mask_str[NR_CPUS]; + + cpumask_scnprintf(mask_str, NR_CPUS, sd->span); + seq_printf(seq, "domain%d %s", dcnt++, mask_str); + for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; + itype++) { + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", + sd->lb_cnt[itype], + sd->lb_balanced[itype], + sd->lb_failed[itype], + sd->lb_imbalance[itype], + sd->lb_gained[itype], + sd->lb_hot_gained[itype], + sd->lb_nobusyq[itype], + sd->lb_nobusyg[itype]); + } + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + sd->alb_cnt, sd->alb_failed, sd->alb_pushed, + sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, + sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, + sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); + } + preempt_enable(); +#endif + } + return 0; +} + +static int schedstat_open(struct inode *inode, struct file *file) +{ + unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); + char *buf = kmalloc(size, GFP_KERNEL); + struct seq_file *m; + int res; + + if (!buf) + return -ENOMEM; + res = single_open(file, show_schedstat, NULL); + if (!res) { + m = file->private_data; + m->buf = buf; + m->size = size; + } else + kfree(buf); + return res; +} + +struct file_operations proc_schedstat_operations = { + .open = schedstat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) +# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) +#else /* !CONFIG_SCHEDSTATS */ +# define schedstat_inc(rq, field) do { } while (0) +# define schedstat_add(rq, field, amt) do { } while (0) +#endif + +/* + * rq_lock - lock a given runqueue and disable interrupts. + */ +static inline runqueue_t *this_rq_lock(void) + __acquires(rq->lock) +{ + runqueue_t *rq; + + local_irq_disable(); + rq = this_rq(); + spin_lock(&rq->lock); + + return rq; +} + +#ifdef CONFIG_SCHEDSTATS +/* + * Called when a process is dequeued from the active array and given + * the cpu. We should note that with the exception of interactive + * tasks, the expired queue will become the active queue after the active + * queue is empty, without explicitly dequeuing and requeuing tasks in the + * expired queue. (Interactive tasks may be requeued directly to the + * active queue, thus delaying tasks in the expired queue from running; + * see scheduler_tick()). + * + * This function is only called from sched_info_arrive(), rather than + * dequeue_task(). Even though a task may be queued and dequeued multiple + * times as it is shuffled about, we're really interested in knowing how + * long it was from the *first* time it was queued to the time that it + * finally hit a cpu. + */ +static inline void sched_info_dequeued(task_t *t) +{ + t->sched_info.last_queued = 0; +} + +/* + * Called when a task finally hits the cpu. We can now calculate how + * long it was waiting to run. We also note when it began so that we + * can keep stats on how long its timeslice is. + */ +static void sched_info_arrive(task_t *t) +{ + unsigned long now = jiffies, diff = 0; + struct runqueue *rq = task_rq(t); + + if (t->sched_info.last_queued) + diff = now - t->sched_info.last_queued; + sched_info_dequeued(t); + t->sched_info.run_delay += diff; + t->sched_info.last_arrival = now; + t->sched_info.pcnt++; + + if (!rq) + return; + + rq->rq_sched_info.run_delay += diff; + rq->rq_sched_info.pcnt++; +} + +/* + * Called when a process is queued into either the active or expired + * array. The time is noted and later used to determine how long we + * had to wait for us to reach the cpu. Since the expired queue will + * become the active queue after active queue is empty, without dequeuing + * and requeuing any tasks, we are interested in queuing to either. It + * is unusual but not impossible for tasks to be dequeued and immediately + * requeued in the same or another array: this can happen in sched_yield(), + * set_user_nice(), and even load_balance() as it moves tasks from runqueue + * to runqueue. + * + * This function is only called from enqueue_task(), but also only updates + * the timestamp if it is already not set. It's assumed that + * sched_info_dequeued() will clear that stamp when appropriate. + */ +static inline void sched_info_queued(task_t *t) +{ + if (!t->sched_info.last_queued) + t->sched_info.last_queued = jiffies; +} + +/* + * Called when a process ceases being the active-running process, either + * voluntarily or involuntarily. Now we can calculate how long we ran. + */ +static inline void sched_info_depart(task_t *t) +{ + struct runqueue *rq = task_rq(t); + unsigned long diff = jiffies - t->sched_info.last_arrival; + + t->sched_info.cpu_time += diff; + + if (rq) + rq->rq_sched_info.cpu_time += diff; +} + +/* + * Called when tasks are switched involuntarily due, typically, to expiring + * their time slice. (This may also be called when switching to or from + * the idle task.) We are only called when prev != next. + */ +static inline void sched_info_switch(task_t *prev, task_t *next) +{ + struct runqueue *rq = task_rq(prev); + + /* + * prev now departs the cpu. It's not interesting to record + * stats about how efficient we were at scheduling the idle + * process, however. + */ + if (prev != rq->idle) + sched_info_depart(prev); + + if (next != rq->idle) + sched_info_arrive(next); +} +#else +#define sched_info_queued(t) do { } while (0) +#define sched_info_switch(t, next) do { } while (0) +#endif /* CONFIG_SCHEDSTATS */ + +/* + * Adding/removing a task to/from a priority array: + */ +static void dequeue_task(struct task_struct *p, prio_array_t *array) +{ + array->nr_active--; + list_del(&p->run_list); + if (list_empty(array->queue + p->prio)) + __clear_bit(p->prio, array->bitmap); +} + +static void enqueue_task(struct task_struct *p, prio_array_t *array) +{ + sched_info_queued(p); + list_add_tail(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->array = array; +} + +/* + * Put task to the end of the run list without the overhead of dequeue + * followed by enqueue. + */ +static void requeue_task(struct task_struct *p, prio_array_t *array) +{ + list_move_tail(&p->run_list, array->queue + p->prio); +} + +static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) +{ + list_add(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->array = array; +} + +/* + * effective_prio - return the priority that is based on the static + * priority but is modified by bonuses/penalties. + * + * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] + * into the -5 ... 0 ... +5 bonus/penalty range. + * + * We use 25% of the full 0...39 priority range so that: + * + * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. + * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. + * + * Both properties are important to certain workloads. + */ +static int effective_prio(task_t *p) +{ + int bonus, prio; + + if (rt_task(p)) + return p->prio; + + bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; + + prio = p->static_prio - bonus; + if (prio < MAX_RT_PRIO) + prio = MAX_RT_PRIO; + if (prio > MAX_PRIO-1) + prio = MAX_PRIO-1; + return prio; +} + +/* + * __activate_task - move a task to the runqueue. + */ +static inline void __activate_task(task_t *p, runqueue_t *rq) +{ + enqueue_task(p, rq->active); + rq->nr_running++; +} + +/* + * __activate_idle_task - move idle task to the _front_ of runqueue. + */ +static inline void __activate_idle_task(task_t *p, runqueue_t *rq) +{ + enqueue_task_head(p, rq->active); + rq->nr_running++; +} + +static int recalc_task_prio(task_t *p, unsigned long long now) +{ + /* Caller must always ensure 'now >= p->timestamp' */ + unsigned long long __sleep_time = now - p->timestamp; + unsigned long sleep_time; + + if (unlikely(p->policy == SCHED_BATCH)) + sleep_time = 0; + else { + if (__sleep_time > NS_MAX_SLEEP_AVG) + sleep_time = NS_MAX_SLEEP_AVG; + else + sleep_time = (unsigned long)__sleep_time; + } + + if (likely(sleep_time > 0)) { + /* + * User tasks that sleep a long time are categorised as + * idle and will get just interactive status to stay active & + * prevent them suddenly becoming cpu hogs and starving + * other processes. + */ + if (p->mm && p->activated != -1 && + sleep_time > INTERACTIVE_SLEEP(p)) { + p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - + DEF_TIMESLICE); + } else { + /* + * The lower the sleep avg a task has the more + * rapidly it will rise with sleep time. + */ + sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; + + /* + * Tasks waking from uninterruptible sleep are + * limited in their sleep_avg rise as they + * are likely to be waiting on I/O + */ + if (p->activated == -1 && p->mm) { + if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) + sleep_time = 0; + else if (p->sleep_avg + sleep_time >= + INTERACTIVE_SLEEP(p)) { + p->sleep_avg = INTERACTIVE_SLEEP(p); + sleep_time = 0; + } + } + + /* + * This code gives a bonus to interactive tasks. + * + * The boost works by updating the 'average sleep time' + * value here, based on ->timestamp. The more time a + * task spends sleeping, the higher the average gets - + * and the higher the priority boost gets as well. + */ + p->sleep_avg += sleep_time; + + if (p->sleep_avg > NS_MAX_SLEEP_AVG) + p->sleep_avg = NS_MAX_SLEEP_AVG; + } + } + + return effective_prio(p); +} + +/* + * activate_task - move a task to the runqueue and do priority recalculation + * + * Update all the scheduling statistics stuff. (sleep average + * calculation, priority modifiers, etc.) + */ +static void activate_task(task_t *p, runqueue_t *rq, int local) +{ + unsigned long long now; + + now = sched_clock(); +#ifdef CONFIG_SMP + if (!local) { + /* Compensate for drifting sched_clock */ + runqueue_t *this_rq = this_rq(); + now = (now - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; + } +#endif + + if (!rt_task(p)) + p->prio = recalc_task_prio(p, now); + + /* + * This checks to make sure it's not an uninterruptible task + * that is now waking up. + */ + if (!p->activated) { + /* + * Tasks which were woken up by interrupts (ie. hw events) + * are most likely of interactive nature. So we give them + * the credit of extending their sleep time to the period + * of time they spend on the runqueue, waiting for execution + * on a CPU, first time around: + */ + if (in_interrupt()) + p->activated = 2; + else { + /* + * Normal first-time wakeups get a credit too for + * on-runqueue time, but it will be weighted down: + */ + p->activated = 1; + } + } + p->timestamp = now; + + __activate_task(p, rq); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +static void deactivate_task(struct task_struct *p, runqueue_t *rq) +{ + rq->nr_running--; + dequeue_task(p, p->array); + p->array = NULL; +} + +/* + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag, on SMP it + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. + */ +#ifdef CONFIG_SMP +static void resched_task(task_t *p) +{ + int cpu; + + assert_spin_locked(&task_rq(p)->lock); + + if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) + return; + + set_tsk_thread_flag(p, TIF_NEED_RESCHED); + + cpu = task_cpu(p); + if (cpu == smp_processor_id()) + return; + + /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ + smp_mb(); + if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) + smp_send_reschedule(cpu); +} +#else +static inline void resched_task(task_t *p) +{ + assert_spin_locked(&task_rq(p)->lock); + set_tsk_need_resched(p); +} +#endif + +/** + * task_curr - is this task currently executing on a CPU? + * @p: the task in question. + */ +inline int task_curr(const task_t *p) +{ + return cpu_curr(task_cpu(p)) == p; +} + +#ifdef CONFIG_SMP +typedef struct { + struct list_head list; + + task_t *task; + int dest_cpu; + + struct completion done; +} migration_req_t; + +/* + * The task's runqueue lock must be held. + * Returns true if you have to wait for migration thread. + */ +static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) +{ + runqueue_t *rq = task_rq(p); + + /* + * If the task is not on a runqueue (and not running), then + * it is sufficient to simply update the task's cpu field. + */ + if (!p->array && !task_running(rq, p)) { + set_task_cpu(p, dest_cpu); + return 0; + } + + init_completion(&req->done); + req->task = p; + req->dest_cpu = dest_cpu; + list_add(&req->list, &rq->migration_queue); + return 1; +} + +/* + * wait_task_inactive - wait for a thread to unschedule. + * + * The caller must ensure that the task *will* unschedule sometime soon, + * else this function might spin for a *long* time. This function can't + * be called with interrupts off, or it may introduce deadlock with + * smp_call_function() if an IPI is sent by the same process we are + * waiting to become inactive. + */ +void wait_task_inactive(task_t *p) +{ + unsigned long flags; + runqueue_t *rq; + int preempted; + +repeat: + rq = task_rq_lock(p, &flags); + /* Must be off runqueue entirely, not preempted. */ + if (unlikely(p->array || task_running(rq, p))) { + /* If it's preempted, we yield. It could be a while. */ + preempted = !task_running(rq, p); + task_rq_unlock(rq, &flags); + cpu_relax(); + if (preempted) + yield(); + goto repeat; + } + task_rq_unlock(rq, &flags); +} + +/*** + * kick_process - kick a running thread to enter/exit the kernel + * @p: the to-be-kicked thread + * + * Cause a process which is running on another CPU to enter + * kernel-mode, without any delay. (to get signals handled.) + * + * NOTE: this function doesnt have to take the runqueue lock, + * because all it wants to ensure is that the remote task enters + * the kernel. If the IPI races and the task has been migrated + * to another CPU then no harm is done and the purpose has been + * achieved as well. + */ +void kick_process(task_t *p) +{ + int cpu; + + preempt_disable(); + cpu = task_cpu(p); + if ((cpu != smp_processor_id()) && task_curr(p)) + smp_send_reschedule(cpu); + preempt_enable(); +} + +/* + * Return a low guess at the load of a migration-source cpu. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively. + */ +static inline unsigned long source_load(int cpu, int type) +{ + runqueue_t *rq = cpu_rq(cpu); + unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + if (type == 0) + return load_now; + + return min(rq->cpu_load[type-1], load_now); +} + +/* + * Return a high guess at the load of a migration-target cpu + */ +static inline unsigned long target_load(int cpu, int type) +{ + runqueue_t *rq = cpu_rq(cpu); + unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + if (type == 0) + return load_now; + + return max(rq->cpu_load[type-1], load_now); +} + +/* + * find_idlest_group finds and returns the least busy CPU group within the + * domain. + */ +static struct sched_group * +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) +{ + struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; + unsigned long min_load = ULONG_MAX, this_load = 0; + int load_idx = sd->forkexec_idx; + int imbalance = 100 + (sd->imbalance_pct-100)/2; + + do { + unsigned long load, avg_load; + int local_group; + int i; + + /* Skip over this group if it has no CPUs allowed */ + if (!cpus_intersects(group->cpumask, p->cpus_allowed)) + goto nextgroup; + + local_group = cpu_isset(this_cpu, group->cpumask); + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; + + for_each_cpu_mask(i, group->cpumask) { + /* Bias balancing toward cpus of our domain */ + if (local_group) + load = source_load(i, load_idx); + else + load = target_load(i, load_idx); + + avg_load += load; + } + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + this = group; + } else if (avg_load < min_load) { + min_load = avg_load; + idlest = group; + } +nextgroup: + group = group->next; + } while (group != sd->groups); + + if (!idlest || 100*this_load < imbalance*min_load) + return NULL; + return idlest; +} + +/* + * find_idlest_queue - find the idlest runqueue among the cpus in group. + */ +static int +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +{ + cpumask_t tmp; + unsigned long load, min_load = ULONG_MAX; + int idlest = -1; + int i; + + /* Traverse only the allowed CPUs */ + cpus_and(tmp, group->cpumask, p->cpus_allowed); + + for_each_cpu_mask(i, tmp) { + load = source_load(i, 0); + + if (load < min_load || (load == min_load && i == this_cpu)) { + min_load = load; + idlest = i; + } + } + + return idlest; +} + +/* + * sched_balance_self: balance the current task (running on cpu) in domains + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and + * SD_BALANCE_EXEC. + * + * Balance, ie. select the least loaded group. + * + * Returns the target CPU number, or the same CPU if no balancing is needed. + * + * preempt must be disabled. + */ +static int sched_balance_self(int cpu, int flag) +{ + struct task_struct *t = current; + struct sched_domain *tmp, *sd = NULL; + + for_each_domain(cpu, tmp) + if (tmp->flags & flag) + sd = tmp; + + while (sd) { + cpumask_t span; + struct sched_group *group; + int new_cpu; + int weight; + + span = sd->span; + group = find_idlest_group(sd, t, cpu); + if (!group) + goto nextlevel; + + new_cpu = find_idlest_cpu(group, t, cpu); + if (new_cpu == -1 || new_cpu == cpu) + goto nextlevel; + + /* Now try balancing at a lower domain level */ + cpu = new_cpu; +nextlevel: + sd = NULL; + weight = cpus_weight(span); + for_each_domain(cpu, tmp) { + if (weight <= cpus_weight(tmp->span)) + break; + if (tmp->flags & flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ + } + + return cpu; +} + +#endif /* CONFIG_SMP */ + +/* + * wake_idle() will wake a task on an idle cpu if task->cpu is + * not idle and an idle cpu is available. The span of cpus to + * search starts with cpus closest then further out as needed, + * so we always favor a closer, idle cpu. + * + * Returns the CPU we should wake onto. + */ +#if defined(ARCH_HAS_SCHED_WAKE_IDLE) +static int wake_idle(int cpu, task_t *p) +{ + cpumask_t tmp; + struct sched_domain *sd; + int i; + + if (idle_cpu(cpu)) + return cpu; + + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_IDLE) { + cpus_and(tmp, sd->span, p->cpus_allowed); + for_each_cpu_mask(i, tmp) { + if (idle_cpu(i)) + return i; + } + } + else + break; + } + return cpu; +} +#else +static inline int wake_idle(int cpu, task_t *p) +{ + return cpu; +} +#endif + +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @state: the mask of task states that can be woken + * @sync: do a synchronous wakeup? + * + * Put it on the run-queue if it's not already there. The "current" + * thread is always on the run-queue (except when the actual + * re-schedule is in progress), and as such you're allowed to do + * the simpler "current->state = TASK_RUNNING" to mark yourself + * runnable without the overhead of this. + * + * returns failure only if the task is already active. + */ +static int try_to_wake_up(task_t *p, unsigned int state, int sync) +{ + int cpu, this_cpu, success = 0; + unsigned long flags; + long old_state; + runqueue_t *rq; +#ifdef CONFIG_SMP + unsigned long load, this_load; + struct sched_domain *sd, *this_sd = NULL; + int new_cpu; +#endif + + rq = task_rq_lock(p, &flags); + old_state = p->state; + if (!(old_state & state)) + goto out; + + if (p->array) + goto out_running; + + cpu = task_cpu(p); + this_cpu = smp_processor_id(); + +#ifdef CONFIG_SMP + if (unlikely(task_running(rq, p))) + goto out_activate; + + new_cpu = cpu; + + schedstat_inc(rq, ttwu_cnt); + if (cpu == this_cpu) { + schedstat_inc(rq, ttwu_local); + goto out_set_cpu; + } + + for_each_domain(this_cpu, sd) { + if (cpu_isset(cpu, sd->span)) { + schedstat_inc(sd, ttwu_wake_remote); + this_sd = sd; + break; + } + } + + if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + goto out_set_cpu; + + /* + * Check for affine wakeup and passive balancing possibilities. + */ + if (this_sd) { + int idx = this_sd->wake_idx; + unsigned int imbalance; + + imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; + + load = source_load(cpu, idx); + this_load = target_load(this_cpu, idx); + + new_cpu = this_cpu; /* Wake to this CPU if we can */ + + if (this_sd->flags & SD_WAKE_AFFINE) { + unsigned long tl = this_load; + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + if (sync) + tl -= SCHED_LOAD_SCALE; + + if ((tl <= load && + tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || + 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { + /* + * This domain has SD_WAKE_AFFINE and + * p is cache cold in this domain, and + * there is no bad imbalance. + */ + schedstat_inc(this_sd, ttwu_move_affine); + goto out_set_cpu; + } + } + + /* + * Start passive balancing when half the imbalance_pct + * limit is reached. + */ + if (this_sd->flags & SD_WAKE_BALANCE) { + if (imbalance*this_load <= 100*load) { + schedstat_inc(this_sd, ttwu_move_balance); + goto out_set_cpu; + } + } + } + + new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ +out_set_cpu: + new_cpu = wake_idle(new_cpu, p); + if (new_cpu != cpu) { + set_task_cpu(p, new_cpu); + task_rq_unlock(rq, &flags); + /* might preempt at this point */ + rq = task_rq_lock(p, &flags); + old_state = p->state; + if (!(old_state & state)) + goto out; + if (p->array) + goto out_running; + + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + } + +out_activate: +#endif /* CONFIG_SMP */ + if (old_state == TASK_UNINTERRUPTIBLE) { + rq->nr_uninterruptible--; + /* + * Tasks on involuntary sleep don't earn + * sleep_avg beyond just interactive state. + */ + p->activated = -1; + } + + /* + * Tasks that have marked their sleep as noninteractive get + * woken up without updating their sleep average. (i.e. their + * sleep is handled in a priority-neutral manner, no priority + * boost and no penalty.) + */ + if (old_state & TASK_NONINTERACTIVE) + __activate_task(p, rq); + else + activate_task(p, rq, cpu == this_cpu); + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption, if the woken up task will run on + * this cpu. (in this case the 'I will reschedule' promise of + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ + if (!sync || cpu != this_cpu) { + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + success = 1; + +out_running: + p->state = TASK_RUNNING; +out: + task_rq_unlock(rq, &flags); + + return success; +} + +int fastcall wake_up_process(task_t *p) +{ + return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); +} + +EXPORT_SYMBOL(wake_up_process); + +int fastcall wake_up_state(task_t *p, unsigned int state) +{ + return try_to_wake_up(p, state, 0); +} + +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + */ +void fastcall sched_fork(task_t *p, int clone_flags) +{ + int cpu = get_cpu(); + +#ifdef CONFIG_SMP + cpu = sched_balance_self(cpu, SD_BALANCE_FORK); +#endif + set_task_cpu(p, cpu); + + /* + * We mark the process as running here, but have not actually + * inserted it onto the runqueue yet. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_RUNNING; + INIT_LIST_HEAD(&p->run_list); + p->array = NULL; +#ifdef CONFIG_SCHEDSTATS + memset(&p->sched_info, 0, sizeof(p->sched_info)); +#endif +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + p->oncpu = 0; +#endif +#ifdef CONFIG_PREEMPT + /* Want to start with kernel preemption disabled. */ + task_thread_info(p)->preempt_count = 1; +#endif + /* + * Share the timeslice between parent and child, thus the + * total amount of pending timeslices in the system doesn't change, + * resulting in more scheduling fairness. + */ + local_irq_disable(); + p->time_slice = (current->time_slice + 1) >> 1; + /* + * The remainder of the first timeslice might be recovered by + * the parent if the child exits early enough. + */ + p->first_time_slice = 1; + current->time_slice >>= 1; + p->timestamp = sched_clock(); + if (unlikely(!current->time_slice)) { + /* + * This case is rare, it happens when the parent has only + * a single jiffy left from its timeslice. Taking the + * runqueue lock is not a problem. + */ + current->time_slice = 1; + scheduler_tick(); + } + local_irq_enable(); + put_cpu(); +} + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) +{ + unsigned long flags; + int this_cpu, cpu; + runqueue_t *rq, *this_rq; + + rq = task_rq_lock(p, &flags); + BUG_ON(p->state != TASK_RUNNING); + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + + /* + * We decrease the sleep average of forking parents + * and children as well, to keep max-interactive tasks + * from forking tasks that are max-interactive. The parent + * (current) is done further down, under its lock. + */ + p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * + CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + + p->prio = effective_prio(p); + + if (likely(cpu == this_cpu)) { + if (!(clone_flags & CLONE_VM)) { + /* + * The VM isn't cloned, so we're in a good position to + * do child-runs-first in anticipation of an exec. This + * usually avoids a lot of COW overhead. + */ + if (unlikely(!current->array)) + __activate_task(p, rq); + else { + p->prio = current->prio; + list_add_tail(&p->run_list, ¤t->run_list); + p->array = current->array; + p->array->nr_active++; + rq->nr_running++; + } + set_need_resched(); + } else + /* Run child last */ + __activate_task(p, rq); + /* + * We skip the following code due to cpu == this_cpu + * + * task_rq_unlock(rq, &flags); + * this_rq = task_rq_lock(current, &flags); + */ + this_rq = rq; + } else { + this_rq = cpu_rq(this_cpu); + + /* + * Not the local CPU - must adjust timestamp. This should + * get optimised away in the !CONFIG_SMP case. + */ + p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; + __activate_task(p, rq); + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + + /* + * Parent and child are on different CPUs, now get the + * parent runqueue to update the parent's ->sleep_avg: + */ + task_rq_unlock(rq, &flags); + this_rq = task_rq_lock(current, &flags); + } + current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * + PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + task_rq_unlock(this_rq, &flags); +} + +/* + * Potentially available exiting-child timeslices are + * retrieved here - this way the parent does not get + * penalized for creating too many threads. + * + * (this cannot be used to 'generate' timeslices + * artificially, because any timeslice recovered here + * was given away by the parent in the first place.) + */ +void fastcall sched_exit(task_t *p) +{ + unsigned long flags; + runqueue_t *rq; + + /* + * If the child was a (relative-) CPU hog then decrease + * the sleep_avg of the parent as well. + */ + rq = task_rq_lock(p->parent, &flags); + if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { + p->parent->time_slice += p->time_slice; + if (unlikely(p->parent->time_slice > task_timeslice(p))) + p->parent->time_slice = task_timeslice(p); + } + if (p->sleep_avg < p->parent->sleep_avg) + p->parent->sleep_avg = p->parent->sleep_avg / + (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / + (EXIT_WEIGHT + 1); + task_rq_unlock(rq, &flags); +} + +/** + * prepare_task_switch - prepare to switch tasks + * @rq: the runqueue preparing to switch + * @next: the task we are going to switch to. + * + * This is called with the rq lock held and interrupts off. It must + * be paired with a subsequent finish_task_switch after the context + * switch. + * + * prepare_task_switch sets up locking and calls architecture specific + * hooks. + */ +static inline void prepare_task_switch(runqueue_t *rq, task_t *next) +{ + prepare_lock_switch(rq, next); + prepare_arch_switch(next); +} + +/** + * finish_task_switch - clean up after a task-switch + * @rq: runqueue associated with task-switch + * @prev: the thread we just switched away from. + * + * finish_task_switch must be called after the context switch, paired + * with a prepare_task_switch call before the context switch. + * finish_task_switch will reconcile locking set up by prepare_task_switch, + * and do any other architecture-specific cleanup actions. + * + * Note that we may have delayed dropping an mm in context_switch(). If + * so, we finish that here outside of the runqueue lock. (Doing it + * with the lock held can cause deadlocks; see schedule() for + * details.) + */ +static inline void finish_task_switch(runqueue_t *rq, task_t *prev) + __releases(rq->lock) +{ + struct mm_struct *mm = rq->prev_mm; + unsigned long prev_task_flags; + + rq->prev_mm = NULL; + + /* + * A task struct has one reference for the use as "current". + * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and + * calls schedule one last time. The schedule call will never return, + * and the scheduled task must drop that reference. + * The test for EXIT_ZOMBIE must occur while the runqueue locks are + * still held, otherwise prev could be scheduled on another cpu, die + * there before we look at prev->state, and then the reference would + * be dropped twice. + * Manfred Spraul + */ + prev_task_flags = prev->flags; + finish_arch_switch(prev); + finish_lock_switch(rq, prev); + if (mm) + mmdrop(mm); + if (unlikely(prev_task_flags & PF_DEAD)) + put_task_struct(prev); +} + +/** + * schedule_tail - first thing a freshly forked thread must call. + * @prev: the thread we just switched away from. + */ +asmlinkage void schedule_tail(task_t *prev) + __releases(rq->lock) +{ + runqueue_t *rq = this_rq(); + finish_task_switch(rq, prev); +#ifdef __ARCH_WANT_UNLOCKED_CTXSW + /* In this case, finish_task_switch does not reenable preemption */ + preempt_enable(); +#endif + if (current->set_child_tid) + put_user(current->pid, current->set_child_tid); +} + +/* + * context_switch - switch to the new MM and the new + * thread's register state. + */ +static inline +task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) +{ + struct mm_struct *mm = next->mm; + struct mm_struct *oldmm = prev->active_mm; + + if (unlikely(!mm)) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next); + } else + switch_mm(oldmm, mm, next); + + if (unlikely(!prev->mm)) { + prev->active_mm = NULL; + WARN_ON(rq->prev_mm); + rq->prev_mm = oldmm; + } + + /* Here we just switch the register state and the stack. */ + switch_to(prev, next, prev); + + return prev; +} + +/* + * nr_running, nr_uninterruptible and nr_context_switches: + * + * externally visible scheduler statistics: current number of runnable + * threads, current number of uninterruptible-sleeping threads, total + * number of context switches performed since bootup. + */ +unsigned long nr_running(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->nr_running; + + return sum; +} + +unsigned long nr_uninterruptible(void) +{ + unsigned long i, sum = 0; + + for_each_cpu(i) + sum += cpu_rq(i)->nr_uninterruptible; + + /* + * Since we read the counters lockless, it might be slightly + * inaccurate. Do not allow it to go below zero though: + */ + if (unlikely((long)sum < 0)) + sum = 0; + + return sum; +} + +unsigned long long nr_context_switches(void) +{ + unsigned long long i, sum = 0; + + for_each_cpu(i) + sum += cpu_rq(i)->nr_switches; + + return sum; +} + +unsigned long nr_iowait(void) +{ + unsigned long i, sum = 0; + + for_each_cpu(i) + sum += atomic_read(&cpu_rq(i)->nr_iowait); + + return sum; +} + +#ifdef CONFIG_SMP + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + if (rq1 == rq2) { + spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ + } else { + if (rq1 < rq2) { + spin_lock(&rq1->lock); + spin_lock(&rq2->lock); + } else { + spin_lock(&rq2->lock); + spin_lock(&rq1->lock); + } + } +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + spin_unlock(&rq1->lock); + if (rq1 != rq2) + spin_unlock(&rq2->lock); + else + __release(rq2->lock); +} + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + if (unlikely(!spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + spin_unlock(&this_rq->lock); + spin_lock(&busiest->lock); + spin_lock(&this_rq->lock); + } else + spin_lock(&busiest->lock); + } +} + +/* + * If dest_cpu is allowed for this process, migrate the task to it. + * This is accomplished by forcing the cpu_allowed mask to only + * allow dest_cpu, which will force the cpu onto dest_cpu. Then + * the cpu_allowed mask is restored. + */ +static void sched_migrate_task(task_t *p, int dest_cpu) +{ + migration_req_t req; + runqueue_t *rq; + unsigned long flags; + + rq = task_rq_lock(p, &flags); + if (!cpu_isset(dest_cpu, p->cpus_allowed) + || unlikely(cpu_is_offline(dest_cpu))) + goto out; + + /* force the process onto the specified CPU */ + if (migrate_task(p, dest_cpu, &req)) { + /* Need to wait for migration thread (might exit: take ref). */ + struct task_struct *mt = rq->migration_thread; + get_task_struct(mt); + task_rq_unlock(rq, &flags); + wake_up_process(mt); + put_task_struct(mt); + wait_for_completion(&req.done); + return; + } +out: + task_rq_unlock(rq, &flags); +} + +/* + * sched_exec - execve() is a valuable balancing opportunity, because at + * this point the task has the smallest effective memory and cache footprint. + */ +void sched_exec(void) +{ + int new_cpu, this_cpu = get_cpu(); + new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); + put_cpu(); + if (new_cpu != this_cpu) + sched_migrate_task(current, new_cpu); +} + +/* + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static +void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, + runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) +{ + dequeue_task(p, src_array); + src_rq->nr_running--; + set_task_cpu(p, this_cpu); + this_rq->nr_running++; + enqueue_task(p, this_array); + p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + + this_rq->timestamp_last_tick; + /* + * Note that idle threads have a prio of MAX_PRIO, for this test + * to be always true for them. + */ + if (TASK_PREEMPTS_CURR(p, this_rq)) + resched_task(this_rq->curr); +} + +/* + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + */ +static +int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, + struct sched_domain *sd, enum idle_type idle, + int *all_pinned) +{ + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 3) are cache-hot on their current CPU. + */ + if (!cpu_isset(this_cpu, p->cpus_allowed)) + return 0; + *all_pinned = 0; + + if (task_running(rq, p)) + return 0; + + /* + * Aggressive migration if: + * 1) task is cache cold, or + * 2) too many balance attempts have failed. + */ + + if (sd->nr_balance_failed > sd->cache_nice_tries) + return 1; + + if (task_hot(p, rq->timestamp_last_tick, sd)) + return 0; + return 1; +} + +/* + * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, + * as part of a balancing operation within "domain". Returns the number of + * tasks moved. + * + * Called with both runqueues locked. + */ +static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle, int *all_pinned) +{ + prio_array_t *array, *dst_array; + struct list_head *head, *curr; + int idx, pulled = 0, pinned = 0; + task_t *tmp; + + if (max_nr_move == 0) + goto out; + + pinned = 1; + + /* + * We first consider expired tasks. Those will likely not be + * executed in the near future, and they are most likely to + * be cache-cold, thus switching CPUs has the least effect + * on them. + */ + if (busiest->expired->nr_active) { + array = busiest->expired; + dst_array = this_rq->expired; + } else { + array = busiest->active; + dst_array = this_rq->active; + } + +new_array: + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, MAX_PRIO, idx); + if (idx >= MAX_PRIO) { + if (array == busiest->expired && busiest->active->nr_active) { + array = busiest->active; + dst_array = this_rq->active; + goto new_array; + } + goto out; + } + + head = array->queue + idx; + curr = head->prev; +skip_queue: + tmp = list_entry(curr, task_t, run_list); + + curr = curr->prev; + + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + +#ifdef CONFIG_SCHEDSTATS + if (task_hot(tmp, busiest->timestamp_last_tick, sd)) + schedstat_inc(sd, lb_hot_gained[idle]); +#endif + + pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pulled++; + + /* We only want to steal up to the prescribed number of tasks. */ + if (pulled < max_nr_move) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } +out: + /* + * Right now, this is the only place pull_task() is called, + * so we can safely collect pull_task() stats here rather than + * inside pull_task(). + */ + schedstat_add(sd, lb_gained[idle], pulled); + + if (all_pinned) + *all_pinned = pinned; + return pulled; +} + +/* + * find_busiest_group finds and returns the busiest CPU group within the + * domain. It calculates and returns the number of tasks which should be + * moved to restore balance via the imbalance parameter. + */ +static struct sched_group * +find_busiest_group(struct sched_domain *sd, int this_cpu, + unsigned long *imbalance, enum idle_type idle, int *sd_idle) +{ + struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; + unsigned long max_load, avg_load, total_load, this_load, total_pwr; + unsigned long max_pull; + int load_idx; + + max_load = this_load = total_load = total_pwr = 0; + if (idle == NOT_IDLE) + load_idx = sd->busy_idx; + else if (idle == NEWLY_IDLE) + load_idx = sd->newidle_idx; + else + load_idx = sd->idle_idx; + + do { + unsigned long load; + int local_group; + int i; + + local_group = cpu_isset(this_cpu, group->cpumask); + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; + + for_each_cpu_mask(i, group->cpumask) { + if (*sd_idle && !idle_cpu(i)) + *sd_idle = 0; + + /* Bias balancing toward cpus of our domain */ + if (local_group) + load = target_load(i, load_idx); + else + load = source_load(i, load_idx); + + avg_load += load; + } + + total_load += avg_load; + total_pwr += group->cpu_power; + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + this = group; + } else if (avg_load > max_load) { + max_load = avg_load; + busiest = group; + } + group = group->next; + } while (group != sd->groups); + + if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) + goto out_balanced; + + avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; + + if (this_load >= avg_load || + 100*max_load <= sd->imbalance_pct*this_load) + goto out_balanced; + + /* + * We're trying to get all the cpus to the average_load, so we don't + * want to push ourselves above the average load, nor do we wish to + * reduce the max loaded cpu below the average load, as either of these + * actions would just result in more rebalancing later, and ping-pong + * tasks around. Thus we look for the minimum possible imbalance. + * Negative imbalances (*we* are more loaded than anyone else) will + * be counted as no imbalance for these purposes -- we can't fix that + * by pulling tasks to us. Be careful of negative numbers as they'll + * appear as very large values with unsigned longs. + */ + + /* Don't want to pull so many tasks that a group would go idle */ + max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); + + /* How much load to actually move to equalise the imbalance */ + *imbalance = min(max_pull * busiest->cpu_power, + (avg_load - this_load) * this->cpu_power) + / SCHED_LOAD_SCALE; + + if (*imbalance < SCHED_LOAD_SCALE) { + unsigned long pwr_now = 0, pwr_move = 0; + unsigned long tmp; + + if (max_load - this_load >= SCHED_LOAD_SCALE*2) { + *imbalance = 1; + return busiest; + } + + /* + * OK, we don't have enough imbalance to justify moving tasks, + * however we may be able to increase total CPU power used by + * moving them. + */ + + pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); + pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); + pwr_now /= SCHED_LOAD_SCALE; + + /* Amount of load we'd subtract */ + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; + if (max_load > tmp) + pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, + max_load - tmp); + + /* Amount of load we'd add */ + if (max_load*busiest->cpu_power < + SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) + tmp = max_load*busiest->cpu_power/this->cpu_power; + else + tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; + pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); + pwr_move /= SCHED_LOAD_SCALE; + + /* Move if we gain throughput */ + if (pwr_move <= pwr_now) + goto out_balanced; + + *imbalance = 1; + return busiest; + } + + /* Get rid of the scaling factor, rounding down as we divide */ + *imbalance = *imbalance / SCHED_LOAD_SCALE; + return busiest; + +out_balanced: + + *imbalance = 0; + return NULL; +} + +/* + * find_busiest_queue - find the busiest runqueue among the cpus in group. + */ +static runqueue_t *find_busiest_queue(struct sched_group *group, + enum idle_type idle) +{ + unsigned long load, max_load = 0; + runqueue_t *busiest = NULL; + int i; + + for_each_cpu_mask(i, group->cpumask) { + load = source_load(i, 0); + + if (load > max_load) { + max_load = load; + busiest = cpu_rq(i); + } + } + + return busiest; +} + +/* + * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but + * so long as it is large enough. + */ +#define MAX_PINNED_INTERVAL 512 + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + * + * Called with this_rq unlocked. + */ +static int load_balance(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, enum idle_type idle) +{ + struct sched_group *group; + runqueue_t *busiest; + unsigned long imbalance; + int nr_moved, all_pinned = 0; + int active_balance = 0; + int sd_idle = 0; + + if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) + sd_idle = 1; + + schedstat_inc(sd, lb_cnt[idle]); + + group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle); + if (!group) { + schedstat_inc(sd, lb_nobusyg[idle]); + goto out_balanced; + } + + busiest = find_busiest_queue(group, idle); + if (!busiest) { + schedstat_inc(sd, lb_nobusyq[idle]); + goto out_balanced; + } + + BUG_ON(busiest == this_rq); + + schedstat_add(sd, lb_imbalance[idle], imbalance); + + nr_moved = 0; + if (busiest->nr_running > 1) { + /* + * Attempt to move tasks. If find_busiest_group has found + * an imbalance but busiest->nr_running <= 1, the group is + * still unbalanced. nr_moved simply stays zero, so it is + * correctly treated as an imbalance. + */ + double_rq_lock(this_rq, busiest); + nr_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, idle, &all_pinned); + double_rq_unlock(this_rq, busiest); + + /* All tasks on this runqueue were pinned by CPU affinity */ + if (unlikely(all_pinned)) + goto out_balanced; + } + + if (!nr_moved) { + schedstat_inc(sd, lb_failed[idle]); + sd->nr_balance_failed++; + + if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { + + spin_lock(&busiest->lock); + + /* don't kick the migration_thread, if the curr + * task on busiest cpu can't be moved to this_cpu + */ + if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + spin_unlock(&busiest->lock); + all_pinned = 1; + goto out_one_pinned; + } + + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + active_balance = 1; + } + spin_unlock(&busiest->lock); + if (active_balance) + wake_up_process(busiest->migration_thread); + + /* + * We've kicked active balancing, reset the failure + * counter. + */ + sd->nr_balance_failed = sd->cache_nice_tries+1; + } + } else + sd->nr_balance_failed = 0; + + if (likely(!active_balance)) { + /* We were unbalanced, so reset the balancing interval */ + sd->balance_interval = sd->min_interval; + } else { + /* + * If we've begun active balancing, start to back off. This + * case may not be covered by the all_pinned logic if there + * is only 1 task on the busy runqueue (because we don't call + * move_tasks). + */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; + } + + if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) + return -1; + return nr_moved; + +out_balanced: + schedstat_inc(sd, lb_balanced[idle]); + + sd->nr_balance_failed = 0; + +out_one_pinned: + /* tune up the balancing interval */ + if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || + (sd->balance_interval < sd->max_interval)) + sd->balance_interval *= 2; + + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + return -1; + return 0; +} + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + * + * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). + * this_rq is locked. + */ +static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, + struct sched_domain *sd) +{ + struct sched_group *group; + runqueue_t *busiest = NULL; + unsigned long imbalance; + int nr_moved = 0; + int sd_idle = 0; + + if (sd->flags & SD_SHARE_CPUPOWER) + sd_idle = 1; + + schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); + group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle); + if (!group) { + schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); + goto out_balanced; + } + + busiest = find_busiest_queue(group, NEWLY_IDLE); + if (!busiest) { + schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); + goto out_balanced; + } + + BUG_ON(busiest == this_rq); + + schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); + + nr_moved = 0; + if (busiest->nr_running > 1) { + /* Attempt to move tasks */ + double_lock_balance(this_rq, busiest); + nr_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, NEWLY_IDLE, NULL); + spin_unlock(&busiest->lock); + } + + if (!nr_moved) { + schedstat_inc(sd, lb_failed[NEWLY_IDLE]); + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + return -1; + } else + sd->nr_balance_failed = 0; + + return nr_moved; + +out_balanced: + schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + return -1; + sd->nr_balance_failed = 0; + return 0; +} + +/* + * idle_balance is called by schedule() if this_cpu is about to become + * idle. Attempts to pull tasks from other CPUs. + */ +static void idle_balance(int this_cpu, runqueue_t *this_rq) +{ + struct sched_domain *sd; + + for_each_domain(this_cpu, sd) { + if (sd->flags & SD_BALANCE_NEWIDLE) { + if (load_balance_newidle(this_cpu, this_rq, sd)) { + /* We've pulled tasks over so stop searching */ + break; + } + } + } +} + +/* + * active_load_balance is run by migration threads. It pushes running tasks + * off the busiest CPU onto idle CPUs. It requires at least 1 task to be + * running on each physical CPU where possible, and avoids physical / + * logical imbalances. + * + * Called with busiest_rq locked. + */ +static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) +{ + struct sched_domain *sd; + runqueue_t *target_rq; + int target_cpu = busiest_rq->push_cpu; + + if (busiest_rq->nr_running <= 1) + /* no task to move */ + return; + + target_rq = cpu_rq(target_cpu); + + /* + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by + * Bjorn Helgaas on a 128-cpu setup. + */ + BUG_ON(busiest_rq == target_rq); + + /* move a task from busiest_rq to target_rq */ + double_lock_balance(busiest_rq, target_rq); + + /* Search for an sd spanning us and the target CPU. */ + for_each_domain(target_cpu, sd) + if ((sd->flags & SD_LOAD_BALANCE) && + cpu_isset(busiest_cpu, sd->span)) + break; + + if (unlikely(sd == NULL)) + goto out; + + schedstat_inc(sd, alb_cnt); + + if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) + schedstat_inc(sd, alb_pushed); + else + schedstat_inc(sd, alb_failed); +out: + spin_unlock(&target_rq->lock); +} + +/* + * rebalance_tick will get called every timer tick, on every CPU. + * + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. + * + * Balancing parameters are set up in arch_init_sched_domains. + */ + +/* Don't have all balancing operations going off at once */ +#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) + +static void rebalance_tick(int this_cpu, runqueue_t *this_rq, + enum idle_type idle) +{ + unsigned long old_load, this_load; + unsigned long j = jiffies + CPU_OFFSET(this_cpu); + struct sched_domain *sd; + int i; + + this_load = this_rq->nr_running * SCHED_LOAD_SCALE; + /* Update our load */ + for (i = 0; i < 3; i++) { + unsigned long new_load = this_load; + int scale = 1 << i; + old_load = this_rq->cpu_load[i]; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale-1; + this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; + } + + for_each_domain(this_cpu, sd) { + unsigned long interval; + + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + + interval = sd->balance_interval; + if (idle != SCHED_IDLE) + interval *= sd->busy_factor; + + /* scale ms to jiffies */ + interval = msecs_to_jiffies(interval); + if (unlikely(!interval)) + interval = 1; + + if (j - sd->last_balance >= interval) { + if (load_balance(this_cpu, this_rq, sd, idle)) { + /* + * We've pulled tasks over so either we're no + * longer idle, or one of our SMT siblings is + * not idle. + */ + idle = NOT_IDLE; + } + sd->last_balance += interval; + } + } +} +#else +/* + * on UP we do not need to balance between CPUs: + */ +static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) +{ +} +static inline void idle_balance(int cpu, runqueue_t *rq) +{ +} +#endif + +static inline int wake_priority_sleeper(runqueue_t *rq) +{ + int ret = 0; +#ifdef CONFIG_SCHED_SMT + spin_lock(&rq->lock); + /* + * If an SMT sibling task has been put to sleep for priority + * reasons reschedule the idle task to see if it can now run. + */ + if (rq->nr_running) { + resched_task(rq->idle); + ret = 1; + } + spin_unlock(&rq->lock); +#endif + return ret; +} + +DEFINE_PER_CPU(struct kernel_stat, kstat); + +EXPORT_PER_CPU_SYMBOL(kstat); + +/* + * This is called on clock ticks and on context switches. + * Bank in p->sched_time the ns elapsed since the last tick or switch. + */ +static inline void update_cpu_clock(task_t *p, runqueue_t *rq, + unsigned long long now) +{ + unsigned long long last = max(p->timestamp, rq->timestamp_last_tick); + p->sched_time += now - last; +} + +/* + * Return current->sched_time plus any more ns on the sched_clock + * that have not yet been banked. + */ +unsigned long long current_sched_time(const task_t *tsk) +{ + unsigned long long ns; + unsigned long flags; + local_irq_save(flags); + ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick); + ns = tsk->sched_time + (sched_clock() - ns); + local_irq_restore(flags); + return ns; +} + +/* + * We place interactive tasks back into the active array, if possible. + * + * To guarantee that this does not starve expired tasks we ignore the + * interactivity of a task if the first expired task had to wait more + * than a 'reasonable' amount of time. This deadline timeout is + * load-dependent, as the frequency of array switched decreases with + * increasing number of running tasks. We also ignore the interactivity + * if a better static_prio task has expired: + */ +#define EXPIRED_STARVING(rq) \ + ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ + (jiffies - (rq)->expired_timestamp >= \ + STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ + ((rq)->curr->static_prio > (rq)->best_expired_prio)) + +/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in user space since the last update + */ +void account_user_time(struct task_struct *p, cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t tmp; + + p->utime = cputime_add(p->utime, cputime); + + /* Add user time to cpustat. */ + tmp = cputime_to_cputime64(cputime); + if (TASK_NICE(p) > 0) + cpustat->nice = cputime64_add(cpustat->nice, tmp); + else + cpustat->user = cputime64_add(cpustat->user, tmp); +} + +/* + * Account system cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in kernel space since the last update + */ +void account_system_time(struct task_struct *p, int hardirq_offset, + cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + runqueue_t *rq = this_rq(); + cputime64_t tmp; + + p->stime = cputime_add(p->stime, cputime); + + /* Add system time to cpustat. */ + tmp = cputime_to_cputime64(cputime); + if (hardirq_count() - hardirq_offset) + cpustat->irq = cputime64_add(cpustat->irq, tmp); + else if (softirq_count()) + cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + else if (p != rq->idle) + cpustat->system = cputime64_add(cpustat->system, tmp); + else if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, tmp); + else + cpustat->idle = cputime64_add(cpustat->idle, tmp); + /* Account for system time used */ + acct_update_integrals(p); +} + +/* + * Account for involuntary wait time. + * @p: the process from which the cpu time has been stolen + * @steal: the cpu time spent in involuntary wait + */ +void account_steal_time(struct task_struct *p, cputime_t steal) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t tmp = cputime_to_cputime64(steal); + runqueue_t *rq = this_rq(); + + if (p == rq->idle) { + p->stime = cputime_add(p->stime, steal); + if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, tmp); + else + cpustat->idle = cputime64_add(cpustat->idle, tmp); + } else + cpustat->steal = cputime64_add(cpustat->steal, tmp); +} + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + * + * It also gets called by the fork code, when changing the parent's + * timeslices. + */ +void scheduler_tick(void) +{ + int cpu = smp_processor_id(); + runqueue_t *rq = this_rq(); + task_t *p = current; + unsigned long long now = sched_clock(); + + update_cpu_clock(p, rq, now); + + rq->timestamp_last_tick = now; + + if (p == rq->idle) { + if (wake_priority_sleeper(rq)) + goto out; + rebalance_tick(cpu, rq, SCHED_IDLE); + return; + } + + /* Task might have expired already, but not scheduled off yet */ + if (p->array != rq->active) { + set_tsk_need_resched(p); + goto out; + } + spin_lock(&rq->lock); + /* + * The task was running during this tick - update the + * time slice counter. Note: we do not update a thread's + * priority until it either goes to sleep or uses up its + * timeslice. This makes it possible for interactive tasks + * to use up their timeslices at their highest priority levels. + */ + if (rt_task(p)) { + /* + * RR tasks need a special form of timeslice management. + * FIFO tasks have no timeslices. + */ + if ((p->policy == SCHED_RR) && !--p->time_slice) { + p->time_slice = task_timeslice(p); + p->first_time_slice = 0; + set_tsk_need_resched(p); + + /* put it at the end of the queue: */ + requeue_task(p, rq->active); + } + goto out_unlock; + } + if (!--p->time_slice) { + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + p->time_slice = task_timeslice(p); + p->first_time_slice = 0; + + if (!rq->expired_timestamp) + rq->expired_timestamp = jiffies; + if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { + enqueue_task(p, rq->expired); + if (p->static_prio < rq->best_expired_prio) + rq->best_expired_prio = p->static_prio; + } else + enqueue_task(p, rq->active); + } else { + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + * + * This only applies to tasks in the interactive + * delta range with at least TIMESLICE_GRANULARITY to requeue. + */ + if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - + p->time_slice) % TIMESLICE_GRANULARITY(p)) && + (p->time_slice >= TIMESLICE_GRANULARITY(p)) && + (p->array == rq->active)) { + + requeue_task(p, rq->active); + set_tsk_need_resched(p); + } + } +out_unlock: + spin_unlock(&rq->lock); +out: + rebalance_tick(cpu, rq, NOT_IDLE); +} + +#ifdef CONFIG_SCHED_SMT +static inline void wakeup_busy_runqueue(runqueue_t *rq) +{ + /* If an SMT runqueue is sleeping due to priority reasons wake it up */ + if (rq->curr == rq->idle && rq->nr_running) + resched_task(rq->idle); +} + +static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) +{ + struct sched_domain *tmp, *sd = NULL; + cpumask_t sibling_map; + int i; + + for_each_domain(this_cpu, tmp) + if (tmp->flags & SD_SHARE_CPUPOWER) + sd = tmp; + + if (!sd) + return; + + /* + * Unlock the current runqueue because we have to lock in + * CPU order to avoid deadlocks. Caller knows that we might + * unlock. We keep IRQs disabled. + */ + spin_unlock(&this_rq->lock); + + sibling_map = sd->span; + + for_each_cpu_mask(i, sibling_map) + spin_lock(&cpu_rq(i)->lock); + /* + * We clear this CPU from the mask. This both simplifies the + * inner loop and keps this_rq locked when we exit: + */ + cpu_clear(this_cpu, sibling_map); + + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq = cpu_rq(i); + + wakeup_busy_runqueue(smt_rq); + } + + for_each_cpu_mask(i, sibling_map) + spin_unlock(&cpu_rq(i)->lock); + /* + * We exit with this_cpu's rq still held and IRQs + * still disabled: + */ +} + +/* + * number of 'lost' timeslices this task wont be able to fully + * utilize, if another task runs on a sibling. This models the + * slowdown effect of other tasks running on siblings: + */ +static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) +{ + return p->time_slice * (100 - sd->per_cpu_gain) / 100; +} + +static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) +{ + struct sched_domain *tmp, *sd = NULL; + cpumask_t sibling_map; + prio_array_t *array; + int ret = 0, i; + task_t *p; + + for_each_domain(this_cpu, tmp) + if (tmp->flags & SD_SHARE_CPUPOWER) + sd = tmp; + + if (!sd) + return 0; + + /* + * The same locking rules and details apply as for + * wake_sleeping_dependent(): + */ + spin_unlock(&this_rq->lock); + sibling_map = sd->span; + for_each_cpu_mask(i, sibling_map) + spin_lock(&cpu_rq(i)->lock); + cpu_clear(this_cpu, sibling_map); + + /* + * Establish next task to be run - it might have gone away because + * we released the runqueue lock above: + */ + if (!this_rq->nr_running) + goto out_unlock; + array = this_rq->active; + if (!array->nr_active) + array = this_rq->expired; + BUG_ON(!array->nr_active); + + p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, + task_t, run_list); + + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq = cpu_rq(i); + task_t *smt_curr = smt_rq->curr; + + /* Kernel threads do not participate in dependent sleeping */ + if (!p->mm || !smt_curr->mm || rt_task(p)) + goto check_smt_task; + + /* + * If a user task with lower static priority than the + * running task on the SMT sibling is trying to schedule, + * delay it till there is proportionately less timeslice + * left of the sibling task to prevent a lower priority + * task from using an unfair proportion of the + * physical cpu's resources. -ck + */ + if (rt_task(smt_curr)) { + /* + * With real time tasks we run non-rt tasks only + * per_cpu_gain% of the time. + */ + if ((jiffies % DEF_TIMESLICE) > + (sd->per_cpu_gain * DEF_TIMESLICE / 100)) + ret = 1; + } else + if (smt_curr->static_prio < p->static_prio && + !TASK_PREEMPTS_CURR(p, smt_rq) && + smt_slice(smt_curr, sd) > task_timeslice(p)) + ret = 1; + +check_smt_task: + if ((!smt_curr->mm && smt_curr != smt_rq->idle) || + rt_task(smt_curr)) + continue; + if (!p->mm) { + wakeup_busy_runqueue(smt_rq); + continue; + } + + /* + * Reschedule a lower priority task on the SMT sibling for + * it to be put to sleep, or wake it up if it has been put to + * sleep for priority reasons to see if it should run now. + */ + if (rt_task(p)) { + if ((jiffies % DEF_TIMESLICE) > + (sd->per_cpu_gain * DEF_TIMESLICE / 100)) + resched_task(smt_curr); + } else { + if (TASK_PREEMPTS_CURR(p, smt_rq) && + smt_slice(p, sd) > task_timeslice(smt_curr)) + resched_task(smt_curr); + else + wakeup_busy_runqueue(smt_rq); + } + } +out_unlock: + for_each_cpu_mask(i, sibling_map) + spin_unlock(&cpu_rq(i)->lock); + return ret; +} +#else +static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) +{ +} + +static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) +{ + return 0; +} +#endif + +#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) + +void fastcall add_preempt_count(int val) +{ + /* + * Underflow? + */ + BUG_ON((preempt_count() < 0)); + preempt_count() += val; + /* + * Spinlock count overflowing soon? + */ + BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); +} +EXPORT_SYMBOL(add_preempt_count); + +void fastcall sub_preempt_count(int val) +{ + /* + * Underflow? + */ + BUG_ON(val > preempt_count()); + /* + * Is the spinlock portion underflowing? + */ + BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); + preempt_count() -= val; +} +EXPORT_SYMBOL(sub_preempt_count); + +#endif + +/* + * schedule() is the main scheduler function. + */ +asmlinkage void __sched schedule(void) +{ + long *switch_count; + task_t *prev, *next; + runqueue_t *rq; + prio_array_t *array; + struct list_head *queue; + unsigned long long now; + unsigned long run_time; + int cpu, idx, new_prio; + + /* + * Test if we are atomic. Since do_exit() needs to call into + * schedule() atomically, we ignore that path for now. + * Otherwise, whine if we are scheduling when we should not be. + */ + if (likely(!current->exit_state)) { + if (unlikely(in_atomic())) { + printk(KERN_ERR "scheduling while atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), current->pid); + dump_stack(); + } + } + profile_hit(SCHED_PROFILING, __builtin_return_address(0)); + +need_resched: + preempt_disable(); + prev = current; + release_kernel_lock(prev); +need_resched_nonpreemptible: + rq = this_rq(); + + /* + * The idle thread is not allowed to schedule! + * Remove this check after it has been exercised a bit. + */ + if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { + printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + dump_stack(); + } + + schedstat_inc(rq, sched_cnt); + now = sched_clock(); + if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { + run_time = now - prev->timestamp; + if (unlikely((long long)(now - prev->timestamp) < 0)) + run_time = 0; + } else + run_time = NS_MAX_SLEEP_AVG; + + /* + * Tasks charged proportionately less run_time at high sleep_avg to + * delay them losing their interactive status + */ + run_time /= (CURRENT_BONUS(prev) ? : 1); + + spin_lock_irq(&rq->lock); + + if (unlikely(prev->flags & PF_DEAD)) + prev->state = EXIT_DEAD; + + switch_count = &prev->nivcsw; + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; + else { + if (prev->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; + deactivate_task(prev, rq); + } + } + + cpu = smp_processor_id(); + if (unlikely(!rq->nr_running)) { +go_idle: + idle_balance(cpu, rq); + if (!rq->nr_running) { + next = rq->idle; + rq->expired_timestamp = 0; + wake_sleeping_dependent(cpu, rq); + /* + * wake_sleeping_dependent() might have released + * the runqueue, so break out if we got new + * tasks meanwhile: + */ + if (!rq->nr_running) + goto switch_tasks; + } + } else { + if (dependent_sleeper(cpu, rq)) { + next = rq->idle; + goto switch_tasks; + } + /* + * dependent_sleeper() releases and reacquires the runqueue + * lock, hence go into the idle loop if the rq went + * empty meanwhile: + */ + if (unlikely(!rq->nr_running)) + goto go_idle; + } + + array = rq->active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + schedstat_inc(rq, sched_switch); + rq->active = rq->expired; + rq->expired = array; + array = rq->active; + rq->expired_timestamp = 0; + rq->best_expired_prio = MAX_PRIO; + } + + idx = sched_find_first_bit(array->bitmap); + queue = array->queue + idx; + next = list_entry(queue->next, task_t, run_list); + + if (!rt_task(next) && next->activated > 0) { + unsigned long long delta = now - next->timestamp; + if (unlikely((long long)(now - next->timestamp) < 0)) + delta = 0; + + if (next->activated == 1) + delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; + + array = next->array; + new_prio = recalc_task_prio(next, next->timestamp + delta); + + if (unlikely(next->prio != new_prio)) { + dequeue_task(next, array); + next->prio = new_prio; + enqueue_task(next, array); + } else + requeue_task(next, array); + } + next->activated = 0; +switch_tasks: + if (next == rq->idle) + schedstat_inc(rq, sched_goidle); + prefetch(next); + prefetch_stack(next); + clear_tsk_need_resched(prev); + rcu_qsctr_inc(task_cpu(prev)); + + update_cpu_clock(prev, rq, now); + + prev->sleep_avg -= run_time; + if ((long)prev->sleep_avg <= 0) + prev->sleep_avg = 0; + prev->timestamp = prev->last_ran = now; + + sched_info_switch(prev, next); + if (likely(prev != next)) { + next->timestamp = now; + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + + prepare_task_switch(rq, next); + prev = context_switch(rq, prev, next); + barrier(); + /* + * this_rq must be evaluated again because prev may have moved + * CPUs since it called schedule(), thus the 'rq' on its stack + * frame will be invalid. + */ + finish_task_switch(this_rq(), prev); + } else + spin_unlock_irq(&rq->lock); + + prev = current; + if (unlikely(reacquire_kernel_lock(prev) < 0)) + goto need_resched_nonpreemptible; + preempt_enable_no_resched(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; +} + +EXPORT_SYMBOL(schedule); + +#ifdef CONFIG_PREEMPT +/* + * this is is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. Kernel preemptions off return from interrupt + * occur there and call schedule directly. + */ +asmlinkage void __sched preempt_schedule(void) +{ + struct thread_info *ti = current_thread_info(); +#ifdef CONFIG_PREEMPT_BKL + struct task_struct *task = current; + int saved_lock_depth; +#endif + /* + * If there is a non-zero preempt_count or interrupts are disabled, + * we do not want to preempt the current task. Just return.. + */ + if (unlikely(ti->preempt_count || irqs_disabled())) + return; + +need_resched: + add_preempt_count(PREEMPT_ACTIVE); + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ +#ifdef CONFIG_PREEMPT_BKL + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; +#endif + schedule(); +#ifdef CONFIG_PREEMPT_BKL + task->lock_depth = saved_lock_depth; +#endif + sub_preempt_count(PREEMPT_ACTIVE); + + /* we could miss a preemption opportunity between schedule and now */ + barrier(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; +} + +EXPORT_SYMBOL(preempt_schedule); + +/* + * this is is the entry point to schedule() from kernel preemption + * off of irq context. + * Note, that this is called and return with irqs disabled. This will + * protect us against recursive calling from irq. + */ +asmlinkage void __sched preempt_schedule_irq(void) +{ + struct thread_info *ti = current_thread_info(); +#ifdef CONFIG_PREEMPT_BKL + struct task_struct *task = current; + int saved_lock_depth; +#endif + /* Catch callers which need to be fixed*/ + BUG_ON(ti->preempt_count || !irqs_disabled()); + +need_resched: + add_preempt_count(PREEMPT_ACTIVE); + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ +#ifdef CONFIG_PREEMPT_BKL + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; +#endif + local_irq_enable(); + schedule(); + local_irq_disable(); +#ifdef CONFIG_PREEMPT_BKL + task->lock_depth = saved_lock_depth; +#endif + sub_preempt_count(PREEMPT_ACTIVE); + + /* we could miss a preemption opportunity between schedule and now */ + barrier(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; +} + +#endif /* CONFIG_PREEMPT */ + +int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, + void *key) +{ + task_t *p = curr->private; + return try_to_wake_up(p, mode, sync); +} + +EXPORT_SYMBOL(default_wake_function); + +/* + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. + * + * There are circumstances in which we can try to wake a task which has already + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. + */ +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, int sync, void *key) +{ + struct list_head *tmp, *next; + + list_for_each_safe(tmp, next, &q->task_list) { + wait_queue_t *curr; + unsigned flags; + curr = list_entry(tmp, wait_queue_t, task_list); + flags = curr->flags; + if (curr->func(curr, mode, sync, key) && + (flags & WQ_FLAG_EXCLUSIVE) && + !--nr_exclusive) + break; + } +} + +/** + * __wake_up - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: is directly passed to the wakeup function + */ +void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, 0, key); + spin_unlock_irqrestore(&q->lock, flags); +} + +EXPORT_SYMBOL(__wake_up); + +/* + * Same as __wake_up but called with the spinlock in wait_queue_head_t held. + */ +void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) +{ + __wake_up_common(q, mode, 1, 0, NULL); +} + +/** + * __wake_up_sync - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * + * The sync wakeup differs that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + */ +void fastcall +__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + unsigned long flags; + int sync = 1; + + if (unlikely(!q)) + return; + + if (unlikely(!nr_exclusive)) + sync = 0; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, sync, NULL); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ + +void fastcall complete(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done++; + __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, + 1, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete); + +void fastcall complete_all(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done += UINT_MAX/2; + __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, + 0, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete_all); + +void fastcall __sched wait_for_completion(struct completion *x) +{ + might_sleep(); + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + schedule(); + spin_lock_irq(&x->wait.lock); + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + } + x->done--; + spin_unlock_irq(&x->wait.lock); +} +EXPORT_SYMBOL(wait_for_completion); + +unsigned long fastcall __sched +wait_for_completion_timeout(struct completion *x, unsigned long timeout) +{ + might_sleep(); + + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + timeout = schedule_timeout(timeout); + spin_lock_irq(&x->wait.lock); + if (!timeout) { + __remove_wait_queue(&x->wait, &wait); + goto out; + } + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + } + x->done--; +out: + spin_unlock_irq(&x->wait.lock); + return timeout; +} +EXPORT_SYMBOL(wait_for_completion_timeout); + +int fastcall __sched wait_for_completion_interruptible(struct completion *x) +{ + int ret = 0; + + might_sleep(); + + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + if (signal_pending(current)) { + ret = -ERESTARTSYS; + __remove_wait_queue(&x->wait, &wait); + goto out; + } + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + schedule(); + spin_lock_irq(&x->wait.lock); + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + } + x->done--; +out: + spin_unlock_irq(&x->wait.lock); + + return ret; +} +EXPORT_SYMBOL(wait_for_completion_interruptible); + +unsigned long fastcall __sched +wait_for_completion_interruptible_timeout(struct completion *x, + unsigned long timeout) +{ + might_sleep(); + + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + if (signal_pending(current)) { + timeout = -ERESTARTSYS; + __remove_wait_queue(&x->wait, &wait); + goto out; + } + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + timeout = schedule_timeout(timeout); + spin_lock_irq(&x->wait.lock); + if (!timeout) { + __remove_wait_queue(&x->wait, &wait); + goto out; + } + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + } + x->done--; +out: + spin_unlock_irq(&x->wait.lock); + return timeout; +} +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); + + +#define SLEEP_ON_VAR \ + unsigned long flags; \ + wait_queue_t wait; \ + init_waitqueue_entry(&wait, current); + +#define SLEEP_ON_HEAD \ + spin_lock_irqsave(&q->lock,flags); \ + __add_wait_queue(q, &wait); \ + spin_unlock(&q->lock); + +#define SLEEP_ON_TAIL \ + spin_lock_irq(&q->lock); \ + __remove_wait_queue(q, &wait); \ + spin_unlock_irqrestore(&q->lock, flags); + +void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) +{ + SLEEP_ON_VAR + + current->state = TASK_INTERRUPTIBLE; + + SLEEP_ON_HEAD + schedule(); + SLEEP_ON_TAIL +} + +EXPORT_SYMBOL(interruptible_sleep_on); + +long fastcall __sched +interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + SLEEP_ON_VAR + + current->state = TASK_INTERRUPTIBLE; + + SLEEP_ON_HEAD + timeout = schedule_timeout(timeout); + SLEEP_ON_TAIL + + return timeout; +} + +EXPORT_SYMBOL(interruptible_sleep_on_timeout); + +void fastcall __sched sleep_on(wait_queue_head_t *q) +{ + SLEEP_ON_VAR + + current->state = TASK_UNINTERRUPTIBLE; + + SLEEP_ON_HEAD + schedule(); + SLEEP_ON_TAIL +} + +EXPORT_SYMBOL(sleep_on); + +long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + SLEEP_ON_VAR + + current->state = TASK_UNINTERRUPTIBLE; + + SLEEP_ON_HEAD + timeout = schedule_timeout(timeout); + SLEEP_ON_TAIL + + return timeout; +} + +EXPORT_SYMBOL(sleep_on_timeout); + +void set_user_nice(task_t *p, long nice) +{ + unsigned long flags; + prio_array_t *array; + runqueue_t *rq; + int old_prio, new_prio, delta; + + if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + return; + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + rq = task_rq_lock(p, &flags); + /* + * The RT priorities are set via sched_setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it wont have any effect on scheduling until the task is + * not SCHED_NORMAL/SCHED_BATCH: + */ + if (rt_task(p)) { + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } + array = p->array; + if (array) + dequeue_task(p, array); + + old_prio = p->prio; + new_prio = NICE_TO_PRIO(nice); + delta = new_prio - old_prio; + p->static_prio = NICE_TO_PRIO(nice); + p->prio += delta; + + if (array) { + enqueue_task(p, array); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + if (delta < 0 || (delta > 0 && task_running(rq, p))) + resched_task(rq->curr); + } +out_unlock: + task_rq_unlock(rq, &flags); +} + +EXPORT_SYMBOL(set_user_nice); + +/* + * can_nice - check if a task can reduce its nice value + * @p: task + * @nice: nice value + */ +int can_nice(const task_t *p, const int nice) +{ + /* convert nice value [19,-20] to rlimit style value [1,40] */ + int nice_rlim = 20 - nice; + return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || + capable(CAP_SYS_NICE)); +} + +#ifdef __ARCH_WANT_SYS_NICE + +/* + * sys_nice - change the priority of the current process. + * @increment: priority increment + * + * sys_setpriority is a more generic, but much slower function that + * does similar things. + */ +asmlinkage long sys_nice(int increment) +{ + int retval; + long nice; + + /* + * Setpriority might change our priority at the same moment. + * We don't have to worry. Conceptually one call occurs first + * and we have a single winner. + */ + if (increment < -40) + increment = -40; + if (increment > 40) + increment = 40; + + nice = PRIO_TO_NICE(current->static_prio) + increment; + if (nice < -20) + nice = -20; + if (nice > 19) + nice = 19; + + if (increment < 0 && !can_nice(current, nice)) + return -EPERM; + + retval = security_task_setnice(current, nice); + if (retval) + return retval; + + set_user_nice(current, nice); + return 0; +} + +#endif + +/** + * task_prio - return the priority value of a given task. + * @p: the task in question. + * + * This is the priority value as seen by users in /proc. + * RT tasks are offset by -200. Normal tasks are centered + * around 0, value goes from -16 to +15. + */ +int task_prio(const task_t *p) +{ + return p->prio - MAX_RT_PRIO; +} + +/** + * task_nice - return the nice value of a given task. + * @p: the task in question. + */ +int task_nice(const task_t *p) +{ + return TASK_NICE(p); +} +EXPORT_SYMBOL_GPL(task_nice); + +/** + * idle_cpu - is a given cpu idle currently? + * @cpu: the processor in question. + */ +int idle_cpu(int cpu) +{ + return cpu_curr(cpu) == cpu_rq(cpu)->idle; +} + +/** + * idle_task - return the idle task for a given cpu. + * @cpu: the processor in question. + */ +task_t *idle_task(int cpu) +{ + return cpu_rq(cpu)->idle; +} + +/** + * find_process_by_pid - find a process with a matching PID value. + * @pid: the pid in question. + */ +static inline task_t *find_process_by_pid(pid_t pid) +{ + return pid ? find_task_by_pid(pid) : current; +} + +/* Actually do priority change: must hold rq lock. */ +static void __setscheduler(struct task_struct *p, int policy, int prio) +{ + BUG_ON(p->array); + p->policy = policy; + p->rt_priority = prio; + if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { + p->prio = MAX_RT_PRIO-1 - p->rt_priority; + } else { + p->prio = p->static_prio; + /* + * SCHED_BATCH tasks are treated as perpetual CPU hogs: + */ + if (policy == SCHED_BATCH) + p->sleep_avg = 0; + } +} + +/** + * sched_setscheduler - change the scheduling policy and/or RT priority of + * a thread. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + */ +int sched_setscheduler(struct task_struct *p, int policy, + struct sched_param *param) +{ + int retval; + int oldprio, oldpolicy = -1; + prio_array_t *array; + unsigned long flags; + runqueue_t *rq; + +recheck: + /* double check policy once rq lock held */ + if (policy < 0) + policy = oldpolicy = p->policy; + else if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_NORMAL && policy != SCHED_BATCH) + return -EINVAL; + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and + * SCHED_BATCH is 0. + */ + if (param->sched_priority < 0 || + (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || + (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) + return -EINVAL; + if ((policy == SCHED_NORMAL || policy == SCHED_BATCH) + != (param->sched_priority == 0)) + return -EINVAL; + + /* + * Allow unprivileged RT tasks to decrease priority: + */ + if (!capable(CAP_SYS_NICE)) { + /* + * can't change policy, except between SCHED_NORMAL + * and SCHED_BATCH: + */ + if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) && + (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) && + !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) + return -EPERM; + /* can't increase priority */ + if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) && + param->sched_priority > p->rt_priority && + param->sched_priority > + p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) + return -EPERM; + /* can't change other user's priorities */ + if ((current->euid != p->euid) && + (current->euid != p->uid)) + return -EPERM; + } + + retval = security_task_setscheduler(p, policy, param); + if (retval) + return retval; + /* + * To be able to change p->policy safely, the apropriate + * runqueue lock must be held. + */ + rq = task_rq_lock(p, &flags); + /* recheck policy now with rq lock held */ + if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { + policy = oldpolicy = -1; + task_rq_unlock(rq, &flags); + goto recheck; + } + array = p->array; + if (array) + deactivate_task(p, rq); + oldprio = p->prio; + __setscheduler(p, policy, param->sched_priority); + if (array) { + __activate_task(p, rq); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + task_rq_unlock(rq, &flags); + return 0; +} +EXPORT_SYMBOL_GPL(sched_setscheduler); + +static int +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +{ + int retval; + struct sched_param lparam; + struct task_struct *p; + + if (!param || pid < 0) + return -EINVAL; + if (copy_from_user(&lparam, param, sizeof(struct sched_param))) + return -EFAULT; + read_lock_irq(&tasklist_lock); + p = find_process_by_pid(pid); + if (!p) { + read_unlock_irq(&tasklist_lock); + return -ESRCH; + } + retval = sched_setscheduler(p, policy, &lparam); + read_unlock_irq(&tasklist_lock); + return retval; +} + +/** + * sys_sched_setscheduler - set/change the scheduler policy and RT priority + * @pid: the pid in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + */ +asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, + struct sched_param __user *param) +{ + /* negative values for policy are not valid */ + if (policy < 0) + return -EINVAL; + + return do_sched_setscheduler(pid, policy, param); +} + +/** + * sys_sched_setparam - set/change the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the new RT priority. + */ +asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) +{ + return do_sched_setscheduler(pid, -1, param); +} + +/** + * sys_sched_getscheduler - get the policy (scheduling class) of a thread + * @pid: the pid in question. + */ +asmlinkage long sys_sched_getscheduler(pid_t pid) +{ + int retval = -EINVAL; + task_t *p; + + if (pid < 0) + goto out_nounlock; + + retval = -ESRCH; + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + if (p) { + retval = security_task_getscheduler(p); + if (!retval) + retval = p->policy; + } + read_unlock(&tasklist_lock); + +out_nounlock: + return retval; +} + +/** + * sys_sched_getscheduler - get the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the RT priority. + */ +asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) +{ + struct sched_param lp; + int retval = -EINVAL; + task_t *p; + + if (!param || pid < 0) + goto out_nounlock; + + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + retval = -ESRCH; + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + lp.sched_priority = p->rt_priority; + read_unlock(&tasklist_lock); + + /* + * This one might sleep, we cannot do it with a spinlock held ... + */ + retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; + +out_nounlock: + return retval; + +out_unlock: + read_unlock(&tasklist_lock); + return retval; +} + +long sched_setaffinity(pid_t pid, cpumask_t new_mask) +{ + task_t *p; + int retval; + cpumask_t cpus_allowed; + + lock_cpu_hotplug(); + read_lock(&tasklist_lock); + + p = find_process_by_pid(pid); + if (!p) { + read_unlock(&tasklist_lock); + unlock_cpu_hotplug(); + return -ESRCH; + } + + /* + * It is not safe to call set_cpus_allowed with the + * tasklist_lock held. We will bump the task_struct's + * usage count and then drop tasklist_lock. + */ + get_task_struct(p); + read_unlock(&tasklist_lock); + + retval = -EPERM; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + + cpus_allowed = cpuset_cpus_allowed(p); + cpus_and(new_mask, new_mask, cpus_allowed); + retval = set_cpus_allowed(p, new_mask); + +out_unlock: + put_task_struct(p); + unlock_cpu_hotplug(); + return retval; +} + +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, + cpumask_t *new_mask) +{ + if (len < sizeof(cpumask_t)) { + memset(new_mask, 0, sizeof(cpumask_t)); + } else if (len > sizeof(cpumask_t)) { + len = sizeof(cpumask_t); + } + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; +} + +/** + * sys_sched_setaffinity - set the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to the new cpu mask + */ +asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, + unsigned long __user *user_mask_ptr) +{ + cpumask_t new_mask; + int retval; + + retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); + if (retval) + return retval; + + return sched_setaffinity(pid, new_mask); +} + +/* + * Represents all cpu's present in the system + * In systems capable of hotplug, this map could dynamically grow + * as new cpu's are detected in the system via any platform specific + * method, such as ACPI for e.g. + */ + +cpumask_t cpu_present_map __read_mostly; +EXPORT_SYMBOL(cpu_present_map); + +#ifndef CONFIG_SMP +cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; +cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; +#endif + +long sched_getaffinity(pid_t pid, cpumask_t *mask) +{ + int retval; + task_t *p; + + lock_cpu_hotplug(); + read_lock(&tasklist_lock); + + retval = -ESRCH; + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = 0; + cpus_and(*mask, p->cpus_allowed, cpu_online_map); + +out_unlock: + read_unlock(&tasklist_lock); + unlock_cpu_hotplug(); + if (retval) + return retval; + + return 0; +} + +/** + * sys_sched_getaffinity - get the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to hold the current cpu mask + */ +asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, + unsigned long __user *user_mask_ptr) +{ + int ret; + cpumask_t mask; + + if (len < sizeof(cpumask_t)) + return -EINVAL; + + ret = sched_getaffinity(pid, &mask); + if (ret < 0) + return ret; + + if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) + return -EFAULT; + + return sizeof(cpumask_t); +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * this function yields the current CPU by moving the calling thread + * to the expired array. If there are no other threads running on this + * CPU then this function will return. + */ +asmlinkage long sys_sched_yield(void) +{ + runqueue_t *rq = this_rq_lock(); + prio_array_t *array = current->array; + prio_array_t *target = rq->expired; + + schedstat_inc(rq, yld_cnt); + /* + * We implement yielding by moving the task into the expired + * queue. + * + * (special rule: RT tasks will just roundrobin in the active + * array.) + */ + if (rt_task(current)) + target = rq->active; + + if (array->nr_active == 1) { + schedstat_inc(rq, yld_act_empty); + if (!rq->expired->nr_active) + schedstat_inc(rq, yld_both_empty); + } else if (!rq->expired->nr_active) + schedstat_inc(rq, yld_exp_empty); + + if (array != target) { + dequeue_task(current, array); + enqueue_task(current, target); + } else + /* + * requeue_task is cheaper so perform that if possible. + */ + requeue_task(current, array); + + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: + */ + __release(rq->lock); + _raw_spin_unlock(&rq->lock); + preempt_enable_no_resched(); + + schedule(); + + return 0; +} + +static inline void __cond_resched(void) +{ + /* + * The BKS might be reacquired before we have dropped + * PREEMPT_ACTIVE, which could trigger a second + * cond_resched() call. + */ + if (unlikely(preempt_count())) + return; + do { + add_preempt_count(PREEMPT_ACTIVE); + schedule(); + sub_preempt_count(PREEMPT_ACTIVE); + } while (need_resched()); +} + +int __sched cond_resched(void) +{ + if (need_resched()) { + __cond_resched(); + return 1; + } + return 0; +} + +EXPORT_SYMBOL(cond_resched); + +/* + * cond_resched_lock() - if a reschedule is pending, drop the given lock, + * call schedule, and on return reacquire the lock. + * + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * operations here to prevent schedule() from being called twice (once via + * spin_unlock(), once by hand). + */ +int cond_resched_lock(spinlock_t *lock) +{ + int ret = 0; + + if (need_lockbreak(lock)) { + spin_unlock(lock); + cpu_relax(); + ret = 1; + spin_lock(lock); + } + if (need_resched()) { + _raw_spin_unlock(lock); + preempt_enable_no_resched(); + __cond_resched(); + ret = 1; + spin_lock(lock); + } + return ret; +} + +EXPORT_SYMBOL(cond_resched_lock); + +int __sched cond_resched_softirq(void) +{ + BUG_ON(!in_softirq()); + + if (need_resched()) { + __local_bh_enable(); + __cond_resched(); + local_bh_disable(); + return 1; + } + return 0; +} + +EXPORT_SYMBOL(cond_resched_softirq); + + +/** + * yield - yield the current processor to other threads. + * + * this is a shortcut for kernel-space yielding - it marks the + * thread runnable and calls sys_sched_yield(). + */ +void __sched yield(void) +{ + set_current_state(TASK_RUNNING); + sys_sched_yield(); +} + +EXPORT_SYMBOL(yield); + +/* + * This task is about to go to sleep on IO. Increment rq->nr_iowait so + * that process accounting knows that this is a task in IO wait state. + * + * But don't do that if it is a deliberate, throttling IO wait (this task + * has set its backing_dev_info: the queue against which it should throttle) + */ +void __sched io_schedule(void) +{ + struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); + + atomic_inc(&rq->nr_iowait); + schedule(); + atomic_dec(&rq->nr_iowait); +} + +EXPORT_SYMBOL(io_schedule); + +long __sched io_schedule_timeout(long timeout) +{ + struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); + long ret; + + atomic_inc(&rq->nr_iowait); + ret = schedule_timeout(timeout); + atomic_dec(&rq->nr_iowait); + return ret; +} + +/** + * sys_sched_get_priority_max - return maximum RT priority. + * @policy: scheduling class. + * + * this syscall returns the maximum rt_priority that can be used + * by a given scheduling class. + */ +asmlinkage long sys_sched_get_priority_max(int policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = MAX_USER_RT_PRIO-1; + break; + case SCHED_NORMAL: + case SCHED_BATCH: + ret = 0; + break; + } + return ret; +} + +/** + * sys_sched_get_priority_min - return minimum RT priority. + * @policy: scheduling class. + * + * this syscall returns the minimum rt_priority that can be used + * by a given scheduling class. + */ +asmlinkage long sys_sched_get_priority_min(int policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = 1; + break; + case SCHED_NORMAL: + case SCHED_BATCH: + ret = 0; + } + return ret; +} + +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + */ +asmlinkage +long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) +{ + int retval = -EINVAL; + struct timespec t; + task_t *p; + + if (pid < 0) + goto out_nounlock; + + retval = -ESRCH; + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + jiffies_to_timespec(p->policy & SCHED_FIFO ? + 0 : task_timeslice(p), &t); + read_unlock(&tasklist_lock); + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; +out_nounlock: + return retval; +out_unlock: + read_unlock(&tasklist_lock); + return retval; +} + +static inline struct task_struct *eldest_child(struct task_struct *p) +{ + if (list_empty(&p->children)) return NULL; + return list_entry(p->children.next,struct task_struct,sibling); +} + +static inline struct task_struct *older_sibling(struct task_struct *p) +{ + if (p->sibling.prev==&p->parent->children) return NULL; + return list_entry(p->sibling.prev,struct task_struct,sibling); +} + +static inline struct task_struct *younger_sibling(struct task_struct *p) +{ + if (p->sibling.next==&p->parent->children) return NULL; + return list_entry(p->sibling.next,struct task_struct,sibling); +} + +static void show_task(task_t *p) +{ + task_t *relative; + unsigned state; + unsigned long free = 0; + static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; + + printk("%-13.13s ", p->comm); + state = p->state ? __ffs(p->state) + 1 : 0; + if (state < ARRAY_SIZE(stat_nam)) + printk(stat_nam[state]); + else + printk("?"); +#if (BITS_PER_LONG == 32) + if (state == TASK_RUNNING) + printk(" running "); + else + printk(" %08lX ", thread_saved_pc(p)); +#else + if (state == TASK_RUNNING) + printk(" running task "); + else + printk(" %016lx ", thread_saved_pc(p)); +#endif +#ifdef CONFIG_DEBUG_STACK_USAGE + { + unsigned long *n = end_of_stack(p); + while (!*n) + n++; + free = (unsigned long)n - (unsigned long)end_of_stack(p); + } +#endif + printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); + if ((relative = eldest_child(p))) + printk("%5d ", relative->pid); + else + printk(" "); + if ((relative = younger_sibling(p))) + printk("%7d", relative->pid); + else + printk(" "); + if ((relative = older_sibling(p))) + printk(" %5d", relative->pid); + else + printk(" "); + if (!p->mm) + printk(" (L-TLB)\n"); + else + printk(" (NOTLB)\n"); + + if (state != TASK_RUNNING) + show_stack(p, NULL); +} + +void show_state(void) +{ + task_t *g, *p; + +#if (BITS_PER_LONG == 32) + printk("\n" + " sibling\n"); + printk(" task PC pid father child younger older\n"); +#else + printk("\n" + " sibling\n"); + printk(" task PC pid father child younger older\n"); +#endif + read_lock(&tasklist_lock); + do_each_thread(g, p) { + /* + * reset the NMI-timeout, listing all files on a slow + * console might take alot of time: + */ + touch_nmi_watchdog(); + show_task(p); + } while_each_thread(g, p); + + read_unlock(&tasklist_lock); + mutex_debug_show_all_locks(); +} + +/** + * init_idle - set up an idle thread for a given CPU + * @idle: task in question + * @cpu: cpu the idle task belongs to + * + * NOTE: this function does not set the idle thread's NEED_RESCHED + * flag, to make booting more robust. + */ +void __devinit init_idle(task_t *idle, int cpu) +{ + runqueue_t *rq = cpu_rq(cpu); + unsigned long flags; + + idle->sleep_avg = 0; + idle->array = NULL; + idle->prio = MAX_PRIO; + idle->state = TASK_RUNNING; + idle->cpus_allowed = cpumask_of_cpu(cpu); + set_task_cpu(idle, cpu); + + spin_lock_irqsave(&rq->lock, flags); + rq->curr = rq->idle = idle; +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + idle->oncpu = 1; +#endif + spin_unlock_irqrestore(&rq->lock, flags); + + /* Set the preempt count _outside_ the spinlocks! */ +#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) + task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); +#else + task_thread_info(idle)->preempt_count = 0; +#endif +} + +/* + * In a system that switches off the HZ timer nohz_cpu_mask + * indicates which cpus entered this state. This is used + * in the rcu update to wait only for active cpus. For system + * which do not switch off the HZ timer nohz_cpu_mask should + * always be CPU_MASK_NONE. + */ +cpumask_t nohz_cpu_mask = CPU_MASK_NONE; + +#ifdef CONFIG_SMP +/* + * This is how migration works: + * + * 1) we queue a migration_req_t structure in the source CPU's + * runqueue and wake up that CPU's migration thread. + * 2) we down() the locked semaphore => thread blocks. + * 3) migration thread wakes up (implicitly it forces the migrated + * thread off the CPU) + * 4) it gets the migration request and checks whether the migrated + * task is still in the wrong runqueue. + * 5) if it's in the wrong runqueue then the migration thread removes + * it and puts it into the right queue. + * 6) migration thread up()s the semaphore. + * 7) we wake up and the migration is done. + */ + +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +int set_cpus_allowed(task_t *p, cpumask_t new_mask) +{ + unsigned long flags; + int ret = 0; + migration_req_t req; + runqueue_t *rq; + + rq = task_rq_lock(p, &flags); + if (!cpus_intersects(new_mask, cpu_online_map)) { + ret = -EINVAL; + goto out; + } + + p->cpus_allowed = new_mask; + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpu_isset(task_cpu(p), new_mask)) + goto out; + + if (migrate_task(p, any_online_cpu(new_mask), &req)) { + /* Need help from migration thread: drop lock and wait. */ + task_rq_unlock(rq, &flags); + wake_up_process(rq->migration_thread); + wait_for_completion(&req.done); + tlb_migrate_finish(p->mm); + return 0; + } +out: + task_rq_unlock(rq, &flags); + return ret; +} + +EXPORT_SYMBOL_GPL(set_cpus_allowed); + +/* + * Move (not current) task off this cpu, onto dest cpu. We're doing + * this because either it can't run here any more (set_cpus_allowed() + * away from this CPU, or CPU going down), or because we're + * attempting to rebalance this task on exec (sched_exec). + * + * So we race with normal scheduler movements, but that's OK, as long + * as the task is no longer on this CPU. + */ +static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) +{ + runqueue_t *rq_dest, *rq_src; + + if (unlikely(cpu_is_offline(dest_cpu))) + return; + + rq_src = cpu_rq(src_cpu); + rq_dest = cpu_rq(dest_cpu); + + double_rq_lock(rq_src, rq_dest); + /* Already moved. */ + if (task_cpu(p) != src_cpu) + goto out; + /* Affinity changed (again). */ + if (!cpu_isset(dest_cpu, p->cpus_allowed)) + goto out; + + set_task_cpu(p, dest_cpu); + if (p->array) { + /* + * Sync timestamp with rq_dest's before activating. + * The same thing could be achieved by doing this step + * afterwards, and pretending it was a local activate. + * This way is cleaner and logically correct. + */ + p->timestamp = p->timestamp - rq_src->timestamp_last_tick + + rq_dest->timestamp_last_tick; + deactivate_task(p, rq_src); + activate_task(p, rq_dest, 0); + if (TASK_PREEMPTS_CURR(p, rq_dest)) + resched_task(rq_dest->curr); + } + +out: + double_rq_unlock(rq_src, rq_dest); +} + +/* + * migration_thread - this is a highprio system thread that performs + * thread migration by bumping thread off CPU then 'pushing' onto + * another runqueue. + */ +static int migration_thread(void *data) +{ + runqueue_t *rq; + int cpu = (long)data; + + rq = cpu_rq(cpu); + BUG_ON(rq->migration_thread != current); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + struct list_head *head; + migration_req_t *req; + + try_to_freeze(); + + spin_lock_irq(&rq->lock); + + if (cpu_is_offline(cpu)) { + spin_unlock_irq(&rq->lock); + goto wait_to_die; + } + + if (rq->active_balance) { + active_load_balance(rq, cpu); + rq->active_balance = 0; + } + + head = &rq->migration_queue; + + if (list_empty(head)) { + spin_unlock_irq(&rq->lock); + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + continue; + } + req = list_entry(head->next, migration_req_t, list); + list_del_init(head->next); + + spin_unlock(&rq->lock); + __migrate_task(req->task, cpu, req->dest_cpu); + local_irq_enable(); + + complete(&req->done); + } + __set_current_state(TASK_RUNNING); + return 0; + +wait_to_die: + /* Wait for kthread_stop */ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +/* Figure out where task on dead CPU should go, use force if neccessary. */ +static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) +{ + int dest_cpu; + cpumask_t mask; + + /* On same node? */ + mask = node_to_cpumask(cpu_to_node(dead_cpu)); + cpus_and(mask, mask, tsk->cpus_allowed); + dest_cpu = any_online_cpu(mask); + + /* On any allowed CPU? */ + if (dest_cpu == NR_CPUS) + dest_cpu = any_online_cpu(tsk->cpus_allowed); + + /* No more Mr. Nice Guy. */ + if (dest_cpu == NR_CPUS) { + cpus_setall(tsk->cpus_allowed); + dest_cpu = any_online_cpu(tsk->cpus_allowed); + + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (tsk->mm && printk_ratelimit()) + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + tsk->pid, tsk->comm, dead_cpu); + } + __migrate_task(tsk, dead_cpu, dest_cpu); +} + +/* + * While a dead CPU has no uninterruptible tasks queued at this point, + * it might still have a nonzero ->nr_uninterruptible counter, because + * for performance reasons the counter is not stricly tracking tasks to + * their home CPUs. So we just add the counter to another CPU's counter, + * to keep the global sum constant after CPU-down: + */ +static void migrate_nr_uninterruptible(runqueue_t *rq_src) +{ + runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); + unsigned long flags; + + local_irq_save(flags); + double_rq_lock(rq_src, rq_dest); + rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; + rq_src->nr_uninterruptible = 0; + double_rq_unlock(rq_src, rq_dest); + local_irq_restore(flags); +} + +/* Run through task list and migrate tasks from the dead cpu. */ +static void migrate_live_tasks(int src_cpu) +{ + struct task_struct *tsk, *t; + + write_lock_irq(&tasklist_lock); + + do_each_thread(t, tsk) { + if (tsk == current) + continue; + + if (task_cpu(tsk) == src_cpu) + move_task_off_dead_cpu(src_cpu, tsk); + } while_each_thread(t, tsk); + + write_unlock_irq(&tasklist_lock); +} + +/* Schedules idle task to be the next runnable task on current CPU. + * It does so by boosting its priority to highest possible and adding it to + * the _front_ of runqueue. Used by CPU offline code. + */ +void sched_idle_next(void) +{ + int cpu = smp_processor_id(); + runqueue_t *rq = this_rq(); + struct task_struct *p = rq->idle; + unsigned long flags; + + /* cpu has to be offline */ + BUG_ON(cpu_online(cpu)); + + /* Strictly not necessary since rest of the CPUs are stopped by now + * and interrupts disabled on current cpu. + */ + spin_lock_irqsave(&rq->lock, flags); + + __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); + /* Add idle task to _front_ of it's priority queue */ + __activate_idle_task(p, rq); + + spin_unlock_irqrestore(&rq->lock, flags); +} + +/* Ensures that the idle task is using init_mm right before its cpu goes + * offline. + */ +void idle_task_exit(void) +{ + struct mm_struct *mm = current->active_mm; + + BUG_ON(cpu_online(smp_processor_id())); + + if (mm != &init_mm) + switch_mm(mm, &init_mm, current); + mmdrop(mm); +} + +static void migrate_dead(unsigned int dead_cpu, task_t *tsk) +{ + struct runqueue *rq = cpu_rq(dead_cpu); + + /* Must be exiting, otherwise would be on tasklist. */ + BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD); + + /* Cannot have done final schedule yet: would have vanished. */ + BUG_ON(tsk->flags & PF_DEAD); + + get_task_struct(tsk); + + /* + * Drop lock around migration; if someone else moves it, + * that's OK. No task can be added to this CPU, so iteration is + * fine. + */ + spin_unlock_irq(&rq->lock); + move_task_off_dead_cpu(dead_cpu, tsk); + spin_lock_irq(&rq->lock); + + put_task_struct(tsk); +} + +/* release_task() removes task from tasklist, so we won't find dead tasks. */ +static void migrate_dead_tasks(unsigned int dead_cpu) +{ + unsigned arr, i; + struct runqueue *rq = cpu_rq(dead_cpu); + + for (arr = 0; arr < 2; arr++) { + for (i = 0; i < MAX_PRIO; i++) { + struct list_head *list = &rq->arrays[arr].queue[i]; + while (!list_empty(list)) + migrate_dead(dead_cpu, + list_entry(list->next, task_t, + run_list)); + } + } +} +#endif /* CONFIG_HOTPLUG_CPU */ + +/* + * migration_call - callback that gets triggered when a CPU is added. + * Here we can start up the necessary migration thread for the new CPU. + */ +static int migration_call(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + struct task_struct *p; + struct runqueue *rq; + unsigned long flags; + + switch (action) { + case CPU_UP_PREPARE: + p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); + if (IS_ERR(p)) + return NOTIFY_BAD; + p->flags |= PF_NOFREEZE; + kthread_bind(p, cpu); + /* Must be high prio: stop_machine expects to yield to it. */ + rq = task_rq_lock(p, &flags); + __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); + task_rq_unlock(rq, &flags); + cpu_rq(cpu)->migration_thread = p; + break; + case CPU_ONLINE: + /* Strictly unneccessary, as first user will wake it. */ + wake_up_process(cpu_rq(cpu)->migration_thread); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + /* Unbind it from offline cpu so it can run. Fall thru. */ + kthread_bind(cpu_rq(cpu)->migration_thread, + any_online_cpu(cpu_online_map)); + kthread_stop(cpu_rq(cpu)->migration_thread); + cpu_rq(cpu)->migration_thread = NULL; + break; + case CPU_DEAD: + migrate_live_tasks(cpu); + rq = cpu_rq(cpu); + kthread_stop(rq->migration_thread); + rq->migration_thread = NULL; + /* Idle task back to normal (off runqueue, low prio) */ + rq = task_rq_lock(rq->idle, &flags); + deactivate_task(rq->idle, rq); + rq->idle->static_prio = MAX_PRIO; + __setscheduler(rq->idle, SCHED_NORMAL, 0); + migrate_dead_tasks(cpu); + task_rq_unlock(rq, &flags); + migrate_nr_uninterruptible(rq); + BUG_ON(rq->nr_running != 0); + + /* No need to migrate the tasks: it was best-effort if + * they didn't do lock_cpu_hotplug(). Just wake up + * the requestors. */ + spin_lock_irq(&rq->lock); + while (!list_empty(&rq->migration_queue)) { + migration_req_t *req; + req = list_entry(rq->migration_queue.next, + migration_req_t, list); + list_del_init(&req->list); + complete(&req->done); + } + spin_unlock_irq(&rq->lock); + break; +#endif + } + return NOTIFY_OK; +} + +/* Register at highest priority so that task migration (migrate_all_tasks) + * happens before everything else. + */ +static struct notifier_block __devinitdata migration_notifier = { + .notifier_call = migration_call, + .priority = 10 +}; + +int __init migration_init(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + /* Start one for boot CPU. */ + migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); + migration_call(&migration_notifier, CPU_ONLINE, cpu); + register_cpu_notifier(&migration_notifier); + return 0; +} +#endif + +#ifdef CONFIG_SMP +#undef SCHED_DOMAIN_DEBUG +#ifdef SCHED_DOMAIN_DEBUG +static void sched_domain_debug(struct sched_domain *sd, int cpu) +{ + int level = 0; + + if (!sd) { + printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); + return; + } + + printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + + do { + int i; + char str[NR_CPUS]; + struct sched_group *group = sd->groups; + cpumask_t groupmask; + + cpumask_scnprintf(str, NR_CPUS, sd->span); + cpus_clear(groupmask); + + printk(KERN_DEBUG); + for (i = 0; i < level + 1; i++) + printk(" "); + printk("domain %d: ", level); + + if (!(sd->flags & SD_LOAD_BALANCE)) { + printk("does not load-balance\n"); + if (sd->parent) + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); + break; + } + + printk("span %s\n", str); + + if (!cpu_isset(cpu, sd->span)) + printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); + if (!cpu_isset(cpu, group->cpumask)) + printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); + + printk(KERN_DEBUG); + for (i = 0; i < level + 2; i++) + printk(" "); + printk("groups:"); + do { + if (!group) { + printk("\n"); + printk(KERN_ERR "ERROR: group is NULL\n"); + break; + } + + if (!group->cpu_power) { + printk("\n"); + printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); + } + + if (!cpus_weight(group->cpumask)) { + printk("\n"); + printk(KERN_ERR "ERROR: empty group\n"); + } + + if (cpus_intersects(groupmask, group->cpumask)) { + printk("\n"); + printk(KERN_ERR "ERROR: repeated CPUs\n"); + } + + cpus_or(groupmask, groupmask, group->cpumask); + + cpumask_scnprintf(str, NR_CPUS, group->cpumask); + printk(" %s", str); + + group = group->next; + } while (group != sd->groups); + printk("\n"); + + if (!cpus_equal(sd->span, groupmask)) + printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + + level++; + sd = sd->parent; + + if (sd) { + if (!cpus_subset(groupmask, sd->span)) + printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); + } + + } while (sd); +} +#else +#define sched_domain_debug(sd, cpu) {} +#endif + +static int sd_degenerate(struct sched_domain *sd) +{ + if (cpus_weight(sd->span) == 1) + return 1; + + /* Following flags need at least 2 groups */ + if (sd->flags & (SD_LOAD_BALANCE | + SD_BALANCE_NEWIDLE | + SD_BALANCE_FORK | + SD_BALANCE_EXEC)) { + if (sd->groups != sd->groups->next) + return 0; + } + + /* Following flags don't use groups */ + if (sd->flags & (SD_WAKE_IDLE | + SD_WAKE_AFFINE | + SD_WAKE_BALANCE)) + return 0; + + return 1; +} + +static int sd_parent_degenerate(struct sched_domain *sd, + struct sched_domain *parent) +{ + unsigned long cflags = sd->flags, pflags = parent->flags; + + if (sd_degenerate(parent)) + return 1; + + if (!cpus_equal(sd->span, parent->span)) + return 0; + + /* Does parent contain flags not in child? */ + /* WAKE_BALANCE is a subset of WAKE_AFFINE */ + if (cflags & SD_WAKE_AFFINE) + pflags &= ~SD_WAKE_BALANCE; + /* Flags needing groups don't count if only 1 group in parent */ + if (parent->groups == parent->groups->next) { + pflags &= ~(SD_LOAD_BALANCE | + SD_BALANCE_NEWIDLE | + SD_BALANCE_FORK | + SD_BALANCE_EXEC); + } + if (~cflags & pflags) + return 0; + + return 1; +} + +/* + * Attach the domain 'sd' to 'cpu' as its base domain. Callers must + * hold the hotplug lock. + */ +static void cpu_attach_domain(struct sched_domain *sd, int cpu) +{ + runqueue_t *rq = cpu_rq(cpu); + struct sched_domain *tmp; + + /* Remove the sched domains which do not contribute to scheduling. */ + for (tmp = sd; tmp; tmp = tmp->parent) { + struct sched_domain *parent = tmp->parent; + if (!parent) + break; + if (sd_parent_degenerate(tmp, parent)) + tmp->parent = parent->parent; + } + + if (sd && sd_degenerate(sd)) + sd = sd->parent; + + sched_domain_debug(sd, cpu); + + rcu_assign_pointer(rq->sd, sd); +} + +/* cpus with isolated domains */ +static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; + +/* Setup the mask of cpus configured for isolated domains */ +static int __init isolated_cpu_setup(char *str) +{ + int ints[NR_CPUS], i; + + str = get_options(str, ARRAY_SIZE(ints), ints); + cpus_clear(cpu_isolated_map); + for (i = 1; i <= ints[0]; i++) + if (ints[i] < NR_CPUS) + cpu_set(ints[i], cpu_isolated_map); + return 1; +} + +__setup ("isolcpus=", isolated_cpu_setup); + +/* + * init_sched_build_groups takes an array of groups, the cpumask we wish + * to span, and a pointer to a function which identifies what group a CPU + * belongs to. The return value of group_fn must be a valid index into the + * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we + * keep track of groups covered with a cpumask_t). + * + * init_sched_build_groups will build a circular linked list of the groups + * covered by the given span, and will set each group's ->cpumask correctly, + * and ->cpu_power to 0. + */ +static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, + int (*group_fn)(int cpu)) +{ + struct sched_group *first = NULL, *last = NULL; + cpumask_t covered = CPU_MASK_NONE; + int i; + + for_each_cpu_mask(i, span) { + int group = group_fn(i); + struct sched_group *sg = &groups[group]; + int j; + + if (cpu_isset(i, covered)) + continue; + + sg->cpumask = CPU_MASK_NONE; + sg->cpu_power = 0; + + for_each_cpu_mask(j, span) { + if (group_fn(j) != group) + continue; + + cpu_set(j, covered); + cpu_set(j, sg->cpumask); + } + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; + } + last->next = first; +} + +#define SD_NODES_PER_DOMAIN 16 + +/* + * Self-tuning task migration cost measurement between source and target CPUs. + * + * This is done by measuring the cost of manipulating buffers of varying + * sizes. For a given buffer-size here are the steps that are taken: + * + * 1) the source CPU reads+dirties a shared buffer + * 2) the target CPU reads+dirties the same shared buffer + * + * We measure how long they take, in the following 4 scenarios: + * + * - source: CPU1, target: CPU2 | cost1 + * - source: CPU2, target: CPU1 | cost2 + * - source: CPU1, target: CPU1 | cost3 + * - source: CPU2, target: CPU2 | cost4 + * + * We then calculate the cost3+cost4-cost1-cost2 difference - this is + * the cost of migration. + * + * We then start off from a small buffer-size and iterate up to larger + * buffer sizes, in 5% steps - measuring each buffer-size separately, and + * doing a maximum search for the cost. (The maximum cost for a migration + * normally occurs when the working set size is around the effective cache + * size.) + */ +#define SEARCH_SCOPE 2 +#define MIN_CACHE_SIZE (64*1024U) +#define DEFAULT_CACHE_SIZE (5*1024*1024U) +#define ITERATIONS 1 +#define SIZE_THRESH 130 +#define COST_THRESH 130 + +/* + * The migration cost is a function of 'domain distance'. Domain + * distance is the number of steps a CPU has to iterate down its + * domain tree to share a domain with the other CPU. The farther + * two CPUs are from each other, the larger the distance gets. + * + * Note that we use the distance only to cache measurement results, + * the distance value is not used numerically otherwise. When two + * CPUs have the same distance it is assumed that the migration + * cost is the same. (this is a simplification but quite practical) + */ +#define MAX_DOMAIN_DISTANCE 32 + +static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = + { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = +/* + * Architectures may override the migration cost and thus avoid + * boot-time calibration. Unit is nanoseconds. Mostly useful for + * virtualized hardware: + */ +#ifdef CONFIG_DEFAULT_MIGRATION_COST + CONFIG_DEFAULT_MIGRATION_COST +#else + -1LL +#endif +}; + +/* + * Allow override of migration cost - in units of microseconds. + * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost + * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: + */ +static int __init migration_cost_setup(char *str) +{ + int ints[MAX_DOMAIN_DISTANCE+1], i; + + str = get_options(str, ARRAY_SIZE(ints), ints); + + printk("#ints: %d\n", ints[0]); + for (i = 1; i <= ints[0]; i++) { + migration_cost[i-1] = (unsigned long long)ints[i]*1000; + printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); + } + return 1; +} + +__setup ("migration_cost=", migration_cost_setup); + +/* + * Global multiplier (divisor) for migration-cutoff values, + * in percentiles. E.g. use a value of 150 to get 1.5 times + * longer cache-hot cutoff times. + * + * (We scale it from 100 to 128 to long long handling easier.) + */ + +#define MIGRATION_FACTOR_SCALE 128 + +static unsigned int migration_factor = MIGRATION_FACTOR_SCALE; + +static int __init setup_migration_factor(char *str) +{ + get_option(&str, &migration_factor); + migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; + return 1; +} + +__setup("migration_factor=", setup_migration_factor); + +/* + * Estimated distance of two CPUs, measured via the number of domains + * we have to pass for the two CPUs to be in the same span: + */ +static unsigned long domain_distance(int cpu1, int cpu2) +{ + unsigned long distance = 0; + struct sched_domain *sd; + + for_each_domain(cpu1, sd) { + WARN_ON(!cpu_isset(cpu1, sd->span)); + if (cpu_isset(cpu2, sd->span)) + return distance; + distance++; + } + if (distance >= MAX_DOMAIN_DISTANCE) { + WARN_ON(1); + distance = MAX_DOMAIN_DISTANCE-1; + } + + return distance; +} + +static unsigned int migration_debug; + +static int __init setup_migration_debug(char *str) +{ + get_option(&str, &migration_debug); + return 1; +} + +__setup("migration_debug=", setup_migration_debug); + +/* + * Maximum cache-size that the scheduler should try to measure. + * Architectures with larger caches should tune this up during + * bootup. Gets used in the domain-setup code (i.e. during SMP + * bootup). + */ +unsigned int max_cache_size; + +static int __init setup_max_cache_size(char *str) +{ + get_option(&str, &max_cache_size); + return 1; +} + +__setup("max_cache_size=", setup_max_cache_size); + +/* + * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This + * is the operation that is timed, so we try to generate unpredictable + * cachemisses that still end up filling the L2 cache: + */ +static void touch_cache(void *__cache, unsigned long __size) +{ + unsigned long size = __size/sizeof(long), chunk1 = size/3, + chunk2 = 2*size/3; + unsigned long *cache = __cache; + int i; + + for (i = 0; i < size/6; i += 8) { + switch (i % 6) { + case 0: cache[i]++; + case 1: cache[size-1-i]++; + case 2: cache[chunk1-i]++; + case 3: cache[chunk1+i]++; + case 4: cache[chunk2-i]++; + case 5: cache[chunk2+i]++; + } + } +} + +/* + * Measure the cache-cost of one task migration. Returns in units of nsec. + */ +static unsigned long long measure_one(void *cache, unsigned long size, + int source, int target) +{ + cpumask_t mask, saved_mask; + unsigned long long t0, t1, t2, t3, cost; + + saved_mask = current->cpus_allowed; + + /* + * Flush source caches to RAM and invalidate them: + */ + sched_cacheflush(); + + /* + * Migrate to the source CPU: + */ + mask = cpumask_of_cpu(source); + set_cpus_allowed(current, mask); + WARN_ON(smp_processor_id() != source); + + /* + * Dirty the working set: + */ + t0 = sched_clock(); + touch_cache(cache, size); + t1 = sched_clock(); + + /* + * Migrate to the target CPU, dirty the L2 cache and access + * the shared buffer. (which represents the working set + * of a migrated task.) + */ + mask = cpumask_of_cpu(target); + set_cpus_allowed(current, mask); + WARN_ON(smp_processor_id() != target); + + t2 = sched_clock(); + touch_cache(cache, size); + t3 = sched_clock(); + + cost = t1-t0 + t3-t2; + + if (migration_debug >= 2) + printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", + source, target, t1-t0, t1-t0, t3-t2, cost); + /* + * Flush target caches to RAM and invalidate them: + */ + sched_cacheflush(); + + set_cpus_allowed(current, saved_mask); + + return cost; +} + +/* + * Measure a series of task migrations and return the average + * result. Since this code runs early during bootup the system + * is 'undisturbed' and the average latency makes sense. + * + * The algorithm in essence auto-detects the relevant cache-size, + * so it will properly detect different cachesizes for different + * cache-hierarchies, depending on how the CPUs are connected. + * + * Architectures can prime the upper limit of the search range via + * max_cache_size, otherwise the search range defaults to 20MB...64K. + */ +static unsigned long long +measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) +{ + unsigned long long cost1, cost2; + int i; + + /* + * Measure the migration cost of 'size' bytes, over an + * average of 10 runs: + * + * (We perturb the cache size by a small (0..4k) + * value to compensate size/alignment related artifacts. + * We also subtract the cost of the operation done on + * the same CPU.) + */ + cost1 = 0; + + /* + * dry run, to make sure we start off cache-cold on cpu1, + * and to get any vmalloc pagefaults in advance: + */ + measure_one(cache, size, cpu1, cpu2); + for (i = 0; i < ITERATIONS; i++) + cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); + + measure_one(cache, size, cpu2, cpu1); + for (i = 0; i < ITERATIONS; i++) + cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); + + /* + * (We measure the non-migrating [cached] cost on both + * cpu1 and cpu2, to handle CPUs with different speeds) + */ + cost2 = 0; + + measure_one(cache, size, cpu1, cpu1); + for (i = 0; i < ITERATIONS; i++) + cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); + + measure_one(cache, size, cpu2, cpu2); + for (i = 0; i < ITERATIONS; i++) + cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); + + /* + * Get the per-iteration migration cost: + */ + do_div(cost1, 2*ITERATIONS); + do_div(cost2, 2*ITERATIONS); + + return cost1 - cost2; +} + +static unsigned long long measure_migration_cost(int cpu1, int cpu2) +{ + unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; + unsigned int max_size, size, size_found = 0; + long long cost = 0, prev_cost; + void *cache; + + /* + * Search from max_cache_size*5 down to 64K - the real relevant + * cachesize has to lie somewhere inbetween. + */ + if (max_cache_size) { + max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); + size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); + } else { + /* + * Since we have no estimation about the relevant + * search range + */ + max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; + size = MIN_CACHE_SIZE; + } + + if (!cpu_online(cpu1) || !cpu_online(cpu2)) { + printk("cpu %d and %d not both online!\n", cpu1, cpu2); + return 0; + } + + /* + * Allocate the working set: + */ + cache = vmalloc(max_size); + if (!cache) { + printk("could not vmalloc %d bytes for cache!\n", 2*max_size); + return 1000000; // return 1 msec on very small boxen + } + + while (size <= max_size) { + prev_cost = cost; + cost = measure_cost(cpu1, cpu2, cache, size); + + /* + * Update the max: + */ + if (cost > 0) { + if (max_cost < cost) { + max_cost = cost; + size_found = size; + } + } + /* + * Calculate average fluctuation, we use this to prevent + * noise from triggering an early break out of the loop: + */ + fluct = abs(cost - prev_cost); + avg_fluct = (avg_fluct + fluct)/2; + + if (migration_debug) + printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", + cpu1, cpu2, size, + (long)cost / 1000000, + ((long)cost / 100000) % 10, + (long)max_cost / 1000000, + ((long)max_cost / 100000) % 10, + domain_distance(cpu1, cpu2), + cost, avg_fluct); + + /* + * If we iterated at least 20% past the previous maximum, + * and the cost has dropped by more than 20% already, + * (taking fluctuations into account) then we assume to + * have found the maximum and break out of the loop early: + */ + if (size_found && (size*100 > size_found*SIZE_THRESH)) + if (cost+avg_fluct <= 0 || + max_cost*100 > (cost+avg_fluct)*COST_THRESH) { + + if (migration_debug) + printk("-> found max.\n"); + break; + } + /* + * Increase the cachesize in 10% steps: + */ + size = size * 10 / 9; + } + + if (migration_debug) + printk("[%d][%d] working set size found: %d, cost: %Ld\n", + cpu1, cpu2, size_found, max_cost); + + vfree(cache); + + /* + * A task is considered 'cache cold' if at least 2 times + * the worst-case cost of migration has passed. + * + * (this limit is only listened to if the load-balancing + * situation is 'nice' - if there is a large imbalance we + * ignore it for the sake of CPU utilization and + * processing fairness.) + */ + return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; +} + +static void calibrate_migration_costs(const cpumask_t *cpu_map) +{ + int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id(); + unsigned long j0, j1, distance, max_distance = 0; + struct sched_domain *sd; + + j0 = jiffies; + + /* + * First pass - calculate the cacheflush times: + */ + for_each_cpu_mask(cpu1, *cpu_map) { + for_each_cpu_mask(cpu2, *cpu_map) { + if (cpu1 == cpu2) + continue; + distance = domain_distance(cpu1, cpu2); + max_distance = max(max_distance, distance); + /* + * No result cached yet? + */ + if (migration_cost[distance] == -1LL) + migration_cost[distance] = + measure_migration_cost(cpu1, cpu2); + } + } + /* + * Second pass - update the sched domain hierarchy with + * the new cache-hot-time estimations: + */ + for_each_cpu_mask(cpu, *cpu_map) { + distance = 0; + for_each_domain(cpu, sd) { + sd->cache_hot_time = migration_cost[distance]; + distance++; + } + } + /* + * Print the matrix: + */ + if (migration_debug) + printk("migration: max_cache_size: %d, cpu: %d MHz:\n", + max_cache_size, +#ifdef CONFIG_X86 + cpu_khz/1000 +#else + -1 +#endif + ); + if (system_state == SYSTEM_BOOTING) { + printk("migration_cost="); + for (distance = 0; distance <= max_distance; distance++) { + if (distance) + printk(","); + printk("%ld", (long)migration_cost[distance] / 1000); + } + printk("\n"); + } + j1 = jiffies; + if (migration_debug) + printk("migration: %ld seconds\n", (j1-j0)/HZ); + + /* + * Move back to the original CPU. NUMA-Q gets confused + * if we migrate to another quad during bootup. + */ + if (raw_smp_processor_id() != orig_cpu) { + cpumask_t mask = cpumask_of_cpu(orig_cpu), + saved_mask = current->cpus_allowed; + + set_cpus_allowed(current, mask); + set_cpus_allowed(current, saved_mask); + } +} + +#ifdef CONFIG_NUMA + +/** + * find_next_best_node - find the next node to include in a sched_domain + * @node: node whose sched_domain we're building + * @used_nodes: nodes already in the sched_domain + * + * Find the next node to include in a given scheduling domain. Simply + * finds the closest node not already in the @used_nodes map. + * + * Should use nodemask_t. + */ +static int find_next_best_node(int node, unsigned long *used_nodes) +{ + int i, n, val, min_val, best_node = 0; + + min_val = INT_MAX; + + for (i = 0; i < MAX_NUMNODES; i++) { + /* Start at @node */ + n = (node + i) % MAX_NUMNODES; + + if (!nr_cpus_node(n)) + continue; + + /* Skip already used nodes */ + if (test_bit(n, used_nodes)) + continue; + + /* Simple min distance search */ + val = node_distance(node, n); + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + set_bit(best_node, used_nodes); + return best_node; +} + +/** + * sched_domain_node_span - get a cpumask for a node's sched_domain + * @node: node whose cpumask we're constructing + * @size: number of nodes to include in this span + * + * Given a node, construct a good cpumask for its sched_domain to span. It + * should be one that prevents unnecessary balancing, but also spreads tasks + * out optimally. + */ +static cpumask_t sched_domain_node_span(int node) +{ + int i; + cpumask_t span, nodemask; + DECLARE_BITMAP(used_nodes, MAX_NUMNODES); + + cpus_clear(span); + bitmap_zero(used_nodes, MAX_NUMNODES); + + nodemask = node_to_cpumask(node); + cpus_or(span, span, nodemask); + set_bit(node, used_nodes); + + for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { + int next_node = find_next_best_node(node, used_nodes); + nodemask = node_to_cpumask(next_node); + cpus_or(span, span, nodemask); + } + + return span; +} +#endif + +/* + * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we + * can switch it on easily if needed. + */ +#ifdef CONFIG_SCHED_SMT +static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static struct sched_group sched_group_cpus[NR_CPUS]; +static int cpu_to_cpu_group(int cpu) +{ + return cpu; +} +#endif + +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +static struct sched_group sched_group_phys[NR_CPUS]; +static int cpu_to_phys_group(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + return first_cpu(cpu_sibling_map[cpu]); +#else + return cpu; +#endif +} + +#ifdef CONFIG_NUMA +/* + * The init_sched_build_groups can't handle what we want to do with node + * groups, so roll our own. Now each node has its own list of groups which + * gets dynamically allocated. + */ +static DEFINE_PER_CPU(struct sched_domain, node_domains); +static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; + +static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); +static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; + +static int cpu_to_allnodes_group(int cpu) +{ + return cpu_to_node(cpu); +} +#endif + +/* + * Build sched domains for a given set of cpus and attach the sched domains + * to the individual cpus + */ +void build_sched_domains(const cpumask_t *cpu_map) +{ + int i; +#ifdef CONFIG_NUMA + struct sched_group **sched_group_nodes = NULL; + struct sched_group *sched_group_allnodes = NULL; + + /* + * Allocate the per-node list of sched groups + */ + sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, + GFP_ATOMIC); + if (!sched_group_nodes) { + printk(KERN_WARNING "Can not alloc sched group node list\n"); + return; + } + sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; +#endif + + /* + * Set up domains for cpus specified by the cpu_map. + */ + for_each_cpu_mask(i, *cpu_map) { + int group; + struct sched_domain *sd = NULL, *p; + cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); + + cpus_and(nodemask, nodemask, *cpu_map); + +#ifdef CONFIG_NUMA + if (cpus_weight(*cpu_map) + > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { + if (!sched_group_allnodes) { + sched_group_allnodes + = kmalloc(sizeof(struct sched_group) + * MAX_NUMNODES, + GFP_KERNEL); + if (!sched_group_allnodes) { + printk(KERN_WARNING + "Can not alloc allnodes sched group\n"); + break; + } + sched_group_allnodes_bycpu[i] + = sched_group_allnodes; + } + sd = &per_cpu(allnodes_domains, i); + *sd = SD_ALLNODES_INIT; + sd->span = *cpu_map; + group = cpu_to_allnodes_group(i); + sd->groups = &sched_group_allnodes[group]; + p = sd; + } else + p = NULL; + + sd = &per_cpu(node_domains, i); + *sd = SD_NODE_INIT; + sd->span = sched_domain_node_span(cpu_to_node(i)); + sd->parent = p; + cpus_and(sd->span, sd->span, *cpu_map); +#endif + + p = sd; + sd = &per_cpu(phys_domains, i); + group = cpu_to_phys_group(i); + *sd = SD_CPU_INIT; + sd->span = nodemask; + sd->parent = p; + sd->groups = &sched_group_phys[group]; + +#ifdef CONFIG_SCHED_SMT + p = sd; + sd = &per_cpu(cpu_domains, i); + group = cpu_to_cpu_group(i); + *sd = SD_SIBLING_INIT; + sd->span = cpu_sibling_map[i]; + cpus_and(sd->span, sd->span, *cpu_map); + sd->parent = p; + sd->groups = &sched_group_cpus[group]; +#endif + } + +#ifdef CONFIG_SCHED_SMT + /* Set up CPU (sibling) groups */ + for_each_cpu_mask(i, *cpu_map) { + cpumask_t this_sibling_map = cpu_sibling_map[i]; + cpus_and(this_sibling_map, this_sibling_map, *cpu_map); + if (i != first_cpu(this_sibling_map)) + continue; + + init_sched_build_groups(sched_group_cpus, this_sibling_map, + &cpu_to_cpu_group); + } +#endif + + /* Set up physical groups */ + for (i = 0; i < MAX_NUMNODES; i++) { + cpumask_t nodemask = node_to_cpumask(i); + + cpus_and(nodemask, nodemask, *cpu_map); + if (cpus_empty(nodemask)) + continue; + + init_sched_build_groups(sched_group_phys, nodemask, + &cpu_to_phys_group); + } + +#ifdef CONFIG_NUMA + /* Set up node groups */ + if (sched_group_allnodes) + init_sched_build_groups(sched_group_allnodes, *cpu_map, + &cpu_to_allnodes_group); + + for (i = 0; i < MAX_NUMNODES; i++) { + /* Set up node groups */ + struct sched_group *sg, *prev; + cpumask_t nodemask = node_to_cpumask(i); + cpumask_t domainspan; + cpumask_t covered = CPU_MASK_NONE; + int j; + + cpus_and(nodemask, nodemask, *cpu_map); + if (cpus_empty(nodemask)) { + sched_group_nodes[i] = NULL; + continue; + } + + domainspan = sched_domain_node_span(i); + cpus_and(domainspan, domainspan, *cpu_map); + + sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); + sched_group_nodes[i] = sg; + for_each_cpu_mask(j, nodemask) { + struct sched_domain *sd; + sd = &per_cpu(node_domains, j); + sd->groups = sg; + if (sd->groups == NULL) { + /* Turn off balancing if we have no groups */ + sd->flags = 0; + } + } + if (!sg) { + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", i); + continue; + } + sg->cpu_power = 0; + sg->cpumask = nodemask; + cpus_or(covered, covered, nodemask); + prev = sg; + + for (j = 0; j < MAX_NUMNODES; j++) { + cpumask_t tmp, notcovered; + int n = (i + j) % MAX_NUMNODES; + + cpus_complement(notcovered, covered); + cpus_and(tmp, notcovered, *cpu_map); + cpus_and(tmp, tmp, domainspan); + if (cpus_empty(tmp)) + break; + + nodemask = node_to_cpumask(n); + cpus_and(tmp, tmp, nodemask); + if (cpus_empty(tmp)) + continue; + + sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); + if (!sg) { + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", j); + break; + } + sg->cpu_power = 0; + sg->cpumask = tmp; + cpus_or(covered, covered, tmp); + prev->next = sg; + prev = sg; + } + prev->next = sched_group_nodes[i]; + } +#endif + + /* Calculate CPU power for physical packages and nodes */ + for_each_cpu_mask(i, *cpu_map) { + int power; + struct sched_domain *sd; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i); + power = SCHED_LOAD_SCALE; + sd->groups->cpu_power = power; +#endif + + sd = &per_cpu(phys_domains, i); + power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * + (cpus_weight(sd->groups->cpumask)-1) / 10; + sd->groups->cpu_power = power; + +#ifdef CONFIG_NUMA + sd = &per_cpu(allnodes_domains, i); + if (sd->groups) { + power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * + (cpus_weight(sd->groups->cpumask)-1) / 10; + sd->groups->cpu_power = power; + } +#endif + } + +#ifdef CONFIG_NUMA + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *sg = sched_group_nodes[i]; + int j; + + if (sg == NULL) + continue; +next_sg: + for_each_cpu_mask(j, sg->cpumask) { + struct sched_domain *sd; + int power; + + sd = &per_cpu(phys_domains, j); + if (j != first_cpu(sd->groups->cpumask)) { + /* + * Only add "power" once for each + * physical package. + */ + continue; + } + power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * + (cpus_weight(sd->groups->cpumask)-1) / 10; + + sg->cpu_power += power; + } + sg = sg->next; + if (sg != sched_group_nodes[i]) + goto next_sg; + } +#endif + + /* Attach the domains */ + for_each_cpu_mask(i, *cpu_map) { + struct sched_domain *sd; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i); +#else + sd = &per_cpu(phys_domains, i); +#endif + cpu_attach_domain(sd, i); + } + /* + * Tune cache-hot values: + */ + calibrate_migration_costs(cpu_map); +} +/* + * Set up scheduler domains and groups. Callers must hold the hotplug lock. + */ +static void arch_init_sched_domains(const cpumask_t *cpu_map) +{ + cpumask_t cpu_default_map; + + /* + * Setup mask for cpus without special case scheduling requirements. + * For now this just excludes isolated cpus, but could be used to + * exclude other special cases in the future. + */ + cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); + + build_sched_domains(&cpu_default_map); +} + +static void arch_destroy_sched_domains(const cpumask_t *cpu_map) +{ +#ifdef CONFIG_NUMA + int i; + int cpu; + + for_each_cpu_mask(cpu, *cpu_map) { + struct sched_group *sched_group_allnodes + = sched_group_allnodes_bycpu[cpu]; + struct sched_group **sched_group_nodes + = sched_group_nodes_bycpu[cpu]; + + if (sched_group_allnodes) { + kfree(sched_group_allnodes); + sched_group_allnodes_bycpu[cpu] = NULL; + } + + if (!sched_group_nodes) + continue; + + for (i = 0; i < MAX_NUMNODES; i++) { + cpumask_t nodemask = node_to_cpumask(i); + struct sched_group *oldsg, *sg = sched_group_nodes[i]; + + cpus_and(nodemask, nodemask, *cpu_map); + if (cpus_empty(nodemask)) + continue; + + if (sg == NULL) + continue; + sg = sg->next; +next_sg: + oldsg = sg; + sg = sg->next; + kfree(oldsg); + if (oldsg != sched_group_nodes[i]) + goto next_sg; + } + kfree(sched_group_nodes); + sched_group_nodes_bycpu[cpu] = NULL; + } +#endif +} + +/* + * Detach sched domains from a group of cpus specified in cpu_map + * These cpus will now be attached to the NULL domain + */ +static void detach_destroy_domains(const cpumask_t *cpu_map) +{ + int i; + + for_each_cpu_mask(i, *cpu_map) + cpu_attach_domain(NULL, i); + synchronize_sched(); + arch_destroy_sched_domains(cpu_map); +} + +/* + * Partition sched domains as specified by the cpumasks below. + * This attaches all cpus from the cpumasks to the NULL domain, + * waits for a RCU quiescent period, recalculates sched + * domain information and then attaches them back to the + * correct sched domains + * Call with hotplug lock held + */ +void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) +{ + cpumask_t change_map; + + cpus_and(*partition1, *partition1, cpu_online_map); + cpus_and(*partition2, *partition2, cpu_online_map); + cpus_or(change_map, *partition1, *partition2); + + /* Detach sched domains from all of the affected cpus */ + detach_destroy_domains(&change_map); + if (!cpus_empty(*partition1)) + build_sched_domains(partition1); + if (!cpus_empty(*partition2)) + build_sched_domains(partition2); +} + +#ifdef CONFIG_HOTPLUG_CPU +/* + * Force a reinitialization of the sched domains hierarchy. The domains + * and groups cannot be updated in place without racing with the balancing + * code, so we temporarily attach all running cpus to the NULL domain + * which will prevent rebalancing while the sched domains are recalculated. + */ +static int update_sched_domains(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action) { + case CPU_UP_PREPARE: + case CPU_DOWN_PREPARE: + detach_destroy_domains(&cpu_online_map); + return NOTIFY_OK; + + case CPU_UP_CANCELED: + case CPU_DOWN_FAILED: + case CPU_ONLINE: + case CPU_DEAD: + /* + * Fall through and re-initialise the domains. + */ + break; + default: + return NOTIFY_DONE; + } + + /* The hotplug lock is already held by cpu_up/cpu_down */ + arch_init_sched_domains(&cpu_online_map); + + return NOTIFY_OK; +} +#endif + +void __init sched_init_smp(void) +{ + lock_cpu_hotplug(); + arch_init_sched_domains(&cpu_online_map); + unlock_cpu_hotplug(); + /* XXX: Theoretical race here - CPU may be hotplugged now */ + hotcpu_notifier(update_sched_domains, 0); +} +#else +void __init sched_init_smp(void) +{ +} +#endif /* CONFIG_SMP */ + +int in_sched_functions(unsigned long addr) +{ + /* Linker adds these: start and end of __sched functions */ + extern char __sched_text_start[], __sched_text_end[]; + return in_lock_functions(addr) || + (addr >= (unsigned long)__sched_text_start + && addr < (unsigned long)__sched_text_end); +} + +void __init sched_init(void) +{ + runqueue_t *rq; + int i, j, k; + + for_each_cpu(i) { + prio_array_t *array; + + rq = cpu_rq(i); + spin_lock_init(&rq->lock); + rq->nr_running = 0; + rq->active = rq->arrays; + rq->expired = rq->arrays + 1; + rq->best_expired_prio = MAX_PRIO; + +#ifdef CONFIG_SMP + rq->sd = NULL; + for (j = 1; j < 3; j++) + rq->cpu_load[j] = 0; + rq->active_balance = 0; + rq->push_cpu = 0; + rq->migration_thread = NULL; + INIT_LIST_HEAD(&rq->migration_queue); +#endif + atomic_set(&rq->nr_iowait, 0); + + for (j = 0; j < 2; j++) { + array = rq->arrays + j; + for (k = 0; k < MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); + } + } + + /* + * The boot idle thread does lazy MMU switching as well: + */ + atomic_inc(&init_mm.mm_count); + enter_lazy_tlb(&init_mm, current); + + /* + * Make us the idle thread. Technically, schedule() should not be + * called from this thread, however somewhere below it might be, + * but because we are the idle thread, we just pick up running again + * when this runqueue becomes "idle". + */ + init_idle(current, smp_processor_id()); +} + +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +void __might_sleep(char *file, int line) +{ +#if defined(in_atomic) + static unsigned long prev_jiffy; /* ratelimiting */ + + if ((in_atomic() || irqs_disabled()) && + system_state == SYSTEM_RUNNING && !oops_in_progress) { + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + printk(KERN_ERR "Debug: sleeping function called from invalid" + " context at %s:%d\n", file, line); + printk("in_atomic():%d, irqs_disabled():%d\n", + in_atomic(), irqs_disabled()); + dump_stack(); + } +#endif +} +EXPORT_SYMBOL(__might_sleep); +#endif + +#ifdef CONFIG_MAGIC_SYSRQ +void normalize_rt_tasks(void) +{ + struct task_struct *p; + prio_array_t *array; + unsigned long flags; + runqueue_t *rq; + + read_lock_irq(&tasklist_lock); + for_each_process (p) { + if (!rt_task(p)) + continue; + + rq = task_rq_lock(p, &flags); + + array = p->array; + if (array) + deactivate_task(p, task_rq(p)); + __setscheduler(p, SCHED_NORMAL, 0); + if (array) { + __activate_task(p, task_rq(p)); + resched_task(rq->curr); + } + + task_rq_unlock(rq, &flags); + } + read_unlock_irq(&tasklist_lock); +} + +#endif /* CONFIG_MAGIC_SYSRQ */ + +#ifdef CONFIG_IA64 +/* + * These functions are only useful for the IA64 MCA handling. + * + * They can only be called when the whole system has been + * stopped - every CPU needs to be quiescent, and no scheduling + * activity can take place. Using them for anything else would + * be a serious bug, and as a result, they aren't even visible + * under any other configuration. + */ + +/** + * curr_task - return the current task for a given cpu. + * @cpu: the processor in question. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +task_t *curr_task(int cpu) +{ + return cpu_curr(cpu); +} + +/** + * set_curr_task - set the current task for a given cpu. + * @cpu: the processor in question. + * @p: the task pointer to set. + * + * Description: This function must only be used when non-maskable interrupts + * are serviced on a separate stack. It allows the architecture to switch the + * notion of the current task on a cpu in a non-blocking manner. This function + * must be called with all CPU's synchronized, and interrupts disabled, the + * and caller must save the original value of the current task (see + * curr_task() above) and restore that value before reenabling interrupts and + * re-starting the system. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +void set_curr_task(int cpu, task_t *p) +{ + cpu_curr(cpu) = p; +} + +#endif diff -urN oldtree/kernel/softirq.c newtree/kernel/softirq.c --- oldtree/kernel/softirq.c 2006-01-03 03:21:10.000000000 +0000 +++ newtree/kernel/softirq.c 2006-02-18 15:24:31.436809808 +0000 @@ -350,7 +350,6 @@ static int ksoftirqd(void * __bind_cpu) { set_user_nice(current, 19); - current->flags |= PF_NOFREEZE; set_current_state(TASK_INTERRUPTIBLE); @@ -456,7 +455,7 @@ case CPU_UP_PREPARE: BUG_ON(per_cpu(tasklet_vec, hotcpu).list); BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list); - p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); + p = kthread_nofreeze_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); if (IS_ERR(p)) { printk("ksoftirqd for %i failed\n", hotcpu); return NOTIFY_BAD; diff -urN oldtree/kernel/sys.c newtree/kernel/sys.c --- oldtree/kernel/sys.c 2006-02-18 15:18:30.099741392 +0000 +++ newtree/kernel/sys.c 2006-02-18 15:24:31.438809504 +0000 @@ -174,15 +174,18 @@ { int ret=NOTIFY_DONE; struct notifier_block *nb = *n; + struct notifier_block *next; while(nb) { - ret=nb->notifier_call(nb,val,v); + /* Determining next here allows the notifier to unregister itself */ + next = nb->next; + ret = nb->notifier_call(nb,val,v); if(ret&NOTIFY_STOP_MASK) { return ret; } - nb=nb->next; + nb = next; } return ret; } @@ -544,12 +547,12 @@ unlock_kernel(); return -EINVAL; -#ifdef CONFIG_SOFTWARE_SUSPEND +#ifdef CONFIG_SUSPEND2 case LINUX_REBOOT_CMD_SW_SUSPEND: { - int ret = software_suspend(); + suspend2_try_suspend(); unlock_kernel(); - return ret; + return 0; } #endif diff -urN oldtree/kernel/workqueue.c newtree/kernel/workqueue.c --- oldtree/kernel/workqueue.c 2006-02-18 15:18:30.103740784 +0000 +++ newtree/kernel/workqueue.c 2006-02-18 15:24:31.439809352 +0000 @@ -191,8 +191,6 @@ struct k_sigaction sa; sigset_t blocked; - current->flags |= PF_NOFREEZE; - set_user_nice(current, -5); /* Block and flush all signals */ @@ -213,6 +211,7 @@ schedule(); else __set_current_state(TASK_RUNNING); + try_to_freeze(); remove_wait_queue(&cwq->more_work, &wait); if (!list_empty(&cwq->worklist)) @@ -282,7 +281,8 @@ } static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, - int cpu) + int cpu, + unsigned long freezer_flags) { struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); struct task_struct *p; @@ -296,10 +296,21 @@ init_waitqueue_head(&cwq->more_work); init_waitqueue_head(&cwq->work_done); - if (is_single_threaded(wq)) - p = kthread_create(worker_thread, cwq, "%s", wq->name); - else - p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu); + if (is_single_threaded(wq)) { + if (freezer_flags) + p = kthread_nofreeze_create(worker_thread, cwq, + "%s", wq->name); + else + p = kthread_create(worker_thread, cwq, + "%s", wq->name); + } else { + if (freezer_flags) + p = kthread_nofreeze_create(worker_thread, cwq, + "%s/%d", wq->name, cpu); + else + p = kthread_create(worker_thread, cwq, + "%s/%d", wq->name, cpu); + } if (IS_ERR(p)) return NULL; cwq->thread = p; @@ -307,7 +318,8 @@ } struct workqueue_struct *__create_workqueue(const char *name, - int singlethread) + int singlethread, + unsigned long freezer_flags) { int cpu, destroy = 0; struct workqueue_struct *wq; @@ -328,7 +340,7 @@ lock_cpu_hotplug(); if (singlethread) { INIT_LIST_HEAD(&wq->list); - p = create_workqueue_thread(wq, singlethread_cpu); + p = create_workqueue_thread(wq, singlethread_cpu, freezer_flags); if (!p) destroy = 1; else @@ -338,7 +350,7 @@ list_add(&wq->list, &workqueues); spin_unlock(&workqueue_lock); for_each_online_cpu(cpu) { - p = create_workqueue_thread(wq, cpu); + p = create_workqueue_thread(wq, cpu, freezer_flags); if (p) { kthread_bind(p, cpu); wake_up_process(p); @@ -529,7 +541,7 @@ case CPU_UP_PREPARE: /* Create a new workqueue thread for it. */ list_for_each_entry(wq, &workqueues, list) { - if (!create_workqueue_thread(wq, hotcpu)) { + if (!create_workqueue_thread(wq, hotcpu, 0)) { printk("workqueue for %i failed\n", hotcpu); return NOTIFY_BAD; } @@ -572,7 +584,7 @@ { singlethread_cpu = first_cpu(cpu_possible_map); hotcpu_notifier(workqueue_cpu_callback, 0); - keventd_wq = create_workqueue("events"); + keventd_wq = create_nofreeze_workqueue("events"); BUG_ON(!keventd_wq); } diff -urN oldtree/lib/Kconfig newtree/lib/Kconfig --- oldtree/lib/Kconfig 2006-01-03 03:21:10.000000000 +0000 +++ newtree/lib/Kconfig 2006-02-18 15:24:31.440809200 +0000 @@ -38,6 +38,9 @@ require M here. See Castagnoli93. Module will be libcrc32c. +config DYN_PAGEFLAGS + bool + # # compression support is select'ed if needed # diff -urN oldtree/lib/Makefile newtree/lib/Makefile --- oldtree/lib/Makefile 2006-02-18 15:18:30.104740632 +0000 +++ newtree/lib/Makefile 2006-02-18 15:24:31.441809048 +0000 @@ -28,6 +28,8 @@ lib-y += dec_and_lock.o endif +obj-$(CONFIG_DYN_PAGEFLAGS) += dyn_pageflags.o + obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o obj-$(CONFIG_CRC16) += crc16.o obj-$(CONFIG_CRC32) += crc32.o diff -urN oldtree/lib/dyn_pageflags.c newtree/lib/dyn_pageflags.c --- oldtree/lib/dyn_pageflags.c 1970-01-01 00:00:00.000000000 +0000 +++ newtree/lib/dyn_pageflags.c 2006-02-18 15:24:31.442808896 +0000 @@ -0,0 +1,330 @@ +/* + * lib/dyn_pageflags.c + * + * Copyright (C) 2004-2006 Nigel Cunningham + * + * This file is released under the GPLv2. + * + * Routines for dynamically allocating and releasing bitmaps + * used as pseudo-pageflags. + * + * Arrays are not contiguous. The first sizeof(void *) bytes are + * the pointer to the next page in the bitmap. This allows us to + * work under low memory conditions where order 0 might be all + * that's available. In their original use (suspend2), it also + * lets us save the pages at suspend time, reload and relocate them + * as necessary at resume time without much effort. + * + */ + +#include +#include +#include + +#define page_to_zone_offset(pg) (page_to_pfn(pg) - page_zone(pg)->zone_start_pfn) + +/* + * num_zones + * + * How many zones are there? + * + */ + +static int num_zones(void) +{ + int result = 0; + struct zone *zone; + + for_each_zone(zone) + result++; + + return result; +} + +/* + * pages_for_zone(struct zone *zone) + * + * How many pages do we need for a bitmap for this zone? + * + */ + +static int pages_for_zone(struct zone *zone) +{ + return (zone->spanned_pages + (PAGE_SIZE << 3) - 1) >> + (PAGE_SHIFT + 3); +} + +/* + * page_zone_number(struct page *page) + * + * Which zone index does the page match? + * + */ + +static int page_zone_number(struct page *page) +{ + struct zone *zone, *zone_sought = page_zone(page); + int zone_num = 0; + + for_each_zone(zone) + if (zone == zone_sought) + return zone_num; + else + zone_num++; + + printk("Was looking for a zone for page %p.\n", page); + BUG_ON(1); + + return 0; +} + +/* + * dyn_pageflags_pages_per_bitmap + * + * Number of pages needed for a bitmap covering all zones. + * + */ +int dyn_pageflags_pages_per_bitmap(void) +{ + int total = 0; + struct zone *zone; + + for_each_zone(zone) + total += pages_for_zone(zone); + + return total; +} + +/* + * clear_dyn_pageflags(dyn_pageflags_t pagemap) + * + * Clear an array used to store local page flags. + * + */ + +void clear_dyn_pageflags(dyn_pageflags_t pagemap) +{ + int i = 0, zone_num = 0; + struct zone *zone; + + BUG_ON(!pagemap); + + for_each_zone(zone) { + for (i = 0; i < pages_for_zone(zone); i++) + memset((pagemap[zone_num][i]), 0, PAGE_SIZE); + zone_num++; + } +} + +/* + * allocate_dyn_pageflags(dyn_pageflags_t *pagemap) + * + * Allocate a bitmap for dynamic page flags. + * + */ +int allocate_dyn_pageflags(dyn_pageflags_t *pagemap) +{ + int i, zone_num = 0; + struct zone *zone; + + BUG_ON(*pagemap); + + *pagemap = kmalloc(sizeof(void *) * num_zones(), GFP_ATOMIC); + + if (!*pagemap) + return -ENOMEM; + + for_each_zone(zone) { + int zone_pages = pages_for_zone(zone); + (*pagemap)[zone_num] = kmalloc(sizeof(void *) * zone_pages, + GFP_ATOMIC); + + if (!(*pagemap)[zone_num]) { + kfree (*pagemap); + return -ENOMEM; + } + + for (i = 0; i < zone_pages; i++) { + unsigned long address = get_zeroed_page(GFP_ATOMIC); + (*pagemap)[zone_num][i] = (unsigned long *) address; + if (!(*pagemap)[zone_num][i]) { + printk("Error. Unable to allocate memory for " + "dynamic pageflags."); + free_dyn_pageflags(pagemap); + return -ENOMEM; + } + } + zone_num++; + } + + return 0; +} + +/* + * free_dyn_pageflags(dyn_pageflags_t *pagemap) + * + * Free a dynamically allocated pageflags bitmap. For Suspend2 usage, we + * support data being relocated from slab to pages that don't conflict + * with the image that will be copied back. This is the reason for the + * PageSlab tests below. + * + */ +void free_dyn_pageflags(dyn_pageflags_t *pagemap) +{ + int i = 0, zone_num = 0; + struct zone *zone; + + if (!*pagemap) + return; + + for_each_zone(zone) { + int zone_pages = pages_for_zone(zone); + + if (!((*pagemap)[zone_num])) + continue; + for (i = 0; i < zone_pages; i++) + if ((*pagemap)[zone_num][i]) + free_page((unsigned long) (*pagemap)[zone_num][i]); + + if (PageSlab(virt_to_page((*pagemap)[zone_num]))) + kfree((*pagemap)[zone_num]); + else + free_page((unsigned long) (*pagemap)[zone_num]); + + zone_num++; + } + + if (PageSlab(virt_to_page((*pagemap)))) + kfree(*pagemap); + else + free_page((unsigned long) (*pagemap)); + + *pagemap = NULL; + return; +} + +/* + * dyn_pageflags_ul_ptr(dyn_pageflags_t *bitmap, struct page *pg) + * + * Get a pointer to the unsigned long containing the flag in the bitmap + * for the given page. + * + */ + +unsigned long *dyn_pageflags_ul_ptr(dyn_pageflags_t *bitmap, struct page *pg) +{ + int zone_pfn = page_to_zone_offset(pg); + int zone_num = page_zone_number(pg); + int pagenum = PAGENUMBER(zone_pfn); + int page_offset = PAGEINDEX(zone_pfn); + return ((*bitmap)[zone_num][pagenum]) + page_offset; +} + +/* + * test_dynpageflag(dyn_pageflags_t *bitmap, struct page *page) + * + * Is the page flagged in the given bitmap? + * + */ + +int test_dynpageflag(dyn_pageflags_t *bitmap, struct page *page) +{ + unsigned long *ul = dyn_pageflags_ul_ptr(bitmap, page); + int zone_offset = page_to_zone_offset(page); + int bit = PAGEBIT(zone_offset); + + return test_bit(bit, ul); +} + +/* + * set_dynpageflag(dyn_pageflags_t *bitmap, struct page *page) + * + * Set the flag for the page in the given bitmap. + * + */ + +void set_dynpageflag(dyn_pageflags_t *bitmap, struct page *page) +{ + unsigned long *ul = dyn_pageflags_ul_ptr(bitmap, page); + int zone_offset = page_to_zone_offset(page); + int bit = PAGEBIT(zone_offset); + set_bit(bit, ul); +} + +/* + * clear_dynpageflags(dyn_pageflags_t *bitmap, struct page *page) + * + * Clear the flag for the page in the given bitmap. + * + */ + +void clear_dynpageflag(dyn_pageflags_t *bitmap, struct page *page) +{ + unsigned long *ul = dyn_pageflags_ul_ptr(bitmap, page); + int zone_offset = page_to_zone_offset(page); + int bit = PAGEBIT(zone_offset); + clear_bit(bit, ul); +} + +/* + * get_next_bit_on(dyn_pageflags_t bitmap, int counter) + * + * Given a pfn (possibly -1), find the next pfn in the bitmap that + * is set. If there are no more flags set, return max_pfn. + * + */ + +int get_next_bit_on(dyn_pageflags_t bitmap, int counter) +{ + struct page *page; + struct zone *zone; + unsigned long *ul; + int zone_offset, pagebit, zone_num, first; + + BUG_ON(counter == max_pfn); + + first = (counter == -1); + + if (first) + counter = pgdat_list->node_zones->zone_start_pfn; + + page = pfn_to_page(counter); + zone = page_zone(page); + zone_num = page_zone_number(page); + + if (!first) + counter++; + + zone_offset = counter - zone->zone_start_pfn; + + do { + if (zone_offset >= zone->spanned_pages) { + do { + zone = next_zone(zone); + if (!zone) + return max_pfn; + zone_num++; + } while(!zone->spanned_pages); + + counter = zone->zone_start_pfn; + zone_offset = 0; + page = pfn_to_page(counter); + } + + /* + * This could be optimised, but there are more + * important things and the code is simple at + * the moment + */ + ul = (bitmap[zone_num][PAGENUMBER(zone_offset)]) + PAGEINDEX(zone_offset); + + pagebit = PAGEBIT(zone_offset); + + counter++; + zone_offset++; + page = pfn_to_page(counter); + + } while((counter <= max_pfn) && (!test_bit(pagebit, ul))); + return counter - 1; +} + diff -urN oldtree/lib/vsprintf.c newtree/lib/vsprintf.c --- oldtree/lib/vsprintf.c 2006-01-03 03:21:10.000000000 +0000 +++ newtree/lib/vsprintf.c 2006-02-18 15:24:31.443808744 +0000 @@ -236,6 +236,34 @@ return buf; } +/* + * vsnprintf_used + * + * Functionality : Print a string with parameters to a buffer of a + * limited size. Unlike vsnprintf, we return the number + * of bytes actually put in the buffer, not the number + * that would have been put in if it was big enough. + */ +int snprintf_used(char *buffer, int buffer_size, const char *fmt, ...) +{ + int result; + va_list args; + + if (!buffer_size) { + return 0; + } + + va_start(args, fmt); + result = vsnprintf(buffer, buffer_size, fmt, args); + va_end(args); + + if (result > buffer_size) { + return buffer_size; + } + + return result; +} + /** * vsnprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into diff -urN oldtree/mm/memory.c newtree/mm/memory.c --- oldtree/mm/memory.c 2006-02-18 15:18:30.117738656 +0000 +++ newtree/mm/memory.c 2006-02-18 15:24:31.445808440 +0000 @@ -960,6 +960,15 @@ return page; } +/* + * We want the address of the page for Suspend2 to mark as being in pageset1. + */ + +struct page *suspend2_follow_page(struct mm_struct *mm, unsigned long address) +{ + return follow_page(mm->mmap, address, 0); +} + int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) diff -urN oldtree/mm/memory.c.orig newtree/mm/memory.c.orig --- oldtree/mm/memory.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ newtree/mm/memory.c.orig 2006-02-18 15:18:30.000000000 +0000 @@ -0,0 +1,2446 @@ +/* + * linux/mm/memory.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * demand-loading started 01.12.91 - seems it is high on the list of + * things wanted, and it should be easy to implement. - Linus + */ + +/* + * Ok, demand-loading was easy, shared pages a little bit tricker. Shared + * pages started 02.12.91, seems to work. - Linus. + * + * Tested sharing by executing about 30 /bin/sh: under the old kernel it + * would have taken more than the 6M I have free, but it worked well as + * far as I could see. + * + * Also corrected some "invalidate()"s - I wasn't doing enough of them. + */ + +/* + * Real VM (paging to/from disk) started 18.12.91. Much more work and + * thought has to go into this. Oh, well.. + * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. + * Found it. Everything seems to work now. + * 20.12.91 - Ok, making the swap-device changeable like the root. + */ + +/* + * 05.04.94 - Multi-page memory management added for v1.1. + * Idea by Alex Bligh (alex@cconcepts.co.uk) + * + * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG + * (Gerhard.Wichert@pdb.siemens.de) + * + * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#ifndef CONFIG_NEED_MULTIPLE_NODES +/* use the per-pgdat data instead for discontigmem - mbligh */ +unsigned long max_mapnr; +struct page *mem_map; + +EXPORT_SYMBOL(max_mapnr); +EXPORT_SYMBOL(mem_map); +#endif + +unsigned long num_physpages; +/* + * A number of key systems in x86 including ioremap() rely on the assumption + * that high_memory defines the upper bound on direct map memory, then end + * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and + * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL + * and ZONE_HIGHMEM. + */ +void * high_memory; +unsigned long vmalloc_earlyreserve; + +EXPORT_SYMBOL(num_physpages); +EXPORT_SYMBOL(high_memory); +EXPORT_SYMBOL(vmalloc_earlyreserve); + +int randomize_va_space __read_mostly = 1; + +static int __init disable_randmaps(char *s) +{ + randomize_va_space = 0; + return 0; +} +__setup("norandmaps", disable_randmaps); + + +/* + * If a p?d_bad entry is found while walking page tables, report + * the error, before resetting entry to p?d_none. Usually (but + * very seldom) called out from the p?d_none_or_clear_bad macros. + */ + +void pgd_clear_bad(pgd_t *pgd) +{ + pgd_ERROR(*pgd); + pgd_clear(pgd); +} + +void pud_clear_bad(pud_t *pud) +{ + pud_ERROR(*pud); + pud_clear(pud); +} + +void pmd_clear_bad(pmd_t *pmd) +{ + pmd_ERROR(*pmd); + pmd_clear(pmd); +} + +/* + * Note: this doesn't free the actual pages themselves. That + * has been handled earlier when unmapping all the memory regions. + */ +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) +{ + struct page *page = pmd_page(*pmd); + pmd_clear(pmd); + pte_lock_deinit(page); + pte_free_tlb(tlb, page); + dec_page_state(nr_page_table_pages); + tlb->mm->nr_ptes--; +} + +static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pmd_t *pmd; + unsigned long next; + unsigned long start; + + start = addr; + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + free_pte_range(tlb, pmd); + } while (pmd++, addr = next, addr != end); + + start &= PUD_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= PUD_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + + pmd = pmd_offset(pud, start); + pud_clear(pud); + pmd_free_tlb(tlb, pmd); +} + +static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pud_t *pud; + unsigned long next; + unsigned long start; + + start = addr; + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + free_pmd_range(tlb, pud, addr, next, floor, ceiling); + } while (pud++, addr = next, addr != end); + + start &= PGDIR_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= PGDIR_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + + pud = pud_offset(pgd, start); + pgd_clear(pgd); + pud_free_tlb(tlb, pud); +} + +/* + * This function frees user-level page tables of a process. + * + * Must be called with pagetable lock held. + */ +void free_pgd_range(struct mmu_gather **tlb, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pgd_t *pgd; + unsigned long next; + unsigned long start; + + /* + * The next few lines have given us lots of grief... + * + * Why are we testing PMD* at this top level? Because often + * there will be no work to do at all, and we'd prefer not to + * go all the way down to the bottom just to discover that. + * + * Why all these "- 1"s? Because 0 represents both the bottom + * of the address space and the top of it (using -1 for the + * top wouldn't help much: the masks would do the wrong thing). + * The rule is that addr 0 and floor 0 refer to the bottom of + * the address space, but end 0 and ceiling 0 refer to the top + * Comparisons need to use "end - 1" and "ceiling - 1" (though + * that end 0 case should be mythical). + * + * Wherever addr is brought up or ceiling brought down, we must + * be careful to reject "the opposite 0" before it confuses the + * subsequent tests. But what about where end is brought down + * by PMD_SIZE below? no, end can't go down to 0 there. + * + * Whereas we round start (addr) and ceiling down, by different + * masks at different levels, in order to test whether a table + * now has no other vmas using it, so can be freed, we don't + * bother to round floor or end up - the tests don't need that. + */ + + addr &= PMD_MASK; + if (addr < floor) { + addr += PMD_SIZE; + if (!addr) + return; + } + if (ceiling) { + ceiling &= PMD_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + end -= PMD_SIZE; + if (addr > end - 1) + return; + + start = addr; + pgd = pgd_offset((*tlb)->mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + free_pud_range(*tlb, pgd, addr, next, floor, ceiling); + } while (pgd++, addr = next, addr != end); + + if (!(*tlb)->fullmm) + flush_tlb_pgtables((*tlb)->mm, start, end); +} + +void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, + unsigned long floor, unsigned long ceiling) +{ + while (vma) { + struct vm_area_struct *next = vma->vm_next; + unsigned long addr = vma->vm_start; + + /* + * Hide vma from rmap and vmtruncate before freeing pgtables + */ + anon_vma_unlink(vma); + unlink_file_vma(vma); + + if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { + hugetlb_free_pgd_range(tlb, addr, vma->vm_end, + floor, next? next->vm_start: ceiling); + } else { + /* + * Optimization: gather nearby vmas into one call down + */ + while (next && next->vm_start <= vma->vm_end + PMD_SIZE + && !is_hugepage_only_range(vma->vm_mm, next->vm_start, + HPAGE_SIZE)) { + vma = next; + next = vma->vm_next; + anon_vma_unlink(vma); + unlink_file_vma(vma); + } + free_pgd_range(tlb, addr, vma->vm_end, + floor, next? next->vm_start: ceiling); + } + vma = next; + } +} + +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +{ + struct page *new = pte_alloc_one(mm, address); + if (!new) + return -ENOMEM; + + pte_lock_init(new); + spin_lock(&mm->page_table_lock); + if (pmd_present(*pmd)) { /* Another has populated it */ + pte_lock_deinit(new); + pte_free(new); + } else { + mm->nr_ptes++; + inc_page_state(nr_page_table_pages); + pmd_populate(mm, pmd, new); + } + spin_unlock(&mm->page_table_lock); + return 0; +} + +int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) +{ + pte_t *new = pte_alloc_one_kernel(&init_mm, address); + if (!new) + return -ENOMEM; + + spin_lock(&init_mm.page_table_lock); + if (pmd_present(*pmd)) /* Another has populated it */ + pte_free_kernel(new); + else + pmd_populate_kernel(&init_mm, pmd, new); + spin_unlock(&init_mm.page_table_lock); + return 0; +} + +static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) +{ + if (file_rss) + add_mm_counter(mm, file_rss, file_rss); + if (anon_rss) + add_mm_counter(mm, anon_rss, anon_rss); +} + +/* + * This function is called to print an error when a bad pte + * is found. For example, we might have a PFN-mapped pte in + * a region that doesn't allow it. + * + * The calling function must still handle the error. + */ +void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) +{ + printk(KERN_ERR "Bad pte = %08llx, process = %s, " + "vm_flags = %lx, vaddr = %lx\n", + (long long)pte_val(pte), + (vma->vm_mm == current->mm ? current->comm : "???"), + vma->vm_flags, vaddr); + dump_stack(); +} + +static inline int is_cow_mapping(unsigned int flags) +{ + return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; +} + +/* + * This function gets the "struct page" associated with a pte. + * + * NOTE! Some mappings do not have "struct pages". A raw PFN mapping + * will have each page table entry just pointing to a raw page frame + * number, and as far as the VM layer is concerned, those do not have + * pages associated with them - even if the PFN might point to memory + * that otherwise is perfectly fine and has a "struct page". + * + * The way we recognize those mappings is through the rules set up + * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set, + * and the vm_pgoff will point to the first PFN mapped: thus every + * page that is a raw mapping will always honor the rule + * + * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) + * + * and if that isn't true, the page has been COW'ed (in which case it + * _does_ have a "struct page" associated with it even if it is in a + * VM_PFNMAP range). + */ +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); + + if (vma->vm_flags & VM_PFNMAP) { + unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } + + /* + * Add some anal sanity checks for now. Eventually, + * we should just do "return pfn_to_page(pfn)", but + * in the meantime we check that we get a valid pfn, + * and that the resulting page looks ok. + * + * Remove this test eventually! + */ + if (unlikely(!pfn_valid(pfn))) { + print_bad_pte(vma, pte, addr); + return NULL; + } + + /* + * NOTE! We still have PageReserved() pages in the page + * tables. + * + * The PAGE_ZERO() pages and various VDSO mappings can + * cause them to exist. + */ + return pfn_to_page(pfn); +} + +/* + * copy one vm_area from one task to the other. Assumes the page tables + * already present in the new task to be cleared in the whole range + * covered by this vma. + */ + +static inline void +copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, + unsigned long addr, int *rss) +{ + unsigned long vm_flags = vma->vm_flags; + pte_t pte = *src_pte; + struct page *page; + + /* pte contains position in swap or file, so copy. */ + if (unlikely(!pte_present(pte))) { + if (!pte_file(pte)) { + swap_duplicate(pte_to_swp_entry(pte)); + /* make sure dst_mm is on swapoff's mmlist. */ + if (unlikely(list_empty(&dst_mm->mmlist))) { + spin_lock(&mmlist_lock); + if (list_empty(&dst_mm->mmlist)) + list_add(&dst_mm->mmlist, + &src_mm->mmlist); + spin_unlock(&mmlist_lock); + } + } + goto out_set_pte; + } + + /* + * If it's a COW mapping, write protect it both + * in the parent and the child + */ + if (is_cow_mapping(vm_flags)) { + ptep_set_wrprotect(src_mm, addr, src_pte); + pte = *src_pte; + } + + /* + * If it's a shared mapping, mark it clean in + * the child + */ + if (vm_flags & VM_SHARED) + pte = pte_mkclean(pte); + pte = pte_mkold(pte); + + page = vm_normal_page(vma, addr, pte); + if (page) { + get_page(page); + page_dup_rmap(page); + rss[!!PageAnon(page)]++; + } + +out_set_pte: + set_pte_at(dst_mm, addr, dst_pte, pte); +} + +static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pte_t *src_pte, *dst_pte; + spinlock_t *src_ptl, *dst_ptl; + int progress = 0; + int rss[2]; + +again: + rss[1] = rss[0] = 0; + dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); + if (!dst_pte) + return -ENOMEM; + src_pte = pte_offset_map_nested(src_pmd, addr); + src_ptl = pte_lockptr(src_mm, src_pmd); + spin_lock(src_ptl); + + do { + /* + * We are holding two locks at this point - either of them + * could generate latencies in another task on another CPU. + */ + if (progress >= 32) { + progress = 0; + if (need_resched() || + need_lockbreak(src_ptl) || + need_lockbreak(dst_ptl)) + break; + } + if (pte_none(*src_pte)) { + progress++; + continue; + } + copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); + progress += 8; + } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); + + spin_unlock(src_ptl); + pte_unmap_nested(src_pte - 1); + add_mm_rss(dst_mm, rss[0], rss[1]); + pte_unmap_unlock(dst_pte - 1, dst_ptl); + cond_resched(); + if (addr != end) + goto again; + return 0; +} + +static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pmd_t *src_pmd, *dst_pmd; + unsigned long next; + + dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); + if (!dst_pmd) + return -ENOMEM; + src_pmd = pmd_offset(src_pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(src_pmd)) + continue; + if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, + vma, addr, next)) + return -ENOMEM; + } while (dst_pmd++, src_pmd++, addr = next, addr != end); + return 0; +} + +static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + pud_t *src_pud, *dst_pud; + unsigned long next; + + dst_pud = pud_alloc(dst_mm, dst_pgd, addr); + if (!dst_pud) + return -ENOMEM; + src_pud = pud_offset(src_pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(src_pud)) + continue; + if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, + vma, addr, next)) + return -ENOMEM; + } while (dst_pud++, src_pud++, addr = next, addr != end); + return 0; +} + +int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + struct vm_area_struct *vma) +{ + pgd_t *src_pgd, *dst_pgd; + unsigned long next; + unsigned long addr = vma->vm_start; + unsigned long end = vma->vm_end; + + /* + * Don't copy ptes where a page fault will fill them correctly. + * Fork becomes much lighter when there are big shared or private + * readonly mappings. The tradeoff is that copy_page_range is more + * efficient than faulting. + */ + if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { + if (!vma->anon_vma) + return 0; + } + + if (is_vm_hugetlb_page(vma)) + return copy_hugetlb_page_range(dst_mm, src_mm, vma); + + dst_pgd = pgd_offset(dst_mm, addr); + src_pgd = pgd_offset(src_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(src_pgd)) + continue; + if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, + vma, addr, next)) + return -ENOMEM; + } while (dst_pgd++, src_pgd++, addr = next, addr != end); + return 0; +} + +static unsigned long zap_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + long *zap_work, struct zap_details *details) +{ + struct mm_struct *mm = tlb->mm; + pte_t *pte; + spinlock_t *ptl; + int file_rss = 0; + int anon_rss = 0; + + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + do { + pte_t ptent = *pte; + if (pte_none(ptent)) { + (*zap_work)--; + continue; + } + if (pte_present(ptent)) { + struct page *page; + + (*zap_work) -= PAGE_SIZE; + + page = vm_normal_page(vma, addr, ptent); + if (unlikely(details) && page) { + /* + * unmap_shared_mapping_pages() wants to + * invalidate cache without truncating: + * unmap shared but keep private pages. + */ + if (details->check_mapping && + details->check_mapping != page->mapping) + continue; + /* + * Each page->index must be checked when + * invalidating or truncating nonlinear. + */ + if (details->nonlinear_vma && + (page->index < details->first_index || + page->index > details->last_index)) + continue; + } + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + tlb_remove_tlb_entry(tlb, pte, addr); + if (unlikely(!page)) + continue; + if (unlikely(details) && details->nonlinear_vma + && linear_page_index(details->nonlinear_vma, + addr) != page->index) + set_pte_at(mm, addr, pte, + pgoff_to_pte(page->index)); + if (PageAnon(page)) + anon_rss--; + else { + if (pte_dirty(ptent)) + set_page_dirty(page); + if (pte_young(ptent)) + mark_page_accessed(page); + file_rss--; + } + page_remove_rmap(page); + tlb_remove_page(tlb, page); + continue; + } + /* + * If details->check_mapping, we leave swap entries; + * if details->nonlinear_vma, we leave file entries. + */ + if (unlikely(details)) + continue; + if (!pte_file(ptent)) + free_swap_and_cache(pte_to_swp_entry(ptent)); + pte_clear_full(mm, addr, pte, tlb->fullmm); + } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); + + add_mm_rss(mm, file_rss, anon_rss); + pte_unmap_unlock(pte - 1, ptl); + + return addr; +} + +static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, + long *zap_work, struct zap_details *details) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) { + (*zap_work)--; + continue; + } + next = zap_pte_range(tlb, vma, pmd, addr, next, + zap_work, details); + } while (pmd++, addr = next, (addr != end && *zap_work > 0)); + + return addr; +} + +static inline unsigned long zap_pud_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, + long *zap_work, struct zap_details *details) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) { + (*zap_work)--; + continue; + } + next = zap_pmd_range(tlb, vma, pud, addr, next, + zap_work, details); + } while (pud++, addr = next, (addr != end && *zap_work > 0)); + + return addr; +} + +static unsigned long unmap_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + long *zap_work, struct zap_details *details) +{ + pgd_t *pgd; + unsigned long next; + + if (details && !details->check_mapping && !details->nonlinear_vma) + details = NULL; + + BUG_ON(addr >= end); + tlb_start_vma(tlb, vma); + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) { + (*zap_work)--; + continue; + } + next = zap_pud_range(tlb, vma, pgd, addr, next, + zap_work, details); + } while (pgd++, addr = next, (addr != end && *zap_work > 0)); + tlb_end_vma(tlb, vma); + + return addr; +} + +#ifdef CONFIG_PREEMPT +# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) +#else +/* No preempt: go for improved straight-line efficiency */ +# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) +#endif + +/** + * unmap_vmas - unmap a range of memory covered by a list of vma's + * @tlbp: address of the caller's struct mmu_gather + * @vma: the starting vma + * @start_addr: virtual address at which to start unmapping + * @end_addr: virtual address at which to end unmapping + * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here + * @details: details of nonlinear truncation or shared cache invalidation + * + * Returns the end address of the unmapping (restart addr if interrupted). + * + * Unmap all pages in the vma list. + * + * We aim to not hold locks for too long (for scheduling latency reasons). + * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to + * return the ending mmu_gather to the caller. + * + * Only addresses between `start' and `end' will be unmapped. + * + * The VMA list must be sorted in ascending virtual address order. + * + * unmap_vmas() assumes that the caller will flush the whole unmapped address + * range after unmap_vmas() returns. So the only responsibility here is to + * ensure that any thus-far unmapped pages are flushed before unmap_vmas() + * drops the lock and schedules. + */ +unsigned long unmap_vmas(struct mmu_gather **tlbp, + struct vm_area_struct *vma, unsigned long start_addr, + unsigned long end_addr, unsigned long *nr_accounted, + struct zap_details *details) +{ + long zap_work = ZAP_BLOCK_SIZE; + unsigned long tlb_start = 0; /* For tlb_finish_mmu */ + int tlb_start_valid = 0; + unsigned long start = start_addr; + spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; + int fullmm = (*tlbp)->fullmm; + + for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { + unsigned long end; + + start = max(vma->vm_start, start_addr); + if (start >= vma->vm_end) + continue; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + continue; + + if (vma->vm_flags & VM_ACCOUNT) + *nr_accounted += (end - start) >> PAGE_SHIFT; + + while (start != end) { + if (!tlb_start_valid) { + tlb_start = start; + tlb_start_valid = 1; + } + + if (unlikely(is_vm_hugetlb_page(vma))) { + unmap_hugepage_range(vma, start, end); + zap_work -= (end - start) / + (HPAGE_SIZE / PAGE_SIZE); + start = end; + } else + start = unmap_page_range(*tlbp, vma, + start, end, &zap_work, details); + + if (zap_work > 0) { + BUG_ON(start != end); + break; + } + + tlb_finish_mmu(*tlbp, tlb_start, start); + + if (need_resched() || + (i_mmap_lock && need_lockbreak(i_mmap_lock))) { + if (i_mmap_lock) { + *tlbp = NULL; + goto out; + } + cond_resched(); + } + + *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); + tlb_start_valid = 0; + zap_work = ZAP_BLOCK_SIZE; + } + } +out: + return start; /* which is now the end (or restart) address */ +} + +/** + * zap_page_range - remove user pages in a given range + * @vma: vm_area_struct holding the applicable pages + * @address: starting address of pages to zap + * @size: number of bytes to zap + * @details: details of nonlinear truncation or shared cache invalidation + */ +unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, + unsigned long size, struct zap_details *details) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather *tlb; + unsigned long end = address + size; + unsigned long nr_accounted = 0; + + lru_add_drain(); + tlb = tlb_gather_mmu(mm, 0); + update_hiwater_rss(mm); + end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); + if (tlb) + tlb_finish_mmu(tlb, address, end); + return end; +} + +/* + * Do a quick page-table lookup for a single page. + */ +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, + unsigned int flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); + if (!IS_ERR(page)) { + BUG_ON(flags & FOLL_GET); + goto out; + } + + page = NULL; + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto no_page_table; + + pud = pud_offset(pgd, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto no_page_table; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto no_page_table; + + if (pmd_huge(*pmd)) { + BUG_ON(flags & FOLL_GET); + page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); + goto out; + } + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!ptep) + goto out; + + pte = *ptep; + if (!pte_present(pte)) + goto unlock; + if ((flags & FOLL_WRITE) && !pte_write(pte)) + goto unlock; + page = vm_normal_page(vma, address, pte); + if (unlikely(!page)) + goto unlock; + + if (flags & FOLL_GET) + get_page(page); + if (flags & FOLL_TOUCH) { + if ((flags & FOLL_WRITE) && + !pte_dirty(pte) && !PageDirty(page)) + set_page_dirty(page); + mark_page_accessed(page); + } +unlock: + pte_unmap_unlock(ptep, ptl); +out: + return page; + +no_page_table: + /* + * When core dumping an enormous anonymous area that nobody + * has touched so far, we don't want to allocate page tables. + */ + if (flags & FOLL_ANON) { + page = ZERO_PAGE(address); + if (flags & FOLL_GET) + get_page(page); + BUG_ON(flags & FOLL_WRITE); + } + return page; +} + +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas) +{ + int i; + unsigned int vm_flags; + + /* + * Require read or write permissions. + * If 'force' is set, we only require the "MAY" flags. + */ + vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + i = 0; + + do { + struct vm_area_struct *vma; + unsigned int foll_flags; + + vma = find_extend_vma(mm, start); + if (!vma && in_gate_area(tsk, start)) { + unsigned long pg = start & PAGE_MASK; + struct vm_area_struct *gate_vma = get_gate_vma(tsk); + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + if (write) /* user gate pages are read-only */ + return i ? : -EFAULT; + if (pg > TASK_SIZE) + pgd = pgd_offset_k(pg); + else + pgd = pgd_offset_gate(mm, pg); + BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd, pg); + BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, pg); + if (pmd_none(*pmd)) + return i ? : -EFAULT; + pte = pte_offset_map(pmd, pg); + if (pte_none(*pte)) { + pte_unmap(pte); + return i ? : -EFAULT; + } + if (pages) { + struct page *page = vm_normal_page(gate_vma, start, *pte); + pages[i] = page; + if (page) + get_page(page); + } + pte_unmap(pte); + if (vmas) + vmas[i] = gate_vma; + i++; + start += PAGE_SIZE; + len--; + continue; + } + + if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) + || !(vm_flags & vma->vm_flags)) + return i ? : -EFAULT; + + if (is_vm_hugetlb_page(vma)) { + i = follow_hugetlb_page(mm, vma, pages, vmas, + &start, &len, i); + continue; + } + + foll_flags = FOLL_TOUCH; + if (pages) + foll_flags |= FOLL_GET; + if (!write && !(vma->vm_flags & VM_LOCKED) && + (!vma->vm_ops || !vma->vm_ops->nopage)) + foll_flags |= FOLL_ANON; + + do { + struct page *page; + + if (write) + foll_flags |= FOLL_WRITE; + + cond_resched(); + while (!(page = follow_page(vma, start, foll_flags))) { + int ret; + ret = __handle_mm_fault(mm, vma, start, + foll_flags & FOLL_WRITE); + /* + * The VM_FAULT_WRITE bit tells us that do_wp_page has + * broken COW when necessary, even if maybe_mkwrite + * decided not to set pte_write. We can thus safely do + * subsequent page lookups as if they were reads. + */ + if (ret & VM_FAULT_WRITE) + foll_flags &= ~FOLL_WRITE; + + switch (ret & ~VM_FAULT_WRITE) { + case VM_FAULT_MINOR: + tsk->min_flt++; + break; + case VM_FAULT_MAJOR: + tsk->maj_flt++; + break; + case VM_FAULT_SIGBUS: + return i ? i : -EFAULT; + case VM_FAULT_OOM: + return i ? i : -ENOMEM; + default: + BUG(); + } + } + if (pages) { + pages[i] = page; + flush_dcache_page(page); + } + if (vmas) + vmas[i] = vma; + i++; + start += PAGE_SIZE; + len--; + } while (len && start < vma->vm_end); + } while (len); + return i; +} +EXPORT_SYMBOL(get_user_pages); + +static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, pgprot_t prot) +{ + pte_t *pte; + spinlock_t *ptl; + + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -ENOMEM; + do { + struct page *page = ZERO_PAGE(addr); + pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); + page_cache_get(page); + page_add_file_rmap(page); + inc_mm_counter(mm, file_rss); + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, addr, pte, zero_pte); + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap_unlock(pte - 1, ptl); + return 0; +} + +static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, pgprot_t prot) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + if (zeromap_pte_range(mm, pmd, addr, next, prot)) + return -ENOMEM; + } while (pmd++, addr = next, addr != end); + return 0; +} + +static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, pgprot_t prot) +{ + pud_t *pud; + unsigned long next; + + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + if (zeromap_pmd_range(mm, pud, addr, next, prot)) + return -ENOMEM; + } while (pud++, addr = next, addr != end); + return 0; +} + +int zeromap_page_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long size, pgprot_t prot) +{ + pgd_t *pgd; + unsigned long next; + unsigned long end = addr + size; + struct mm_struct *mm = vma->vm_mm; + int err; + + BUG_ON(addr >= end); + pgd = pgd_offset(mm, addr); + flush_cache_range(vma, addr, end); + do { + next = pgd_addr_end(addr, end); + err = zeromap_pud_range(mm, pgd, addr, next, prot); + if (err) + break; + } while (pgd++, addr = next, addr != end); + return err; +} + +pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) +{ + pgd_t * pgd = pgd_offset(mm, addr); + pud_t * pud = pud_alloc(mm, pgd, addr); + if (pud) { + pmd_t * pmd = pmd_alloc(mm, pud, addr); + if (pmd) + return pte_alloc_map_lock(mm, pmd, addr, ptl); + } + return NULL; +} + +/* + * This is the old fallback for page remapping. + * + * For historical reasons, it only allows reserved pages. Only + * old drivers should use this, and they needed to mark their + * pages reserved for the old functions anyway. + */ +static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot) +{ + int retval; + pte_t *pte; + spinlock_t *ptl; + + retval = -EINVAL; + if (PageAnon(page)) + goto out; + retval = -ENOMEM; + flush_dcache_page(page); + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + goto out; + retval = -EBUSY; + if (!pte_none(*pte)) + goto out_unlock; + + /* Ok, finally just insert the thing.. */ + get_page(page); + inc_mm_counter(mm, file_rss); + page_add_file_rmap(page); + set_pte_at(mm, addr, pte, mk_pte(page, prot)); + + retval = 0; +out_unlock: + pte_unmap_unlock(pte, ptl); +out: + return retval; +} + +/* + * This allows drivers to insert individual pages they've allocated + * into a user vma. + * + * The page has to be a nice clean _individual_ kernel allocation. + * If you allocate a compound page, you need to have marked it as + * such (__GFP_COMP), or manually just split the page up yourself + * (which is mainly an issue of doing "set_page_count(page, 1)" for + * each sub-page, and then freeing them one by one when you free + * them rather than freeing it as a compound page). + * + * NOTE! Traditionally this was done with "remap_pfn_range()" which + * took an arbitrary page protection parameter. This doesn't allow + * that. Your vma protection will have to be set up correctly, which + * means that if you want a shared writable mapping, you'd better + * ask for a shared writable mapping! + * + * The page does not need to be reserved. + */ +int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) +{ + if (addr < vma->vm_start || addr >= vma->vm_end) + return -EFAULT; + if (!page_count(page)) + return -EINVAL; + vma->vm_flags |= VM_INSERTPAGE; + return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_page); + +/* + * maps a range of physical memory into the requested pages. the old + * mappings are removed. any references to nonexistent pages results + * in null mappings (currently treated as "copy-on-access") + */ +static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pte_t *pte; + spinlock_t *ptl; + + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -ENOMEM; + do { + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); + pfn++; + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap_unlock(pte - 1, ptl); + return 0; +} + +static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pmd_t *pmd; + unsigned long next; + + pfn -= addr >> PAGE_SHIFT; + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + if (remap_pte_range(mm, pmd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot)) + return -ENOMEM; + } while (pmd++, addr = next, addr != end); + return 0; +} + +static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pud_t *pud; + unsigned long next; + + pfn -= addr >> PAGE_SHIFT; + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + if (remap_pmd_range(mm, pud, addr, next, + pfn + (addr >> PAGE_SHIFT), prot)) + return -ENOMEM; + } while (pud++, addr = next, addr != end); + return 0; +} + +/* Note: this is only safe if the mm semaphore is held when called. */ +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + pgd_t *pgd; + unsigned long next; + unsigned long end = addr + PAGE_ALIGN(size); + struct mm_struct *mm = vma->vm_mm; + int err; + + /* + * Physically remapped pages are special. Tell the + * rest of the world about it: + * VM_IO tells people not to look at these pages + * (accesses can have side effects). + * VM_RESERVED is specified all over the place, because + * in 2.4 it kept swapout's vma scan off this vma; but + * in 2.6 the LRU scan won't even find its pages, so this + * flag means no more than count its pages in reserved_vm, + * and omit it from core dump, even when VM_IO turned off. + * VM_PFNMAP tells the core MM that the base pages are just + * raw PFN mappings, and do not have a "struct page" associated + * with them. + * + * There's a horrible special case to handle copy-on-write + * behaviour that some programs depend on. We mark the "original" + * un-COW'ed pages by matching them up with "vma->vm_pgoff". + */ + if (is_cow_mapping(vma->vm_flags)) { + if (addr != vma->vm_start || end != vma->vm_end) + return -EINVAL; + vma->vm_pgoff = pfn; + } + + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + + BUG_ON(addr >= end); + pfn -= addr >> PAGE_SHIFT; + pgd = pgd_offset(mm, addr); + flush_cache_range(vma, addr, end); + do { + next = pgd_addr_end(addr, end); + err = remap_pud_range(mm, pgd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot); + if (err) + break; + } while (pgd++, addr = next, addr != end); + return err; +} +EXPORT_SYMBOL(remap_pfn_range); + +/* + * handle_pte_fault chooses page fault handler according to an entry + * which was read non-atomically. Before making any commitment, on + * those architectures or configurations (e.g. i386 with PAE) which + * might give a mix of unmatched parts, do_swap_page and do_file_page + * must check under lock before unmapping the pte and proceeding + * (but do_wp_page is only called after already making such a check; + * and do_anonymous_page and do_no_page can safely check later on). + */ +static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, + pte_t *page_table, pte_t orig_pte) +{ + int same = 1; +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) + if (sizeof(pte_t) > sizeof(unsigned long)) { + spinlock_t *ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + same = pte_same(*page_table, orig_pte); + spin_unlock(ptl); + } +#endif + pte_unmap(page_table); + return same; +} + +/* + * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when + * servicing faults for write access. In the normal case, do always want + * pte_mkwrite. But get_user_pages can cause write faults for mappings + * that do not have writing enabled, when used by access_process_vm. + */ +static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) +{ + if (likely(vma->vm_flags & VM_WRITE)) + pte = pte_mkwrite(pte); + return pte; +} + +static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va) +{ + /* + * If the source page was a PFN mapping, we don't have + * a "struct page" for it. We do a best-effort copy by + * just copying from the original user address. If that + * fails, we just zero-fill it. Live with it. + */ + if (unlikely(!src)) { + void *kaddr = kmap_atomic(dst, KM_USER0); + void __user *uaddr = (void __user *)(va & PAGE_MASK); + + /* + * This really shouldn't fail, because the page is there + * in the page tables. But it might just be unreadable, + * in which case we just give up and fill the result with + * zeroes. + */ + if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) + memset(kaddr, 0, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + return; + + } + copy_user_highpage(dst, src, va); +} + +/* + * This routine handles present pages, when users try to write + * to a shared page. It is done by copying the page to a new address + * and decrementing the shared-page counter for the old page. + * + * Note that this routine assumes that the protection checks have been + * done by the caller (the low-level page fault routine in most cases). + * Thus we can safely just mark it writable once we've done any necessary + * COW. + * + * We also mark the page dirty at this point even though the page will + * change only once the write actually happens. This avoids a few races, + * and potentially makes it more efficient. + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), with pte both mapped and locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + spinlock_t *ptl, pte_t orig_pte) +{ + struct page *old_page, *new_page; + pte_t entry; + int ret = VM_FAULT_MINOR; + + old_page = vm_normal_page(vma, address, orig_pte); + if (!old_page) + goto gotten; + + if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { + int reuse = can_share_swap_page(old_page); + unlock_page(old_page); + if (reuse) { + flush_cache_page(vma, address, pte_pfn(orig_pte)); + entry = pte_mkyoung(orig_pte); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + ptep_set_access_flags(vma, address, page_table, entry, 1); + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); + ret |= VM_FAULT_WRITE; + goto unlock; + } + } + + /* + * Ok, we need to copy. Oh, well.. + */ + page_cache_get(old_page); +gotten: + pte_unmap_unlock(page_table, ptl); + + if (unlikely(anon_vma_prepare(vma))) + goto oom; + if (old_page == ZERO_PAGE(address)) { + new_page = alloc_zeroed_user_highpage(vma, address); + if (!new_page) + goto oom; + } else { + new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); + if (!new_page) + goto oom; + cow_user_page(new_page, old_page, address); + } + + /* + * Re-check the pte - we dropped the lock + */ + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (likely(pte_same(*page_table, orig_pte))) { + if (old_page) { + page_remove_rmap(old_page); + if (!PageAnon(old_page)) { + dec_mm_counter(mm, file_rss); + inc_mm_counter(mm, anon_rss); + } + } else + inc_mm_counter(mm, anon_rss); + flush_cache_page(vma, address, pte_pfn(orig_pte)); + entry = mk_pte(new_page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + ptep_establish(vma, address, page_table, entry); + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); + lru_cache_add_active(new_page); + page_add_new_anon_rmap(new_page, vma, address); + + /* Free the old page.. */ + new_page = old_page; + ret |= VM_FAULT_WRITE; + } + if (new_page) + page_cache_release(new_page); + if (old_page) + page_cache_release(old_page); +unlock: + pte_unmap_unlock(page_table, ptl); + return ret; +oom: + if (old_page) + page_cache_release(old_page); + return VM_FAULT_OOM; +} + +/* + * Helper functions for unmap_mapping_range(). + * + * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ + * + * We have to restart searching the prio_tree whenever we drop the lock, + * since the iterator is only valid while the lock is held, and anyway + * a later vma might be split and reinserted earlier while lock dropped. + * + * The list of nonlinear vmas could be handled more efficiently, using + * a placeholder, but handle it in the same way until a need is shown. + * It is important to search the prio_tree before nonlinear list: a vma + * may become nonlinear and be shifted from prio_tree to nonlinear list + * while the lock is dropped; but never shifted from list to prio_tree. + * + * In order to make forward progress despite restarting the search, + * vm_truncate_count is used to mark a vma as now dealt with, so we can + * quickly skip it next time around. Since the prio_tree search only + * shows us those vmas affected by unmapping the range in question, we + * can't efficiently keep all vmas in step with mapping->truncate_count: + * so instead reset them all whenever it wraps back to 0 (then go to 1). + * mapping->truncate_count and vma->vm_truncate_count are protected by + * i_mmap_lock. + * + * In order to make forward progress despite repeatedly restarting some + * large vma, note the restart_addr from unmap_vmas when it breaks out: + * and restart from that address when we reach that vma again. It might + * have been split or merged, shrunk or extended, but never shifted: so + * restart_addr remains valid so long as it remains in the vma's range. + * unmap_mapping_range forces truncate_count to leap over page-aligned + * values so we can save vma's restart_addr in its truncate_count field. + */ +#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK)) + +static void reset_vma_truncate_counts(struct address_space *mapping) +{ + struct vm_area_struct *vma; + struct prio_tree_iter iter; + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) + vma->vm_truncate_count = 0; + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) + vma->vm_truncate_count = 0; +} + +static int unmap_mapping_range_vma(struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr, + struct zap_details *details) +{ + unsigned long restart_addr; + int need_break; + +again: + restart_addr = vma->vm_truncate_count; + if (is_restart_addr(restart_addr) && start_addr < restart_addr) { + start_addr = restart_addr; + if (start_addr >= end_addr) { + /* Top of vma has been split off since last time */ + vma->vm_truncate_count = details->truncate_count; + return 0; + } + } + + restart_addr = zap_page_range(vma, start_addr, + end_addr - start_addr, details); + need_break = need_resched() || + need_lockbreak(details->i_mmap_lock); + + if (restart_addr >= end_addr) { + /* We have now completed this vma: mark it so */ + vma->vm_truncate_count = details->truncate_count; + if (!need_break) + return 0; + } else { + /* Note restart_addr in vma's truncate_count field */ + vma->vm_truncate_count = restart_addr; + if (!need_break) + goto again; + } + + spin_unlock(details->i_mmap_lock); + cond_resched(); + spin_lock(details->i_mmap_lock); + return -EINTR; +} + +static inline void unmap_mapping_range_tree(struct prio_tree_root *root, + struct zap_details *details) +{ + struct vm_area_struct *vma; + struct prio_tree_iter iter; + pgoff_t vba, vea, zba, zea; + +restart: + vma_prio_tree_foreach(vma, &iter, root, + details->first_index, details->last_index) { + /* Skip quickly over those we have already dealt with */ + if (vma->vm_truncate_count == details->truncate_count) + continue; + + vba = vma->vm_pgoff; + vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; + /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ + zba = details->first_index; + if (zba < vba) + zba = vba; + zea = details->last_index; + if (zea > vea) + zea = vea; + + if (unmap_mapping_range_vma(vma, + ((zba - vba) << PAGE_SHIFT) + vma->vm_start, + ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, + details) < 0) + goto restart; + } +} + +static inline void unmap_mapping_range_list(struct list_head *head, + struct zap_details *details) +{ + struct vm_area_struct *vma; + + /* + * In nonlinear VMAs there is no correspondence between virtual address + * offset and file offset. So we must perform an exhaustive search + * across *all* the pages in each nonlinear VMA, not just the pages + * whose virtual address lies outside the file truncation point. + */ +restart: + list_for_each_entry(vma, head, shared.vm_set.list) { + /* Skip quickly over those we have already dealt with */ + if (vma->vm_truncate_count == details->truncate_count) + continue; + details->nonlinear_vma = vma; + if (unmap_mapping_range_vma(vma, vma->vm_start, + vma->vm_end, details) < 0) + goto restart; + } +} + +/** + * unmap_mapping_range - unmap the portion of all mmaps + * in the specified address_space corresponding to the specified + * page range in the underlying file. + * @mapping: the address space containing mmaps to be unmapped. + * @holebegin: byte in first page to unmap, relative to the start of + * the underlying file. This will be rounded down to a PAGE_SIZE + * boundary. Note that this is different from vmtruncate(), which + * must keep the partial page. In contrast, we must get rid of + * partial pages. + * @holelen: size of prospective hole in bytes. This will be rounded + * up to a PAGE_SIZE boundary. A holelen of zero truncates to the + * end of the file. + * @even_cows: 1 when truncating a file, unmap even private COWed pages; + * but 0 when invalidating pagecache, don't throw away private data. + */ +void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows) +{ + struct zap_details details; + pgoff_t hba = holebegin >> PAGE_SHIFT; + pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; + + /* Check for overflow. */ + if (sizeof(holelen) > sizeof(hlen)) { + long long holeend = + (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (holeend & ~(long long)ULONG_MAX) + hlen = ULONG_MAX - hba + 1; + } + + details.check_mapping = even_cows? NULL: mapping; + details.nonlinear_vma = NULL; + details.first_index = hba; + details.last_index = hba + hlen - 1; + if (details.last_index < details.first_index) + details.last_index = ULONG_MAX; + details.i_mmap_lock = &mapping->i_mmap_lock; + + spin_lock(&mapping->i_mmap_lock); + + /* serialize i_size write against truncate_count write */ + smp_wmb(); + /* Protect against page faults, and endless unmapping loops */ + mapping->truncate_count++; + /* + * For archs where spin_lock has inclusive semantics like ia64 + * this smp_mb() will prevent to read pagetable contents + * before the truncate_count increment is visible to + * other cpus. + */ + smp_mb(); + if (unlikely(is_restart_addr(mapping->truncate_count))) { + if (mapping->truncate_count == 0) + reset_vma_truncate_counts(mapping); + mapping->truncate_count++; + } + details.truncate_count = mapping->truncate_count; + + if (unlikely(!prio_tree_empty(&mapping->i_mmap))) + unmap_mapping_range_tree(&mapping->i_mmap, &details); + if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) + unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); + spin_unlock(&mapping->i_mmap_lock); +} +EXPORT_SYMBOL(unmap_mapping_range); + +/* + * Handle all mappings that got truncated by a "truncate()" + * system call. + * + * NOTE! We have to be ready to update the memory sharing + * between the file and the memory map for a potential last + * incomplete page. Ugly, but necessary. + */ +int vmtruncate(struct inode * inode, loff_t offset) +{ + struct address_space *mapping = inode->i_mapping; + unsigned long limit; + + if (inode->i_size < offset) + goto do_expand; + /* + * truncation of in-use swapfiles is disallowed - it would cause + * subsequent swapout to scribble on the now-freed blocks. + */ + if (IS_SWAPFILE(inode)) + goto out_busy; + i_size_write(inode, offset); + unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(mapping, offset); + goto out_truncate; + +do_expand: + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY && offset > limit) + goto out_sig; + if (offset > inode->i_sb->s_maxbytes) + goto out_big; + i_size_write(inode, offset); + +out_truncate: + if (inode->i_op && inode->i_op->truncate) + inode->i_op->truncate(inode); + return 0; +out_sig: + send_sig(SIGXFSZ, current, 0); +out_big: + return -EFBIG; +out_busy: + return -ETXTBSY; +} +EXPORT_SYMBOL(vmtruncate); + +int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) +{ + struct address_space *mapping = inode->i_mapping; + + /* + * If the underlying filesystem is not going to provide + * a way to truncate a range of blocks (punch a hole) - + * we should return failure right now. + */ + if (!inode->i_op || !inode->i_op->truncate_range) + return -ENOSYS; + + mutex_lock(&inode->i_mutex); + down_write(&inode->i_alloc_sem); + unmap_mapping_range(mapping, offset, (end - offset), 1); + truncate_inode_pages_range(mapping, offset, end); + inode->i_op->truncate_range(inode, offset, end); + up_write(&inode->i_alloc_sem); + mutex_unlock(&inode->i_mutex); + + return 0; +} +EXPORT_SYMBOL(vmtruncate_range); + +/* + * Primitive swap readahead code. We simply read an aligned block of + * (1 << page_cluster) entries in the swap area. This method is chosen + * because it doesn't cost us any seek time. We also make sure to queue + * the 'original' request together with the readahead ones... + * + * This has been extended to use the NUMA policies from the mm triggering + * the readahead. + * + * Caller must hold down_read on the vma->vm_mm if vma is not NULL. + */ +void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma) +{ +#ifdef CONFIG_NUMA + struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL; +#endif + int i, num; + struct page *new_page; + unsigned long offset; + + /* + * Get the number of handles we should do readahead io to. + */ + num = valid_swaphandles(entry, &offset); + for (i = 0; i < num; offset++, i++) { + /* Ok, do the async read-ahead now */ + new_page = read_swap_cache_async(swp_entry(swp_type(entry), + offset), vma, addr); + if (!new_page) + break; + page_cache_release(new_page); +#ifdef CONFIG_NUMA + /* + * Find the next applicable VMA for the NUMA policy. + */ + addr += PAGE_SIZE; + if (addr == 0) + vma = NULL; + if (vma) { + if (addr >= vma->vm_end) { + vma = next_vma; + next_vma = vma ? vma->vm_next : NULL; + } + if (vma && addr < vma->vm_start) + vma = NULL; + } else { + if (next_vma && addr >= next_vma->vm_start) { + vma = next_vma; + next_vma = vma->vm_next; + } + } +#endif + } + lru_add_drain(); /* Push any new pages onto the LRU now */ +} + +/* + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access, pte_t orig_pte) +{ + spinlock_t *ptl; + struct page *page; + swp_entry_t entry; + pte_t pte; + int ret = VM_FAULT_MINOR; + + if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) + goto out; + + entry = pte_to_swp_entry(orig_pte); +again: + page = lookup_swap_cache(entry); + if (!page) { + swapin_readahead(entry, address, vma); + page = read_swap_cache_async(entry, vma, address); + if (!page) { + /* + * Back out if somebody else faulted in this pte + * while we released the pte lock. + */ + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (likely(pte_same(*page_table, orig_pte))) + ret = VM_FAULT_OOM; + goto unlock; + } + + /* Had to read the page from swap area: Major fault */ + ret = VM_FAULT_MAJOR; + inc_page_state(pgmajfault); + grab_swap_token(); + } + + mark_page_accessed(page); + lock_page(page); + if (!PageSwapCache(page)) { + /* Page migration has occured */ + unlock_page(page); + page_cache_release(page); + goto again; + } + + /* + * Back out if somebody else already faulted in this pte. + */ + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*page_table, orig_pte))) + goto out_nomap; + + if (unlikely(!PageUptodate(page))) { + ret = VM_FAULT_SIGBUS; + goto out_nomap; + } + + /* The page isn't present yet, go ahead with the fault. */ + + inc_mm_counter(mm, anon_rss); + pte = mk_pte(page, vma->vm_page_prot); + if (write_access && can_share_swap_page(page)) { + pte = maybe_mkwrite(pte_mkdirty(pte), vma); + write_access = 0; + } + + flush_icache_page(vma, page); + set_pte_at(mm, address, page_table, pte); + page_add_anon_rmap(page, vma, address); + + swap_free(entry); + if (vm_swap_full()) + remove_exclusive_swap_page(page); + unlock_page(page); + + if (write_access) { + if (do_wp_page(mm, vma, address, + page_table, pmd, ptl, pte) == VM_FAULT_OOM) + ret = VM_FAULT_OOM; + goto out; + } + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, pte); + lazy_mmu_prot_update(pte); +unlock: + pte_unmap_unlock(page_table, ptl); +out: + return ret; +out_nomap: + pte_unmap_unlock(page_table, ptl); + unlock_page(page); + page_cache_release(page); + return ret; +} + +/* + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access) +{ + struct page *page; + spinlock_t *ptl; + pte_t entry; + + if (write_access) { + /* Allocate our own private page. */ + pte_unmap(page_table); + + if (unlikely(anon_vma_prepare(vma))) + goto oom; + page = alloc_zeroed_user_highpage(vma, address); + if (!page) + goto oom; + + entry = mk_pte(page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!pte_none(*page_table)) + goto release; + inc_mm_counter(mm, anon_rss); + lru_cache_add_active(page); + page_add_new_anon_rmap(page, vma, address); + } else { + /* Map the ZERO_PAGE - vm_page_prot is readonly */ + page = ZERO_PAGE(address); + page_cache_get(page); + entry = mk_pte(page, vma->vm_page_prot); + + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + if (!pte_none(*page_table)) + goto release; + inc_mm_counter(mm, file_rss); + page_add_file_rmap(page); + } + + set_pte_at(mm, address, page_table, entry); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); +unlock: + pte_unmap_unlock(page_table, ptl); + return VM_FAULT_MINOR; +release: + page_cache_release(page); + goto unlock; +oom: + return VM_FAULT_OOM; +} + +/* + * do_no_page() tries to create a new page mapping. It aggressively + * tries to share with existing pages, but makes a separate copy if + * the "write_access" parameter is true in order to avoid the next + * page fault. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access) +{ + spinlock_t *ptl; + struct page *new_page; + struct address_space *mapping = NULL; + pte_t entry; + unsigned int sequence = 0; + int ret = VM_FAULT_MINOR; + int anon = 0; + + pte_unmap(page_table); + BUG_ON(vma->vm_flags & VM_PFNMAP); + + if (vma->vm_file) { + mapping = vma->vm_file->f_mapping; + sequence = mapping->truncate_count; + smp_rmb(); /* serializes i_size against truncate_count */ + } +retry: + new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); + /* + * No smp_rmb is needed here as long as there's a full + * spin_lock/unlock sequence inside the ->nopage callback + * (for the pagecache lookup) that acts as an implicit + * smp_mb() and prevents the i_size read to happen + * after the next truncate_count read. + */ + + /* no page was available -- either SIGBUS or OOM */ + if (new_page == NOPAGE_SIGBUS) + return VM_FAULT_SIGBUS; + if (new_page == NOPAGE_OOM) + return VM_FAULT_OOM; + + /* + * Should we do an early C-O-W break? + */ + if (write_access && !(vma->vm_flags & VM_SHARED)) { + struct page *page; + + if (unlikely(anon_vma_prepare(vma))) + goto oom; + page = alloc_page_vma(GFP_HIGHUSER, vma, address); + if (!page) + goto oom; + copy_user_highpage(page, new_page, address); + page_cache_release(new_page); + new_page = page; + anon = 1; + } + + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + /* + * For a file-backed vma, someone could have truncated or otherwise + * invalidated this page. If unmap_mapping_range got called, + * retry getting the page. + */ + if (mapping && unlikely(sequence != mapping->truncate_count)) { + pte_unmap_unlock(page_table, ptl); + page_cache_release(new_page); + cond_resched(); + sequence = mapping->truncate_count; + smp_rmb(); + goto retry; + } + + /* + * This silly early PAGE_DIRTY setting removes a race + * due to the bad i386 page protection. But it's valid + * for other architectures too. + * + * Note that if write_access is true, we either now have + * an exclusive copy of the page, or this is a shared mapping, + * so we can make it writable and dirty to avoid having to + * handle that later. + */ + /* Only go through if we didn't race with anybody else... */ + if (pte_none(*page_table)) { + flush_icache_page(vma, new_page); + entry = mk_pte(new_page, vma->vm_page_prot); + if (write_access) + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + set_pte_at(mm, address, page_table, entry); + if (anon) { + inc_mm_counter(mm, anon_rss); + lru_cache_add_active(new_page); + page_add_new_anon_rmap(new_page, vma, address); + } else { + inc_mm_counter(mm, file_rss); + page_add_file_rmap(new_page); + } + } else { + /* One of our sibling threads was faster, back out. */ + page_cache_release(new_page); + goto unlock; + } + + /* no need to invalidate: a not-present page shouldn't be cached */ + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); +unlock: + pte_unmap_unlock(page_table, ptl); + return ret; +oom: + page_cache_release(new_page); + return VM_FAULT_OOM; +} + +/* + * Fault of a previously existing named mapping. Repopulate the pte + * from the encoded file_pte if possible. This enables swappable + * nonlinear vmas. + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access, pte_t orig_pte) +{ + pgoff_t pgoff; + int err; + + if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) + return VM_FAULT_MINOR; + + if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { + /* + * Page table corrupted: show pte and kill process. + */ + print_bad_pte(vma, orig_pte, address); + return VM_FAULT_OOM; + } + /* We can then assume vm->vm_ops && vma->vm_ops->populate */ + + pgoff = pte_to_pgoff(orig_pte); + err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, + vma->vm_page_prot, pgoff, 0); + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err) + return VM_FAULT_SIGBUS; + return VM_FAULT_MAJOR; +} + +/* + * These routines also need to handle stuff like marking pages dirty + * and/or accessed for architectures that don't do it in hardware (most + * RISC architectures). The early dirtying is also good on the i386. + * + * There is also a hook called "update_mmu_cache()" that architectures + * with external mmu caches can use to update those (ie the Sparc or + * PowerPC hashed page tables that act as extended TLBs). + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static inline int handle_pte_fault(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pte_t *pte, pmd_t *pmd, int write_access) +{ + pte_t entry; + pte_t old_entry; + spinlock_t *ptl; + + old_entry = entry = *pte; + if (!pte_present(entry)) { + if (pte_none(entry)) { + if (!vma->vm_ops || !vma->vm_ops->nopage) + return do_anonymous_page(mm, vma, address, + pte, pmd, write_access); + return do_no_page(mm, vma, address, + pte, pmd, write_access); + } + if (pte_file(entry)) + return do_file_page(mm, vma, address, + pte, pmd, write_access, entry); + return do_swap_page(mm, vma, address, + pte, pmd, write_access, entry); + } + + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + if (unlikely(!pte_same(*pte, entry))) + goto unlock; + if (write_access) { + if (!pte_write(entry)) + return do_wp_page(mm, vma, address, + pte, pmd, ptl, entry); + entry = pte_mkdirty(entry); + } + entry = pte_mkyoung(entry); + if (!pte_same(old_entry, entry)) { + ptep_set_access_flags(vma, address, pte, entry, write_access); + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); + } else { + /* + * This is needed only for protection faults but the arch code + * is not yet telling us if this is a protection fault or not. + * This still avoids useless tlb flushes for .text page faults + * with threads. + */ + if (write_access) + flush_tlb_page(vma, address); + } +unlock: + pte_unmap_unlock(pte, ptl); + return VM_FAULT_MINOR; +} + +/* + * By the time we get here, we already hold the mm semaphore + */ +int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, int write_access) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + __set_current_state(TASK_RUNNING); + + inc_page_state(pgfault); + + if (unlikely(is_vm_hugetlb_page(vma))) + return hugetlb_fault(mm, vma, address, write_access); + + pgd = pgd_offset(mm, address); + pud = pud_alloc(mm, pgd, address); + if (!pud) + return VM_FAULT_OOM; + pmd = pmd_alloc(mm, pud, address); + if (!pmd) + return VM_FAULT_OOM; + pte = pte_alloc_map(mm, pmd, address); + if (!pte) + return VM_FAULT_OOM; + + return handle_pte_fault(mm, vma, address, pte, pmd, write_access); +} + +EXPORT_SYMBOL_GPL(__handle_mm_fault); + +#ifndef __PAGETABLE_PUD_FOLDED +/* + * Allocate page upper directory. + * We've already handled the fast-path in-line. + */ +int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + pud_t *new = pud_alloc_one(mm, address); + if (!new) + return -ENOMEM; + + spin_lock(&mm->page_table_lock); + if (pgd_present(*pgd)) /* Another has populated it */ + pud_free(new); + else + pgd_populate(mm, pgd, new); + spin_unlock(&mm->page_table_lock); + return 0; +} +#else +/* Workaround for gcc 2.96 */ +int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + return 0; +} +#endif /* __PAGETABLE_PUD_FOLDED */ + +#ifndef __PAGETABLE_PMD_FOLDED +/* + * Allocate page middle directory. + * We've already handled the fast-path in-line. + */ +int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) +{ + pmd_t *new = pmd_alloc_one(mm, address); + if (!new) + return -ENOMEM; + + spin_lock(&mm->page_table_lock); +#ifndef __ARCH_HAS_4LEVEL_HACK + if (pud_present(*pud)) /* Another has populated it */ + pmd_free(new); + else + pud_populate(mm, pud, new); +#else + if (pgd_present(*pud)) /* Another has populated it */ + pmd_free(new); + else + pgd_populate(mm, pud, new); +#endif /* __ARCH_HAS_4LEVEL_HACK */ + spin_unlock(&mm->page_table_lock); + return 0; +} +#else +/* Workaround for gcc 2.96 */ +int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) +{ + return 0; +} +#endif /* __PAGETABLE_PMD_FOLDED */ + +int make_pages_present(unsigned long addr, unsigned long end) +{ + int ret, len, write; + struct vm_area_struct * vma; + + vma = find_vma(current->mm, addr); + if (!vma) + return -1; + write = (vma->vm_flags & VM_WRITE) != 0; + if (addr >= end) + BUG(); + if (end > vma->vm_end) + BUG(); + len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; + ret = get_user_pages(current, current->mm, addr, + len, write, 0, NULL, NULL); + if (ret < 0) + return ret; + return ret == len ? 0 : -1; +} + +/* + * Map a vmalloc()-space virtual address to the physical page. + */ +struct page * vmalloc_to_page(void * vmalloc_addr) +{ + unsigned long addr = (unsigned long) vmalloc_addr; + struct page *page = NULL; + pgd_t *pgd = pgd_offset_k(addr); + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + + if (!pgd_none(*pgd)) { + pud = pud_offset(pgd, addr); + if (!pud_none(*pud)) { + pmd = pmd_offset(pud, addr); + if (!pmd_none(*pmd)) { + ptep = pte_offset_map(pmd, addr); + pte = *ptep; + if (pte_present(pte)) + page = pte_page(pte); + pte_unmap(ptep); + } + } + } + return page; +} + +EXPORT_SYMBOL(vmalloc_to_page); + +/* + * Map a vmalloc()-space virtual address to the physical page frame number. + */ +unsigned long vmalloc_to_pfn(void * vmalloc_addr) +{ + return page_to_pfn(vmalloc_to_page(vmalloc_addr)); +} + +EXPORT_SYMBOL(vmalloc_to_pfn); + +#if !defined(__HAVE_ARCH_GATE_AREA) + +#if defined(AT_SYSINFO_EHDR) +static struct vm_area_struct gate_vma; + +static int __init gate_vma_init(void) +{ + gate_vma.vm_mm = NULL; + gate_vma.vm_start = FIXADDR_USER_START; + gate_vma.vm_end = FIXADDR_USER_END; + gate_vma.vm_page_prot = PAGE_READONLY; + gate_vma.vm_flags = 0; + return 0; +} +__initcall(gate_vma_init); +#endif + +struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +{ +#ifdef AT_SYSINFO_EHDR + return &gate_vma; +#else + return NULL; +#endif +} + +int in_gate_area_no_task(unsigned long addr) +{ +#ifdef AT_SYSINFO_EHDR + if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) + return 1; +#endif + return 0; +} + +#endif /* __HAVE_ARCH_GATE_AREA */ diff -urN oldtree/mm/page_alloc.c newtree/mm/page_alloc.c --- oldtree/mm/page_alloc.c 2006-02-18 15:18:30.125737440 +0000 +++ newtree/mm/page_alloc.c 2006-02-18 15:24:31.449807832 +0000 @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -447,6 +448,7 @@ { if (order == 0) { __ClearPageReserved(page); + ClearPageNosave(page); set_page_count(page, 0); free_hot_cold_page(page, 0); @@ -460,6 +462,7 @@ if (loop + 16 < BITS_PER_LONG) prefetchw(p + 16); __ClearPageReserved(p); + ClearPageNosave(p); set_page_count(p, 0); } @@ -961,8 +964,8 @@ /* This allocation should allow future memory freeing. */ - if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) - && !in_interrupt()) { + if ((((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && + !in_interrupt()) || (test_freezer_state(FREEZER_ON))) { if (!(gfp_mask & __GFP_NOMEMALLOC)) { nofail_alloc: /* go through the zonelist yet again, ignoring mins */ diff -urN oldtree/mm/page_alloc.c.orig newtree/mm/page_alloc.c.orig --- oldtree/mm/page_alloc.c.orig 1970-01-01 00:00:00.000000000 +0000 +++ newtree/mm/page_alloc.c.orig 2006-02-18 15:18:30.000000000 +0000 @@ -0,0 +1,2732 @@ +/* + * linux/mm/page_alloc.c + * + * Manages the free list, the system allocates free pages here. + * Note that kmalloc() lives in slab.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 + * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 + * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 + * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 + * (lots of bits borrowed from Ingo Molnar & Andrew Morton) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "internal.h" + +/* + * MCD - HACK: Find somewhere to initialize this EARLY, or make this + * initializer cleaner + */ +nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; +EXPORT_SYMBOL(node_online_map); +nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; +EXPORT_SYMBOL(node_possible_map); +struct pglist_data *pgdat_list __read_mostly; +unsigned long totalram_pages __read_mostly; +unsigned long totalhigh_pages __read_mostly; +long nr_swap_pages; +int percpu_pagelist_fraction; + +static void fastcall free_hot_cold_page(struct page *page, int cold); +static void __free_pages_ok(struct page *page, unsigned int order); + +/* + * results with 256, 32 in the lowmem_reserve sysctl: + * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) + * 1G machine -> (16M dma, 784M normal, 224M high) + * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA + * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL + * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA + * + * TBD: should special case ZONE_DMA32 machines here - in those we normally + * don't need any ZONE_NORMAL reservation + */ +int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 }; + +EXPORT_SYMBOL(totalram_pages); + +/* + * Used by page_zone() to look up the address of the struct zone whose + * id is encoded in the upper bits of page->flags + */ +struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; +EXPORT_SYMBOL(zone_table); + +static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; +int min_free_kbytes = 1024; + +unsigned long __initdata nr_kernel_pages; +unsigned long __initdata nr_all_pages; + +#ifdef CONFIG_DEBUG_VM +static int page_outside_zone_boundaries(struct zone *zone, struct page *page) +{ + int ret = 0; + unsigned seq; + unsigned long pfn = page_to_pfn(page); + + do { + seq = zone_span_seqbegin(zone); + if (pfn >= zone->zone_start_pfn + zone->spanned_pages) + ret = 1; + else if (pfn < zone->zone_start_pfn) + ret = 1; + } while (zone_span_seqretry(zone, seq)); + + return ret; +} + +static int page_is_consistent(struct zone *zone, struct page *page) +{ +#ifdef CONFIG_HOLES_IN_ZONE + if (!pfn_valid(page_to_pfn(page))) + return 0; +#endif + if (zone != page_zone(page)) + return 0; + + return 1; +} +/* + * Temporary debugging check for pages not lying within a given zone. + */ +static int bad_range(struct zone *zone, struct page *page) +{ + if (page_outside_zone_boundaries(zone, page)) + return 1; + if (!page_is_consistent(zone, page)) + return 1; + + return 0; +} + +#else +static inline int bad_range(struct zone *zone, struct page *page) +{ + return 0; +} +#endif + +static void bad_page(struct page *page) +{ + printk(KERN_EMERG "Bad page state in process '%s'\n" + KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" + KERN_EMERG "Trying to fix it up, but a reboot is needed\n" + KERN_EMERG "Backtrace:\n", + current->comm, page, (int)(2*sizeof(unsigned long)), + (unsigned long)page->flags, page->mapping, + page_mapcount(page), page_count(page)); + dump_stack(); + page->flags &= ~(1 << PG_lru | + 1 << PG_private | + 1 << PG_locked | + 1 << PG_active | + 1 << PG_dirty | + 1 << PG_reclaim | + 1 << PG_slab | + 1 << PG_swapcache | + 1 << PG_writeback ); + set_page_count(page, 0); + reset_page_mapcount(page); + page->mapping = NULL; + add_taint(TAINT_BAD_PAGE); +} + +/* + * Higher-order pages are called "compound pages". They are structured thusly: + * + * The first PAGE_SIZE page is called the "head page". + * + * The remaining PAGE_SIZE pages are called "tail pages". + * + * All pages have PG_compound set. All pages have their ->private pointing at + * the head page (even the head page has this). + * + * The first tail page's ->lru.next holds the address of the compound page's + * put_page() function. Its ->lru.prev holds the order of allocation. + * This usage means that zero-order pages may not be compound. + */ + +static void free_compound_page(struct page *page) +{ + __free_pages_ok(page, (unsigned long)page[1].lru.prev); +} + +static void prep_compound_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + + page[1].lru.next = (void *)free_compound_page; /* set dtor */ + page[1].lru.prev = (void *)order; + for (i = 0; i < nr_pages; i++) { + struct page *p = page + i; + + SetPageCompound(p); + set_page_private(p, (unsigned long)page); + } +} + +static void destroy_compound_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + + if (unlikely((unsigned long)page[1].lru.prev != order)) + bad_page(page); + + for (i = 0; i < nr_pages; i++) { + struct page *p = page + i; + + if (unlikely(!PageCompound(p) | + (page_private(p) != (unsigned long)page))) + bad_page(page); + ClearPageCompound(p); + } +} + +/* + * function for dealing with page's order in buddy system. + * zone->lock is already acquired when we use these. + * So, we don't need atomic page->flags operations here. + */ +static inline unsigned long page_order(struct page *page) { + return page_private(page); +} + +static inline void set_page_order(struct page *page, int order) { + set_page_private(page, order); + __SetPagePrivate(page); +} + +static inline void rmv_page_order(struct page *page) +{ + __ClearPagePrivate(page); + set_page_private(page, 0); +} + +/* + * Locate the struct page for both the matching buddy in our + * pair (buddy1) and the combined O(n+1) page they form (page). + * + * 1) Any buddy B1 will have an order O twin B2 which satisfies + * the following equation: + * B2 = B1 ^ (1 << O) + * For example, if the starting buddy (buddy2) is #8 its order + * 1 buddy is #10: + * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 + * + * 2) Any buddy B will have an order O+1 parent P which + * satisfies the following equation: + * P = B & ~(1 << O) + * + * Assumption: *_mem_map is contigious at least up to MAX_ORDER + */ +static inline struct page * +__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) +{ + unsigned long buddy_idx = page_idx ^ (1 << order); + + return page + (buddy_idx - page_idx); +} + +static inline unsigned long +__find_combined_index(unsigned long page_idx, unsigned int order) +{ + return (page_idx & ~(1 << order)); +} + +/* + * This function checks whether a page is free && is the buddy + * we can do coalesce a page and its buddy if + * (a) the buddy is not in a hole && + * (b) the buddy is free && + * (c) the buddy is on the buddy system && + * (d) a page and its buddy have the same order. + * for recording page's order, we use page_private(page) and PG_private. + * + */ +static inline int page_is_buddy(struct page *page, int order) +{ +#ifdef CONFIG_HOLES_IN_ZONE + if (!pfn_valid(page_to_pfn(page))) + return 0; +#endif + + if (PagePrivate(page) && + (page_order(page) == order) && + page_count(page) == 0) + return 1; + return 0; +} + +/* + * Freeing function for a buddy system allocator. + * + * The concept of a buddy system is to maintain direct-mapped table + * (containing bit values) for memory blocks of various "orders". + * The bottom level table contains the map for the smallest allocatable + * units of memory (here, pages), and each level above it describes + * pairs of units from the levels below, hence, "buddies". + * At a high level, all that happens here is marking the table entry + * at the bottom level available, and propagating the changes upward + * as necessary, plus some accounting needed to play nicely with other + * parts of the VM system. + * At each level, we keep a list of pages, which are heads of continuous + * free pages of length of (1 << order) and marked with PG_Private.Page's + * order is recorded in page_private(page) field. + * So when we are allocating or freeing one, we can derive the state of the + * other. That is, if we allocate a small block, and both were + * free, the remainder of the region must be split into blocks. + * If a block is freed, and its buddy is also free, then this + * triggers coalescing into a block of larger size. + * + * -- wli + */ + +static inline void __free_one_page(struct page *page, + struct zone *zone, unsigned int order) +{ + unsigned long page_idx; + int order_size = 1 << order; + + if (unlikely(PageCompound(page))) + destroy_compound_page(page, order); + + page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); + + BUG_ON(page_idx & (order_size - 1)); + BUG_ON(bad_range(zone, page)); + + zone->free_pages += order_size; + while (order < MAX_ORDER-1) { + unsigned long combined_idx; + struct free_area *area; + struct page *buddy; + + buddy = __page_find_buddy(page, page_idx, order); + if (!page_is_buddy(buddy, order)) + break; /* Move the buddy up one level. */ + + list_del(&buddy->lru); + area = zone->free_area + order; + area->nr_free--; + rmv_page_order(buddy); + combined_idx = __find_combined_index(page_idx, order); + page = page + (combined_idx - page_idx); + page_idx = combined_idx; + order++; + } + set_page_order(page, order); + list_add(&page->lru, &zone->free_area[order].free_list); + zone->free_area[order].nr_free++; +} + +static inline int free_pages_check(struct page *page) +{ + if (unlikely(page_mapcount(page) | + (page->mapping != NULL) | + (page_count(page) != 0) | + (page->flags & ( + 1 << PG_lru | + 1 << PG_private | + 1 << PG_locked | + 1 << PG_active | + 1 << PG_reclaim | + 1 << PG_slab | + 1 << PG_swapcache | + 1 << PG_writeback | + 1 << PG_reserved )))) + bad_page(page); + if (PageDirty(page)) + __ClearPageDirty(page); + /* + * For now, we report if PG_reserved was found set, but do not + * clear it, and do not free the page. But we shall soon need + * to do more, for when the ZERO_PAGE count wraps negative. + */ + return PageReserved(page); +} + +/* + * Frees a list of pages. + * Assumes all pages on list are in same zone, and of same order. + * count is the number of pages to free. + * + * If the zone was previously in an "all pages pinned" state then look to + * see if this freeing clears that state. + * + * And clear the zone's pages_scanned counter, to hold off the "all pages are + * pinned" detection logic. + */ +static void free_pages_bulk(struct zone *zone, int count, + struct list_head *list, int order) +{ + spin_lock(&zone->lock); + zone->all_unreclaimable = 0; + zone->pages_scanned = 0; + while (count--) { + struct page *page; + + BUG_ON(list_empty(list)); + page = list_entry(list->prev, struct page, lru); + /* have to delete it as __free_one_page list manipulates */ + list_del(&page->lru); + __free_one_page(page, zone, order); + } + spin_unlock(&zone->lock); +} + +static void free_one_page(struct zone *zone, struct page *page, int order) +{ + LIST_HEAD(list); + list_add(&page->lru, &list); + free_pages_bulk(zone, 1, &list, order); +} + +static void __free_pages_ok(struct page *page, unsigned int order) +{ + unsigned long flags; + int i; + int reserved = 0; + + arch_free_page(page, order); + if (!PageHighMem(page)) + mutex_debug_check_no_locks_freed(page_address(page), + PAGE_SIZE<lru, &list); + kernel_map_pages(page, 1 << order, 0); + free_pages_bulk(page_zone(page), 1, &list, order); + } +} + + +/* + * The order of subdivision here is critical for the IO subsystem. + * Please do not alter this order without good reasons and regression + * testing. Specifically, as large blocks of memory are subdivided, + * the order in which smaller blocks are delivered depends on the order + * they're subdivided in this function. This is the primary factor + * influencing the order in which pages are delivered to the IO + * subsystem according to empirical testing, and this is also justified + * by considering the behavior of a buddy system containing a single + * large block of memory acted on by a series of small allocations. + * This behavior is a critical factor in sglist merging's success. + * + * -- wli + */ +static inline void expand(struct zone *zone, struct page *page, + int low, int high, struct free_area *area) +{ + unsigned long size = 1 << high; + + while (high > low) { + area--; + high--; + size >>= 1; + BUG_ON(bad_range(zone, &page[size])); + list_add(&page[size].lru, &area->free_list); + area->nr_free++; + set_page_order(&page[size], high); + } +} + +/* + * This page is about to be returned from the page allocator + */ +static int prep_new_page(struct page *page, int order) +{ + if (unlikely(page_mapcount(page) | + (page->mapping != NULL) | + (page_count(page) != 0) | + (page->flags & ( + 1 << PG_lru | + 1 << PG_private | + 1 << PG_locked | + 1 << PG_active | + 1 << PG_dirty | + 1 << PG_reclaim | + 1 << PG_slab | + 1 << PG_swapcache | + 1 << PG_writeback | + 1 << PG_reserved )))) + bad_page(page); + + /* + * For now, we report if PG_reserved was found set, but do not + * clear it, and do not allocate the page: as a safety net. + */ + if (PageReserved(page)) + return 1; + + page->flags &= ~(1 << PG_uptodate | 1 << PG_error | + 1 << PG_referenced | 1 << PG_arch_1 | + 1 << PG_checked | 1 << PG_mappedtodisk); + set_page_private(page, 0); + set_page_refs(page, order); + kernel_map_pages(page, 1 << order, 1); + return 0; +} + +/* + * Do the hard work of removing an element from the buddy allocator. + * Call me with the zone->lock already held. + */ +static struct page *__rmqueue(struct zone *zone, unsigned int order) +{ + struct free_area * area; + unsigned int current_order; + struct page *page; + + for (current_order = order; current_order < MAX_ORDER; ++current_order) { + area = zone->free_area + current_order; + if (list_empty(&area->free_list)) + continue; + + page = list_entry(area->free_list.next, struct page, lru); + list_del(&page->lru); + rmv_page_order(page); + area->nr_free--; + zone->free_pages -= 1UL << order; + expand(zone, page, order, current_order, area); + return page; + } + + return NULL; +} + +/* + * Obtain a specified number of elements from the buddy allocator, all under + * a single hold of the lock, for efficiency. Add them to the supplied list. + * Returns the number of new pages which were placed at *list. + */ +static int rmqueue_bulk(struct zone *zone, unsigned int order, + unsigned long count, struct list_head *list) +{ + int i; + + spin_lock(&zone->lock); + for (i = 0; i < count; ++i) { + struct page *page = __rmqueue(zone, order); + if (unlikely(page == NULL)) + break; + list_add_tail(&page->lru, list); + } + spin_unlock(&zone->lock); + return i; +} + +#ifdef CONFIG_NUMA +/* Called from the slab reaper to drain remote pagesets */ +void drain_remote_pages(void) +{ + struct zone *zone; + int i; + unsigned long flags; + + local_irq_save(flags); + for_each_zone(zone) { + struct per_cpu_pageset *pset; + + /* Do not drain local pagesets */ + if (zone->zone_pgdat->node_id == numa_node_id()) + continue; + + pset = zone_pcp(zone, smp_processor_id()); + for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { + struct per_cpu_pages *pcp; + + pcp = &pset->pcp[i]; + free_pages_bulk(zone, pcp->count, &pcp->list, 0); + pcp->count = 0; + } + } + local_irq_restore(flags); +} +#endif + +#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) +static void __drain_pages(unsigned int cpu) +{ + unsigned long flags; + struct zone *zone; + int i; + + for_each_zone(zone) { + struct per_cpu_pageset *pset; + + pset = zone_pcp(zone, cpu); + for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { + struct per_cpu_pages *pcp; + + pcp = &pset->pcp[i]; + local_irq_save(flags); + free_pages_bulk(zone, pcp->count, &pcp->list, 0); + pcp->count = 0; + local_irq_restore(flags); + } + } +} +#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ + +#ifdef CONFIG_PM + +void mark_free_pages(struct zone *zone) +{ + unsigned long zone_pfn, flags; + int order; + struct list_head *curr; + + if (!zone->spanned_pages) + return; + + spin_lock_irqsave(&zone->lock, flags); + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) + ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); + + for (order = MAX_ORDER - 1; order >= 0; --order) + list_for_each(curr, &zone->free_area[order].free_list) { + unsigned long start_pfn, i; + + start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); + + for (i=0; i < (1<lock, flags); +} + +/* + * Spill all of this CPU's per-cpu pages back into the buddy allocator. + */ +void drain_local_pages(void) +{ + unsigned long flags; + + local_irq_save(flags); + __drain_pages(smp_processor_id()); + local_irq_restore(flags); +} +#endif /* CONFIG_PM */ + +static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu) +{ +#ifdef CONFIG_NUMA + pg_data_t *pg = z->zone_pgdat; + pg_data_t *orig = zonelist->zones[0]->zone_pgdat; + struct per_cpu_pageset *p; + + p = zone_pcp(z, cpu); + if (pg == orig) { + p->numa_hit++; + } else { + p->numa_miss++; + zone_pcp(zonelist->zones[0], cpu)->numa_foreign++; + } + if (pg == NODE_DATA(numa_node_id())) + p->local_node++; + else + p->other_node++; +#endif +} + +/* + * Free a 0-order page + */ +static void fastcall free_hot_cold_page(struct page *page, int cold) +{ + struct zone *zone = page_zone(page); + struct per_cpu_pages *pcp; + unsigned long flags; + + arch_free_page(page, 0); + + if (PageAnon(page)) + page->mapping = NULL; + if (free_pages_check(page)) + return; + + kernel_map_pages(page, 1, 0); + + pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; + local_irq_save(flags); + __inc_page_state(pgfree); + list_add(&page->lru, &pcp->list); + pcp->count++; + if (pcp->count >= pcp->high) { + free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + pcp->count -= pcp->batch; + } + local_irq_restore(flags); + put_cpu(); +} + +void fastcall free_hot_page(struct page *page) +{ + free_hot_cold_page(page, 0); +} + +void fastcall free_cold_page(struct page *page) +{ + free_hot_cold_page(page, 1); +} + +static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) +{ + int i; + + BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); + for(i = 0; i < (1 << order); i++) + clear_highpage(page + i); +} + +/* + * Really, prep_compound_page() should be called from __rmqueue_bulk(). But + * we cheat by calling it from here, in the order > 0 path. Saves a branch + * or two. + */ +static struct page *buffered_rmqueue(struct zonelist *zonelist, + struct zone *zone, int order, gfp_t gfp_flags) +{ + unsigned long flags; + struct page *page; + int cold = !!(gfp_flags & __GFP_COLD); + int cpu; + +again: + cpu = get_cpu(); + if (likely(order == 0)) { + struct per_cpu_pages *pcp; + + pcp = &zone_pcp(zone, cpu)->pcp[cold]; + local_irq_save(flags); + if (!pcp->count) { + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, &pcp->list); + if (unlikely(!pcp->count)) + goto failed; + } + page = list_entry(pcp->list.next, struct page, lru); + list_del(&page->lru); + pcp->count--; + } else { + spin_lock_irqsave(&zone->lock, flags); + page = __rmqueue(zone, order); + spin_unlock(&zone->lock); + if (!page) + goto failed; + } + + __mod_page_state_zone(zone, pgalloc, 1 << order); + zone_statistics(zonelist, zone, cpu); + local_irq_restore(flags); + put_cpu(); + + BUG_ON(bad_range(zone, page)); + if (prep_new_page(page, order)) + goto again; + + if (gfp_flags & __GFP_ZERO) + prep_zero_page(page, order, gfp_flags); + + if (order && (gfp_flags & __GFP_COMP)) + prep_compound_page(page, order); + return page; + +failed: + local_irq_restore(flags); + put_cpu(); + return NULL; +} + +#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ +#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ +#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ +#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ +#define ALLOC_HARDER 0x10 /* try to alloc harder */ +#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ + +/* + * Return 1 if free pages are above 'mark'. This takes into account the order + * of the allocation. + */ +int zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags) +{ + /* free_pages my go negative - that's OK */ + long min = mark, free_pages = z->free_pages - (1 << order) + 1; + int o; + + if (alloc_flags & ALLOC_HIGH) + min -= min / 2; + if (alloc_flags & ALLOC_HARDER) + min -= min / 4; + + if (free_pages <= min + z->lowmem_reserve[classzone_idx]) + return 0; + for (o = 0; o < order; o++) { + /* At the next order, this order's pages become unavailable */ + free_pages -= z->free_area[o].nr_free << o; + + /* Require fewer higher order pages to be free */ + min >>= 1; + + if (free_pages <= min) + return 0; + } + return 1; +} + +/* + * get_page_from_freeliest goes through the zonelist trying to allocate + * a page. + */ +static struct page * +get_page_from_freelist(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, int alloc_flags) +{ + struct zone **z = zonelist->zones; + struct page *page = NULL; + int classzone_idx = zone_idx(*z); + + /* + * Go through the zonelist once, looking for a zone with enough free. + * See also cpuset_zone_allowed() comment in kernel/cpuset.c. + */ + do { + if ((alloc_flags & ALLOC_CPUSET) && + !cpuset_zone_allowed(*z, gfp_mask)) + continue; + + if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { + unsigned long mark; + if (alloc_flags & ALLOC_WMARK_MIN) + mark = (*z)->pages_min; + else if (alloc_flags & ALLOC_WMARK_LOW) + mark = (*z)->pages_low; + else + mark = (*z)->pages_high; + if (!zone_watermark_ok(*z, order, mark, + classzone_idx, alloc_flags)) + if (!zone_reclaim_mode || + !zone_reclaim(*z, gfp_mask, order)) + continue; + } + + page = buffered_rmqueue(zonelist, *z, order, gfp_mask); + if (page) { + break; + } + } while (*(++z) != NULL); + return page; +} + +/* + * This is the 'heart' of the zoned buddy allocator. + */ +struct page * fastcall +__alloc_pages(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist) +{ + const gfp_t wait = gfp_mask & __GFP_WAIT; + struct zone **z; + struct page *page; + struct reclaim_state reclaim_state; + struct task_struct *p = current; + int do_retry; + int alloc_flags; + int did_some_progress; + + might_sleep_if(wait); + +restart: + z = zonelist->zones; /* the list of zones suitable for gfp_mask */ + + if (unlikely(*z == NULL)) { + /* Should this ever happen?? */ + return NULL; + } + + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, + zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); + if (page) + goto got_pg; + + do { + wakeup_kswapd(*z, order); + } while (*(++z)); + + /* + * OK, we're below the kswapd watermark and have kicked background + * reclaim. Now things get more complex, so set up alloc_flags according + * to how we want to proceed. + * + * The caller may dip into page reserves a bit more if the caller + * cannot run direct reclaim, or if the caller has realtime scheduling + * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will + * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). + */ + alloc_flags = ALLOC_WMARK_MIN; + if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) + alloc_flags |= ALLOC_HARDER; + if (gfp_mask & __GFP_HIGH) + alloc_flags |= ALLOC_HIGH; + alloc_flags |= ALLOC_CPUSET; + + /* + * Go through the zonelist again. Let __GFP_HIGH and allocations + * coming from realtime tasks go deeper into reserves. + * + * This is the last chance, in general, before the goto nopage. + * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. + * See also cpuset_zone_allowed() comment in kernel/cpuset.c. + */ + page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); + if (page) + goto got_pg; + + /* This allocation should allow future memory freeing. */ + + if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) + && !in_interrupt()) { + if (!(gfp_mask & __GFP_NOMEMALLOC)) { +nofail_alloc: + /* go through the zonelist yet again, ignoring mins */ + page = get_page_from_freelist(gfp_mask, order, + zonelist, ALLOC_NO_WATERMARKS); + if (page) + goto got_pg; + if (gfp_mask & __GFP_NOFAIL) { + blk_congestion_wait(WRITE, HZ/50); + goto nofail_alloc; + } + } + goto nopage; + } + + /* Atomic allocations - we can't balance anything */ + if (!wait) + goto nopage; + +rebalance: + cond_resched(); + + /* We now go into synchronous reclaim */ + cpuset_memory_pressure_bump(); + p->flags |= PF_MEMALLOC; + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + + did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); + + p->reclaim_state = NULL; + p->flags &= ~PF_MEMALLOC; + + cond_resched(); + + if (likely(did_some_progress)) { + page = get_page_from_freelist(gfp_mask, order, + zonelist, alloc_flags); + if (page) + goto got_pg; + } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { + /* + * Go through the zonelist yet one more time, keep + * very high watermark here, this is only to catch + * a parallel oom killing, we must fail if we're still + * under heavy pressure. + */ + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, + zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); + if (page) + goto got_pg; + + out_of_memory(gfp_mask, order); + goto restart; + } + + /* + * Don't let big-order allocations loop unless the caller explicitly + * requests that. Wait for some write requests to complete then retry. + * + * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order + * <= 3, but that may not be true in other implementations. + */ + do_retry = 0; + if (!(gfp_mask & __GFP_NORETRY)) { + if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) + do_retry = 1; + if (gfp_mask & __GFP_NOFAIL) + do_retry = 1; + } + if (do_retry) { + blk_congestion_wait(WRITE, HZ/50); + goto rebalance; + } + +nopage: + if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { + printk(KERN_WARNING "%s: page allocation failure." + " order:%d, mode:0x%x\n", + p->comm, order, gfp_mask); + dump_stack(); + show_mem(); + } +got_pg: + return page; +} + +EXPORT_SYMBOL(__alloc_pages); + +/* + * Common helper functions. + */ +fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) +{ + struct page * page; + page = alloc_pages(gfp_mask, order); + if (!page) + return 0; + return (unsigned long) page_address(page); +} + +EXPORT_SYMBOL(__get_free_pages); + +fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) +{ + struct page * page; + + /* + * get_zeroed_page() returns a 32-bit address, which cannot represent + * a highmem page + */ + BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); + + page = alloc_pages(gfp_mask | __GFP_ZERO, 0); + if (page) + return (unsigned long) page_address(page); + return 0; +} + +EXPORT_SYMBOL(get_zeroed_page); + +void __pagevec_free(struct pagevec *pvec) +{ + int i = pagevec_count(pvec); + + while (--i >= 0) + free_hot_cold_page(pvec->pages[i], pvec->cold); +} + +fastcall void __free_pages(struct page *page, unsigned int order) +{ + if (put_page_testzero(page)) { + if (order == 0) + free_hot_page(page); + else + __free_pages_ok(page, order); + } +} + +EXPORT_SYMBOL(__free_pages); + +fastcall void free_pages(unsigned long addr, unsigned int order) +{ + if (addr != 0) { + BUG_ON(!virt_addr_valid((void *)addr)); + __free_pages(virt_to_page((void *)addr), order); + } +} + +EXPORT_SYMBOL(free_pages); + +/* + * Total amount of free (allocatable) RAM: + */ +unsigned int nr_free_pages(void) +{ + unsigned int sum = 0; + struct zone *zone; + + for_each_zone(zone) + sum += zone->free_pages; + + return sum; +} + +EXPORT_SYMBOL(nr_free_pages); + +#ifdef CONFIG_NUMA +unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) +{ + unsigned int i, sum = 0; + + for (i = 0; i < MAX_NR_ZONES; i++) + sum += pgdat->node_zones[i].free_pages; + + return sum; +} +#endif + +static unsigned int nr_free_zone_pages(int offset) +{ + /* Just pick one node, since fallback list is circular */ + pg_data_t *pgdat = NODE_DATA(numa_node_id()); + unsigned int sum = 0; + + struct zonelist *zonelist = pgdat->node_zonelists + offset; + struct zone **zonep = zonelist->zones; + struct zone *zone; + + for (zone = *zonep++; zone; zone = *zonep++) { + unsigned long size = zone->present_pages; + unsigned long high = zone->pages_high; + if (size > high) + sum += size - high; + } + + return sum; +} + +/* + * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL + */ +unsigned int nr_free_buffer_pages(void) +{ + return nr_free_zone_pages(gfp_zone(GFP_USER)); +} + +/* + * Amount of free RAM allocatable within all zones + */ +unsigned int nr_free_pagecache_pages(void) +{ + return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); +} + +#ifdef CONFIG_HIGHMEM +unsigned int nr_free_highpages (void) +{ + pg_data_t *pgdat; + unsigned int pages = 0; + + for_each_pgdat(pgdat) + pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; + + return pages; +} +#endif + +#ifdef CONFIG_NUMA +static void show_node(struct zone *zone) +{ + printk("Node %d ", zone->zone_pgdat->node_id); +} +#else +#define show_node(zone) do { } while (0) +#endif + +/* + * Accumulate the page_state information across all CPUs. + * The result is unavoidably approximate - it can change + * during and after execution of this function. + */ +static DEFINE_PER_CPU(struct page_state, page_states) = {0}; + +atomic_t nr_pagecache = ATOMIC_INIT(0); +EXPORT_SYMBOL(nr_pagecache); +#ifdef CONFIG_SMP +DEFINE_PER_CPU(long, nr_pagecache_local) = 0; +#endif + +static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) +{ + int cpu = 0; + + memset(ret, 0, nr * sizeof(unsigned long)); + cpus_and(*cpumask, *cpumask, cpu_online_map); + + cpu = first_cpu(*cpumask); + while (cpu < NR_CPUS) { + unsigned long *in, *out, off; + + if (!cpu_isset(cpu, *cpumask)) + continue; + + in = (unsigned long *)&per_cpu(page_states, cpu); + + cpu = next_cpu(cpu, *cpumask); + + if (likely(cpu < NR_CPUS)) + prefetch(&per_cpu(page_states, cpu)); + + out = (unsigned long *)ret; + for (off = 0; off < nr; off++) + *out++ += *in++; + } +} + +void get_page_state_node(struct page_state *ret, int node) +{ + int nr; + cpumask_t mask = node_to_cpumask(node); + + nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); + nr /= sizeof(unsigned long); + + __get_page_state(ret, nr+1, &mask); +} + +void get_page_state(struct page_state *ret) +{ + int nr; + cpumask_t mask = CPU_MASK_ALL; + + nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); + nr /= sizeof(unsigned long); + + __get_page_state(ret, nr + 1, &mask); +} + +void get_full_page_state(struct page_state *ret) +{ + cpumask_t mask = CPU_MASK_ALL; + + __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); +} + +unsigned long read_page_state_offset(unsigned long offset) +{ + unsigned long ret = 0; + int cpu; + + for_each_online_cpu(cpu) { + unsigned long in; + + in = (unsigned long)&per_cpu(page_states, cpu) + offset; + ret += *((unsigned long *)in); + } + return ret; +} + +void __mod_page_state_offset(unsigned long offset, unsigned long delta) +{ + void *ptr; + + ptr = &__get_cpu_var(page_states); + *(unsigned long *)(ptr + offset) += delta; +} +EXPORT_SYMBOL(__mod_page_state_offset); + +void mod_page_state_offset(unsigned long offset, unsigned long delta) +{ + unsigned long flags; + void *ptr; + + local_irq_save(flags); + ptr = &__get_cpu_var(page_states); + *(unsigned long *)(ptr + offset) += delta; + local_irq_restore(flags); +} +EXPORT_SYMBOL(mod_page_state_offset); + +void __get_zone_counts(unsigned long *active, unsigned long *inactive, + unsigned long *free, struct pglist_data *pgdat) +{ + struct zone *zones = pgdat->node_zones; + int i; + + *active = 0; + *inactive = 0; + *free = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { + *active += zones[i].nr_active; + *inactive += zones[i].nr_inactive; + *free += zones[i].free_pages; + } +} + +void get_zone_counts(unsigned long *active, + unsigned long *inactive, unsigned long *free) +{ + struct pglist_data *pgdat; + + *active = 0; + *inactive = 0; + *free = 0; + for_each_pgdat(pgdat) { + unsigned long l, m, n; + __get_zone_counts(&l, &m, &n, pgdat); + *active += l; + *inactive += m; + *free += n; + } +} + +void si_meminfo(struct sysinfo *val) +{ + val->totalram = totalram_pages; + val->sharedram = 0; + val->freeram = nr_free_pages(); + val->bufferram = nr_blockdev_pages(); +#ifdef CONFIG_HIGHMEM + val->totalhigh = totalhigh_pages; + val->freehigh = nr_free_highpages(); +#else + val->totalhigh = 0; + val->freehigh = 0; +#endif + val->mem_unit = PAGE_SIZE; +} + +EXPORT_SYMBOL(si_meminfo); + +#ifdef CONFIG_NUMA +void si_meminfo_node(struct sysinfo *val, int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + + val->totalram = pgdat->node_present_pages; + val->freeram = nr_free_pages_pgdat(pgdat); + val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; + val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; + val->mem_unit = PAGE_SIZE; +} +#endif + +#define K(x) ((x) << (PAGE_SHIFT-10)) + +/* + * Show free area list (used inside shift_scroll-lock stuff) + * We also calculate the percentage fragmentation. We do this by counting the + * memory on each free list with the exception of the first item on the list. + */ +void show_free_areas(void) +{ + struct page_state ps; + int cpu, temperature; + unsigned long active; + unsigned long inactive; + unsigned long free; + struct zone *zone; + + for_each_zone(zone) { + show_node(zone); + printk("%s per-cpu:", zone->name); + + if (!populated_zone(zone)) { + printk(" empty\n"); + continue; + } else + printk("\n"); + + for_each_online_cpu(cpu) { + struct per_cpu_pageset *pageset; + + pageset = zone_pcp(zone, cpu); + + for (temperature = 0; temperature < 2; temperature++) + printk("cpu %d %s: high %d, batch %d used:%d\n", + cpu, + temperature ? "cold" : "hot", + pageset->pcp[temperature].high, + pageset->pcp[temperature].batch, + pageset->pcp[temperature].count); + } + } + + get_page_state(&ps); + get_zone_counts(&active, &inactive, &free); + + printk("Free pages: %11ukB (%ukB HighMem)\n", + K(nr_free_pages()), + K(nr_free_highpages())); + + printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " + "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", + active, + inactive, + ps.nr_dirty, + ps.nr_writeback, + ps.nr_unstable, + nr_free_pages(), + ps.nr_slab, + ps.nr_mapped, + ps.nr_page_table_pages); + + for_each_zone(zone) { + int i; + + show_node(zone); + printk("%s" + " free:%lukB" + " min:%lukB" + " low:%lukB" + " high:%lukB" + " active:%lukB" + " inactive:%lukB" + " present:%lukB" + " pages_scanned:%lu" + " all_unreclaimable? %s" + "\n", + zone->name, + K(zone->free_pages), + K(zone->pages_min), + K(zone->pages_low), + K(zone->pages_high), + K(zone->nr_active), + K(zone->nr_inactive), + K(zone->present_pages), + zone->pages_scanned, + (zone->all_unreclaimable ? "yes" : "no") + ); + printk("lowmem_reserve[]:"); + for (i = 0; i < MAX_NR_ZONES; i++) + printk(" %lu", zone->lowmem_reserve[i]); + printk("\n"); + } + + for_each_zone(zone) { + unsigned long nr, flags, order, total = 0; + + show_node(zone); + printk("%s: ", zone->name); + if (!populated_zone(zone)) { + printk("empty\n"); + continue; + } + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + nr = zone->free_area[order].nr_free; + total += nr << order; + printk("%lu*%lukB ", nr, K(1UL) << order); + } + spin_unlock_irqrestore(&zone->lock, flags); + printk("= %lukB\n", K(total)); + } + + show_swap_cache_info(); +} + +/* + * Builds allocation fallback zone lists. + * + * Add all populated zones of a node to the zonelist. + */ +static int __init build_zonelists_node(pg_data_t *pgdat, + struct zonelist *zonelist, int nr_zones, int zone_type) +{ + struct zone *zone; + + BUG_ON(zone_type > ZONE_HIGHMEM); + + do { + zone = pgdat->node_zones + zone_type; + if (populated_zone(zone)) { +#ifndef CONFIG_HIGHMEM + BUG_ON(zone_type > ZONE_NORMAL); +#endif + zonelist->zones[nr_zones++] = zone; + check_highest_zone(zone_type); + } + zone_type--; + + } while (zone_type >= 0); + return nr_zones; +} + +static inline int highest_zone(int zone_bits) +{ + int res = ZONE_NORMAL; + if (zone_bits & (__force int)__GFP_HIGHMEM) + res = ZONE_HIGHMEM; + if (zone_bits & (__force int)__GFP_DMA32) + res = ZONE_DMA32; + if (zone_bits & (__force int)__GFP_DMA) + res = ZONE_DMA; + return res; +} + +#ifdef CONFIG_NUMA +#define MAX_NODE_LOAD (num_online_nodes()) +static int __initdata node_load[MAX_NUMNODES]; +/** + * find_next_best_node - find the next node that should appear in a given node's fallback list + * @node: node whose fallback list we're appending + * @used_node_mask: nodemask_t of already used nodes + * + * We use a number of factors to determine which is the next node that should + * appear on a given node's fallback list. The node should not have appeared + * already in @node's fallback list, and it should be the next closest node + * according to the distance array (which contains arbitrary distance values + * from each node to each node in the system), and should also prefer nodes + * with no CPUs, since presumably they'll have very little allocation pressure + * on them otherwise. + * It returns -1 if no node is found. + */ +static int __init find_next_best_node(int node, nodemask_t *used_node_mask) +{ + int n, val; + int min_val = INT_MAX; + int best_node = -1; + + /* Use the local node if we haven't already */ + if (!node_isset(node, *used_node_mask)) { + node_set(node, *used_node_mask); + return node; + } + + for_each_online_node(n) { + cpumask_t tmp; + + /* Don't want a node to appear more than once */ + if (node_isset(n, *used_node_mask)) + continue; + + /* Use the distance array to find the distance */ + val = node_distance(node, n); + + /* Penalize nodes under us ("prefer the next node") */ + val += (n < node); + + /* Give preference to headless and unused nodes */ + tmp = node_to_cpumask(n); + if (!cpus_empty(tmp)) + val += PENALTY_FOR_NODE_WITH_CPUS; + + /* Slight preference for less loaded node */ + val *= (MAX_NODE_LOAD*MAX_NUMNODES); + val += node_load[n]; + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + if (best_node >= 0) + node_set(best_node, *used_node_mask); + + return best_node; +} + +static void __init build_zonelists(pg_data_t *pgdat) +{ + int i, j, k, node, local_node; + int prev_node, load; + struct zonelist *zonelist; + nodemask_t used_mask; + + /* initialize zonelists */ + for (i = 0; i < GFP_ZONETYPES; i++) { + zonelist = pgdat->node_zonelists + i; + zonelist->zones[0] = NULL; + } + + /* NUMA-aware ordering of nodes */ + local_node = pgdat->node_id; + load = num_online_nodes(); + prev_node = local_node; + nodes_clear(used_mask); + while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { + int distance = node_distance(local_node, node); + + /* + * If another node is sufficiently far away then it is better + * to reclaim pages in a zone before going off node. + */ + if (distance > RECLAIM_DISTANCE) + zone_reclaim_mode = 1; + + /* + * We don't want to pressure a particular node. + * So adding penalty to the first node in same + * distance group to make it round-robin. + */ + + if (distance != node_distance(local_node, prev_node)) + node_load[node] += load; + prev_node = node; + load--; + for (i = 0; i < GFP_ZONETYPES; i++) { + zonelist = pgdat->node_zonelists + i; + for (j = 0; zonelist->zones[j] != NULL; j++); + + k = highest_zone(i); + + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + zonelist->zones[j] = NULL; + } + } +} + +#else /* CONFIG_NUMA */ + +static void __init build_zonelists(pg_data_t *pgdat) +{ + int i, j, k, node, local_node; + + local_node = pgdat->node_id; + for (i = 0; i < GFP_ZONETYPES; i++) { + struct zonelist *zonelist; + + zonelist = pgdat->node_zonelists + i; + + j = 0; + k = highest_zone(i); + j = build_zonelists_node(pgdat, zonelist, j, k); + /* + * Now we build the zonelist so that it contains the zones + * of all the other nodes. + * We don't want to pressure a particular node, so when + * building the zones for node N, we make sure that the + * zones coming right after the local ones are those from + * node N+1 (modulo N) + */ + for (node = local_node + 1; node < MAX_NUMNODES; node++) { + if (!node_online(node)) + continue; + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + } + for (node = 0; node < local_node; node++) { + if (!node_online(node)) + continue; + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + } + + zonelist->zones[j] = NULL; + } +} + +#endif /* CONFIG_NUMA */ + +void __init build_all_zonelists(void) +{ + int i; + + for_each_online_node(i) + build_zonelists(NODE_DATA(i)); + printk("Built %i zonelists\n", num_online_nodes()); + cpuset_init_current_mems_allowed(); +} + +/* + * Helper functions to size the waitqueue hash table. + * Essentially these want to choose hash table sizes sufficiently + * large so that collisions trying to wait on pages are rare. + * But in fact, the number of active page waitqueues on typical + * systems is ridiculously low, less than 200. So this is even + * conservative, even though it seems large. + * + * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to + * waitqueues, i.e. the size of the waitq table given the number of pages. + */ +#define PAGES_PER_WAITQUEUE 256 + +static inline unsigned long wait_table_size(unsigned long pages) +{ + unsigned long size = 1; + + pages /= PAGES_PER_WAITQUEUE; + + while (size < pages) + size <<= 1; + + /* + * Once we have dozens or even hundreds of threads sleeping + * on IO we've got bigger problems than wait queue collision. + * Limit the size of the wait table to a reasonable size. + */ + size = min(size, 4096UL); + + return max(size, 4UL); +} + +/* + * This is an integer logarithm so that shifts can be used later + * to extract the more random high bits from the multiplicative + * hash function before the remainder is taken. + */ +static inline unsigned long wait_table_bits(unsigned long size) +{ + return ffz(~size); +} + +#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) + +static void __init calculate_zone_totalpages(struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long *zholes_size) +{ + unsigned long realtotalpages, totalpages = 0; + int i; + + for (i = 0; i < MAX_NR_ZONES; i++) + totalpages += zones_size[i]; + pgdat->node_spanned_pages = totalpages; + + realtotalpages = totalpages; + if (zholes_size) + for (i = 0; i < MAX_NR_ZONES; i++) + realtotalpages -= zholes_size[i]; + pgdat->node_present_pages = realtotalpages; + printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); +} + + +/* + * Initially all pages are reserved - free ones are freed + * up by free_all_bootmem() once the early boot process is + * done. Non-atomic initialization, single-pass. + */ +void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, + unsigned long start_pfn) +{ + struct page *page; + unsigned long end_pfn = start_pfn + size; + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + if (!early_pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + set_page_links(page, zone, nid, pfn); + set_page_count(page, 1); + reset_page_mapcount(page); + SetPageReserved(page); + INIT_LIST_HEAD(&page->lru); +#ifdef WANT_PAGE_VIRTUAL + /* The shift won't overflow because ZONE_NORMAL is below 4G. */ + if (!is_highmem_idx(zone)) + set_page_address(page, __va(pfn << PAGE_SHIFT)); +#endif + } +} + +void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, + unsigned long size) +{ + int order; + for (order = 0; order < MAX_ORDER ; order++) { + INIT_LIST_HEAD(&zone->free_area[order].free_list); + zone->free_area[order].nr_free = 0; + } +} + +#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) +void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, + unsigned long size) +{ + unsigned long snum = pfn_to_section_nr(pfn); + unsigned long end = pfn_to_section_nr(pfn + size); + + if (FLAGS_HAS_NODE) + zone_table[ZONETABLE_INDEX(nid, zid)] = zone; + else + for (; snum <= end; snum++) + zone_table[ZONETABLE_INDEX(snum, zid)] = zone; +} + +#ifndef __HAVE_ARCH_MEMMAP_INIT +#define memmap_init(size, nid, zone, start_pfn) \ + memmap_init_zone((size), (nid), (zone), (start_pfn)) +#endif + +static int __cpuinit zone_batchsize(struct zone *zone) +{ + int batch; + + /* + * The per-cpu-pages pools are set to around 1000th of the + * size of the zone. But no more than 1/2 of a meg. + * + * OK, so we don't know how big the cache is. So guess. + */ + batch = zone->present_pages / 1024; + if (batch * PAGE_SIZE > 512 * 1024) + batch = (512 * 1024) / PAGE_SIZE; + batch /= 4; /* We effectively *= 4 below */ + if (batch < 1) + batch = 1; + + /* + * Clamp the batch to a 2^n - 1 value. Having a power + * of 2 value was found to be more likely to have + * suboptimal cache aliasing properties in some cases. + * + * For example if 2 tasks are alternately allocating + * batches of pages, one task can end up with a lot + * of pages of one half of the possible page colors + * and the other with pages of the other colors. + */ + batch = (1 << (fls(batch + batch/2)-1)) - 1; + + return batch; +} + +inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +{ + struct per_cpu_pages *pcp; + + memset(p, 0, sizeof(*p)); + + pcp = &p->pcp[0]; /* hot */ + pcp->count = 0; + pcp->high = 6 * batch; + pcp->batch = max(1UL, 1 * batch); + INIT_LIST_HEAD(&pcp->list); + + pcp = &p->pcp[1]; /* cold*/ + pcp->count = 0; + pcp->high = 2 * batch; + pcp->batch = max(1UL, batch/2); + INIT_LIST_HEAD(&pcp->list); +} + +/* + * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist + * to the value high for the pageset p. + */ + +static void setup_pagelist_highmark(struct per_cpu_pageset *p, + unsigned long high) +{ + struct per_cpu_pages *pcp; + + pcp = &p->pcp[0]; /* hot list */ + pcp->high = high; + pcp->batch = max(1UL, high/4); + if ((high/4) > (PAGE_SHIFT * 8)) + pcp->batch = PAGE_SHIFT * 8; +} + + +#ifdef CONFIG_NUMA +/* + * Boot pageset table. One per cpu which is going to be used for all + * zones and all nodes. The parameters will be set in such a way + * that an item put on a list will immediately be handed over to + * the buddy list. This is safe since pageset manipulation is done + * with interrupts disabled. + * + * Some NUMA counter updates may also be caught by the boot pagesets. + * + * The boot_pagesets must be kept even after bootup is complete for + * unused processors and/or zones. They do play a role for bootstrapping + * hotplugged processors. + * + * zoneinfo_show() and maybe other functions do + * not check if the processor is online before following the pageset pointer. + * Other parts of the kernel may not check if the zone is available. + */ +static struct per_cpu_pageset boot_pageset[NR_CPUS]; + +/* + * Dynamically allocate memory for the + * per cpu pageset array in struct zone. + */ +static int __cpuinit process_zones(int cpu) +{ + struct zone *zone, *dzone; + + for_each_zone(zone) { + + zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), + GFP_KERNEL, cpu_to_node(cpu)); + if (!zone_pcp(zone, cpu)) + goto bad; + + setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); + + if (percpu_pagelist_fraction) + setup_pagelist_highmark(zone_pcp(zone, cpu), + (zone->present_pages / percpu_pagelist_fraction)); + } + + return 0; +bad: + for_each_zone(dzone) { + if (dzone == zone) + break; + kfree(zone_pcp(dzone, cpu)); + zone_pcp(dzone, cpu) = NULL; + } + return -ENOMEM; +} + +static inline void free_zone_pagesets(int cpu) +{ + struct zone *zone; + + for_each_zone(zone) { + struct per_cpu_pageset *pset = zone_pcp(zone, cpu); + + zone_pcp(zone, cpu) = NULL; + kfree(pset); + } +} + +static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + int ret = NOTIFY_OK; + + switch (action) { + case CPU_UP_PREPARE: + if (process_zones(cpu)) + ret = NOTIFY_BAD; + break; + case CPU_UP_CANCELED: + case CPU_DEAD: + free_zone_pagesets(cpu); + break; + default: + break; + } + return ret; +} + +static struct notifier_block pageset_notifier = + { &pageset_cpuup_callback, NULL, 0 }; + +void __init setup_per_cpu_pageset(void) +{ + int err; + + /* Initialize per_cpu_pageset for cpu 0. + * A cpuup callback will do this for every cpu + * as it comes online + */ + err = process_zones(smp_processor_id()); + BUG_ON(err); + register_cpu_notifier(&pageset_notifier); +} + +#endif + +static __meminit +void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) +{ + int i; + struct pglist_data *pgdat = zone->zone_pgdat; + + /* + * The per-page waitqueue mechanism uses hashed waitqueues + * per zone. + */ + zone->wait_table_size = wait_table_size(zone_size_pages); + zone->wait_table_bits = wait_table_bits(zone->wait_table_size); + zone->wait_table = (wait_queue_head_t *) + alloc_bootmem_node(pgdat, zone->wait_table_size + * sizeof(wait_queue_head_t)); + + for(i = 0; i < zone->wait_table_size; ++i) + init_waitqueue_head(zone->wait_table + i); +} + +static __meminit void zone_pcp_init(struct zone *zone) +{ + int cpu; + unsigned long batch = zone_batchsize(zone); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { +#ifdef CONFIG_NUMA + /* Early boot. Slab allocator not functional yet */ + zone_pcp(zone, cpu) = &boot_pageset[cpu]; + setup_pageset(&boot_pageset[cpu],0); +#else + setup_pageset(zone_pcp(zone,cpu), batch); +#endif + } + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", + zone->name, zone->present_pages, batch); +} + +static __meminit void init_currently_empty_zone(struct zone *zone, + unsigned long zone_start_pfn, unsigned long size) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + + zone_wait_table_init(zone, size); + pgdat->nr_zones = zone_idx(zone) + 1; + + zone->zone_mem_map = pfn_to_page(zone_start_pfn); + zone->zone_start_pfn = zone_start_pfn; + + memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); + + zone_init_free_lists(pgdat, zone, zone->spanned_pages); +} + +/* + * Set up the zone data structures: + * - mark all pages reserved + * - mark all memory queues empty + * - clear the memory bitmaps + */ +static void __init free_area_init_core(struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long *zholes_size) +{ + unsigned long j; + int nid = pgdat->node_id; + unsigned long zone_start_pfn = pgdat->node_start_pfn; + + pgdat_resize_init(pgdat); + pgdat->nr_zones = 0; + init_waitqueue_head(&pgdat->kswapd_wait); + pgdat->kswapd_max_order = 0; + + for (j = 0; j < MAX_NR_ZONES; j++) { + struct zone *zone = pgdat->node_zones + j; + unsigned long size, realsize; + + realsize = size = zones_size[j]; + if (zholes_size) + realsize -= zholes_size[j]; + + if (j < ZONE_HIGHMEM) + nr_kernel_pages += realsize; + nr_all_pages += realsize; + + zone->spanned_pages = size; + zone->present_pages = realsize; + zone->name = zone_names[j]; + spin_lock_init(&zone->lock); + spin_lock_init(&zone->lru_lock); + zone_seqlock_init(zone); + zone->zone_pgdat = pgdat; + zone->free_pages = 0; + + zone->temp_priority = zone->prev_priority = DEF_PRIORITY; + + zone_pcp_init(zone); + INIT_LIST_HEAD(&zone->active_list); + INIT_LIST_HEAD(&zone->inactive_list); + zone->nr_scan_active = 0; + zone->nr_scan_inactive = 0; + zone->nr_active = 0; + zone->nr_inactive = 0; + atomic_set(&zone->reclaim_in_progress, 0); + if (!size) + continue; + + zonetable_add(zone, nid, j, zone_start_pfn, size); + init_currently_empty_zone(zone, zone_start_pfn, size); + zone_start_pfn += size; + } +} + +static void __init alloc_node_mem_map(struct pglist_data *pgdat) +{ + /* Skip empty nodes */ + if (!pgdat->node_spanned_pages) + return; + +#ifdef CONFIG_FLAT_NODE_MEM_MAP + /* ia64 gets its own node_mem_map, before this, without bootmem */ + if (!pgdat->node_mem_map) { + unsigned long size; + struct page *map; + + size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); + map = alloc_remap(pgdat->node_id, size); + if (!map) + map = alloc_bootmem_node(pgdat, size); + pgdat->node_mem_map = map; + } +#ifdef CONFIG_FLATMEM + /* + * With no DISCONTIG, the global mem_map is just set as node 0's + */ + if (pgdat == NODE_DATA(0)) + mem_map = NODE_DATA(0)->node_mem_map; +#endif +#endif /* CONFIG_FLAT_NODE_MEM_MAP */ +} + +void __init free_area_init_node(int nid, struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long node_start_pfn, + unsigned long *zholes_size) +{ + pgdat->node_id = nid; + pgdat->node_start_pfn = node_start_pfn; + calculate_zone_totalpages(pgdat, zones_size, zholes_size); + + alloc_node_mem_map(pgdat); + + free_area_init_core(pgdat, zones_size, zholes_size); +} + +#ifndef CONFIG_NEED_MULTIPLE_NODES +static bootmem_data_t contig_bootmem_data; +struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; + +EXPORT_SYMBOL(contig_page_data); +#endif + +void __init free_area_init(unsigned long *zones_size) +{ + free_area_init_node(0, NODE_DATA(0), zones_size, + __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); +} + +#ifdef CONFIG_PROC_FS + +#include + +static void *frag_start(struct seq_file *m, loff_t *pos) +{ + pg_data_t *pgdat; + loff_t node = *pos; + + for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) + --node; + + return pgdat; +} + +static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + (*pos)++; + return pgdat->pgdat_next; +} + +static void frag_stop(struct seq_file *m, void *arg) +{ +} + +/* + * This walks the free areas for each zone. + */ +static int frag_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + int order; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!populated_zone(zone)) + continue; + + spin_lock_irqsave(&zone->lock, flags); + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].nr_free); + spin_unlock_irqrestore(&zone->lock, flags); + seq_putc(m, '\n'); + } + return 0; +} + +struct seq_operations fragmentation_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = frag_show, +}; + +/* + * Output information about zones in @pgdat. + */ +static int zoneinfo_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = arg; + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { + int i; + + if (!populated_zone(zone)) + continue; + + spin_lock_irqsave(&zone->lock, flags); + seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); + seq_printf(m, + "\n pages free %lu" + "\n min %lu" + "\n low %lu" + "\n high %lu" + "\n active %lu" + "\n inactive %lu" + "\n scanned %lu (a: %lu i: %lu)" + "\n spanned %lu" + "\n present %lu", + zone->free_pages, + zone->pages_min, + zone->pages_low, + zone->pages_high, + zone->nr_active, + zone->nr_inactive, + zone->pages_scanned, + zone->nr_scan_active, zone->nr_scan_inactive, + zone->spanned_pages, + zone->present_pages); + seq_printf(m, + "\n protection: (%lu", + zone->lowmem_reserve[0]); + for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) + seq_printf(m, ", %lu", zone->lowmem_reserve[i]); + seq_printf(m, + ")" + "\n pagesets"); + for_each_online_cpu(i) { + struct per_cpu_pageset *pageset; + int j; + + pageset = zone_pcp(zone, i); + for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { + if (pageset->pcp[j].count) + break; + } + if (j == ARRAY_SIZE(pageset->pcp)) + continue; + for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { + seq_printf(m, + "\n cpu: %i pcp: %i" + "\n count: %i" + "\n high: %i" + "\n batch: %i", + i, j, + pageset->pcp[j].count, + pageset->pcp[j].high, + pageset->pcp[j].batch); + } +#ifdef CONFIG_NUMA + seq_printf(m, + "\n numa_hit: %lu" + "\n numa_miss: %lu" + "\n numa_foreign: %lu" + "\n interleave_hit: %lu" + "\n local_node: %lu" + "\n other_node: %lu", + pageset->numa_hit, + pageset->numa_miss, + pageset->numa_foreign, + pageset->interleave_hit, + pageset->local_node, + pageset->other_node); +#endif + } + seq_printf(m, + "\n all_unreclaimable: %u" + "\n prev_priority: %i" + "\n temp_priority: %i" + "\n start_pfn: %lu", + zone->all_unreclaimable, + zone->prev_priority, + zone->temp_priority, + zone->zone_start_pfn); + spin_unlock_irqrestore(&zone->lock, flags); + seq_putc(m, '\n'); + } + return 0; +} + +struct seq_operations zoneinfo_op = { + .start = frag_start, /* iterate over all zones. The same as in + * fragmentation. */ + .next = frag_next, + .stop = frag_stop, + .show = zoneinfo_show, +}; + +static char *vmstat_text[] = { + "nr_dirty", + "nr_writeback", + "nr_unstable", + "nr_page_table_pages", + "nr_mapped", + "nr_slab", + + "pgpgin", + "pgpgout", + "pswpin", + "pswpout", + + "pgalloc_high", + "pgalloc_normal", + "pgalloc_dma32", + "pgalloc_dma", + + "pgfree", + "pgactivate", + "pgdeactivate", + + "pgfault", + "pgmajfault", + + "pgrefill_high", + "pgrefill_normal", + "pgrefill_dma32", + "pgrefill_dma", + + "pgsteal_high", + "pgsteal_normal", + "pgsteal_dma32", + "pgsteal_dma", + + "pgscan_kswapd_high", + "pgscan_kswapd_normal", + "pgscan_kswapd_dma32", + "pgscan_kswapd_dma", + + "pgscan_direct_high", + "pgscan_direct_normal", + "pgscan_direct_dma32", + "pgscan_direct_dma", + + "pginodesteal", + "slabs_scanned", + "kswapd_steal", + "kswapd_inodesteal", + "pageoutrun", + "allocstall", + + "pgrotated", + "nr_bounce", +}; + +static void *vmstat_start(struct seq_file *m, loff_t *pos) +{ + struct page_state *ps; + + if (*pos >= ARRAY_SIZE(vmstat_text)) + return NULL; + + ps = kmalloc(sizeof(*ps), GFP_KERNEL); + m->private = ps; + if (!ps) + return ERR_PTR(-ENOMEM); + get_full_page_state(ps); + ps->pgpgin /= 2; /* sectors -> kbytes */ + ps->pgpgout /= 2; + return (unsigned long *)ps + *pos; +} + +static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) +{ + (*pos)++; + if (*pos >= ARRAY_SIZE(vmstat_text)) + return NULL; + return (unsigned long *)m->private + *pos; +} + +static int vmstat_show(struct seq_file *m, void *arg) +{ + unsigned long *l = arg; + unsigned long off = l - (unsigned long *)m->private; + + seq_printf(m, "%s %lu\n", vmstat_text[off], *l); + return 0; +} + +static void vmstat_stop(struct seq_file *m, void *arg) +{ + kfree(m->private); + m->private = NULL; +} + +struct seq_operations vmstat_op = { + .start = vmstat_start, + .next = vmstat_next, + .stop = vmstat_stop, + .show = vmstat_show, +}; + +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_HOTPLUG_CPU +static int page_alloc_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (unsigned long)hcpu; + long *count; + unsigned long *src, *dest; + + if (action == CPU_DEAD) { + int i; + + /* Drain local pagecache count. */ + count = &per_cpu(nr_pagecache_local, cpu); + atomic_add(*count, &nr_pagecache); + *count = 0; + local_irq_disable(); + __drain_pages(cpu); + + /* Add dead cpu's page_states to our own. */ + dest = (unsigned long *)&__get_cpu_var(page_states); + src = (unsigned long *)&per_cpu(page_states, cpu); + + for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long); + i++) { + dest[i] += src[i]; + src[i] = 0; + } + + local_irq_enable(); + } + return NOTIFY_OK; +} +#endif /* CONFIG_HOTPLUG_CPU */ + +void __init page_alloc_init(void) +{ + hotcpu_notifier(page_alloc_cpu_notify, 0); +} + +/* + * setup_per_zone_lowmem_reserve - called whenever + * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone + * has a correct pages reserved value, so an adequate number of + * pages are left in the zone after a successful __alloc_pages(). + */ +static void setup_per_zone_lowmem_reserve(void) +{ + struct pglist_data *pgdat; + int j, idx; + + for_each_pgdat(pgdat) { + for (j = 0; j < MAX_NR_ZONES; j++) { + struct zone *zone = pgdat->node_zones + j; + unsigned long present_pages = zone->present_pages; + + zone->lowmem_reserve[j] = 0; + + for (idx = j-1; idx >= 0; idx--) { + struct zone *lower_zone; + + if (sysctl_lowmem_reserve_ratio[idx] < 1) + sysctl_lowmem_reserve_ratio[idx] = 1; + + lower_zone = pgdat->node_zones + idx; + lower_zone->lowmem_reserve[j] = present_pages / + sysctl_lowmem_reserve_ratio[idx]; + present_pages += lower_zone->present_pages; + } + } + } +} + +/* + * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures + * that the pages_{min,low,high} values for each zone are set correctly + * with respect to min_free_kbytes. + */ +void setup_per_zone_pages_min(void) +{ + unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); + unsigned long lowmem_pages = 0; + struct zone *zone; + unsigned long flags; + + /* Calculate total number of !ZONE_HIGHMEM pages */ + for_each_zone(zone) { + if (!is_highmem(zone)) + lowmem_pages += zone->present_pages; + } + + for_each_zone(zone) { + unsigned long tmp; + spin_lock_irqsave(&zone->lru_lock, flags); + tmp = (pages_min * zone->present_pages) / lowmem_pages; + if (is_highmem(zone)) { + /* + * __GFP_HIGH and PF_MEMALLOC allocations usually don't + * need highmem pages, so cap pages_min to a small + * value here. + * + * The (pages_high-pages_low) and (pages_low-pages_min) + * deltas controls asynch page reclaim, and so should + * not be capped for highmem. + */ + int min_pages; + + min_pages = zone->present_pages / 1024; + if (min_pages < SWAP_CLUSTER_MAX) + min_pages = SWAP_CLUSTER_MAX; + if (min_pages > 128) + min_pages = 128; + zone->pages_min = min_pages; + } else { + /* + * If it's a lowmem zone, reserve a number of pages + * proportionate to the zone's size. + */ + zone->pages_min = tmp; + } + + zone->pages_low = zone->pages_min + tmp / 4; + zone->pages_high = zone->pages_min + tmp / 2; + spin_unlock_irqrestore(&zone->lru_lock, flags); + } +} + +/* + * Initialise min_free_kbytes. + * + * For small machines we want it small (128k min). For large machines + * we want it large (64MB max). But it is not linear, because network + * bandwidth does not increase linearly with machine size. We use + * + * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: + * min_free_kbytes = sqrt(lowmem_kbytes * 16) + * + * which yields + * + * 16MB: 512k + * 32MB: 724k + * 64MB: 1024k + * 128MB: 1448k + * 256MB: 2048k + * 512MB: 2896k + * 1024MB: 4096k + * 2048MB: 5792k + * 4096MB: 8192k + * 8192MB: 11584k + * 16384MB: 16384k + */ +static int __init init_per_zone_pages_min(void) +{ + unsigned long lowmem_kbytes; + + lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); + + min_free_kbytes = int_sqrt(lowmem_kbytes * 16); + if (min_free_kbytes < 128) + min_free_kbytes = 128; + if (min_free_kbytes > 65536) + min_free_kbytes = 65536; + setup_per_zone_pages_min(); + setup_per_zone_lowmem_reserve(); + return 0; +} +module_init(init_per_zone_pages_min) + +/* + * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so + * that we can call two helper functions whenever min_free_kbytes + * changes. + */ +int min_free_kbytes_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec(table, write, file, buffer, length, ppos); + setup_per_zone_pages_min(); + return 0; +} + +/* + * lowmem_reserve_ratio_sysctl_handler - just a wrapper around + * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() + * whenever sysctl_lowmem_reserve_ratio changes. + * + * The reserve ratio obviously has absolutely no relation with the + * pages_min watermarks. The lowmem reserve ratio can only make sense + * if in function of the boot time zone sizes. + */ +int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec_minmax(table, write, file, buffer, length, ppos); + setup_per_zone_lowmem_reserve(); + return 0; +} + +/* + * percpu_pagelist_fraction - changes the pcp->high for each zone on each + * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist + * can have before it gets flushed back to buddy allocator. + */ + +int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + unsigned int cpu; + int ret; + + ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + if (!write || (ret == -EINVAL)) + return ret; + for_each_zone(zone) { + for_each_online_cpu(cpu) { + unsigned long high; + high = zone->present_pages / percpu_pagelist_fraction; + setup_pagelist_highmark(zone_pcp(zone, cpu), high); + } + } + return 0; +} + +__initdata int hashdist = HASHDIST_DEFAULT; + +#ifdef CONFIG_NUMA +static int __init set_hashdist(char *str) +{ + if (!str) + return 0; + hashdist = simple_strtoul(str, &str, 0); + return 1; +} +__setup("hashdist=", set_hashdist); +#endif + +/* + * allocate a large system hash table from bootmem + * - it is assumed that the hash table must contain an exact power-of-2 + * quantity of entries + * - limit is the number of hash buckets, not the total allocation size + */ +void *__init alloc_large_system_hash(const char *tablename, + unsigned long bucketsize, + unsigned long numentries, + int scale, + int flags, + unsigned int *_hash_shift, + unsigned int *_hash_mask, + unsigned long limit) +{ + unsigned long long max = limit; + unsigned long log2qty, size; + void *table = NULL; + + /* allow the kernel cmdline to have a say */ + if (!numentries) { + /* round applicable memory size up to nearest megabyte */ + numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; + numentries += (1UL << (20 - PAGE_SHIFT)) - 1; + numentries >>= 20 - PAGE_SHIFT; + numentries <<= 20 - PAGE_SHIFT; + + /* limit to 1 bucket per 2^scale bytes of low memory */ + if (scale > PAGE_SHIFT) + numentries >>= (scale - PAGE_SHIFT); + else + numentries <<= (PAGE_SHIFT - scale); + } + /* rounded up to nearest power of 2 in size */ + numentries = 1UL << (long_log2(numentries) + 1); + + /* limit allocation size to 1/16 total memory by default */ + if (max == 0) { + max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; + do_div(max, bucketsize); + } + + if (numentries > max) + numentries = max; + + log2qty = long_log2(numentries); + + do { + size = bucketsize << log2qty; + if (flags & HASH_EARLY) + table = alloc_bootmem(size); + else if (hashdist) + table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); + else { + unsigned long order; + for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) + ; + table = (void*) __get_free_pages(GFP_ATOMIC, order); + } + } while (!table && size > PAGE_SIZE && --log2qty); + + if (!table) + panic("Failed to allocate %s hash table\n", tablename); + + printk("%s hash table entries: %d (order: %d, %lu bytes)\n", + tablename, + (1U << log2qty), + long_log2(size) - PAGE_SHIFT, + size); + + if (_hash_shift) + *_hash_shift = log2qty; + if (_hash_mask) + *_hash_mask = (1 << log2qty) - 1; + + return table; +} diff -urN oldtree/mm/swapfile.c newtree/mm/swapfile.c --- oldtree/mm/swapfile.c 2006-02-18 15:18:30.137735616 +0000 +++ newtree/mm/swapfile.c 2006-02-18 15:24:31.451807528 +0000 @@ -1193,6 +1193,7 @@ swap_file = p->swap_file; p->swap_file = NULL; p->max = 0; + p->bdev = NULL; swap_map = p->swap_map; p->swap_map = NULL; p->flags = 0; diff -urN oldtree/mm/vmscan.c newtree/mm/vmscan.c --- oldtree/mm/vmscan.c 2006-02-18 15:18:30.140735160 +0000 +++ newtree/mm/vmscan.c 2006-02-18 15:24:31.453807224 +0000 @@ -1724,7 +1724,8 @@ for ( ; ; ) { unsigned long new_order; - try_to_freeze(); + if (try_to_freeze()) + pgdat->kswapd_max_order = 0; prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); new_order = pgdat->kswapd_max_order;