diff -urN newtree/drivers/mtd/devices/block2mtd.c newtree.2/drivers/mtd/devices/block2mtd.c --- newtree/drivers/mtd/devices/block2mtd.c 2006-07-05 10:06:57.000000000 -0400 +++ newtree.2/drivers/mtd/devices/block2mtd.c 2006-07-11 13:50:10.000000000 -0400 @@ -58,28 +58,27 @@ end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); - read_lock_irq(&mapping->tree_lock); for (i = 0; i < PAGE_READAHEAD; i++) { pagei = index + i; if (pagei > end_index) { INFO("Overrun end of disk in cache readahead\n"); break; } + /* Don't need mapping->tree_lock - lookup can be racy */ + rcu_read_lock(); page = radix_tree_lookup(&mapping->page_tree, pagei); + rcu_read_unlock(); if (page && (!i)) break; if (page) continue; - read_unlock_irq(&mapping->tree_lock); page = page_cache_alloc_cold(mapping); - read_lock_irq(&mapping->tree_lock); if (!page) break; page->index = pagei; list_add(&page->lru, &page_pool); ret++; } - read_unlock_irq(&mapping->tree_lock); if (ret) read_cache_pages(mapping, &page_pool, filler, NULL); } diff -urN newtree/fs/buffer.c newtree.2/fs/buffer.c --- newtree/fs/buffer.c 2006-07-05 10:06:57.000000000 -0400 +++ newtree.2/fs/buffer.c 2006-07-11 13:50:10.000000000 -0400 @@ -848,7 +848,7 @@ spin_unlock(&mapping->private_lock); if (!TestSetPageDirty(page)) { - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); if (page->mapping) { /* Race with truncate? */ if (mapping_cap_account_dirty(mapping)) __inc_zone_page_state(page, NR_FILE_DIRTY); @@ -856,7 +856,7 @@ page_index(page), PAGECACHE_TAG_DIRTY); } - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); return 1; } diff -urN newtree/fs/inode.c newtree.2/fs/inode.c --- newtree/fs/inode.c 2006-07-05 10:06:57.000000000 -0400 +++ newtree.2/fs/inode.c 2006-07-11 13:50:10.000000000 -0400 @@ -194,7 +194,7 @@ mutex_init(&inode->i_mutex); init_rwsem(&inode->i_alloc_sem); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); - rwlock_init(&inode->i_data.tree_lock); + spin_lock_init(&inode->i_data.tree_lock); spin_lock_init(&inode->i_data.i_mmap_lock); INIT_LIST_HEAD(&inode->i_data.private_list); spin_lock_init(&inode->i_data.private_lock); diff -urN newtree/fs/reiser4/as_ops.c newtree.2/fs/reiser4/as_ops.c --- newtree/fs/reiser4/as_ops.c 2006-07-05 10:06:57.000000000 -0400 +++ newtree.2/fs/reiser4/as_ops.c 2006-07-11 13:50:10.000000000 -0400 @@ -77,7 +77,7 @@ struct address_space *mapping = page->mapping; if (mapping) { - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); /* check for race with truncate */ if (page->mapping) { @@ -89,7 +89,7 @@ page->index, PAGECACHE_TAG_REISER4_MOVED); } - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } } diff -urN newtree/fs/reiser4/jnode.c newtree.2/fs/reiser4/jnode.c --- newtree/fs/reiser4/jnode.c 2006-07-05 10:06:57.000000000 -0400 +++ newtree.2/fs/reiser4/jnode.c 2006-07-11 13:50:10.000000000 -0400 @@ -434,9 +434,9 @@ if (rtree->rnode == NULL) { /* prevent inode from being pruned when it has jnodes attached to it */ - write_lock_irq(&inode->i_data.tree_lock); + spin_lock_irq(&inode->i_data.tree_lock); inode->i_data.nrpages++; - write_unlock_irq(&inode->i_data.tree_lock); + spin_unlock_irq(&inode->i_data.tree_lock); } assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0)); check_me("zam-1045", @@ -464,9 +464,9 @@ check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index)); if (rtree->rnode == NULL) { /* inode can be pruned now */ - write_lock_irq(&inode->i_data.tree_lock); + spin_lock_irq(&inode->i_data.tree_lock); inode->i_data.nrpages--; - write_unlock_irq(&inode->i_data.tree_lock); + spin_unlock_irq(&inode->i_data.tree_lock); } } diff -urN newtree/fs/reiser4/plugin/file/cryptcompress.c newtree.2/fs/reiser4/plugin/file/cryptcompress.c --- newtree/fs/reiser4/plugin/file/cryptcompress.c 2006-07-08 09:55:17.000000000 -0400 +++ newtree.2/fs/reiser4/plugin/file/cryptcompress.c 2006-07-11 13:50:10.000000000 -0400 @@ -3415,7 +3415,7 @@ { int i; void * ret; - read_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); for (i = 0; i < clust->nr_pages; i++) { assert("edward-1438", clust->pages[i] != NULL); ret = radix_tree_tag_clear(&mapping->page_tree, @@ -3423,7 +3423,7 @@ PAGECACHE_TAG_REISER4_MOVED); assert("edward-1439", ret == clust->pages[i]); } - read_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); } /* Capture an anonymous pager cluster. (Page cluser is @@ -3448,11 +3448,11 @@ if (unlikely(result)) { /* set cleared tag back, so it will be possible to capture it again later */ - read_lock_irq(&inode->i_mapping->tree_lock); + spin_lock_irq(&inode->i_mapping->tree_lock); radix_tree_tag_set(&inode->i_mapping->page_tree, clust_to_pg(clust->index, inode), PAGECACHE_TAG_REISER4_MOVED); - read_unlock_irq(&inode->i_mapping->tree_lock); + spin_unlock_irq(&inode->i_mapping->tree_lock); release_cluster_pages_and_jnode(clust); } diff -urN newtree/fs/reiser4/plugin/file/file.c newtree.2/fs/reiser4/plugin/file/file.c --- newtree/fs/reiser4/plugin/file/file.c 2006-07-05 10:06:57.000000000 -0400 +++ newtree.2/fs/reiser4/plugin/file/file.c 2006-07-11 13:50:10.000000000 -0400 @@ -830,9 +830,9 @@ { int result; - read_lock_irq(&inode->i_mapping->tree_lock); + spin_lock_irq(&inode->i_mapping->tree_lock); result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED); - read_unlock_irq(&inode->i_mapping->tree_lock); + spin_unlock_irq(&inode->i_mapping->tree_lock); return result; } @@ -978,7 +978,7 @@ nr = 0; /* find pages tagged MOVED */ - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree, (void **)pvec.pages, *index, count, PAGECACHE_TAG_REISER4_MOVED); @@ -987,7 +987,7 @@ * there are no pages tagged MOVED in mapping->page_tree * starting from *index */ - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); *index = (pgoff_t)-1; return 0; } @@ -1001,7 +1001,7 @@ PAGECACHE_TAG_REISER4_MOVED); assert("vs-49", p == pvec.pages[i]); } - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); *index = pvec.pages[i - 1]->index + 1; @@ -1026,13 +1026,13 @@ * set MOVED tag to all pages which left not * captured */ - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); for (; i < pagevec_count(&pvec); i ++) { radix_tree_tag_set(&mapping->page_tree, pvec.pages[i]->index, PAGECACHE_TAG_REISER4_MOVED); } - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); pagevec_release(&pvec); return result; @@ -1042,11 +1042,11 @@ * 0 for Writeback-ed page. Set MOVED tag on * that page */ - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); radix_tree_tag_set(&mapping->page_tree, pvec.pages[i]->index, PAGECACHE_TAG_REISER4_MOVED); - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); if (i == 0) *index = pvec.pages[0]->index; else @@ -1122,7 +1122,7 @@ mapping = inode->i_mapping; from = 0; result = 0; - read_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); while (result == 0) { struct page *page; @@ -1136,17 +1136,17 @@ /* page may not leave radix tree because it is protected from truncating by inode->i_mutex locked by sys_fsync */ page_cache_get(page); - read_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); from = page->index + 1; result = sync_page(page); page_cache_release(page); - read_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); } - read_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); return result; } diff -urN newtree/include/asm-arm/cacheflush.h newtree.2/include/asm-arm/cacheflush.h --- newtree/include/asm-arm/cacheflush.h 2006-07-05 10:06:57.000000000 -0400 +++ newtree.2/include/asm-arm/cacheflush.h 2006-07-11 13:50:10.000000000 -0400 @@ -326,9 +326,9 @@ extern void flush_dcache_page(struct page *); #define flush_dcache_mmap_lock(mapping) \ - write_lock_irq(&(mapping)->tree_lock) + spin_lock_irq(&(mapping)->tree_lock) #define flush_dcache_mmap_unlock(mapping) \ - write_unlock_irq(&(mapping)->tree_lock) + spin_unlock_irq(&(mapping)->tree_lock) #define flush_icache_user_range(vma,page,addr,len) \ flush_dcache_page(page) diff -urN newtree/include/asm-parisc/cacheflush.h newtree.2/include/asm-parisc/cacheflush.h --- newtree/include/asm-parisc/cacheflush.h 2006-07-05 10:06:57.000000000 -0400 +++ newtree.2/include/asm-parisc/cacheflush.h 2006-07-11 13:50:10.000000000 -0400 @@ -57,9 +57,9 @@ extern void flush_dcache_page(struct page *page); #define flush_dcache_mmap_lock(mapping) \ - write_lock_irq(&(mapping)->tree_lock) + spin_lock_irq(&(mapping)->tree_lock) #define flush_dcache_mmap_unlock(mapping) \ - write_unlock_irq(&(mapping)->tree_lock) + spin_unlock_irq(&(mapping)->tree_lock) #define flush_icache_page(vma,page) do { flush_kernel_dcache_page(page); flush_kernel_icache_page(page_address(page)); } while (0) diff -urN newtree/include/linux/fs.h newtree.2/include/linux/fs.h --- newtree/include/linux/fs.h 2006-07-08 09:55:03.000000000 -0400 +++ newtree.2/include/linux/fs.h 2006-07-11 13:50:10.000000000 -0400 @@ -419,7 +419,7 @@ struct address_space { struct inode *host; /* owner: inode, block_device */ struct radix_tree_root page_tree; /* radix tree of all pages */ - rwlock_t tree_lock; /* and rwlock protecting it */ + spinlock_t tree_lock; /* and lock protecting it */ unsigned int i_mmap_writable;/* count VM_SHARED mappings */ struct prio_tree_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ diff -urN newtree/include/linux/page-flags.h newtree.2/include/linux/page-flags.h --- newtree/include/linux/page-flags.h 2006-07-05 10:06:57.000000000 -0400 +++ newtree.2/include/linux/page-flags.h 2006-07-11 13:50:10.000000000 -0400 @@ -87,6 +87,8 @@ #define PG_buddy 19 /* Page is free, on buddy lists */ #define PG_readahead 20 /* Reminder to do readahead */ +#define PG_nonewrefs 21 /* Block concurrent pagecache lookups + * while testing refcount */ #if (BITS_PER_LONG > 32) @@ -103,16 +105,13 @@ /* * Manipulation of page state flags */ -#define PageLocked(page) \ - test_bit(PG_locked, &(page)->flags) -#define SetPageLocked(page) \ - set_bit(PG_locked, &(page)->flags) -#define TestSetPageLocked(page) \ - test_and_set_bit(PG_locked, &(page)->flags) -#define ClearPageLocked(page) \ - clear_bit(PG_locked, &(page)->flags) -#define TestClearPageLocked(page) \ - test_and_clear_bit(PG_locked, &(page)->flags) +#define PageLocked(page) test_bit(PG_locked, &(page)->flags) +#define SetPageLocked(page) set_bit(PG_locked, &(page)->flags) +#define __SetPageLocked(page) __set_bit(PG_locked, &(page)->flags) +#define TestSetPageLocked(page) test_and_set_bit(PG_locked, &(page)->flags) +#define ClearPageLocked(page) clear_bit(PG_locked, &(page)->flags) +#define __ClearPageLocked(page) __clear_bit(PG_locked, &(page)->flags) +#define TestClearPageLocked(page) test_and_clear_bit(PG_locked, &(page)->flags) #define PageError(page) test_bit(PG_error, &(page)->flags) #define SetPageError(page) set_bit(PG_error, &(page)->flags) @@ -253,6 +252,11 @@ #define SetPageReadahead(page) set_bit(PG_readahead, &(page)->flags) #define TestClearPageReadahead(page) test_and_clear_bit(PG_readahead, &(page)->flags) +#define PageNoNewRefs(page) test_bit(PG_nonewrefs, &(page)->flags) +#define SetPageNoNewRefs(page) set_bit(PG_nonewrefs, &(page)->flags) +#define ClearPageNoNewRefs(page) clear_bit(PG_nonewrefs, &(page)->flags) +#define __ClearPageNoNewRefs(page) __clear_bit(PG_nonewrefs, &(page)->flags) + struct page; /* forward declaration */ int test_clear_page_dirty(struct page *page); diff -urN newtree/include/linux/pagemap.h newtree.2/include/linux/pagemap.h --- newtree/include/linux/pagemap.h 2006-07-05 10:06:57.000000000 -0400 +++ newtree.2/include/linux/pagemap.h 2006-07-11 13:50:10.000000000 -0400 @@ -11,6 +11,8 @@ #include #include #include +#include +#include /* for in_interrupt() */ /* * Bits in mapping->flags. The lower __GFP_BITS_SHIFT bits are the page @@ -51,6 +53,76 @@ #define page_cache_release(page) put_page(page) void release_pages(struct page **pages, int nr, int cold); +static inline struct page *page_cache_get_speculative(struct page *page) +{ + VM_BUG_ON(in_interrupt()); + +#ifndef CONFIG_SMP +# ifdef CONFIG_PREEMPT + VM_BUG_ON(!in_atomic()); +# endif + /* + * Preempt must be disabled here - we rely on rcu_read_lock doing + * this for us. + * + * Pagecache won't be truncated from interrupt context, so if we have + * found a page in the radix tree here, we have pinned its refcount by + * disabling preempt, and hence no need for the "speculative get" that + * SMP requires. + */ + VM_BUG_ON(page_count(page) == 0); + atomic_inc(&page->_count); + +#else + if (unlikely(!get_page_unless_zero(page))) + return NULL; /* page has been freed */ + + /* + * Note that get_page_unless_zero provides a memory barrier. + * This is needed to ensure PageNoNewRefs is evaluated after the + * page refcount has been raised. See below comment. + */ + + /* + * PageNoNewRefs is set in order to prevent new references to the + * page (eg. before it gets removed from pagecache). Wait until it + * becomes clear (and checks below will ensure we still have the + * correct one). + */ + while (unlikely(PageNoNewRefs(page))) + cpu_relax(); + + /* + * smp_rmb is to ensure the load of page->flags (for PageNoNewRefs()) + * is performed before a future load used to ensure the page is + * the correct on (usually: page->mapping and page->index). + * + * Those places that set PageNoNewRefs have the following pattern: + * SetPageNoNewRefs(page) + * wmb(); + * if (page_count(page) == X) + * remove page from pagecache + * wmb(); + * ClearPageNoNewRefs(page) + * + * So PageNoNewRefs() becomes clear _after_ we've elevated page + * refcount, then either the page will be safely pinned in pagecache, + * or it will have been already removed. In the latter case, *pagep + * will be changed in the below test - provided it is loaded after + * testing PageNoNewRefs() (which is what the smp_rmb is for). + * + * If the load was out of order, page->mapping might be loaded before + * the page is removed from pagecache while PageNoNewRefs evaluated + * after the ClearPageNoNewRefs(). + */ + smp_rmb(); + +#endif + VM_BUG_ON(PageCompound(page) && (struct page *)page_private(page) != page); + + return page; +} + #ifdef CONFIG_NUMA extern struct page *page_cache_alloc(struct address_space *x); extern struct page *page_cache_alloc_cold(struct address_space *x); @@ -110,6 +182,8 @@ int add_to_page_cache(struct page *page, struct address_space *mapping, unsigned long index, gfp_t gfp_mask); +int __add_to_page_cache(struct page *page, struct address_space *mapping, + unsigned long index, gfp_t gfp_mask); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, unsigned long index, gfp_t gfp_mask); extern void remove_from_page_cache(struct page *page); diff -urN newtree/include/linux/swap.h newtree.2/include/linux/swap.h --- newtree/include/linux/swap.h 2006-07-08 06:15:08.000000000 -0400 +++ newtree.2/include/linux/swap.h 2006-07-11 13:50:10.000000000 -0400 @@ -228,6 +228,7 @@ struct address_space *); extern void free_page_and_swap_cache(struct page *); extern void free_pages_and_swap_cache(struct page **, int); +extern struct page * find_get_swap_page(swp_entry_t); extern struct page * lookup_swap_cache(swp_entry_t); extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma, unsigned long addr); diff -urN newtree/mm/filemap.c newtree.2/mm/filemap.c --- newtree/mm/filemap.c 2006-07-11 12:42:13.000000000 -0400 +++ newtree.2/mm/filemap.c 2006-07-11 13:50:10.000000000 -0400 @@ -117,7 +117,7 @@ /* * Remove a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold a write_lock on the mapping's tree_lock. + * is safe. The caller must hold the mapping's tree_lock. */ void __remove_from_page_cache(struct page *page) { @@ -136,9 +136,9 @@ BUG_ON(!PageLocked(page)); - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); } EXPORT_SYMBOL(remove_from_page_cache); @@ -431,42 +431,6 @@ return err; } -/** - * add_to_page_cache - add newly allocated pagecache pages - * @page: page to add - * @mapping: the page's address_space - * @offset: page index - * @gfp_mask: page allocation mode - * - * This function is used to add newly allocated pagecache pages; - * the page is new, so we can just run SetPageLocked() against it. - * The other page state flags were set by rmqueue(). - * - * This function does not add the page to the LRU. The caller must do that. - */ -int add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t offset, gfp_t gfp_mask) -{ - int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); - - if (error == 0) { - write_lock_irq(&mapping->tree_lock); - error = radix_tree_insert(&mapping->page_tree, offset, page); - if (!error) { - page_cache_get(page); - SetPageLocked(page); - page->mapping = mapping; - page->index = offset; - mapping->nrpages++; - __inc_zone_page_state(page, NR_FILE_PAGES); - } - write_unlock_irq(&mapping->tree_lock); - radix_tree_preload_end(); - } - return error; -} -EXPORT_SYMBOL(add_to_page_cache); - int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { @@ -498,6 +462,96 @@ EXPORT_SYMBOL(page_cache_alloc_cold); #endif +static int add_to_page_cache_nolock(struct page *page, + struct address_space *mapping, pgoff_t offset) +{ + int error; + + /* + * Can get away with less atomic ops and without using + * Set/ClearPageNoNewRefs if we order operations correctly. + */ + page_cache_get(page); + __SetPageLocked(page); + page->mapping = mapping; + page->index = offset; + + /* radix_tree_insert provides a write memory barrier */ + error = radix_tree_insert(&mapping->page_tree, offset, page); + + if (!error) { + mapping->nrpages++; + __inc_zone_page_state(page, NR_FILE_PAGES); + } else { + page->mapping = NULL; + __ClearPageLocked(page); + __put_page(page); + } + + return error; +} + +/** + * add_to_page_cache - add newly allocated pagecache pages + * @page: page to add + * @mapping: the page's address_space + * @offset: page index + * @gfp_mask: page allocation mode + * + * This function is used to add newly allocated pagecache pages; + * the page is new, so we can just run SetPageLocked() against it. + * The other page state flags were set by rmqueue(). + * + * This function does not add the page to the LRU. The caller must do that. + */ +int add_to_page_cache(struct page *page, struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask) +{ + int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); + + if (error == 0) { + spin_lock_irq(&mapping->tree_lock); + error = add_to_page_cache_nolock(page, mapping, offset); + spin_unlock_irq(&mapping->tree_lock); + + radix_tree_preload_end(); + } + return error; +} + +/* + * Same as add_to_page_cache, but works on pages that are already in + * swapcache and possibly visible to external lookups. + * (special case for move_from_swap_cache). + */ +int __add_to_page_cache(struct page *page, struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask) +{ + int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); + + if (error == 0) { + SetPageNoNewRefs(page); + smp_wmb(); + spin_lock_irq(&mapping->tree_lock); + + error = radix_tree_insert(&mapping->page_tree, offset, page); + if (!error) { + page_cache_get(page); + SetPageLocked(page); + page->mapping = mapping; + page->index = offset; + mapping->nrpages++; + __inc_zone_page_state(page, NR_FILE_PAGES); + } + + spin_unlock_irq(&mapping->tree_lock); + smp_wmb(); + ClearPageNoNewRefs(page); + radix_tree_preload_end(); + } + return error; +} + /* * In order to wait for pages to become available there must be * waitqueues associated with pages. By using a hash table of @@ -602,30 +656,36 @@ { int exists; - read_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); exists = __probe_page(mapping, offset); - read_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); return exists; } -/** - * find_get_page - find and get a page reference - * @mapping: the address_space to search - * @offset: the page index - * - * A rather lightweight function, finding and getting a reference to a - * hashed page atomically. +/* + * find_get_page - find and get a reference to a pagecache page. */ -struct page * find_get_page(struct address_space *mapping, unsigned long offset) +struct page *find_get_page(struct address_space *mapping, unsigned long offset) { struct page *page; - read_lock_irq(&mapping->tree_lock); + rcu_read_lock(); +repeat: page = radix_tree_lookup(&mapping->page_tree, offset); - if (page) - page_cache_get(page); - read_unlock_irq(&mapping->tree_lock); + if (page) { + page = page_cache_get_speculative(page); + if (unlikely(!page)) + goto repeat; + /* Has the page been truncated? */ + if (unlikely(page->mapping != mapping + || page->index != offset)) { + page_cache_release(page); + goto repeat; + } + } + rcu_read_unlock(); + return page; } EXPORT_SYMBOL(find_get_page); @@ -641,11 +701,11 @@ { struct page *page; - read_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page && TestSetPageLocked(page)) page = NULL; - read_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); return page; } EXPORT_SYMBOL(find_trylock_page); @@ -663,28 +723,28 @@ struct page *find_lock_page(struct address_space *mapping, unsigned long offset) { + struct page *page; - read_lock_irq(&mapping->tree_lock); repeat: + rcu_read_lock(); page = radix_tree_lookup(&mapping->page_tree, offset); if (page) { - page_cache_get(page); - if (TestSetPageLocked(page)) { - read_unlock_irq(&mapping->tree_lock); - __lock_page(page); - read_lock_irq(&mapping->tree_lock); - - /* Has the page been truncated while we slept? */ - if (unlikely(page->mapping != mapping || - page->index != offset)) { - unlock_page(page); - page_cache_release(page); - goto repeat; - } + page = page_cache_get_speculative(page); + rcu_read_unlock(); + if (unlikely(!page)) + goto repeat; + lock_page(page); + /* Has the page been truncated? */ + if (unlikely(page->mapping != mapping + || page->index != offset)) { + unlock_page(page); + page_cache_release(page); + goto repeat; } - } - read_unlock_irq(&mapping->tree_lock); + } else + rcu_read_unlock(); + return page; } EXPORT_SYMBOL(find_lock_page); @@ -752,16 +812,41 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages) { + unsigned int i; - unsigned int ret; + unsigned int nr_found; - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup(&mapping->page_tree, + rcu_read_lock(); +repeat: + nr_found = radix_tree_gang_lookup(&mapping->page_tree, (void **)pages, start, nr_pages); - for (i = 0; i < ret; i++) - page_cache_get(pages[i]); - read_unlock_irq(&mapping->tree_lock); - return ret; + for (i = 0; i < nr_found; i++) { + struct page *page; + page = page_cache_get_speculative(pages[i]); + if (unlikely(!page)) { +bail: + /* + * must return at least 1 page, so caller continues + * calling in. + */ + if (i == 0) + goto repeat; + break; + } + + /* Has the page been truncated? */ + if (unlikely(page->mapping != mapping + || page->index < start)) { + page_cache_release(page); + goto bail; + } + + pages[i] = page; + /* ensure we don't pick up pages that have moved behind us */ + start = page->index+1; + } + rcu_read_unlock(); + return i; } EXPORT_SYMBOL(find_get_pages); @@ -781,19 +866,36 @@ unsigned int nr_pages, struct page **pages) { unsigned int i; - unsigned int ret; + unsigned int nr_found; - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup(&mapping->page_tree, + rcu_read_lock(); +repeat: + nr_found = radix_tree_gang_lookup(&mapping->page_tree, (void **)pages, index, nr_pages); - for (i = 0; i < ret; i++) { - if (pages[i]->mapping == NULL || pages[i]->index != index) + for (i = 0; i < nr_found; i++) { + struct page *page; + page = page_cache_get_speculative(pages[i]); + if (unlikely(!page)) { +bail: + /* + * must return at least 1 page, so caller continues + * calling in. + */ + if (i == 0) + goto repeat; break; + } - page_cache_get(pages[i]); + /* Has the page been truncated? */ + if (unlikely(page->mapping != mapping + || page->index != index)) { + page_cache_release(page); + goto bail; + } + pages[i] = page; index++; } - read_unlock_irq(&mapping->tree_lock); + rcu_read_unlock(); return i; } EXPORT_SYMBOL(find_get_pages_tag); @@ -813,17 +915,41 @@ int tag, unsigned int nr_pages, struct page **pages) { unsigned int i; - unsigned int ret; + unsigned int nr_found; + pgoff_t start = *index; - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup_tag(&mapping->page_tree, - (void **)pages, *index, nr_pages, tag); - for (i = 0; i < ret; i++) - page_cache_get(pages[i]); - if (ret) - *index = pages[ret - 1]->index + 1; - read_unlock_irq(&mapping->tree_lock); - return ret; + rcu_read_lock(); +repeat: + nr_found = radix_tree_gang_lookup_tag(&mapping->page_tree, + (void **)pages, start, nr_pages, tag); + for (i = 0; i < nr_found; i++) { + struct page *page; + page = page_cache_get_speculative(pages[i]); + if (unlikely(!page)) { +bail: + /* + * must return at least 1 page, so caller continues + * calling in. + */ + if (i == 0) + goto repeat; + break; + } + + /* Has the page been truncated? */ + if (unlikely(page->mapping != mapping + || page->index < start)) { + page_cache_release(page); + goto bail; + } + + pages[i] = page; + /* ensure we don't pick up pages that have moved behind us */ + start = page->index+1; + } + rcu_read_unlock(); + *index = start; + return i; } /** diff -urN newtree/mm/filemap.c.orig newtree.2/mm/filemap.c.orig --- newtree/mm/filemap.c.orig 2006-07-08 06:14:27.000000000 -0400 +++ newtree.2/mm/filemap.c.orig 1969-12-31 19:00:00.000000000 -0500 @@ -1,2555 +0,0 @@ -/* - * linux/mm/filemap.c - * - * Copyright (C) 1994-1999 Linus Torvalds - */ - -/* - * This file handles the generic file mmap semantics used by - * most "normal" filesystems (but you don't /have/ to use this: - * the NFS filesystem used to do this differently, for example) - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "filemap.h" -#include "internal.h" - -/* - * FIXME: remove all knowledge of the buffer layer from the core VM - */ -#include /* for generic_osync_inode */ - -#include - -static ssize_t -generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs); - -#ifdef CONFIG_DEBUG_READAHEAD -extern u32 readahead_debug_level; -#else -#define readahead_debug_level 0 -#endif /* CONFIG_DEBUG_READAHEAD */ - -/* - * Shared mappings implemented 30.11.1994. It's not fully working yet, - * though. - * - * Shared mappings now work. 15.8.1995 Bruno. - * - * finished 'unifying' the page and buffer cache and SMP-threaded the - * page-cache, 21.05.1999, Ingo Molnar - * - * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli - */ - -/* - * Lock ordering: - * - * ->i_mmap_lock (vmtruncate) - * ->private_lock (__free_pte->__set_page_dirty_buffers) - * ->swap_lock (exclusive_swap_page, others) - * ->mapping->tree_lock - * - * ->i_mutex - * ->i_mmap_lock (truncate->unmap_mapping_range) - * - * ->mmap_sem - * ->i_mmap_lock - * ->page_table_lock or pte_lock (various, mainly in memory.c) - * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) - * - * ->mmap_sem - * ->lock_page (access_process_vm) - * - * ->mmap_sem - * ->i_mutex (msync) - * - * ->i_mutex - * ->i_alloc_sem (various) - * - * ->inode_lock - * ->sb_lock (fs/fs-writeback.c) - * ->mapping->tree_lock (__sync_single_inode) - * - * ->i_mmap_lock - * ->anon_vma.lock (vma_adjust) - * - * ->anon_vma.lock - * ->page_table_lock or pte_lock (anon_vma_prepare and various) - * - * ->page_table_lock or pte_lock - * ->swap_lock (try_to_unmap_one) - * ->private_lock (try_to_unmap_one) - * ->tree_lock (try_to_unmap_one) - * ->zone.lru_lock (follow_page->mark_page_accessed) - * ->zone.lru_lock (check_pte_range->isolate_lru_page) - * ->private_lock (page_remove_rmap->set_page_dirty) - * ->tree_lock (page_remove_rmap->set_page_dirty) - * ->inode_lock (page_remove_rmap->set_page_dirty) - * ->inode_lock (zap_pte_range->set_page_dirty) - * ->private_lock (zap_pte_range->__set_page_dirty_buffers) - * - * ->task->proc_lock - * ->dcache_lock (proc_pid_lookup) - */ - -/* - * Remove a page from the page cache and free it. Caller has to make - * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold a write_lock on the mapping's tree_lock. - */ -void __remove_from_page_cache(struct page *page) -{ - struct address_space *mapping = page->mapping; - - radix_tree_delete(&mapping->page_tree, page->index); - page->mapping = NULL; - mapping->nrpages--; - __dec_zone_page_state(page, NR_FILE_PAGES); -} -EXPORT_SYMBOL(__remove_from_page_cache); - -void remove_from_page_cache(struct page *page) -{ - struct address_space *mapping = page->mapping; - - BUG_ON(!PageLocked(page)); - - write_lock_irq(&mapping->tree_lock); - __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); -} -EXPORT_SYMBOL(remove_from_page_cache); - -static int sync_page(void *word) -{ - struct address_space *mapping; - struct page *page; - - page = container_of((unsigned long *)word, struct page, flags); - - /* - * page_mapping() is being called without PG_locked held. - * Some knowledge of the state and use of the page is used to - * reduce the requirements down to a memory barrier. - * The danger here is of a stale page_mapping() return value - * indicating a struct address_space different from the one it's - * associated with when it is associated with one. - * After smp_mb(), it's either the correct page_mapping() for - * the page, or an old page_mapping() and the page's own - * page_mapping() has gone NULL. - * The ->sync_page() address_space operation must tolerate - * page_mapping() going NULL. By an amazing coincidence, - * this comes about because none of the users of the page - * in the ->sync_page() methods make essential use of the - * page_mapping(), merely passing the page down to the backing - * device's unplug functions when it's non-NULL, which in turn - * ignore it for all cases but swap, where only page_private(page) is - * of interest. When page_mapping() does go NULL, the entire - * call stack gracefully ignores the page and returns. - * -- wli - */ - smp_mb(); - mapping = page_mapping(page); - if (mapping && mapping->a_ops && mapping->a_ops->sync_page) - mapping->a_ops->sync_page(page); - io_schedule(); - return 0; -} - -/** - * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range - * @mapping: address space structure to write - * @start: offset in bytes where the range starts - * @end: offset in bytes where the range ends (inclusive) - * @sync_mode: enable synchronous operation - * - * Start writeback against all of a mapping's dirty pages that lie - * within the byte offsets inclusive. - * - * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as - * opposed to a regular memory cleansing writeback. The difference between - * these two operations is that if a dirty page/buffer is encountered, it must - * be waited upon, and not just skipped over. - */ -int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end, int sync_mode) -{ - int ret; - struct writeback_control wbc = { - .sync_mode = sync_mode, - .nr_to_write = mapping->nrpages * 2, - .range_start = start, - .range_end = end, - }; - - if (!mapping_cap_writeback_dirty(mapping)) - return 0; - - ret = do_writepages(mapping, &wbc); - return ret; -} - -static inline int __filemap_fdatawrite(struct address_space *mapping, - int sync_mode) -{ - return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); -} - -int filemap_fdatawrite(struct address_space *mapping) -{ - return __filemap_fdatawrite(mapping, WB_SYNC_ALL); -} -EXPORT_SYMBOL(filemap_fdatawrite); - -static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end) -{ - return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); -} - -/** - * filemap_flush - mostly a non-blocking flush - * @mapping: target address_space - * - * This is a mostly non-blocking flush. Not suitable for data-integrity - * purposes - I/O may not be started against all dirty pages. - */ -int filemap_flush(struct address_space *mapping) -{ - return __filemap_fdatawrite(mapping, WB_SYNC_NONE); -} -EXPORT_SYMBOL(filemap_flush); - -/** - * wait_on_page_writeback_range - wait for writeback to complete - * @mapping: target address_space - * @start: beginning page index - * @end: ending page index - * - * Wait for writeback to complete against pages indexed by start->end - * inclusive - */ -int wait_on_page_writeback_range(struct address_space *mapping, - pgoff_t start, pgoff_t end) -{ - struct pagevec pvec; - int nr_pages; - int ret = 0; - pgoff_t index; - - if (end < start) - return 0; - - pagevec_init(&pvec, 0); - index = start; - while ((index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_WRITEBACK, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { - unsigned i; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - /* until radix tree lookup accepts end_index */ - if (page->index > end) - continue; - - wait_on_page_writeback(page); - if (PageError(page)) - ret = -EIO; - } - pagevec_release(&pvec); - cond_resched(); - } - - /* Check for outstanding write errors */ - if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) - ret = -ENOSPC; - if (test_and_clear_bit(AS_EIO, &mapping->flags)) - ret = -EIO; - - return ret; -} -EXPORT_SYMBOL(add_to_page_cache_lru); - -/** - * sync_page_range - write and wait on all pages in the passed range - * @inode: target inode - * @mapping: target address_space - * @pos: beginning offset in pages to write - * @count: number of bytes to write - * - * Write and wait upon all the pages in the passed range. This is a "data - * integrity" operation. It waits upon in-flight writeout before starting and - * waiting upon new writeout. If there was an IO error, return it. - * - * We need to re-take i_mutex during the generic_osync_inode list walk because - * it is otherwise livelockable. - */ -int sync_page_range(struct inode *inode, struct address_space *mapping, - loff_t pos, loff_t count) -{ - pgoff_t start = pos >> PAGE_CACHE_SHIFT; - pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; - int ret; - - if (!mapping_cap_writeback_dirty(mapping) || !count) - return 0; - ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); - if (ret == 0) { - mutex_lock(&inode->i_mutex); - ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); - mutex_unlock(&inode->i_mutex); - } - if (ret == 0) - ret = wait_on_page_writeback_range(mapping, start, end); - return ret; -} -EXPORT_SYMBOL(sync_page_range); - -/** - * sync_page_range_nolock - * @inode: target inode - * @mapping: target address_space - * @pos: beginning offset in pages to write - * @count: number of bytes to write - * - * Note: Holding i_mutex across sync_page_range_nolock is not a good idea - * as it forces O_SYNC writers to different parts of the same file - * to be serialised right until io completion. - */ -int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, - loff_t pos, loff_t count) -{ - pgoff_t start = pos >> PAGE_CACHE_SHIFT; - pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; - int ret; - - if (!mapping_cap_writeback_dirty(mapping) || !count) - return 0; - ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); - if (ret == 0) - ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); - if (ret == 0) - ret = wait_on_page_writeback_range(mapping, start, end); - return ret; -} -EXPORT_SYMBOL(sync_page_range_nolock); - -/** - * filemap_fdatawait - wait for all under-writeback pages to complete - * @mapping: address space structure to wait for - * - * Walk the list of under-writeback pages of the given address space - * and wait for all of them. - */ -int filemap_fdatawait(struct address_space *mapping) -{ - loff_t i_size = i_size_read(mapping->host); - - if (i_size == 0) - return 0; - - return wait_on_page_writeback_range(mapping, 0, - (i_size - 1) >> PAGE_CACHE_SHIFT); -} -EXPORT_SYMBOL(filemap_fdatawait); - -int filemap_write_and_wait(struct address_space *mapping) -{ - int err = 0; - - if (mapping->nrpages) { - err = filemap_fdatawrite(mapping); - /* - * Even if the above returned error, the pages may be - * written partially (e.g. -ENOSPC), so we wait for it. - * But the -EIO is special case, it may indicate the worst - * thing (e.g. bug) happened, so we avoid waiting for it. - */ - if (err != -EIO) { - int err2 = filemap_fdatawait(mapping); - if (!err) - err = err2; - } - } - return err; -} -EXPORT_SYMBOL(filemap_write_and_wait); - -/** - * filemap_write_and_wait_range - write out & wait on a file range - * @mapping: the address_space for the pages - * @lstart: offset in bytes where the range starts - * @lend: offset in bytes where the range ends (inclusive) - * - * Write out and wait upon file offsets lstart->lend, inclusive. - * - * Note that `lend' is inclusive (describes the last byte to be written) so - * that this function can be used to write to the very end-of-file (end = -1). - */ -int filemap_write_and_wait_range(struct address_space *mapping, - loff_t lstart, loff_t lend) -{ - int err = 0; - - if (mapping->nrpages) { - err = __filemap_fdatawrite_range(mapping, lstart, lend, - WB_SYNC_ALL); - /* See comment of filemap_write_and_wait() */ - if (err != -EIO) { - int err2 = wait_on_page_writeback_range(mapping, - lstart >> PAGE_CACHE_SHIFT, - lend >> PAGE_CACHE_SHIFT); - if (!err) - err = err2; - } - } - return err; -} - -/** - * add_to_page_cache - add newly allocated pagecache pages - * @page: page to add - * @mapping: the page's address_space - * @offset: page index - * @gfp_mask: page allocation mode - * - * This function is used to add newly allocated pagecache pages; - * the page is new, so we can just run SetPageLocked() against it. - * The other page state flags were set by rmqueue(). - * - * This function does not add the page to the LRU. The caller must do that. - */ -int add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t offset, gfp_t gfp_mask) -{ - int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); - - if (error == 0) { - write_lock_irq(&mapping->tree_lock); - error = radix_tree_insert(&mapping->page_tree, offset, page); - if (!error) { - page_cache_get(page); - SetPageLocked(page); - page->mapping = mapping; - page->index = offset; - mapping->nrpages++; - __inc_zone_page_state(page, NR_FILE_PAGES); - } - write_unlock_irq(&mapping->tree_lock); - radix_tree_preload_end(); - } - return error; -} -EXPORT_SYMBOL(add_to_page_cache); - -int add_to_page_cache_lru(struct page *page, struct address_space *mapping, - pgoff_t offset, gfp_t gfp_mask) -{ - int ret = add_to_page_cache(page, mapping, offset, gfp_mask); - if (ret == 0) - lru_cache_add(page); - return ret; -} - -#ifdef CONFIG_NUMA -struct page *page_cache_alloc(struct address_space *x) -{ - if (cpuset_do_page_mem_spread()) { - int n = cpuset_mem_spread_node(); - return alloc_pages_node(n, mapping_gfp_mask(x), 0); - } - return alloc_pages(mapping_gfp_mask(x), 0); -} -EXPORT_SYMBOL(page_cache_alloc); - -struct page *page_cache_alloc_cold(struct address_space *x) -{ - if (cpuset_do_page_mem_spread()) { - int n = cpuset_mem_spread_node(); - return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0); - } - return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0); -} -EXPORT_SYMBOL(page_cache_alloc_cold); -#endif - -/* - * In order to wait for pages to become available there must be - * waitqueues associated with pages. By using a hash table of - * waitqueues where the bucket discipline is to maintain all - * waiters on the same queue and wake all when any of the pages - * become available, and for the woken contexts to check to be - * sure the appropriate page became available, this saves space - * at a cost of "thundering herd" phenomena during rare hash - * collisions. - */ -static wait_queue_head_t *page_waitqueue(struct page *page) -{ - const struct zone *zone = page_zone(page); - - return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; -} - -static inline void wake_up_page(struct page *page, int bit) -{ - __wake_up_bit(page_waitqueue(page), &page->flags, bit); -} - -void fastcall wait_on_page_bit(struct page *page, int bit_nr) -{ - DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); - - if (test_bit(bit_nr, &page->flags)) - __wait_on_bit(page_waitqueue(page), &wait, sync_page, - TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_on_page_bit); - -/** - * unlock_page - unlock a locked page - * @page: the page - * - * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). - * Also wakes sleepers in wait_on_page_writeback() because the wakeup - * mechananism between PageLocked pages and PageWriteback pages is shared. - * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. - * - * The first mb is necessary to safely close the critical section opened by the - * TestSetPageLocked(), the second mb is necessary to enforce ordering between - * the clear_bit and the read of the waitqueue (to avoid SMP races with a - * parallel wait_on_page_locked()). - */ -void fastcall unlock_page(struct page *page) -{ - smp_mb__before_clear_bit(); - if (!TestClearPageLocked(page)) - BUG(); - smp_mb__after_clear_bit(); - wake_up_page(page, PG_locked); -} -EXPORT_SYMBOL(unlock_page); - -/** - * end_page_writeback - end writeback against a page - * @page: the page - */ -void end_page_writeback(struct page *page) -{ - if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) { - if (!test_clear_page_writeback(page)) - BUG(); - } - smp_mb__after_clear_bit(); - wake_up_page(page, PG_writeback); -} -EXPORT_SYMBOL(end_page_writeback); - -/** - * __lock_page - get a lock on the page, assuming we need to sleep to get it - * @page: the page to lock - * - * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some - * random driver's requestfn sets TASK_RUNNING, we could busywait. However - * chances are that on the second loop, the block layer's plug list is empty, - * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. - */ -void fastcall __lock_page(struct page *page) -{ - DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); - - __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, - TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(__lock_page); - -/* - * Probing page existence. - */ -int __probe_page(struct address_space *mapping, pgoff_t offset) -{ - return !! radix_tree_lookup(&mapping->page_tree, offset); -} - -/* - * Here we just do not bother to grab the page, it's meaningless anyway. - */ -int probe_page(struct address_space *mapping, pgoff_t offset) -{ - int exists; - - read_lock_irq(&mapping->tree_lock); - exists = __probe_page(mapping, offset); - read_unlock_irq(&mapping->tree_lock); - - return exists; -} - -/** - * find_get_page - find and get a page reference - * @mapping: the address_space to search - * @offset: the page index - * - * A rather lightweight function, finding and getting a reference to a - * hashed page atomically. - */ -struct page * find_get_page(struct address_space *mapping, unsigned long offset) -{ - struct page *page; - - read_lock_irq(&mapping->tree_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); - if (page) - page_cache_get(page); - read_unlock_irq(&mapping->tree_lock); - return page; -} -EXPORT_SYMBOL(find_get_page); - -/** - * find_trylock_page - find and lock a page - * @mapping: the address_space to search - * @offset: the page index - * - * Same as find_get_page(), but trylock it instead of incrementing the count. - */ -struct page *find_trylock_page(struct address_space *mapping, unsigned long offset) -{ - struct page *page; - - read_lock_irq(&mapping->tree_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); - if (page && TestSetPageLocked(page)) - page = NULL; - read_unlock_irq(&mapping->tree_lock); - return page; -} -EXPORT_SYMBOL(find_trylock_page); - -/** - * find_lock_page - locate, pin and lock a pagecache page - * @mapping: the address_space to search - * @offset: the page index - * - * Locates the desired pagecache page, locks it, increments its reference - * count and returns its address. - * - * Returns zero if the page was not present. find_lock_page() may sleep. - */ -struct page *find_lock_page(struct address_space *mapping, - unsigned long offset) -{ - struct page *page; - - read_lock_irq(&mapping->tree_lock); -repeat: - page = radix_tree_lookup(&mapping->page_tree, offset); - if (page) { - page_cache_get(page); - if (TestSetPageLocked(page)) { - read_unlock_irq(&mapping->tree_lock); - __lock_page(page); - read_lock_irq(&mapping->tree_lock); - - /* Has the page been truncated while we slept? */ - if (unlikely(page->mapping != mapping || - page->index != offset)) { - unlock_page(page); - page_cache_release(page); - goto repeat; - } - } - } - read_unlock_irq(&mapping->tree_lock); - return page; -} -EXPORT_SYMBOL(find_lock_page); - -/** - * find_or_create_page - locate or add a pagecache page - * @mapping: the page's address_space - * @index: the page's index into the mapping - * @gfp_mask: page allocation mode - * - * Locates a page in the pagecache. If the page is not present, a new page - * is allocated using @gfp_mask and is added to the pagecache and to the VM's - * LRU list. The returned page is locked and has its reference count - * incremented. - * - * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic - * allocation! - * - * find_or_create_page() returns the desired page's address, or zero on - * memory exhaustion. - */ -struct page *find_or_create_page(struct address_space *mapping, - unsigned long index, gfp_t gfp_mask) -{ - struct page *page, *cached_page = NULL; - int err; -repeat: - page = find_lock_page(mapping, index); - if (!page) { - if (!cached_page) { - cached_page = alloc_page(gfp_mask); - if (!cached_page) - return NULL; - } - err = add_to_page_cache_lru(cached_page, mapping, - index, gfp_mask); - if (!err) { - page = cached_page; - cached_page = NULL; - } else if (err == -EEXIST) - goto repeat; - } - if (cached_page) - page_cache_release(cached_page); - return page; -} -EXPORT_SYMBOL(find_or_create_page); - -/** - * find_get_pages - gang pagecache lookup - * @mapping: The address_space to search - * @start: The starting page index - * @nr_pages: The maximum number of pages - * @pages: Where the resulting pages are placed - * - * find_get_pages() will search for and return a group of up to - * @nr_pages pages in the mapping. The pages are placed at @pages. - * find_get_pages() takes a reference against the returned pages. - * - * The search returns a group of mapping-contiguous pages with ascending - * indexes. There may be holes in the indices due to not-present pages. - * - * find_get_pages() returns the number of pages which were found. - */ -unsigned find_get_pages(struct address_space *mapping, pgoff_t start, - unsigned int nr_pages, struct page **pages) -{ - unsigned int i; - unsigned int ret; - - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup(&mapping->page_tree, - (void **)pages, start, nr_pages); - for (i = 0; i < ret; i++) - page_cache_get(pages[i]); - read_unlock_irq(&mapping->tree_lock); - return ret; -} -EXPORT_SYMBOL(find_get_pages); - -/** - * find_get_pages_contig - gang contiguous pagecache lookup - * @mapping: The address_space to search - * @index: The starting page index - * @nr_pages: The maximum number of pages - * @pages: Where the resulting pages are placed - * - * find_get_pages_contig() works exactly like find_get_pages(), except - * that the returned number of pages are guaranteed to be contiguous. - * - * find_get_pages_contig() returns the number of pages which were found. - */ -unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, - unsigned int nr_pages, struct page **pages) -{ - unsigned int i; - unsigned int ret; - - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup(&mapping->page_tree, - (void **)pages, index, nr_pages); - for (i = 0; i < ret; i++) { - if (pages[i]->mapping == NULL || pages[i]->index != index) - break; - - page_cache_get(pages[i]); - index++; - } - read_unlock_irq(&mapping->tree_lock); - return i; -} -EXPORT_SYMBOL(find_get_pages_tag); - -/** - * find_get_pages_tag - find and return pages that match @tag - * @mapping: the address_space to search - * @index: the starting page index - * @tag: the tag index - * @nr_pages: the maximum number of pages - * @pages: where the resulting pages are placed - * - * Like find_get_pages, except we only return pages which are tagged with - * @tag. We update @index to index the next page for the traversal. - */ -unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, - int tag, unsigned int nr_pages, struct page **pages) -{ - unsigned int i; - unsigned int ret; - - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup_tag(&mapping->page_tree, - (void **)pages, *index, nr_pages, tag); - for (i = 0; i < ret; i++) - page_cache_get(pages[i]); - if (ret) - *index = pages[ret - 1]->index + 1; - read_unlock_irq(&mapping->tree_lock); - return ret; -} - -/** - * grab_cache_page_nowait - returns locked page at given index in given cache - * @mapping: target address_space - * @index: the page index - * - * Same as grab_cache_page, but do not wait if the page is unavailable. - * This is intended for speculative data generators, where the data can - * be regenerated if the page couldn't be grabbed. This routine should - * be safe to call while holding the lock for another page. - * - * Clear __GFP_FS when allocating the page to avoid recursion into the fs - * and deadlock against the caller's locked page. - */ -struct page * -grab_cache_page_nowait(struct address_space *mapping, unsigned long index) -{ - struct page *page = find_get_page(mapping, index); - gfp_t gfp_mask; - - if (page) { - if (!TestSetPageLocked(page)) - return page; - page_cache_release(page); - return NULL; - } - gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS; - page = alloc_pages(gfp_mask, 0); - if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) { - page_cache_release(page); - page = NULL; - } - return page; -} -EXPORT_SYMBOL(grab_cache_page_nowait); - -/* - * CD/DVDs are error prone. When a medium error occurs, the driver may fail - * a _large_ part of the i/o request. Imagine the worst scenario: - * - * ---R__________________________________________B__________ - * ^ reading here ^ bad block(assume 4k) - * - * read(R) => miss => readahead(R...B) => media error => frustrating retries - * => failing the whole request => read(R) => read(R+1) => - * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => - * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => - * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... - * - * It is going insane. Fix it by quickly scaling down the readahead size. - */ -static void shrink_readahead_size_eio(struct file *filp, - struct file_ra_state *ra) -{ - if (!ra->ra_pages) - return; - - ra->ra_pages /= 4; - printk(KERN_WARNING "Reducing readahead size to %luK\n", - ra->ra_pages << (PAGE_CACHE_SHIFT - 10)); -} - -/** - * do_generic_mapping_read - generic file read routine - * @mapping: address_space to be read - * @_ra: file's readahead state - * @filp: the file to read - * @ppos: current file position - * @desc: read_descriptor - * @actor: read method - * - * This is a generic file read routine, and uses the - * mapping->a_ops->readpage() function for the actual low-level stuff. - * - * This is really ugly. But the goto's actually try to clarify some - * of the logic when it comes to error handling etc. - * - * Note the struct file* is only passed for the use of readpage. - * It may be NULL. - */ -void do_generic_mapping_read(struct address_space *mapping, - struct file_ra_state *_ra, - struct file *filp, - loff_t *ppos, - read_descriptor_t *desc, - read_actor_t actor) -{ - struct inode *inode = mapping->host; - unsigned long index; - unsigned long end_index; - unsigned long offset; - unsigned long last_index; - unsigned long next_index; - unsigned long prev_index; - loff_t isize; - struct page *cached_page; - struct page *prev_page; - int error; - struct file_ra_state ra = *_ra; - - cached_page = NULL; - prev_page = NULL; - index = *ppos >> PAGE_CACHE_SHIFT; - next_index = index; - prev_index = ra.prev_page; - last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; - offset = *ppos & ~PAGE_CACHE_MASK; - - isize = i_size_read(inode); - if (!isize) - goto out; - - if (readahead_debug_level >= 5) - printk(KERN_DEBUG "read-file(ino=%lu, req=%lu+%lu)\n", - inode->i_ino, index, last_index - index); - - end_index = (isize - 1) >> PAGE_CACHE_SHIFT; - for (;;) { - struct page *page; - unsigned long nr, ret; - - /* nr is the maximum number of bytes to copy from this page */ - nr = PAGE_CACHE_SIZE; - if (index >= end_index) { - if (index > end_index) - goto out; - nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; - if (nr <= offset) { - goto out; - } - } - nr = nr - offset; - - cond_resched(); - - if (!prefer_adaptive_readahead() && index == next_index) - next_index = page_cache_readahead(mapping, &ra, filp, - index, last_index - index); - -find_page: - page = find_get_page(mapping, index); - if (prefer_adaptive_readahead()) { - if (unlikely(page == NULL)) { - ra.prev_page = prev_index; - page_cache_readahead_adaptive(mapping, &ra, - filp, prev_page, NULL, - *ppos >> PAGE_CACHE_SHIFT, - index, last_index); - page = find_get_page(mapping, index); - } else if (PageReadahead(page)) { - ra.prev_page = prev_index; - page_cache_readahead_adaptive(mapping, &ra, - filp, prev_page, page, - *ppos >> PAGE_CACHE_SHIFT, - index, last_index); - } - } - if (unlikely(page == NULL)) { - if (!prefer_adaptive_readahead()) - handle_ra_miss(mapping, &ra, index); - goto no_cached_page; - } - - if (prev_page) - page_cache_release(prev_page); - prev_page = page; - - if (prefer_adaptive_readahead()) - readahead_cache_hit(&ra, page); - - if (readahead_debug_level >= 7) - printk(KERN_DEBUG "read-page(ino=%lu, idx=%lu, io=%s)\n", - inode->i_ino, index, - PageUptodate(page) ? "hit" : "miss"); - - if (!PageUptodate(page)) - goto page_not_up_to_date; -page_ok: - - /* If users can be writing to this page using arbitrary - * virtual addresses, take care about potential aliasing - * before reading the page on the kernel side. - */ - if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); - - /* - * When (part of) the same page is read multiple times - * in succession, only mark it as accessed the first time. - */ - if (prev_index != index) - mark_page_accessed(page); - prev_index = index; - - /* - * Ok, we have the page, and it's up-to-date, so - * now we can copy it to user space... - * - * The actor routine returns how many bytes were actually used.. - * NOTE! This may not be the same as how much of a user buffer - * we filled up (we may be padding etc), so we can only update - * "pos" here (the actor routine has to update the user buffer - * pointers and the remaining count). - */ - ret = actor(desc, page, offset, nr); - offset += ret; - index += offset >> PAGE_CACHE_SHIFT; - offset &= ~PAGE_CACHE_MASK; - - if (ret == nr && desc->count) - continue; - goto out; - -page_not_up_to_date: - /* Get exclusive access to the page ... */ - lock_page(page); - - /* Did it get unhashed before we got the lock? */ - if (!page->mapping) { - unlock_page(page); - continue; - } - - /* Did somebody else fill it already? */ - if (PageUptodate(page)) { - unlock_page(page); - goto page_ok; - } - -readpage: - /* Start the actual read. The read will unlock the page. */ - error = mapping->a_ops->readpage(filp, page); - - if (unlikely(error)) { - if (error == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto find_page; - } - goto readpage_error; - } - - if (!PageUptodate(page)) { - lock_page(page); - if (!PageUptodate(page)) { - if (page->mapping == NULL) { - /* - * invalidate_inode_pages got it - */ - unlock_page(page); - goto find_page; - } - unlock_page(page); - error = -EIO; - shrink_readahead_size_eio(filp, &ra); - goto readpage_error; - } - unlock_page(page); - } - - /* - * i_size must be checked after we have done ->readpage. - * - * Checking i_size after the readpage allows us to calculate - * the correct value for "nr", which means the zero-filled - * part of the page is not copied back to userspace (unless - * another truncate extends the file - this is desired though). - */ - isize = i_size_read(inode); - end_index = (isize - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(!isize || index > end_index)) { - goto out; - } - - /* nr is the maximum number of bytes to copy from this page */ - nr = PAGE_CACHE_SIZE; - if (index == end_index) { - nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; - if (nr <= offset) { - goto out; - } - } - nr = nr - offset; - goto page_ok; - -readpage_error: - /* UHHUH! A synchronous read error occurred. Report it */ - desc->error = error; - goto out; - -no_cached_page: - /* - * Ok, it wasn't cached, so we need to create a new - * page.. - */ - if (!cached_page) { - cached_page = page_cache_alloc_cold(mapping); - if (!cached_page) { - desc->error = -ENOMEM; - goto out; - } - } - error = add_to_page_cache_lru(cached_page, mapping, - index, GFP_KERNEL); - if (error) { - if (error == -EEXIST) - goto find_page; - desc->error = error; - goto out; - } - page = cached_page; - cached_page = NULL; - if (prev_page) - page_cache_release(prev_page); - prev_page = page; - goto readpage; - } - -out: - *_ra = ra; - if (prefer_adaptive_readahead()) - _ra->prev_page = prev_index; - - *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; - if (cached_page) - page_cache_release(cached_page); - if (prev_page) - page_cache_release(prev_page); - if (filp) - file_accessed(filp); -} -EXPORT_SYMBOL(do_generic_mapping_read); - -int file_read_actor(read_descriptor_t *desc, struct page *page, - unsigned long offset, unsigned long size) -{ - char *kaddr; - unsigned long left, count = desc->count; - - if (size > count) - size = count; - - /* - * Faults on the destination of a read are common, so do it before - * taking the kmap. - */ - if (!fault_in_pages_writeable(desc->arg.buf, size)) { - kaddr = kmap_atomic(page, KM_USER0); - left = __copy_to_user_inatomic(desc->arg.buf, - kaddr + offset, size); - kunmap_atomic(kaddr, KM_USER0); - if (left == 0) - goto success; - } - - /* Do it the slow way */ - kaddr = kmap(page); - left = __copy_to_user(desc->arg.buf, kaddr + offset, size); - kunmap(page); - - if (left) { - size -= left; - desc->error = -EFAULT; - } -success: - desc->count = count - size; - desc->written += size; - desc->arg.buf += size; - return size; -} -EXPORT_SYMBOL_GPL(file_read_actor); - -/** - * __generic_file_aio_read - generic filesystem read routine - * @iocb: kernel I/O control block - * @iov: io vector request - * @nr_segs: number of segments in the iovec - * @ppos: current file position - * - * This is the "read()" routine for all filesystems - * that can use the page cache directly. - */ -ssize_t -generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - struct file *filp = iocb->ki_filp; - ssize_t retval; - unsigned long seg; - size_t count; - loff_t *ppos = &iocb->ki_pos; - - count = 0; - for (seg = 0; seg < nr_segs; seg++) { - const struct iovec *iv = &iov[seg]; - - /* - * If any segment has a negative length, or the cumulative - * length ever wraps negative then return -EINVAL. - */ - count += iv->iov_len; - if (unlikely((ssize_t)(count|iv->iov_len) < 0)) - return -EINVAL; - if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len)) - continue; - if (seg == 0) - return -EFAULT; - nr_segs = seg; - count -= iv->iov_len; /* This segment is no good */ - break; - } - - /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ - if (filp->f_flags & O_DIRECT) { - loff_t size; - struct address_space *mapping; - struct inode *inode; - - mapping = filp->f_mapping; - inode = mapping->host; - retval = 0; - if (!count) - goto out; /* skip atime */ - size = i_size_read(inode); - if (pos < size) { - retval = generic_file_direct_IO(READ, iocb, - iov, pos, nr_segs); - if (retval > 0 && !is_sync_kiocb(iocb)) - retval = -EIOCBQUEUED; - if (retval > 0) - *ppos = pos + retval; - } - file_accessed(filp); - goto out; - } - - retval = 0; - if (count) { - for (seg = 0; seg < nr_segs; seg++) { - read_descriptor_t desc; - - desc.written = 0; - desc.arg.buf = iov[seg].iov_base; - desc.count = iov[seg].iov_len; - if (desc.count == 0) - continue; - desc.error = 0; - do_generic_file_read(filp,ppos,&desc,file_read_actor); - retval += desc.written; - if (desc.error) { - retval = retval ?: desc.error; - break; - } - } - } -out: - return retval; -} -EXPORT_SYMBOL(generic_file_aio_read); - -int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) -{ - ssize_t written; - unsigned long count = desc->count; - struct file *file = desc->arg.data; - - if (size > count) - size = count; - - written = file->f_op->sendpage(file, page, offset, - size, &file->f_pos, sizeerror = written; - written = 0; - } - desc->count = count - written; - desc->written += written; - return written; -} - -ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos, - size_t count, read_actor_t actor, void *target) -{ - read_descriptor_t desc; - - if (!count) - return 0; - - desc.written = 0; - desc.count = count; - desc.arg.data = target; - desc.error = 0; - - do_generic_file_read(in_file, ppos, &desc, actor); - if (desc.written) - return desc.written; - return desc.error; -} -EXPORT_SYMBOL(generic_file_sendfile); - -static ssize_t -do_readahead(struct address_space *mapping, struct file *filp, - unsigned long index, unsigned long nr) -{ - if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) - return -EINVAL; - - force_page_cache_readahead(mapping, filp, index, - max_sane_readahead(nr)); - return 0; -} - -asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) -{ - ssize_t ret; - struct file *file; - - ret = -EBADF; - file = fget(fd); - if (file) { - if (file->f_mode & FMODE_READ) { - struct address_space *mapping = file->f_mapping; - unsigned long start = offset >> PAGE_CACHE_SHIFT; - unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT; - unsigned long len = end - start + 1; - ret = do_readahead(mapping, file, start, len); - } - fput(file); - } - return ret; -} - -#ifdef CONFIG_MMU -static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); -/** - * page_cache_read - adds requested page to the page cache if not already there - * @file: file to read - * @offset: page index - * - * This adds the requested page to the page cache if it isn't already there, - * and schedules an I/O to read in its contents from disk. - */ -static int fastcall page_cache_read(struct file * file, unsigned long offset) -{ - struct address_space *mapping = file->f_mapping; - struct page *page; - int ret; - - do { - page = page_cache_alloc_cold(mapping); - if (!page) - return -ENOMEM; - - ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); - if (ret == 0) - ret = mapping->a_ops->readpage(file, page); - else if (ret == -EEXIST) - ret = 0; /* losing race to add is OK */ - - page_cache_release(page); - - } while (ret == AOP_TRUNCATED_PAGE); - - return ret; -} - -#define MMAP_LOTSAMISS (100) - -/** - * filemap_nopage - read in file data for page fault handling - * @area: the applicable vm_area - * @address: target address to read in - * @type: returned with VM_FAULT_{MINOR,MAJOR} if not %NULL - * - * filemap_nopage() is invoked via the vma operations vector for a - * mapped memory region to read in file data during a page fault. - * - * The goto's are kind of ugly, but this streamlines the normal case of having - * it in the page cache, and handles the special cases reasonably without - * having a lot of duplicated code. - */ -struct page *filemap_nopage(struct vm_area_struct *area, - unsigned long address, int *type) -{ - int error; - struct file *file = area->vm_file; - struct address_space *mapping = file->f_mapping; - struct file_ra_state *ra = &file->f_ra; - struct inode *inode = mapping->host; - struct page *page; - unsigned long size, pgoff; - int did_readaround = 0, majmin = VM_FAULT_MINOR; - - ra->flags |= RA_FLAG_MMAP; - pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; - -retry_all: - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (pgoff >= size) - goto outside_data_content; - - /* If we don't want any read-ahead, don't bother */ - if (VM_RandomReadHint(area)) - goto no_cached_page; - - /* - * The readahead code wants to be told about each and every page - * so it can build and shrink its windows appropriately - * - * For sequential accesses, we use the generic readahead logic. - */ - if (!prefer_adaptive_readahead() && VM_SequentialReadHint(area)) - page_cache_readahead(mapping, ra, file, pgoff, 1); - - /* - * Do we have something in the page cache already? - */ -retry_find: - page = find_get_page(mapping, pgoff); - if (prefer_adaptive_readahead() && VM_SequentialReadHint(area)) { - if (!page) { - page_cache_readahead_adaptive(mapping, ra, - file, NULL, NULL, - pgoff, pgoff, pgoff + 1); - page = find_get_page(mapping, pgoff); - } else if (PageReadahead(page)) { - page_cache_readahead_adaptive(mapping, ra, - file, NULL, page, - pgoff, pgoff, pgoff + 1); - } - } - if (!page) { - unsigned long ra_pages; - - if (VM_SequentialReadHint(area)) { - if (!prefer_adaptive_readahead()) - handle_ra_miss(mapping, ra, pgoff); - goto no_cached_page; - } - ra->mmap_miss++; - - /* - * Do we miss much more than hit in this file? If so, - * stop bothering with read-ahead. It will only hurt. - */ - if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS) - goto no_cached_page; - - /* - * To keep the pgmajfault counter straight, we need to - * check did_readaround, as this is an inner loop. - */ - if (!did_readaround) { - majmin = VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - } - did_readaround = 1; - ra_pages = max_sane_readahead(file->f_ra.ra_pages); - if (ra_pages) { - pgoff_t start = 0; - - if (pgoff > ra_pages / 2) - start = pgoff - ra_pages / 2; - do_page_cache_readahead(mapping, file, start, ra_pages); - } - page = find_get_page(mapping, pgoff); - if (!page) - goto no_cached_page; - } - - if (!did_readaround) - ra->mmap_hit++; - - if (prefer_adaptive_readahead()) - readahead_cache_hit(ra, page); - - if (readahead_debug_level >= 6) - printk(KERN_DEBUG "read-mmap(ino=%lu, idx=%lu, hint=%s, io=%s)\n", - inode->i_ino, pgoff, - VM_RandomReadHint(area) ? "random" : - (VM_SequentialReadHint(area) ? "sequential" : "none"), - PageUptodate(page) ? "hit" : "miss"); - - /* - * Ok, found a page in the page cache, now we need to check - * that it's up-to-date. - */ - if (!PageUptodate(page)) - goto page_not_uptodate; - -success: - /* - * Found the page and have a reference on it. - */ - mark_page_accessed(page); - if (type) - *type = majmin; - if (prefer_adaptive_readahead()) - ra->prev_page = page->index; - return page; - -outside_data_content: - /* - * An external ptracer can access pages that normally aren't - * accessible.. - */ - if (area->vm_mm == current->mm) - return NULL; - /* Fall through to the non-read-ahead case */ -no_cached_page: - /* - * We're only likely to ever get here if MADV_RANDOM is in - * effect. - */ - error = page_cache_read(file, pgoff); - grab_swap_token(); - - /* - * The page we want has now been added to the page cache. - * In the unlikely event that someone removed it in the - * meantime, we'll just come back here and read it again. - */ - if (error >= 0) - goto retry_find; - - /* - * An error return from page_cache_read can result if the - * system is low on memory, or a problem occurs while trying - * to schedule I/O. - */ - if (error == -ENOMEM) - return NOPAGE_OOM; - return NULL; - -page_not_uptodate: - if (!did_readaround) { - majmin = VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - } - lock_page(page); - - /* Did it get unhashed while we waited for it? */ - if (!page->mapping) { - unlock_page(page); - page_cache_release(page); - goto retry_all; - } - - /* Did somebody else get it up-to-date? */ - if (PageUptodate(page)) { - unlock_page(page); - goto success; - } - - error = mapping->a_ops->readpage(file, page); - if (!error) { - wait_on_page_locked(page); - if (PageUptodate(page)) - goto success; - } else if (error == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto retry_find; - } - - /* - * Umm, take care of errors if the page isn't up-to-date. - * Try to re-read it _once_. We do this synchronously, - * because there really aren't any performance issues here - * and we need to check for errors. - */ - lock_page(page); - - /* Somebody truncated the page on us? */ - if (!page->mapping) { - unlock_page(page); - page_cache_release(page); - goto retry_all; - } - - /* Somebody else successfully read it in? */ - if (PageUptodate(page)) { - unlock_page(page); - goto success; - } - ClearPageError(page); - error = mapping->a_ops->readpage(file, page); - if (!error) { - wait_on_page_locked(page); - if (PageUptodate(page)) - goto success; - } else if (error == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto retry_find; - } - - /* - * Things didn't work out. Return zero to tell the - * mm layer so, possibly freeing the page cache page first. - */ - shrink_readahead_size_eio(file, ra); - page_cache_release(page); - return NULL; -} -EXPORT_SYMBOL(filemap_nopage); - -static struct page * filemap_getpage(struct file *file, unsigned long pgoff, - int nonblock) -{ - struct address_space *mapping = file->f_mapping; - struct page *page; - int error; - - /* - * Do we have something in the page cache already? - */ -retry_find: - page = find_get_page(mapping, pgoff); - if (!page) { - if (nonblock) - return NULL; - goto no_cached_page; - } - - /* - * Ok, found a page in the page cache, now we need to check - * that it's up-to-date. - */ - if (!PageUptodate(page)) { - if (nonblock) { - page_cache_release(page); - return NULL; - } - goto page_not_uptodate; - } - -success: - /* - * Found the page and have a reference on it. - */ - mark_page_accessed(page); - return page; - -no_cached_page: - error = page_cache_read(file, pgoff); - - /* - * The page we want has now been added to the page cache. - * In the unlikely event that someone removed it in the - * meantime, we'll just come back here and read it again. - */ - if (error >= 0) - goto retry_find; - - /* - * An error return from page_cache_read can result if the - * system is low on memory, or a problem occurs while trying - * to schedule I/O. - */ - return NULL; - -page_not_uptodate: - lock_page(page); - - /* Did it get unhashed while we waited for it? */ - if (!page->mapping) { - unlock_page(page); - goto err; - } - - /* Did somebody else get it up-to-date? */ - if (PageUptodate(page)) { - unlock_page(page); - goto success; - } - - error = mapping->a_ops->readpage(file, page); - if (!error) { - wait_on_page_locked(page); - if (PageUptodate(page)) - goto success; - } else if (error == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto retry_find; - } - - /* - * Umm, take care of errors if the page isn't up-to-date. - * Try to re-read it _once_. We do this synchronously, - * because there really aren't any performance issues here - * and we need to check for errors. - */ - lock_page(page); - - /* Somebody truncated the page on us? */ - if (!page->mapping) { - unlock_page(page); - goto err; - } - /* Somebody else successfully read it in? */ - if (PageUptodate(page)) { - unlock_page(page); - goto success; - } - - ClearPageError(page); - error = mapping->a_ops->readpage(file, page); - if (!error) { - wait_on_page_locked(page); - if (PageUptodate(page)) - goto success; - } else if (error == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto retry_find; - } - - /* - * Things didn't work out. Return zero to tell the - * mm layer so, possibly freeing the page cache page first. - */ -err: - page_cache_release(page); - - return NULL; -} - -int filemap_populate(struct vm_area_struct *vma, unsigned long addr, - unsigned long len, pgprot_t prot, unsigned long pgoff, - int nonblock) -{ - struct file *file = vma->vm_file; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - unsigned long size; - struct mm_struct *mm = vma->vm_mm; - struct page *page; - int err; - - if (!nonblock) - force_page_cache_readahead(mapping, vma->vm_file, - pgoff, len >> PAGE_CACHE_SHIFT); - -repeat: - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (pgoff + (len >> PAGE_CACHE_SHIFT) > size) - return -EINVAL; - - page = filemap_getpage(file, pgoff, nonblock); - - /* XXX: This is wrong, a filesystem I/O error may have happened. Fix that as - * done in shmem_populate calling shmem_getpage */ - if (!page && !nonblock) - return -ENOMEM; - - if (page) { - err = install_page(mm, vma, addr, page, prot); - if (err) { - page_cache_release(page); - return err; - } - } else if (vma->vm_flags & VM_NONLINEAR) { - /* No page was found just because we can't read it in now (being - * here implies nonblock != 0), but the page may exist, so set - * the PTE to fault it in later. */ - err = install_file_pte(mm, vma, addr, pgoff, prot); - if (err) - return err; - } - - len -= PAGE_SIZE; - addr += PAGE_SIZE; - pgoff++; - if (len) - goto repeat; - - return 0; -} -EXPORT_SYMBOL(filemap_populate); - -struct vm_operations_struct generic_file_vm_ops = { - .nopage = filemap_nopage, - .populate = filemap_populate, -}; - -/* This is used for a general mmap of a disk file */ - -int generic_file_mmap(struct file * file, struct vm_area_struct * vma) -{ - struct address_space *mapping = file->f_mapping; - - if (!mapping->a_ops->readpage) - return -ENOEXEC; - file_accessed(file); - vma->vm_ops = &generic_file_vm_ops; - return 0; -} - -/* - * This is for filesystems which do not implement ->writepage. - */ -int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) -{ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) - return -EINVAL; - return generic_file_mmap(file, vma); -} -#else -int generic_file_mmap(struct file * file, struct vm_area_struct * vma) -{ - return -ENOSYS; -} -int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) -{ - return -ENOSYS; -} -#endif /* CONFIG_MMU */ - -EXPORT_SYMBOL(generic_file_mmap); -EXPORT_SYMBOL(generic_file_readonly_mmap); - -static inline struct page *__read_cache_page(struct address_space *mapping, - unsigned long index, - int (*filler)(void *,struct page*), - void *data) -{ - struct page *page, *cached_page = NULL; - int err; -repeat: - page = find_get_page(mapping, index); - if (!page) { - if (!cached_page) { - cached_page = page_cache_alloc_cold(mapping); - if (!cached_page) - return ERR_PTR(-ENOMEM); - } - err = add_to_page_cache_lru(cached_page, mapping, - index, GFP_KERNEL); - if (err == -EEXIST) - goto repeat; - if (err < 0) { - /* Presumably ENOMEM for radix tree node */ - page_cache_release(cached_page); - return ERR_PTR(err); - } - page = cached_page; - cached_page = NULL; - err = filler(data, page); - if (err < 0) { - page_cache_release(page); - page = ERR_PTR(err); - } - } - if (cached_page) - page_cache_release(cached_page); - return page; -} - -/** - * read_cache_page - read into page cache, fill it if needed - * @mapping: the page's address_space - * @index: the page index - * @filler: function to perform the read - * @data: destination for read data - * - * Read into the page cache. If a page already exists, - * and PageUptodate() is not set, try to fill the page. - */ -struct page *read_cache_page(struct address_space *mapping, - unsigned long index, - int (*filler)(void *,struct page*), - void *data) -{ - struct page *page; - int err; - -retry: - page = __read_cache_page(mapping, index, filler, data); - if (IS_ERR(page)) - goto out; - mark_page_accessed(page); - if (PageUptodate(page)) - goto out; - - lock_page(page); - if (!page->mapping) { - unlock_page(page); - page_cache_release(page); - goto retry; - } - if (PageUptodate(page)) { - unlock_page(page); - goto out; - } - err = filler(data, page); - if (err < 0) { - page_cache_release(page); - page = ERR_PTR(err); - } - out: - return page; -} -EXPORT_SYMBOL(read_cache_page); - -/* - * If the page was newly created, increment its refcount and add it to the - * caller's lru-buffering pagevec. This function is specifically for - * generic_file_write(). - */ -static inline struct page * -__grab_cache_page(struct address_space *mapping, unsigned long index, - struct page **cached_page, struct pagevec *lru_pvec) -{ - int err; - struct page *page; -repeat: - page = find_lock_page(mapping, index); - if (!page) { - if (!*cached_page) { - *cached_page = page_cache_alloc(mapping); - if (!*cached_page) - return NULL; - } - err = add_to_page_cache(*cached_page, mapping, - index, GFP_KERNEL); - if (err == -EEXIST) - goto repeat; - if (err == 0) { - page = *cached_page; - page_cache_get(page); - if (!pagevec_add(lru_pvec, page)) - __pagevec_lru_add(lru_pvec); - *cached_page = NULL; - } - } - return page; -} - -/* - * The logic we want is - * - * if suid or (sgid and xgrp) - * remove privs - */ -int remove_suid(struct dentry *dentry) -{ - mode_t mode = dentry->d_inode->i_mode; - int kill = 0; - int result = 0; - - /* suid always must be killed */ - if (unlikely(mode & S_ISUID)) - kill = ATTR_KILL_SUID; - - /* - * sgid without any exec bits is just a mandatory locking mark; leave - * it alone. If some exec bits are set, it's a real sgid; kill it. - */ - if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) - kill |= ATTR_KILL_SGID; - - if (unlikely(kill && !capable(CAP_FSETID))) { - struct iattr newattrs; - - newattrs.ia_valid = ATTR_FORCE | kill; - result = notify_change(dentry, &newattrs); - } - return result; -} -EXPORT_SYMBOL(remove_suid); - -size_t -__filemap_copy_from_user_iovec_inatomic(char *vaddr, - const struct iovec *iov, size_t base, size_t bytes) -{ - size_t copied = 0, left = 0; - - while (bytes) { - char __user *buf = iov->iov_base + base; - int copy = min(bytes, iov->iov_len - base); - - base = 0; - left = __copy_from_user_inatomic_nocache(vaddr, buf, copy); - copied += copy; - bytes -= copy; - vaddr += copy; - iov++; - - if (unlikely(left)) - break; - } - return copied - left; -} - -/* - * Performs necessary checks before doing a write - * - * Can adjust writing position or amount of bytes to write. - * Returns appropriate error code that caller should return or - * zero in case that write should be allowed. - */ -inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) -{ - struct inode *inode = file->f_mapping->host; - unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - - if (unlikely(*pos < 0)) - return -EINVAL; - - if (!isblk) { - /* FIXME: this is for backwards compatibility with 2.4 */ - if (file->f_flags & O_APPEND) - *pos = i_size_read(inode); - - if (limit != RLIM_INFINITY) { - if (*pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - if (*count > limit - (typeof(limit))*pos) { - *count = limit - (typeof(limit))*pos; - } - } - } - - /* - * LFS rule - */ - if (unlikely(*pos + *count > MAX_NON_LFS && - !(file->f_flags & O_LARGEFILE))) { - if (*pos >= MAX_NON_LFS) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - if (*count > MAX_NON_LFS - (unsigned long)*pos) { - *count = MAX_NON_LFS - (unsigned long)*pos; - } - } - - /* - * Are we about to exceed the fs block limit ? - * - * If we have written data it becomes a short write. If we have - * exceeded without writing data we send a signal and return EFBIG. - * Linus frestrict idea will clean these up nicely.. - */ - if (likely(!isblk)) { - if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { - if (*count || *pos > inode->i_sb->s_maxbytes) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - /* zero-length writes at ->s_maxbytes are OK */ - } - - if (unlikely(*pos + *count > inode->i_sb->s_maxbytes)) - *count = inode->i_sb->s_maxbytes - *pos; - } else { - loff_t isize; - if (bdev_read_only(I_BDEV(inode))) - return -EPERM; - isize = i_size_read(inode); - if (*pos >= isize) { - if (*count || *pos > isize) - return -ENOSPC; - } - - if (*pos + *count > isize) - *count = isize - *pos; - } - return 0; -} -EXPORT_SYMBOL(generic_write_checks); - -ssize_t -generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long *nr_segs, loff_t pos, loff_t *ppos, - size_t count, size_t ocount) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t written; - - if (count != ocount) - *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); - - written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs); - if (written > 0) { - loff_t end = pos + written; - if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { - i_size_write(inode, end); - mark_inode_dirty(inode); - } - *ppos = end; - } - - /* - * Sync the fs metadata but not the minor inode changes and - * of course not the data as we did direct DMA for the IO. - * i_mutex is held, which protects generic_osync_inode() from - * livelocking. - */ - if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); - if (err < 0) - written = err; - } - if (written == count && !is_sync_kiocb(iocb)) - written = -EIOCBQUEUED; - return written; -} -EXPORT_SYMBOL(generic_file_direct_write); - -/** - * write_actor - copy data from user buffer - * @page: the page to copy data to - * @offset: offset within the page - * @bytes: number of bytes to copy - * @desc: pointer to user buffer is obtained from here - * - * This is used to copy data from user buffer into @page in case of i/o vector - * has 1 segment. In case of write, in short. - */ -static size_t write_actor(struct page *page, unsigned long offset, - size_t bytes, const write_descriptor_t *desc) -{ - return filemap_copy_from_user(page, offset, desc->buf, bytes); -} - -/** - * write_iovec_actor - copy data from i/o vector - * @page: the page to copy data to - * @offset: offset within the page - * @bytes: number of bytes to copy - * @desc: current iovec and offset in it are obtained from here - * - * This is used to copy data from user buffer into @page in case of i/o vector - * has more than segment. In case of writev, in short. - */ -static size_t write_iovec_actor(struct page *page, unsigned long offset, - size_t bytes, const write_descriptor_t *desc) -{ - return filemap_copy_from_user_iovec(page, offset, desc->cur_iov, - desc->iov_off, bytes); -} - -/** - * generic_batch_write - generic implementation of batched write - * @file: the file to write to - * @desc: set of write arguments - * @lru_pvec: multipage container to batch adding pages to LRU list - * @cached_page: allocated but not used on previous call - * @written: returned number of bytes successfully written - * - * This implementation of batch_write method writes not more than one page of - * file. It faults in user space, allocates page and calls prepare_write and - * commit_write address space operations. User data are copied by an actor - * which is set by caller depending on whether write or writev is on the way. - */ -static long generic_batch_write(struct file *file, - const write_descriptor_t *desc, - struct pagevec *lru_pvec, - struct page **cached_page, size_t *written) -{ - const struct address_space_operations *a_ops = file->f_mapping->a_ops; - struct page *page; - unsigned long index; - size_t bytes; - unsigned long offset; - long status; - - /* offset within page write is to start at */ - offset = (desc->pos & (PAGE_CACHE_SIZE - 1)); - - /* index of page we are to write to */ - index = desc->pos >> PAGE_CACHE_SHIFT; - - /* number of bytes which can be written to the page */ - bytes = PAGE_CACHE_SIZE - offset; - - /* Limit the size of the copy to the caller's write size */ - bytes = min(bytes, desc->count); - - /* - * Limit the size of the copy to that of the current segment, - * because fault_in_pages_readable() doesn't know how to walk - * segments. - */ - bytes = min(bytes, desc->cur_iov->iov_len - desc->iov_off); - - while (1) { - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. - */ - fault_in_pages_readable(desc->buf, bytes); - - page = __grab_cache_page(file->f_mapping, index, cached_page, - lru_pvec); - if (!page) - return -ENOMEM; - - status = a_ops->prepare_write(file, page, offset, - offset+bytes); - if (unlikely(status)) { - loff_t isize = i_size_read(file->f_mapping->host); - - if (status != AOP_TRUNCATED_PAGE) - unlock_page(page); - page_cache_release(page); - if (status == AOP_TRUNCATED_PAGE) - continue; - /* - * prepare_write() may have instantiated a few - * blocks outside i_size. Trim these off - * again. - */ - if (desc->pos + bytes > isize) - vmtruncate(file->f_mapping->host, isize); - return status; - } - - /* - * call write actor in order to copy user data to the - * page - */ - *written = desc->actor(page, offset, bytes, desc); - - flush_dcache_page(page); - status = a_ops->commit_write(file, page, offset, offset+bytes); - if (status == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - continue; - } - - unlock_page(page); - mark_page_accessed(page); - page_cache_release(page); - break; - } - /* - * If commit_write returned error - write failed and we zero - * number of written bytes. If write_actor copied less than it - * was asked to we return -EFAULT and number of bytes - * actually written. - */ - if (status) - *written = 0; - else if (*written != bytes) - status = -EFAULT; - return status; -} - -ssize_t -generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, loff_t *ppos, - size_t count, ssize_t written) -{ - struct file *file = iocb->ki_filp; - struct address_space * mapping = file->f_mapping; - const struct address_space_operations *a_ops = mapping->a_ops; - struct inode *inode = mapping->host; - long status; - struct page *cached_page = NULL; - struct pagevec lru_pvec; - write_descriptor_t desc; - size_t copied = 0; - - pagevec_init(&lru_pvec, 0); - - /* - * initialize write descriptor fields: position to write to - * and number of bytes to write - */ - desc.pos = pos; - desc.count = count; - - /* - * handle partial DIO write. Adjust cur_iov if needed. - */ - if (likely(nr_segs == 1)) { - desc.cur_iov = iov; - desc.iov_off = written; - desc.actor = write_actor; - } else { - filemap_set_next_iovec(&desc.cur_iov, &desc.iov_off, written); - desc.actor = write_iovec_actor; - } - /* pointer to user buffer */ - desc.buf = desc.cur_iov->iov_base + desc.iov_off; - - do { - /* - * When calling the filesystem for writes, there is processing - * that must be done: - * 1) per word - * 2) per page - * 3) per call to the FS - * If the FS is called per page, then it turns out that 3) - * costs more than 1) and 2) for sophisticated filesystems. To - * allow the FS to choose to pay the cost of 3) only once we - * call batch_write, if the FS supports it. - */ - if (a_ops->batch_write) - status = a_ops->batch_write(file, &desc, &lru_pvec, - &cached_page, &copied); - else - status = generic_batch_write(file, &desc, &lru_pvec, - &cached_page, &copied); - if (likely(copied > 0)) { - written += copied; - desc.count -= copied; - if (desc.count) { - /* - * not everything is written yet. Adjust write - * descriptor for next iteration - */ - desc.pos += copied; - if (unlikely(nr_segs > 1)) - filemap_set_next_iovec(&desc.cur_iov, - &desc.iov_off, - copied); - else - desc.iov_off += copied; - desc.buf = desc.cur_iov->iov_base + - desc.iov_off; - } - } - if (status < 0) - break; - balance_dirty_pages_ratelimited(mapping); - cond_resched(); - } while (desc.count); - *ppos = pos + written; - - if (cached_page) - page_cache_release(cached_page); - - /* - * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC - */ - if (likely(status >= 0)) { - if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - if (!a_ops->writepage || !is_sync_kiocb(iocb)) - status = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - } - } - - /* - * If we get here for O_DIRECT writes then we must have fallen through - * to buffered writes (block instantiation inside i_size). So we sync - * the file data here, to try to honour O_DIRECT expectations. - */ - if (unlikely(file->f_flags & O_DIRECT) && written) - status = filemap_write_and_wait(mapping); - - pagevec_lru_add(&lru_pvec); - return written ? written : status; -} -EXPORT_SYMBOL(generic_file_buffered_write); - -static ssize_t -__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) -{ - struct file *file = iocb->ki_filp; - const struct address_space * mapping = file->f_mapping; - size_t ocount; /* original count */ - size_t count; /* after file limit checks */ - struct inode *inode = mapping->host; - unsigned long seg; - loff_t pos; - ssize_t written; - ssize_t err; - - ocount = 0; - for (seg = 0; seg < nr_segs; seg++) { - const struct iovec *iv = &iov[seg]; - - /* - * If any segment has a negative length, or the cumulative - * length ever wraps negative then return -EINVAL. - */ - ocount += iv->iov_len; - if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) - return -EINVAL; - if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) - continue; - if (seg == 0) - return -EFAULT; - nr_segs = seg; - ocount -= iv->iov_len; /* This segment is no good */ - break; - } - - count = ocount; - pos = *ppos; - - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); - - /* We can write back this queue in page reclaim */ - current->backing_dev_info = mapping->backing_dev_info; - written = 0; - - err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); - if (err) - goto out; - - if (count == 0) - goto out; - - err = remove_suid(file->f_dentry); - if (err) - goto out; - - file_update_time(file); - - /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ - if (unlikely(file->f_flags & O_DIRECT)) { - written = generic_file_direct_write(iocb, iov, - &nr_segs, pos, ppos, count, ocount); - if (written < 0 || written == count) - goto out; - /* - * direct-io write to a hole: fall through to buffered I/O - * for completing the rest of the request. - */ - pos += written; - count -= written; - } - - written = generic_file_buffered_write(iocb, iov, nr_segs, - pos, ppos, count, written); -out: - current->backing_dev_info = NULL; - return written ? written : err; -} - -ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, - const struct iovec *iov, unsigned long nr_segs, loff_t pos) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t ret; - - BUG_ON(iocb->ki_pos != pos); - - ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, - &iocb->ki_pos); - - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - ssize_t err; - - err = sync_page_range_nolock(inode, mapping, pos, ret); - if (err < 0) - ret = err; - } - return ret; -} -EXPORT_SYMBOL(generic_file_aio_write_nolock); - -ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t ret; - - BUG_ON(iocb->ki_pos != pos); - - mutex_lock(&inode->i_mutex); - ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, - &iocb->ki_pos); - mutex_unlock(&inode->i_mutex); - - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - ssize_t err; - - err = sync_page_range(inode, mapping, pos, ret); - if (err < 0) - ret = err; - } - return ret; -} -EXPORT_SYMBOL(generic_file_aio_write); - -/* - * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something - * went wrong during pagecache shootdown. - */ -static ssize_t -generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - ssize_t retval; - size_t write_len = 0; - - /* - * If it's a write, unmap all mmappings of the file up-front. This - * will cause any pte dirty bits to be propagated into the pageframes - * for the subsequent filemap_write_and_wait(). - */ - if (rw == WRITE) { - write_len = iov_length(iov, nr_segs); - if (mapping_mapped(mapping)) - unmap_mapping_range(mapping, offset, write_len, 0); - } - - retval = filemap_write_and_wait(mapping); - if (retval == 0) { - retval = mapping->a_ops->direct_IO(rw, iocb, iov, - offset, nr_segs); - if (rw == WRITE && mapping->nrpages) { - pgoff_t end = (offset + write_len - 1) - >> PAGE_CACHE_SHIFT; - int err = invalidate_inode_pages2_range(mapping, - offset >> PAGE_CACHE_SHIFT, end); - if (err) - retval = err; - } - } - return retval; -} diff -urN newtree/mm/migrate.c newtree.2/mm/migrate.c --- newtree/mm/migrate.c 2006-07-05 10:06:57.000000000 -0400 +++ newtree.2/mm/migrate.c 2006-07-11 13:50:10.000000000 -0400 @@ -29,8 +29,6 @@ #include #include -#include "internal.h" - #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) /* @@ -303,15 +301,18 @@ return 0; } - write_lock_irq(&mapping->tree_lock); + SetPageNoNewRefs(page); + smp_wmb(); + spin_lock_irq(&mapping->tree_lock); radix_pointer = (struct page **)radix_tree_lookup_slot( &mapping->page_tree, page_index(page)); if (page_count(page) != 2 + !!PagePrivate(page) || - radix_tree_deref_slot(radix_pointer) != page) { - write_unlock_irq(&mapping->tree_lock); + radix_tree_deref_slot(radix_pointer) != page) { + spin_unlock_irq(&mapping->tree_lock); + ClearPageNoNewRefs(page); return -EAGAIN; } @@ -326,9 +327,16 @@ } #endif - radix_tree_replace_slot(radix_pointer, newpage); + SetPageNoNewRefs(newpage); + radix_tree_replace_slot(radix_pointer, newpage); + page->mapping = NULL; + + spin_unlock_irq(&mapping->tree_lock); __put_page(page); - write_unlock_irq(&mapping->tree_lock); + + smp_wmb(); + ClearPageNoNewRefs(page); + ClearPageNoNewRefs(newpage); return 0; } diff -urN newtree/mm/page-writeback.c newtree.2/mm/page-writeback.c --- newtree/mm/page-writeback.c 2006-07-05 10:06:57.000000000 -0400 +++ newtree.2/mm/page-writeback.c 2006-07-11 13:50:10.000000000 -0400 @@ -630,7 +630,7 @@ struct address_space *mapping2; if (mapping) { - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); @@ -640,7 +640,7 @@ radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, @@ -719,23 +719,23 @@ WARN_ON_ONCE(!PageLocked(page)); if (mapping) { - write_lock_irqsave(&mapping->tree_lock, flags); + spin_lock_irqsave(&mapping->tree_lock, flags); if (TestClearPageDirty(page)) { radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); - write_unlock_irqrestore(&mapping->tree_lock, flags); /* * We can continue to use `mapping' here because the * page is locked, which pins the address_space */ + spin_unlock_irqrestore(&mapping->tree_lock, flags); if (mapping_cap_account_dirty(mapping)) { page_mkclean(page); dec_zone_page_state(page, NR_FILE_DIRTY); } return 1; } - write_unlock_irqrestore(&mapping->tree_lock, flags); + spin_unlock_irqrestore(&mapping->tree_lock, flags); return 0; } return TestClearPageDirty(page); @@ -778,33 +778,32 @@ int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); - int ret; if (mapping) { unsigned long flags; + int ret; - write_lock_irqsave(&mapping->tree_lock, flags); + spin_lock_irqsave(&mapping->tree_lock, flags); ret = TestClearPageWriteback(page); if (ret) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_WRITEBACK); - write_unlock_irqrestore(&mapping->tree_lock, flags); - } else { - ret = TestClearPageWriteback(page); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + return ret; } - return ret; + return TestClearPageWriteback(page); } int test_set_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); - int ret; if (mapping) { unsigned long flags; + int ret; - write_lock_irqsave(&mapping->tree_lock, flags); + spin_lock_irqsave(&mapping->tree_lock, flags); ret = TestSetPageWriteback(page); if (!ret) radix_tree_tag_set(&mapping->page_tree, @@ -814,27 +813,24 @@ radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); - write_unlock_irqrestore(&mapping->tree_lock, flags); - } else { - ret = TestSetPageWriteback(page); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + return ret; } - return ret; + return TestSetPageWriteback(page); } EXPORT_SYMBOL(test_set_page_writeback); /* - * Return true if any of the pages in the mapping are marged with the + * Return true if any of the pages in the mapping are marked with the * passed tag. */ int mapping_tagged(struct address_space *mapping, int tag) { - unsigned long flags; int ret; - - read_lock_irqsave(&mapping->tree_lock, flags); + rcu_read_lock(); ret = radix_tree_tagged(&mapping->page_tree, tag); - read_unlock_irqrestore(&mapping->tree_lock, flags); + rcu_read_unlock(); return ret; } EXPORT_SYMBOL(mapping_tagged); diff -urN newtree/mm/readahead.c newtree.2/mm/readahead.c --- newtree/mm/readahead.c 2006-07-05 10:06:57.000000000 -0400 +++ newtree.2/mm/readahead.c 2006-07-11 13:50:10.000000000 -0400 @@ -398,21 +398,21 @@ /* * Preallocate as many pages as we will need. */ - read_lock_irq(&mapping->tree_lock); for (page_idx = 0; page_idx < nr_to_read; page_idx++) { pgoff_t page_offset = offset + page_idx; if (page_offset > end_index) break; + /* Don't need mapping->tree_lock - lookup can be racy */ + rcu_read_lock(); page = radix_tree_lookup(&mapping->page_tree, page_offset); + rcu_read_unlock(); if (page) continue; - read_unlock_irq(&mapping->tree_lock); cond_resched(); page = page_cache_alloc_cold(mapping); - read_lock_irq(&mapping->tree_lock); if (!page) break; page->index = page_offset; @@ -421,7 +421,6 @@ SetPageReadahead(page); ret++; } - read_unlock_irq(&mapping->tree_lock); /* * Now start the IO. We ignore I/O errors - if the page is not @@ -1324,7 +1323,7 @@ pgoff_t ra_index; cond_resched(); - read_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); ra_index = radix_tree_scan_hole(&mapping->page_tree, index, max_scan); #ifdef DEBUG_READAHEAD_RADIXTREE BUG_ON(!__probe_page(mapping, index)); @@ -1336,7 +1335,7 @@ if (ra_index != ~0UL && ra_index - index < max_scan) WARN_ON(__probe_page(mapping, ra_index)); #endif - read_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); if (ra_index <= index + max_scan) return ra_index; @@ -1359,13 +1358,13 @@ * Poor man's radix_tree_scan_data_backward() implementation. * Acceptable because max_scan won't be large. */ - read_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); for (; origin - index < max_scan;) if (__probe_page(mapping, --index)) { - read_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); return index + 1; } - read_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); return 0; } @@ -1416,7 +1415,7 @@ * The count here determines ra_size. */ cond_resched(); - read_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); index = radix_tree_scan_hole_backward(&mapping->page_tree, offset - 1, ra_max); #ifdef DEBUG_READAHEAD_RADIXTREE @@ -1458,7 +1457,7 @@ break; out_unlock: - read_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); /* * For sequential read that extends from index 0, the counted value diff -urN newtree/mm/swap_prefetch.c newtree.2/mm/swap_prefetch.c --- newtree/mm/swap_prefetch.c 2006-07-05 10:06:57.000000000 -0400 +++ newtree.2/mm/swap_prefetch.c 2006-07-11 13:50:10.000000000 -0400 @@ -190,10 +190,10 @@ enum trickle_return ret = TRICKLE_FAILED; struct page *page; - read_lock_irq(&swapper_space.tree_lock); + spin_lock_irq(&swapper_space.tree_lock); /* Entry may already exist */ page = radix_tree_lookup(&swapper_space.page_tree, entry.val); - read_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&swapper_space.tree_lock); if (page) { remove_from_swapped_list(entry.val); goto out; diff -urN newtree/mm/swap_state.c newtree.2/mm/swap_state.c --- newtree/mm/swap_state.c 2006-07-05 10:06:57.000000000 -0400 +++ newtree.2/mm/swap_state.c 2006-07-11 13:50:10.000000000 -0400 @@ -39,7 +39,7 @@ struct address_space swapper_space = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), - .tree_lock = __RW_LOCK_UNLOCKED(swapper_space.tree_lock), + .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), .a_ops = &swap_aops, .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), .backing_dev_info = &swap_backing_dev_info, @@ -79,7 +79,9 @@ BUG_ON(PagePrivate(page)); error = radix_tree_preload(gfp_mask); if (!error) { - write_lock_irq(&swapper_space.tree_lock); + SetPageNoNewRefs(page); + smp_wmb(); + spin_lock_irq(&swapper_space.tree_lock); error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); if (!error) { @@ -91,7 +93,9 @@ total_swapcache_pages++; __inc_zone_page_state(page, NR_FILE_PAGES); } - write_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&swapper_space.tree_lock); + smp_wmb(); + ClearPageNoNewRefs(page); radix_tree_preload_end(); } return error; @@ -206,9 +210,9 @@ entry.val = page_private(page); - write_lock_irq(&swapper_space.tree_lock); + spin_lock_irq(&swapper_space.tree_lock); __delete_from_swap_cache(page); - write_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&swapper_space.tree_lock); swap_free(entry); page_cache_release(page); @@ -238,7 +242,7 @@ int move_from_swap_cache(struct page *page, unsigned long index, struct address_space *mapping) { - int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC); + int err = __add_to_page_cache(page, mapping, index, GFP_ATOMIC); if (!err) { delete_from_swap_cache(page); /* shift page from clean_pages to dirty_pages list */ @@ -295,6 +299,29 @@ } } +struct page *find_get_swap_page(swp_entry_t entry) +{ + struct page *page; + + rcu_read_lock(); +repeat: + page = radix_tree_lookup(&swapper_space.page_tree, entry.val); + if (page) { + page = page_cache_get_speculative(page); + if (unlikely(!page)) + goto repeat; + /* Has the page been truncated? */ + if (unlikely(!PageSwapCache(page) + || page_private(page) != entry.val)) { + page_cache_release(page); + goto repeat; + } + } + rcu_read_unlock(); + + return page; +} + /* * Lookup a swap entry in the swap cache. A found page will be returned * unlocked and with its refcount incremented - we rely on the kernel @@ -305,7 +332,7 @@ { struct page *page; - page = find_get_page(&swapper_space, entry.val); + page = find_get_swap_page(entry); if (page) INC_CACHE_INFO(find_success); @@ -335,7 +362,7 @@ * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ - found_page = find_get_page(&swapper_space, entry.val); + found_page = find_get_swap_page(entry); if (found_page) break; diff -urN newtree/mm/swapfile.c newtree.2/mm/swapfile.c --- newtree/mm/swapfile.c 2006-07-05 10:06:57.000000000 -0400 +++ newtree.2/mm/swapfile.c 2006-07-11 13:50:10.000000000 -0400 @@ -367,13 +367,13 @@ retval = 0; if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the swapcache lock held.. */ - write_lock_irq(&swapper_space.tree_lock); + spin_lock_irq(&swapper_space.tree_lock); if ((page_count(page) == 2) && !PageWriteback(page)) { __delete_from_swap_cache(page); SetPageDirty(page); retval = 1; } - write_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&swapper_space.tree_lock); } spin_unlock(&swap_lock); @@ -400,7 +400,7 @@ p = swap_info_get(entry); if (p) { if (swap_entry_free(p, swp_offset(entry)) == 1) { - page = find_get_page(&swapper_space, entry.val); + page = find_get_swap_page(entry); if (page && unlikely(TestSetPageLocked(page))) { page_cache_release(page); page = NULL; diff -urN newtree/mm/truncate.c newtree.2/mm/truncate.c --- newtree/mm/truncate.c 2006-07-05 10:06:57.000000000 -0400 +++ newtree.2/mm/truncate.c 2006-07-11 13:50:10.000000000 -0400 @@ -67,15 +67,15 @@ if (PagePrivate(page) && !try_to_release_page(page, 0)) return 0; - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); if (PageDirty(page)) { - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); return 0; } BUG_ON(PagePrivate(page)); __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); ClearPageUptodate(page); page_cache_release(page); /* pagecache ref */ return 1; diff -urN newtree/mm/vmscan.c newtree.2/mm/vmscan.c --- newtree/mm/vmscan.c 2006-07-08 06:15:26.000000000 -0400 +++ newtree.2/mm/vmscan.c 2006-07-11 13:50:10.000000000 -0400 @@ -382,7 +382,9 @@ if (!mapping) return 0; /* truncate got there first */ - write_lock_irq(&mapping->tree_lock); + SetPageNoNewRefs(page); + smp_wmb(); + spin_lock_irq(&mapping->tree_lock); /* * The non-racy check for busy page. It is critical to check @@ -399,19 +401,23 @@ swp_entry_t swap = { .val = page_private(page) }; add_to_swapped_list(page); __delete_from_swap_cache(page); - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); swap_free(swap); - __put_page(page); /* The pagecache ref */ - return 1; + goto free_it; } __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); - __put_page(page); + spin_unlock_irq(&mapping->tree_lock); + +free_it: + smp_wmb(); + __ClearPageNoNewRefs(page); + __put_page(page); /* The pagecache ref */ return 1; cannot_free: - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); + ClearPageNoNewRefs(page); return 0; }