diff -urN newtree/drivers/mtd/devices/block2mtd.c newtree.2/drivers/mtd/devices/block2mtd.c
--- newtree/drivers/mtd/devices/block2mtd.c	2006-07-05 10:06:57.000000000 -0400
+++ newtree.2/drivers/mtd/devices/block2mtd.c	2006-07-11 13:50:10.000000000 -0400
@@ -58,28 +58,27 @@
 
 	end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
 
-	read_lock_irq(&mapping->tree_lock);
 	for (i = 0; i < PAGE_READAHEAD; i++) {
 		pagei = index + i;
 		if (pagei > end_index) {
 			INFO("Overrun end of disk in cache readahead\n");
 			break;
 		}
+		/* Don't need mapping->tree_lock - lookup can be racy */
+		rcu_read_lock();
 		page = radix_tree_lookup(&mapping->page_tree, pagei);
+		rcu_read_unlock();
 		if (page && (!i))
 			break;
 		if (page)
 			continue;
-		read_unlock_irq(&mapping->tree_lock);
 		page = page_cache_alloc_cold(mapping);
-		read_lock_irq(&mapping->tree_lock);
 		if (!page)
 			break;
 		page->index = pagei;
 		list_add(&page->lru, &page_pool);
 		ret++;
 	}
-	read_unlock_irq(&mapping->tree_lock);
 	if (ret)
 		read_cache_pages(mapping, &page_pool, filler, NULL);
 }
diff -urN newtree/fs/buffer.c newtree.2/fs/buffer.c
--- newtree/fs/buffer.c	2006-07-05 10:06:57.000000000 -0400
+++ newtree.2/fs/buffer.c	2006-07-11 13:50:10.000000000 -0400
@@ -848,7 +848,7 @@
 	spin_unlock(&mapping->private_lock);
 
 	if (!TestSetPageDirty(page)) {
-		write_lock_irq(&mapping->tree_lock);
+		spin_lock_irq(&mapping->tree_lock);
 		if (page->mapping) {	/* Race with truncate? */
 			if (mapping_cap_account_dirty(mapping))
 				__inc_zone_page_state(page, NR_FILE_DIRTY);
@@ -856,7 +856,7 @@
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
 		}
-		write_unlock_irq(&mapping->tree_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 		return 1;
 	}
diff -urN newtree/fs/inode.c newtree.2/fs/inode.c
--- newtree/fs/inode.c	2006-07-05 10:06:57.000000000 -0400
+++ newtree.2/fs/inode.c	2006-07-11 13:50:10.000000000 -0400
@@ -194,7 +194,7 @@
 	mutex_init(&inode->i_mutex);
 	init_rwsem(&inode->i_alloc_sem);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
-	rwlock_init(&inode->i_data.tree_lock);
+	spin_lock_init(&inode->i_data.tree_lock);
 	spin_lock_init(&inode->i_data.i_mmap_lock);
 	INIT_LIST_HEAD(&inode->i_data.private_list);
 	spin_lock_init(&inode->i_data.private_lock);
diff -urN newtree/fs/reiser4/as_ops.c newtree.2/fs/reiser4/as_ops.c
--- newtree/fs/reiser4/as_ops.c	2006-07-05 10:06:57.000000000 -0400
+++ newtree.2/fs/reiser4/as_ops.c	2006-07-11 13:50:10.000000000 -0400
@@ -77,7 +77,7 @@
 		struct address_space *mapping = page->mapping;
 
 		if (mapping) {
-			write_lock_irq(&mapping->tree_lock);
+			spin_lock_irq(&mapping->tree_lock);
 
 			/* check for race with truncate */
 			if (page->mapping) {
@@ -89,7 +89,7 @@
 						   page->index,
 						   PAGECACHE_TAG_REISER4_MOVED);
 			}
-			write_unlock_irq(&mapping->tree_lock);
+			spin_unlock_irq(&mapping->tree_lock);
 			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 		}
 	}
diff -urN newtree/fs/reiser4/jnode.c newtree.2/fs/reiser4/jnode.c
--- newtree/fs/reiser4/jnode.c	2006-07-05 10:06:57.000000000 -0400
+++ newtree.2/fs/reiser4/jnode.c	2006-07-11 13:50:10.000000000 -0400
@@ -434,9 +434,9 @@
 	if (rtree->rnode == NULL) {
 		/* prevent inode from being pruned when it has jnodes attached
 		   to it */
-		write_lock_irq(&inode->i_data.tree_lock);
+                spin_lock_irq(&inode->i_data.tree_lock);
 		inode->i_data.nrpages++;
-		write_unlock_irq(&inode->i_data.tree_lock);
+		spin_unlock_irq(&inode->i_data.tree_lock);
 	}
 	assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0));
 	check_me("zam-1045",
@@ -464,9 +464,9 @@
 	check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index));
 	if (rtree->rnode == NULL) {
 		/* inode can be pruned now */
-		write_lock_irq(&inode->i_data.tree_lock);
+		spin_lock_irq(&inode->i_data.tree_lock);
 		inode->i_data.nrpages--;
-		write_unlock_irq(&inode->i_data.tree_lock);
+		spin_unlock_irq(&inode->i_data.tree_lock);
 	}
 }
 
diff -urN newtree/fs/reiser4/plugin/file/cryptcompress.c newtree.2/fs/reiser4/plugin/file/cryptcompress.c
--- newtree/fs/reiser4/plugin/file/cryptcompress.c	2006-07-08 09:55:17.000000000 -0400
+++ newtree.2/fs/reiser4/plugin/file/cryptcompress.c	2006-07-11 13:50:10.000000000 -0400
@@ -3415,7 +3415,7 @@
 {
 	int i;
 	void * ret;
-	read_lock_irq(&mapping->tree_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	for (i = 0; i < clust->nr_pages; i++) {
 		assert("edward-1438", clust->pages[i] != NULL);
 		ret = radix_tree_tag_clear(&mapping->page_tree,
@@ -3423,7 +3423,7 @@
 					   PAGECACHE_TAG_REISER4_MOVED);
 		assert("edward-1439", ret == clust->pages[i]);
 	}
-	read_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 }
 
 /* Capture an anonymous pager cluster. (Page cluser is
@@ -3448,11 +3448,11 @@
 	if (unlikely(result)) {
 		/* set cleared tag back, so it will be
 		   possible to capture it again later */
-		read_lock_irq(&inode->i_mapping->tree_lock);
+		spin_lock_irq(&inode->i_mapping->tree_lock);
 		radix_tree_tag_set(&inode->i_mapping->page_tree,
 				   clust_to_pg(clust->index, inode),
 				   PAGECACHE_TAG_REISER4_MOVED);
-		read_unlock_irq(&inode->i_mapping->tree_lock);
+		spin_unlock_irq(&inode->i_mapping->tree_lock);
 
 		release_cluster_pages_and_jnode(clust);
 	}
diff -urN newtree/fs/reiser4/plugin/file/file.c newtree.2/fs/reiser4/plugin/file/file.c
--- newtree/fs/reiser4/plugin/file/file.c	2006-07-05 10:06:57.000000000 -0400
+++ newtree.2/fs/reiser4/plugin/file/file.c	2006-07-11 13:50:10.000000000 -0400
@@ -830,9 +830,9 @@
 {
 	int result;
 
-	read_lock_irq(&inode->i_mapping->tree_lock);
+	spin_lock_irq(&inode->i_mapping->tree_lock);
 	result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED);
-	read_unlock_irq(&inode->i_mapping->tree_lock);
+	spin_unlock_irq(&inode->i_mapping->tree_lock);
 	return result;
 }
 
@@ -978,7 +978,7 @@
 	nr = 0;
 
 	/* find pages tagged MOVED */
-	write_lock_irq(&mapping->tree_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree,
 					     (void **)pvec.pages, *index, count,
 					     PAGECACHE_TAG_REISER4_MOVED);
@@ -987,7 +987,7 @@
 		 * there are no pages tagged MOVED in mapping->page_tree
 		 * starting from *index
 		 */
-		write_unlock_irq(&mapping->tree_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		*index = (pgoff_t)-1;
 		return 0;
 	}
@@ -1001,7 +1001,7 @@
 					 PAGECACHE_TAG_REISER4_MOVED);
 		assert("vs-49", p == pvec.pages[i]);
 	}
-	write_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 
 
 	*index = pvec.pages[i - 1]->index + 1;
@@ -1026,13 +1026,13 @@
 				 * set MOVED tag to all pages which left not
 				 * captured
 				 */
-				write_lock_irq(&mapping->tree_lock);
+				spin_lock_irq(&mapping->tree_lock);
 				for (; i < pagevec_count(&pvec); i ++) {
 					radix_tree_tag_set(&mapping->page_tree,
 							   pvec.pages[i]->index,
 							   PAGECACHE_TAG_REISER4_MOVED);
 				}
-				write_unlock_irq(&mapping->tree_lock);
+				spin_unlock_irq(&mapping->tree_lock);
 
 				pagevec_release(&pvec);
 				return result;
@@ -1042,11 +1042,11 @@
 				 * 0 for Writeback-ed page. Set MOVED tag on
 				 * that page
 				 */
-				write_lock_irq(&mapping->tree_lock);
+				spin_lock_irq(&mapping->tree_lock);
 				radix_tree_tag_set(&mapping->page_tree,
 						   pvec.pages[i]->index,
 						   PAGECACHE_TAG_REISER4_MOVED);
-				write_unlock_irq(&mapping->tree_lock);
+				spin_unlock_irq(&mapping->tree_lock);
 				if (i == 0)
 					*index = pvec.pages[0]->index;
 				else
@@ -1122,7 +1122,7 @@
 	mapping = inode->i_mapping;
 	from = 0;
 	result = 0;
-	read_lock_irq(&mapping->tree_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	while (result == 0) {
 		struct page *page;
 
@@ -1136,17 +1136,17 @@
 		/* page may not leave radix tree because it is protected from truncating by inode->i_mutex locked by
 		   sys_fsync */
 		page_cache_get(page);
-		read_unlock_irq(&mapping->tree_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 
 		from = page->index + 1;
 
 		result = sync_page(page);
 
 		page_cache_release(page);
-		read_lock_irq(&mapping->tree_lock);
+		spin_lock_irq(&mapping->tree_lock);
 	}
 
-	read_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	return result;
 }
 
diff -urN newtree/include/asm-arm/cacheflush.h newtree.2/include/asm-arm/cacheflush.h
--- newtree/include/asm-arm/cacheflush.h	2006-07-05 10:06:57.000000000 -0400
+++ newtree.2/include/asm-arm/cacheflush.h	2006-07-11 13:50:10.000000000 -0400
@@ -326,9 +326,9 @@
 extern void flush_dcache_page(struct page *);
 
 #define flush_dcache_mmap_lock(mapping) \
-	write_lock_irq(&(mapping)->tree_lock)
+	spin_lock_irq(&(mapping)->tree_lock)
 #define flush_dcache_mmap_unlock(mapping) \
-	write_unlock_irq(&(mapping)->tree_lock)
+	spin_unlock_irq(&(mapping)->tree_lock)
 
 #define flush_icache_user_range(vma,page,addr,len) \
 	flush_dcache_page(page)
diff -urN newtree/include/asm-parisc/cacheflush.h newtree.2/include/asm-parisc/cacheflush.h
--- newtree/include/asm-parisc/cacheflush.h	2006-07-05 10:06:57.000000000 -0400
+++ newtree.2/include/asm-parisc/cacheflush.h	2006-07-11 13:50:10.000000000 -0400
@@ -57,9 +57,9 @@
 extern void flush_dcache_page(struct page *page);
 
 #define flush_dcache_mmap_lock(mapping) \
-	write_lock_irq(&(mapping)->tree_lock)
+	spin_lock_irq(&(mapping)->tree_lock)
 #define flush_dcache_mmap_unlock(mapping) \
-	write_unlock_irq(&(mapping)->tree_lock)
+	spin_unlock_irq(&(mapping)->tree_lock)
 
 #define flush_icache_page(vma,page)	do { flush_kernel_dcache_page(page); flush_kernel_icache_page(page_address(page)); } while (0)
 
diff -urN newtree/include/linux/fs.h newtree.2/include/linux/fs.h
--- newtree/include/linux/fs.h	2006-07-08 09:55:03.000000000 -0400
+++ newtree.2/include/linux/fs.h	2006-07-11 13:50:10.000000000 -0400
@@ -419,7 +419,7 @@
 struct address_space {
 	struct inode		*host;		/* owner: inode, block_device */
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
-	rwlock_t		tree_lock;	/* and rwlock protecting it */
+	spinlock_t		tree_lock;	/* and lock protecting it */
 	unsigned int		i_mmap_writable;/* count VM_SHARED mappings */
 	struct prio_tree_root	i_mmap;		/* tree of private and shared mappings */
 	struct list_head	i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
diff -urN newtree/include/linux/page-flags.h newtree.2/include/linux/page-flags.h
--- newtree/include/linux/page-flags.h	2006-07-05 10:06:57.000000000 -0400
+++ newtree.2/include/linux/page-flags.h	2006-07-11 13:50:10.000000000 -0400
@@ -87,6 +87,8 @@
 #define PG_buddy		19	/* Page is free, on buddy lists */
 
 #define PG_readahead		20	/* Reminder to do readahead */
+#define PG_nonewrefs          21      /* Block concurrent pagecache lookups
+                                         * while testing refcount */
 
 
 #if (BITS_PER_LONG > 32)
@@ -103,16 +105,13 @@
 /*
  * Manipulation of page state flags
  */
-#define PageLocked(page)		\
-		test_bit(PG_locked, &(page)->flags)
-#define SetPageLocked(page)		\
-		set_bit(PG_locked, &(page)->flags)
-#define TestSetPageLocked(page)		\
-		test_and_set_bit(PG_locked, &(page)->flags)
-#define ClearPageLocked(page)		\
-		clear_bit(PG_locked, &(page)->flags)
-#define TestClearPageLocked(page)	\
-		test_and_clear_bit(PG_locked, &(page)->flags)
+#define PageLocked(page)	test_bit(PG_locked, &(page)->flags)
+#define SetPageLocked(page)	set_bit(PG_locked, &(page)->flags)
+#define __SetPageLocked(page)	__set_bit(PG_locked, &(page)->flags)
+#define TestSetPageLocked(page)	test_and_set_bit(PG_locked, &(page)->flags)
+#define ClearPageLocked(page)	clear_bit(PG_locked, &(page)->flags)
+#define __ClearPageLocked(page)	__clear_bit(PG_locked, &(page)->flags)
+#define TestClearPageLocked(page) test_and_clear_bit(PG_locked, &(page)->flags)
 
 #define PageError(page)		test_bit(PG_error, &(page)->flags)
 #define SetPageError(page)	set_bit(PG_error, &(page)->flags)
@@ -253,6 +252,11 @@
 #define SetPageReadahead(page)	set_bit(PG_readahead, &(page)->flags)
 #define TestClearPageReadahead(page) test_and_clear_bit(PG_readahead, &(page)->flags)
 
+#define PageNoNewRefs(page)	test_bit(PG_nonewrefs, &(page)->flags)
+#define SetPageNoNewRefs(page)	set_bit(PG_nonewrefs, &(page)->flags)
+#define ClearPageNoNewRefs(page) clear_bit(PG_nonewrefs, &(page)->flags)
+#define __ClearPageNoNewRefs(page) __clear_bit(PG_nonewrefs, &(page)->flags)
+
 struct page;	/* forward declaration */
 
 int test_clear_page_dirty(struct page *page);
diff -urN newtree/include/linux/pagemap.h newtree.2/include/linux/pagemap.h
--- newtree/include/linux/pagemap.h	2006-07-05 10:06:57.000000000 -0400
+++ newtree.2/include/linux/pagemap.h	2006-07-11 13:50:10.000000000 -0400
@@ -11,6 +11,8 @@
 #include <linux/compiler.h>
 #include <asm/uaccess.h>
 #include <linux/gfp.h>
+#include <linux/page-flags.h>
+#include <linux/hardirq.h> /* for in_interrupt() */
 
 /*
  * Bits in mapping->flags.  The lower __GFP_BITS_SHIFT bits are the page
@@ -51,6 +53,76 @@
 #define page_cache_release(page)	put_page(page)
 void release_pages(struct page **pages, int nr, int cold);
 
+static inline struct page *page_cache_get_speculative(struct page *page)
+{
+	VM_BUG_ON(in_interrupt());
+
+#ifndef CONFIG_SMP
+# ifdef CONFIG_PREEMPT
+	VM_BUG_ON(!in_atomic());
+# endif
+	/*
+	 * Preempt must be disabled here - we rely on rcu_read_lock doing
+	 * this for us.
+	 *
+	 * Pagecache won't be truncated from interrupt context, so if we have
+	 * found a page in the radix tree here, we have pinned its refcount by
+	 * disabling preempt, and hence no need for the "speculative get" that
+	 * SMP requires.
+	 */
+	VM_BUG_ON(page_count(page) == 0);
+	atomic_inc(&page->_count);
+
+#else
+	if (unlikely(!get_page_unless_zero(page)))
+		return NULL; /* page has been freed */
+
+	/*
+	 * Note that get_page_unless_zero provides a memory barrier.
+	 * This is needed to ensure PageNoNewRefs is evaluated after the
+	 * page refcount has been raised. See below comment.
+	 */
+
+	/*
+	 * PageNoNewRefs is set in order to prevent new references to the
+	 * page (eg. before it gets removed from pagecache). Wait until it
+	 * becomes clear (and checks below will ensure we still have the
+	 * correct one).
+	 */
+	while (unlikely(PageNoNewRefs(page)))
+		cpu_relax();
+
+	/*
+	 * smp_rmb is to ensure the load of page->flags (for PageNoNewRefs())
+	 * is performed before a future load used to ensure the page is
+	 * the correct on (usually: page->mapping and page->index).
+	 *
+	 * Those places that set PageNoNewRefs have the following pattern:
+	 * 	SetPageNoNewRefs(page)
+	 * 	wmb();
+	 * 	if (page_count(page) == X)
+	 * 		remove page from pagecache
+	 * 	wmb();
+	 * 	ClearPageNoNewRefs(page)
+	 *
+	 * So PageNoNewRefs() becomes clear _after_ we've elevated page
+	 * refcount, then either the page will be safely pinned in pagecache,
+	 * or it will have been already removed. In the latter case, *pagep
+	 * will be changed in the below test - provided it is loaded after
+	 * testing PageNoNewRefs() (which is what the smp_rmb is for).
+	 *
+	 * If the load was out of order, page->mapping might be loaded before
+	 * the page is removed from pagecache while PageNoNewRefs evaluated
+	 * after the ClearPageNoNewRefs().
+	 */
+	smp_rmb();
+
+#endif
+	VM_BUG_ON(PageCompound(page) && (struct page *)page_private(page) != page);
+
+	return page;
+}
+
 #ifdef CONFIG_NUMA
 extern struct page *page_cache_alloc(struct address_space *x);
 extern struct page *page_cache_alloc_cold(struct address_space *x);
@@ -110,6 +182,8 @@
 
 int add_to_page_cache(struct page *page, struct address_space *mapping,
 				unsigned long index, gfp_t gfp_mask);
+int __add_to_page_cache(struct page *page, struct address_space *mapping,
+				unsigned long index, gfp_t gfp_mask);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 				unsigned long index, gfp_t gfp_mask);
 extern void remove_from_page_cache(struct page *page);
diff -urN newtree/include/linux/swap.h newtree.2/include/linux/swap.h
--- newtree/include/linux/swap.h	2006-07-08 06:15:08.000000000 -0400
+++ newtree.2/include/linux/swap.h	2006-07-11 13:50:10.000000000 -0400
@@ -228,6 +228,7 @@
 		struct address_space *);
 extern void free_page_and_swap_cache(struct page *);
 extern void free_pages_and_swap_cache(struct page **, int);
+extern struct page * find_get_swap_page(swp_entry_t);
 extern struct page * lookup_swap_cache(swp_entry_t);
 extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma,
 					   unsigned long addr);
diff -urN newtree/mm/filemap.c newtree.2/mm/filemap.c
--- newtree/mm/filemap.c	2006-07-11 12:42:13.000000000 -0400
+++ newtree.2/mm/filemap.c	2006-07-11 13:50:10.000000000 -0400
@@ -117,7 +117,7 @@
 /*
  * Remove a page from the page cache and free it. Caller has to make
  * sure the page is locked and that nobody else uses it - or that usage
- * is safe.  The caller must hold a write_lock on the mapping's tree_lock.
+ * is safe.  The caller must hold the mapping's tree_lock.
  */
 void __remove_from_page_cache(struct page *page)
 {
@@ -136,9 +136,9 @@
 
 	BUG_ON(!PageLocked(page));
 
-	write_lock_irq(&mapping->tree_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	__remove_from_page_cache(page);
-	write_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 }
 EXPORT_SYMBOL(remove_from_page_cache);
 
@@ -431,42 +431,6 @@
 	return err;
 }
 
-/**
- * add_to_page_cache - add newly allocated pagecache pages
- * @page:	page to add
- * @mapping:	the page's address_space
- * @offset:	page index
- * @gfp_mask:	page allocation mode
- *
- * This function is used to add newly allocated pagecache pages;
- * the page is new, so we can just run SetPageLocked() against it.
- * The other page state flags were set by rmqueue().
- *
- * This function does not add the page to the LRU.  The caller must do that.
- */
-int add_to_page_cache(struct page *page, struct address_space *mapping,
-		pgoff_t offset, gfp_t gfp_mask)
-{
-	int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
-
-	if (error == 0) {
-		write_lock_irq(&mapping->tree_lock);
-		error = radix_tree_insert(&mapping->page_tree, offset, page);
-		if (!error) {
-			page_cache_get(page);
-			SetPageLocked(page);
-			page->mapping = mapping;
-			page->index = offset;
-			mapping->nrpages++;
-			__inc_zone_page_state(page, NR_FILE_PAGES);
-		}
-		write_unlock_irq(&mapping->tree_lock);
-		radix_tree_preload_end();
-	}
-	return error;
-}
-EXPORT_SYMBOL(add_to_page_cache);
-
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 				pgoff_t offset, gfp_t gfp_mask)
 {
@@ -498,6 +462,96 @@
 EXPORT_SYMBOL(page_cache_alloc_cold);
 #endif
 
+static int add_to_page_cache_nolock(struct page *page,
+			struct address_space *mapping, pgoff_t offset)
+{
+	int error;
+
+	/*
+	 * Can get away with less atomic ops and without using
+	 * Set/ClearPageNoNewRefs if we order operations correctly.
+	 */
+	page_cache_get(page);
+	__SetPageLocked(page);
+	page->mapping = mapping;
+	page->index = offset;
+
+	/* radix_tree_insert provides a write memory barrier */
+	error = radix_tree_insert(&mapping->page_tree, offset, page);
+
+	if (!error) {
+		mapping->nrpages++;
+                __inc_zone_page_state(page, NR_FILE_PAGES);
+	} else {
+		page->mapping = NULL;
+		__ClearPageLocked(page);
+		__put_page(page);
+	}
+
+	return error;
+}
+
+/**
+ * add_to_page_cache - add newly allocated pagecache pages
+ * @page:       page to add
+ * @mapping:    the page's address_space
+ * @offset:     page index
+ * @gfp_mask:   page allocation mode
+ *
+ * This function is used to add newly allocated pagecache pages;
+ * the page is new, so we can just run SetPageLocked() against it.
+ * The other page state flags were set by rmqueue().
+ *
+ * This function does not add the page to the LRU.  The caller must do that.
+ */
+int add_to_page_cache(struct page *page, struct address_space *mapping,
+                pgoff_t offset, gfp_t gfp_mask)
+{
+        int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+
+        if (error == 0) {
+                spin_lock_irq(&mapping->tree_lock);
+                error = add_to_page_cache_nolock(page, mapping, offset);
+                spin_unlock_irq(&mapping->tree_lock);
+
+                radix_tree_preload_end();
+        }
+        return error;
+}
+
+/*
+ * Same as add_to_page_cache, but works on pages that are already in
+ * swapcache and possibly visible to external lookups.
+ * (special case for move_from_swap_cache).
+ */
+int __add_to_page_cache(struct page *page, struct address_space *mapping,
+                pgoff_t offset, gfp_t gfp_mask)
+{
+        int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+
+        if (error == 0) {
+                SetPageNoNewRefs(page);
+                smp_wmb();
+                spin_lock_irq(&mapping->tree_lock);
+
+                error = radix_tree_insert(&mapping->page_tree, offset, page);
+                if (!error) {
+                        page_cache_get(page);
+                        SetPageLocked(page);
+                        page->mapping = mapping;
+                        page->index = offset;
+                        mapping->nrpages++;
+                        __inc_zone_page_state(page, NR_FILE_PAGES);
+                }
+
+                spin_unlock_irq(&mapping->tree_lock);
+                smp_wmb();
+                ClearPageNoNewRefs(page);
+                radix_tree_preload_end();
+        }
+        return error;
+}
+
 /*
  * In order to wait for pages to become available there must be
  * waitqueues associated with pages. By using a hash table of
@@ -602,30 +656,36 @@
 {
 	int exists;
 
-	read_lock_irq(&mapping->tree_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	exists = __probe_page(mapping, offset);
-	read_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 
 	return exists;
 }
 
-/**
- * find_get_page - find and get a page reference
- * @mapping: the address_space to search
- * @offset: the page index
- *
- * A rather lightweight function, finding and getting a reference to a
- * hashed page atomically.
+/*
+ * find_get_page - find and get a reference to a pagecache page.
  */
-struct page * find_get_page(struct address_space *mapping, unsigned long offset)
+struct page *find_get_page(struct address_space *mapping, unsigned long offset)
 {
 	struct page *page;
 
-	read_lock_irq(&mapping->tree_lock);
+        rcu_read_lock();
+repeat:
 	page = radix_tree_lookup(&mapping->page_tree, offset);
-	if (page)
-		page_cache_get(page);
-	read_unlock_irq(&mapping->tree_lock);
+        if (page) {
+                page = page_cache_get_speculative(page);
+                if (unlikely(!page))
+                        goto repeat;
+                /* Has the page been truncated? */
+                if (unlikely(page->mapping != mapping
+                                || page->index != offset)) {
+                        page_cache_release(page);
+                        goto repeat;
+                }
+        }
+        rcu_read_unlock();
+
 	return page;
 }
 EXPORT_SYMBOL(find_get_page);
@@ -641,11 +701,11 @@
 {
 	struct page *page;
 
-	read_lock_irq(&mapping->tree_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	page = radix_tree_lookup(&mapping->page_tree, offset);
 	if (page && TestSetPageLocked(page))
 		page = NULL;
-	read_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	return page;
 }
 EXPORT_SYMBOL(find_trylock_page);
@@ -663,28 +723,28 @@
 struct page *find_lock_page(struct address_space *mapping,
 				unsigned long offset)
 {
+
 	struct page *page;
 
-	read_lock_irq(&mapping->tree_lock);
 repeat:
+	rcu_read_lock();
 	page = radix_tree_lookup(&mapping->page_tree, offset);
 	if (page) {
-		page_cache_get(page);
-		if (TestSetPageLocked(page)) {
-			read_unlock_irq(&mapping->tree_lock);
-			__lock_page(page);
-			read_lock_irq(&mapping->tree_lock);
-
-			/* Has the page been truncated while we slept? */
-			if (unlikely(page->mapping != mapping ||
-				     page->index != offset)) {
-				unlock_page(page);
-				page_cache_release(page);
-				goto repeat;
-			}
+		page = page_cache_get_speculative(page);
+		rcu_read_unlock();
+		if (unlikely(!page))
+			goto repeat;
+		lock_page(page);
+		/* Has the page been truncated? */
+		if (unlikely(page->mapping != mapping
+				|| page->index != offset)) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto repeat;
 		}
-	}
-	read_unlock_irq(&mapping->tree_lock);
+	} else
+		rcu_read_unlock();
+
 	return page;
 }
 EXPORT_SYMBOL(find_lock_page);
@@ -752,16 +812,41 @@
 unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 			    unsigned int nr_pages, struct page **pages)
 {
+
 	unsigned int i;
-	unsigned int ret;
+	unsigned int nr_found;
 
-	read_lock_irq(&mapping->tree_lock);
-	ret = radix_tree_gang_lookup(&mapping->page_tree,
+	rcu_read_lock();
+repeat:
+	nr_found = radix_tree_gang_lookup(&mapping->page_tree,
 				(void **)pages, start, nr_pages);
-	for (i = 0; i < ret; i++)
-		page_cache_get(pages[i]);
-	read_unlock_irq(&mapping->tree_lock);
-	return ret;
+	for (i = 0; i < nr_found; i++) {
+		struct page *page;
+		page = page_cache_get_speculative(pages[i]);
+		if (unlikely(!page)) {
+bail:
+			/*
+			 * must return at least 1 page, so caller continues
+			 * calling in.
+			 */
+			if (i == 0)
+				goto repeat;
+			break;
+		}
+
+		/* Has the page been truncated? */
+		if (unlikely(page->mapping != mapping
+				|| page->index < start)) {
+			page_cache_release(page);
+			goto bail;
+		}
+
+		pages[i] = page;
+		/* ensure we don't pick up pages that have moved behind us */
+		start = page->index+1;
+	}
+	rcu_read_unlock();
+	return i;
 }
 EXPORT_SYMBOL(find_get_pages);
 
@@ -781,19 +866,36 @@
 			       unsigned int nr_pages, struct page **pages)
 {
 	unsigned int i;
-	unsigned int ret;
+	unsigned int nr_found;
 
-	read_lock_irq(&mapping->tree_lock);
-	ret = radix_tree_gang_lookup(&mapping->page_tree,
+	rcu_read_lock();
+repeat:
+	nr_found = radix_tree_gang_lookup(&mapping->page_tree,
 				(void **)pages, index, nr_pages);
-	for (i = 0; i < ret; i++) {
-		if (pages[i]->mapping == NULL || pages[i]->index != index)
+	for (i = 0; i < nr_found; i++) {
+		struct page *page;
+		page = page_cache_get_speculative(pages[i]);
+		if (unlikely(!page)) {
+bail:
+			/*
+			 * must return at least 1 page, so caller continues
+			 * calling in.
+			 */
+			if (i == 0)
+				goto repeat;
 			break;
+		}
 
-		page_cache_get(pages[i]);
+		/* Has the page been truncated? */
+		if (unlikely(page->mapping != mapping
+				|| page->index != index)) {
+			page_cache_release(page);
+			goto bail;
+		}
+		pages[i] = page;
 		index++;
 	}
-	read_unlock_irq(&mapping->tree_lock);
+	rcu_read_unlock();
 	return i;
 }
 EXPORT_SYMBOL(find_get_pages_tag);
@@ -813,17 +915,41 @@
 			int tag, unsigned int nr_pages, struct page **pages)
 {
 	unsigned int i;
-	unsigned int ret;
+	unsigned int nr_found;
+	pgoff_t start = *index;
 
-	read_lock_irq(&mapping->tree_lock);
-	ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
-				(void **)pages, *index, nr_pages, tag);
-	for (i = 0; i < ret; i++)
-		page_cache_get(pages[i]);
-	if (ret)
-		*index = pages[ret - 1]->index + 1;
-	read_unlock_irq(&mapping->tree_lock);
-	return ret;
+	rcu_read_lock();
+repeat:
+	nr_found = radix_tree_gang_lookup_tag(&mapping->page_tree,
+				(void **)pages, start, nr_pages, tag);
+	for (i = 0; i < nr_found; i++) {
+		struct page *page;
+		page = page_cache_get_speculative(pages[i]);
+		if (unlikely(!page)) {
+bail:
+			/*
+			 * must return at least 1 page, so caller continues
+			 * calling in.
+			 */
+			if (i == 0)
+				goto repeat;
+			break;
+		}
+
+		/* Has the page been truncated? */
+		if (unlikely(page->mapping != mapping
+				|| page->index < start)) {
+			page_cache_release(page);
+			goto bail;
+		}
+
+		pages[i] = page;
+		/* ensure we don't pick up pages that have moved behind us */
+		start = page->index+1;
+	}
+	rcu_read_unlock();
+	*index = start;
+	return i;
 }
 
 /**
diff -urN newtree/mm/filemap.c.orig newtree.2/mm/filemap.c.orig
--- newtree/mm/filemap.c.orig	2006-07-08 06:14:27.000000000 -0400
+++ newtree.2/mm/filemap.c.orig	1969-12-31 19:00:00.000000000 -0500
@@ -1,2555 +0,0 @@
-/*
- *	linux/mm/filemap.c
- *
- * Copyright (C) 1994-1999  Linus Torvalds
- */
-
-/*
- * This file handles the generic file mmap semantics used by
- * most "normal" filesystems (but you don't /have/ to use this:
- * the NFS filesystem used to do this differently, for example)
- */
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/compiler.h>
-#include <linux/fs.h>
-#include <linux/uaccess.h>
-#include <linux/aio.h>
-#include <linux/capability.h>
-#include <linux/kernel_stat.h>
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/mman.h>
-#include <linux/pagemap.h>
-#include <linux/file.h>
-#include <linux/uio.h>
-#include <linux/hash.h>
-#include <linux/writeback.h>
-#include <linux/pagevec.h>
-#include <linux/blkdev.h>
-#include <linux/security.h>
-#include <linux/syscalls.h>
-#include <linux/cpuset.h>
-#include "filemap.h"
-#include "internal.h"
-
-/*
- * FIXME: remove all knowledge of the buffer layer from the core VM
- */
-#include <linux/buffer_head.h> /* for generic_osync_inode */
-
-#include <asm/mman.h>
-
-static ssize_t
-generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-	loff_t offset, unsigned long nr_segs);
-
-#ifdef CONFIG_DEBUG_READAHEAD
-extern u32 readahead_debug_level;
-#else
-#define readahead_debug_level 0
-#endif /* CONFIG_DEBUG_READAHEAD */
-
-/*
- * Shared mappings implemented 30.11.1994. It's not fully working yet,
- * though.
- *
- * Shared mappings now work. 15.8.1995  Bruno.
- *
- * finished 'unifying' the page and buffer cache and SMP-threaded the
- * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
- *
- * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
- */
-
-/*
- * Lock ordering:
- *
- *  ->i_mmap_lock		(vmtruncate)
- *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
- *      ->swap_lock		(exclusive_swap_page, others)
- *        ->mapping->tree_lock
- *
- *  ->i_mutex
- *    ->i_mmap_lock		(truncate->unmap_mapping_range)
- *
- *  ->mmap_sem
- *    ->i_mmap_lock
- *      ->page_table_lock or pte_lock	(various, mainly in memory.c)
- *        ->mapping->tree_lock	(arch-dependent flush_dcache_mmap_lock)
- *
- *  ->mmap_sem
- *    ->lock_page		(access_process_vm)
- *
- *  ->mmap_sem
- *    ->i_mutex			(msync)
- *
- *  ->i_mutex
- *    ->i_alloc_sem             (various)
- *
- *  ->inode_lock
- *    ->sb_lock			(fs/fs-writeback.c)
- *    ->mapping->tree_lock	(__sync_single_inode)
- *
- *  ->i_mmap_lock
- *    ->anon_vma.lock		(vma_adjust)
- *
- *  ->anon_vma.lock
- *    ->page_table_lock or pte_lock	(anon_vma_prepare and various)
- *
- *  ->page_table_lock or pte_lock
- *    ->swap_lock		(try_to_unmap_one)
- *    ->private_lock		(try_to_unmap_one)
- *    ->tree_lock		(try_to_unmap_one)
- *    ->zone.lru_lock		(follow_page->mark_page_accessed)
- *    ->zone.lru_lock		(check_pte_range->isolate_lru_page)
- *    ->private_lock		(page_remove_rmap->set_page_dirty)
- *    ->tree_lock		(page_remove_rmap->set_page_dirty)
- *    ->inode_lock		(page_remove_rmap->set_page_dirty)
- *    ->inode_lock		(zap_pte_range->set_page_dirty)
- *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
- *
- *  ->task->proc_lock
- *    ->dcache_lock		(proc_pid_lookup)
- */
-
-/*
- * Remove a page from the page cache and free it. Caller has to make
- * sure the page is locked and that nobody else uses it - or that usage
- * is safe.  The caller must hold a write_lock on the mapping's tree_lock.
- */
-void __remove_from_page_cache(struct page *page)
-{
-	struct address_space *mapping = page->mapping;
-
-	radix_tree_delete(&mapping->page_tree, page->index);
-	page->mapping = NULL;
-	mapping->nrpages--;
-	__dec_zone_page_state(page, NR_FILE_PAGES);
-}
-EXPORT_SYMBOL(__remove_from_page_cache);
-
-void remove_from_page_cache(struct page *page)
-{
-	struct address_space *mapping = page->mapping;
-
-	BUG_ON(!PageLocked(page));
-
-	write_lock_irq(&mapping->tree_lock);
-	__remove_from_page_cache(page);
-	write_unlock_irq(&mapping->tree_lock);
-}
-EXPORT_SYMBOL(remove_from_page_cache);
-
-static int sync_page(void *word)
-{
-	struct address_space *mapping;
-	struct page *page;
-
-	page = container_of((unsigned long *)word, struct page, flags);
-
-	/*
-	 * page_mapping() is being called without PG_locked held.
-	 * Some knowledge of the state and use of the page is used to
-	 * reduce the requirements down to a memory barrier.
-	 * The danger here is of a stale page_mapping() return value
-	 * indicating a struct address_space different from the one it's
-	 * associated with when it is associated with one.
-	 * After smp_mb(), it's either the correct page_mapping() for
-	 * the page, or an old page_mapping() and the page's own
-	 * page_mapping() has gone NULL.
-	 * The ->sync_page() address_space operation must tolerate
-	 * page_mapping() going NULL. By an amazing coincidence,
-	 * this comes about because none of the users of the page
-	 * in the ->sync_page() methods make essential use of the
-	 * page_mapping(), merely passing the page down to the backing
-	 * device's unplug functions when it's non-NULL, which in turn
-	 * ignore it for all cases but swap, where only page_private(page) is
-	 * of interest. When page_mapping() does go NULL, the entire
-	 * call stack gracefully ignores the page and returns.
-	 * -- wli
-	 */
-	smp_mb();
-	mapping = page_mapping(page);
-	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
-		mapping->a_ops->sync_page(page);
-	io_schedule();
-	return 0;
-}
-
-/**
- * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
- * @mapping:	address space structure to write
- * @start:	offset in bytes where the range starts
- * @end:	offset in bytes where the range ends (inclusive)
- * @sync_mode:	enable synchronous operation
- *
- * Start writeback against all of a mapping's dirty pages that lie
- * within the byte offsets <start, end> inclusive.
- *
- * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
- * opposed to a regular memory cleansing writeback.  The difference between
- * these two operations is that if a dirty page/buffer is encountered, it must
- * be waited upon, and not just skipped over.
- */
-int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
-				loff_t end, int sync_mode)
-{
-	int ret;
-	struct writeback_control wbc = {
-		.sync_mode = sync_mode,
-		.nr_to_write = mapping->nrpages * 2,
-		.range_start = start,
-		.range_end = end,
-	};
-
-	if (!mapping_cap_writeback_dirty(mapping))
-		return 0;
-
-	ret = do_writepages(mapping, &wbc);
-	return ret;
-}
-
-static inline int __filemap_fdatawrite(struct address_space *mapping,
-	int sync_mode)
-{
-	return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
-}
-
-int filemap_fdatawrite(struct address_space *mapping)
-{
-	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
-}
-EXPORT_SYMBOL(filemap_fdatawrite);
-
-static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
-				loff_t end)
-{
-	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
-}
-
-/**
- * filemap_flush - mostly a non-blocking flush
- * @mapping:	target address_space
- *
- * This is a mostly non-blocking flush.  Not suitable for data-integrity
- * purposes - I/O may not be started against all dirty pages.
- */
-int filemap_flush(struct address_space *mapping)
-{
-	return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
-}
-EXPORT_SYMBOL(filemap_flush);
-
-/**
- * wait_on_page_writeback_range - wait for writeback to complete
- * @mapping:	target address_space
- * @start:	beginning page index
- * @end:	ending page index
- *
- * Wait for writeback to complete against pages indexed by start->end
- * inclusive
- */
-int wait_on_page_writeback_range(struct address_space *mapping,
-				pgoff_t start, pgoff_t end)
-{
-	struct pagevec pvec;
-	int nr_pages;
-	int ret = 0;
-	pgoff_t index;
-
-	if (end < start)
-		return 0;
-
-	pagevec_init(&pvec, 0);
-	index = start;
-	while ((index <= end) &&
-			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-			PAGECACHE_TAG_WRITEBACK,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
-		unsigned i;
-
-		for (i = 0; i < nr_pages; i++) {
-			struct page *page = pvec.pages[i];
-
-			/* until radix tree lookup accepts end_index */
-			if (page->index > end)
-				continue;
-
-			wait_on_page_writeback(page);
-			if (PageError(page))
-				ret = -EIO;
-		}
-		pagevec_release(&pvec);
-		cond_resched();
-	}
-
-	/* Check for outstanding write errors */
-	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
-		ret = -ENOSPC;
-	if (test_and_clear_bit(AS_EIO, &mapping->flags))
-		ret = -EIO;
-
-	return ret;
-}
-EXPORT_SYMBOL(add_to_page_cache_lru);
-
-/**
- * sync_page_range - write and wait on all pages in the passed range
- * @inode:	target inode
- * @mapping:	target address_space
- * @pos:	beginning offset in pages to write
- * @count:	number of bytes to write
- *
- * Write and wait upon all the pages in the passed range.  This is a "data
- * integrity" operation.  It waits upon in-flight writeout before starting and
- * waiting upon new writeout.  If there was an IO error, return it.
- *
- * We need to re-take i_mutex during the generic_osync_inode list walk because
- * it is otherwise livelockable.
- */
-int sync_page_range(struct inode *inode, struct address_space *mapping,
-			loff_t pos, loff_t count)
-{
-	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
-	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
-	int ret;
-
-	if (!mapping_cap_writeback_dirty(mapping) || !count)
-		return 0;
-	ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
-	if (ret == 0) {
-		mutex_lock(&inode->i_mutex);
-		ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
-		mutex_unlock(&inode->i_mutex);
-	}
-	if (ret == 0)
-		ret = wait_on_page_writeback_range(mapping, start, end);
-	return ret;
-}
-EXPORT_SYMBOL(sync_page_range);
-
-/**
- * sync_page_range_nolock
- * @inode:	target inode
- * @mapping:	target address_space
- * @pos:	beginning offset in pages to write
- * @count:	number of bytes to write
- *
- * Note: Holding i_mutex across sync_page_range_nolock is not a good idea
- * as it forces O_SYNC writers to different parts of the same file
- * to be serialised right until io completion.
- */
-int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
-			   loff_t pos, loff_t count)
-{
-	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
-	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
-	int ret;
-
-	if (!mapping_cap_writeback_dirty(mapping) || !count)
-		return 0;
-	ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
-	if (ret == 0)
-		ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
-	if (ret == 0)
-		ret = wait_on_page_writeback_range(mapping, start, end);
-	return ret;
-}
-EXPORT_SYMBOL(sync_page_range_nolock);
-
-/**
- * filemap_fdatawait - wait for all under-writeback pages to complete
- * @mapping: address space structure to wait for
- *
- * Walk the list of under-writeback pages of the given address space
- * and wait for all of them.
- */
-int filemap_fdatawait(struct address_space *mapping)
-{
-	loff_t i_size = i_size_read(mapping->host);
-
-	if (i_size == 0)
-		return 0;
-
-	return wait_on_page_writeback_range(mapping, 0,
-				(i_size - 1) >> PAGE_CACHE_SHIFT);
-}
-EXPORT_SYMBOL(filemap_fdatawait);
-
-int filemap_write_and_wait(struct address_space *mapping)
-{
-	int err = 0;
-
-	if (mapping->nrpages) {
-		err = filemap_fdatawrite(mapping);
-		/*
-		 * Even if the above returned error, the pages may be
-		 * written partially (e.g. -ENOSPC), so we wait for it.
-		 * But the -EIO is special case, it may indicate the worst
-		 * thing (e.g. bug) happened, so we avoid waiting for it.
-		 */
-		if (err != -EIO) {
-			int err2 = filemap_fdatawait(mapping);
-			if (!err)
-				err = err2;
-		}
-	}
-	return err;
-}
-EXPORT_SYMBOL(filemap_write_and_wait);
-
-/**
- * filemap_write_and_wait_range - write out & wait on a file range
- * @mapping:	the address_space for the pages
- * @lstart:	offset in bytes where the range starts
- * @lend:	offset in bytes where the range ends (inclusive)
- *
- * Write out and wait upon file offsets lstart->lend, inclusive.
- *
- * Note that `lend' is inclusive (describes the last byte to be written) so
- * that this function can be used to write to the very end-of-file (end = -1).
- */
-int filemap_write_and_wait_range(struct address_space *mapping,
-				 loff_t lstart, loff_t lend)
-{
-	int err = 0;
-
-	if (mapping->nrpages) {
-		err = __filemap_fdatawrite_range(mapping, lstart, lend,
-						 WB_SYNC_ALL);
-		/* See comment of filemap_write_and_wait() */
-		if (err != -EIO) {
-			int err2 = wait_on_page_writeback_range(mapping,
-						lstart >> PAGE_CACHE_SHIFT,
-						lend >> PAGE_CACHE_SHIFT);
-			if (!err)
-				err = err2;
-		}
-	}
-	return err;
-}
-
-/**
- * add_to_page_cache - add newly allocated pagecache pages
- * @page:	page to add
- * @mapping:	the page's address_space
- * @offset:	page index
- * @gfp_mask:	page allocation mode
- *
- * This function is used to add newly allocated pagecache pages;
- * the page is new, so we can just run SetPageLocked() against it.
- * The other page state flags were set by rmqueue().
- *
- * This function does not add the page to the LRU.  The caller must do that.
- */
-int add_to_page_cache(struct page *page, struct address_space *mapping,
-		pgoff_t offset, gfp_t gfp_mask)
-{
-	int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
-
-	if (error == 0) {
-		write_lock_irq(&mapping->tree_lock);
-		error = radix_tree_insert(&mapping->page_tree, offset, page);
-		if (!error) {
-			page_cache_get(page);
-			SetPageLocked(page);
-			page->mapping = mapping;
-			page->index = offset;
-			mapping->nrpages++;
-			__inc_zone_page_state(page, NR_FILE_PAGES);
-		}
-		write_unlock_irq(&mapping->tree_lock);
-		radix_tree_preload_end();
-	}
-	return error;
-}
-EXPORT_SYMBOL(add_to_page_cache);
-
-int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
-				pgoff_t offset, gfp_t gfp_mask)
-{
-	int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
-	if (ret == 0)
-		lru_cache_add(page);
-	return ret;
-}
-
-#ifdef CONFIG_NUMA
-struct page *page_cache_alloc(struct address_space *x)
-{
-	if (cpuset_do_page_mem_spread()) {
-		int n = cpuset_mem_spread_node();
-		return alloc_pages_node(n, mapping_gfp_mask(x), 0);
-	}
-	return alloc_pages(mapping_gfp_mask(x), 0);
-}
-EXPORT_SYMBOL(page_cache_alloc);
-
-struct page *page_cache_alloc_cold(struct address_space *x)
-{
-	if (cpuset_do_page_mem_spread()) {
-		int n = cpuset_mem_spread_node();
-		return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0);
-	}
-	return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
-}
-EXPORT_SYMBOL(page_cache_alloc_cold);
-#endif
-
-/*
- * In order to wait for pages to become available there must be
- * waitqueues associated with pages. By using a hash table of
- * waitqueues where the bucket discipline is to maintain all
- * waiters on the same queue and wake all when any of the pages
- * become available, and for the woken contexts to check to be
- * sure the appropriate page became available, this saves space
- * at a cost of "thundering herd" phenomena during rare hash
- * collisions.
- */
-static wait_queue_head_t *page_waitqueue(struct page *page)
-{
-	const struct zone *zone = page_zone(page);
-
-	return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
-}
-
-static inline void wake_up_page(struct page *page, int bit)
-{
-	__wake_up_bit(page_waitqueue(page), &page->flags, bit);
-}
-
-void fastcall wait_on_page_bit(struct page *page, int bit_nr)
-{
-	DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
-
-	if (test_bit(bit_nr, &page->flags))
-		__wait_on_bit(page_waitqueue(page), &wait, sync_page,
-							TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_on_page_bit);
-
-/**
- * unlock_page - unlock a locked page
- * @page: the page
- *
- * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
- * Also wakes sleepers in wait_on_page_writeback() because the wakeup
- * mechananism between PageLocked pages and PageWriteback pages is shared.
- * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
- *
- * The first mb is necessary to safely close the critical section opened by the
- * TestSetPageLocked(), the second mb is necessary to enforce ordering between
- * the clear_bit and the read of the waitqueue (to avoid SMP races with a
- * parallel wait_on_page_locked()).
- */
-void fastcall unlock_page(struct page *page)
-{
-	smp_mb__before_clear_bit();
-	if (!TestClearPageLocked(page))
-		BUG();
-	smp_mb__after_clear_bit(); 
-	wake_up_page(page, PG_locked);
-}
-EXPORT_SYMBOL(unlock_page);
-
-/**
- * end_page_writeback - end writeback against a page
- * @page: the page
- */
-void end_page_writeback(struct page *page)
-{
-	if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) {
-		if (!test_clear_page_writeback(page))
-			BUG();
-	}
-	smp_mb__after_clear_bit();
-	wake_up_page(page, PG_writeback);
-}
-EXPORT_SYMBOL(end_page_writeback);
-
-/**
- * __lock_page - get a lock on the page, assuming we need to sleep to get it
- * @page: the page to lock
- *
- * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
- * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
- * chances are that on the second loop, the block layer's plug list is empty,
- * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
- */
-void fastcall __lock_page(struct page *page)
-{
-	DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
-
-	__wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
-							TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(__lock_page);
-
-/*
- * Probing page existence.
- */
-int __probe_page(struct address_space *mapping, pgoff_t offset)
-{
-	return !! radix_tree_lookup(&mapping->page_tree, offset);
-}
-
-/*
- * Here we just do not bother to grab the page, it's meaningless anyway.
- */
-int probe_page(struct address_space *mapping, pgoff_t offset)
-{
-	int exists;
-
-	read_lock_irq(&mapping->tree_lock);
-	exists = __probe_page(mapping, offset);
-	read_unlock_irq(&mapping->tree_lock);
-
-	return exists;
-}
-
-/**
- * find_get_page - find and get a page reference
- * @mapping: the address_space to search
- * @offset: the page index
- *
- * A rather lightweight function, finding and getting a reference to a
- * hashed page atomically.
- */
-struct page * find_get_page(struct address_space *mapping, unsigned long offset)
-{
-	struct page *page;
-
-	read_lock_irq(&mapping->tree_lock);
-	page = radix_tree_lookup(&mapping->page_tree, offset);
-	if (page)
-		page_cache_get(page);
-	read_unlock_irq(&mapping->tree_lock);
-	return page;
-}
-EXPORT_SYMBOL(find_get_page);
-
-/**
- * find_trylock_page - find and lock a page
- * @mapping: the address_space to search
- * @offset: the page index
- *
- * Same as find_get_page(), but trylock it instead of incrementing the count.
- */
-struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
-{
-	struct page *page;
-
-	read_lock_irq(&mapping->tree_lock);
-	page = radix_tree_lookup(&mapping->page_tree, offset);
-	if (page && TestSetPageLocked(page))
-		page = NULL;
-	read_unlock_irq(&mapping->tree_lock);
-	return page;
-}
-EXPORT_SYMBOL(find_trylock_page);
-
-/**
- * find_lock_page - locate, pin and lock a pagecache page
- * @mapping: the address_space to search
- * @offset: the page index
- *
- * Locates the desired pagecache page, locks it, increments its reference
- * count and returns its address.
- *
- * Returns zero if the page was not present. find_lock_page() may sleep.
- */
-struct page *find_lock_page(struct address_space *mapping,
-				unsigned long offset)
-{
-	struct page *page;
-
-	read_lock_irq(&mapping->tree_lock);
-repeat:
-	page = radix_tree_lookup(&mapping->page_tree, offset);
-	if (page) {
-		page_cache_get(page);
-		if (TestSetPageLocked(page)) {
-			read_unlock_irq(&mapping->tree_lock);
-			__lock_page(page);
-			read_lock_irq(&mapping->tree_lock);
-
-			/* Has the page been truncated while we slept? */
-			if (unlikely(page->mapping != mapping ||
-				     page->index != offset)) {
-				unlock_page(page);
-				page_cache_release(page);
-				goto repeat;
-			}
-		}
-	}
-	read_unlock_irq(&mapping->tree_lock);
-	return page;
-}
-EXPORT_SYMBOL(find_lock_page);
-
-/**
- * find_or_create_page - locate or add a pagecache page
- * @mapping: the page's address_space
- * @index: the page's index into the mapping
- * @gfp_mask: page allocation mode
- *
- * Locates a page in the pagecache.  If the page is not present, a new page
- * is allocated using @gfp_mask and is added to the pagecache and to the VM's
- * LRU list.  The returned page is locked and has its reference count
- * incremented.
- *
- * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
- * allocation!
- *
- * find_or_create_page() returns the desired page's address, or zero on
- * memory exhaustion.
- */
-struct page *find_or_create_page(struct address_space *mapping,
-		unsigned long index, gfp_t gfp_mask)
-{
-	struct page *page, *cached_page = NULL;
-	int err;
-repeat:
-	page = find_lock_page(mapping, index);
-	if (!page) {
-		if (!cached_page) {
-			cached_page = alloc_page(gfp_mask);
-			if (!cached_page)
-				return NULL;
-		}
-		err = add_to_page_cache_lru(cached_page, mapping,
-					index, gfp_mask);
-		if (!err) {
-			page = cached_page;
-			cached_page = NULL;
-		} else if (err == -EEXIST)
-			goto repeat;
-	}
-	if (cached_page)
-		page_cache_release(cached_page);
-	return page;
-}
-EXPORT_SYMBOL(find_or_create_page);
-
-/**
- * find_get_pages - gang pagecache lookup
- * @mapping:	The address_space to search
- * @start:	The starting page index
- * @nr_pages:	The maximum number of pages
- * @pages:	Where the resulting pages are placed
- *
- * find_get_pages() will search for and return a group of up to
- * @nr_pages pages in the mapping.  The pages are placed at @pages.
- * find_get_pages() takes a reference against the returned pages.
- *
- * The search returns a group of mapping-contiguous pages with ascending
- * indexes.  There may be holes in the indices due to not-present pages.
- *
- * find_get_pages() returns the number of pages which were found.
- */
-unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
-			    unsigned int nr_pages, struct page **pages)
-{
-	unsigned int i;
-	unsigned int ret;
-
-	read_lock_irq(&mapping->tree_lock);
-	ret = radix_tree_gang_lookup(&mapping->page_tree,
-				(void **)pages, start, nr_pages);
-	for (i = 0; i < ret; i++)
-		page_cache_get(pages[i]);
-	read_unlock_irq(&mapping->tree_lock);
-	return ret;
-}
-EXPORT_SYMBOL(find_get_pages);
-
-/**
- * find_get_pages_contig - gang contiguous pagecache lookup
- * @mapping:	The address_space to search
- * @index:	The starting page index
- * @nr_pages:	The maximum number of pages
- * @pages:	Where the resulting pages are placed
- *
- * find_get_pages_contig() works exactly like find_get_pages(), except
- * that the returned number of pages are guaranteed to be contiguous.
- *
- * find_get_pages_contig() returns the number of pages which were found.
- */
-unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
-			       unsigned int nr_pages, struct page **pages)
-{
-	unsigned int i;
-	unsigned int ret;
-
-	read_lock_irq(&mapping->tree_lock);
-	ret = radix_tree_gang_lookup(&mapping->page_tree,
-				(void **)pages, index, nr_pages);
-	for (i = 0; i < ret; i++) {
-		if (pages[i]->mapping == NULL || pages[i]->index != index)
-			break;
-
-		page_cache_get(pages[i]);
-		index++;
-	}
-	read_unlock_irq(&mapping->tree_lock);
-	return i;
-}
-EXPORT_SYMBOL(find_get_pages_tag);
-
-/**
- * find_get_pages_tag - find and return pages that match @tag
- * @mapping:	the address_space to search
- * @index:	the starting page index
- * @tag:	the tag index
- * @nr_pages:	the maximum number of pages
- * @pages:	where the resulting pages are placed
- *
- * Like find_get_pages, except we only return pages which are tagged with
- * @tag.   We update @index to index the next page for the traversal.
- */
-unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
-			int tag, unsigned int nr_pages, struct page **pages)
-{
-	unsigned int i;
-	unsigned int ret;
-
-	read_lock_irq(&mapping->tree_lock);
-	ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
-				(void **)pages, *index, nr_pages, tag);
-	for (i = 0; i < ret; i++)
-		page_cache_get(pages[i]);
-	if (ret)
-		*index = pages[ret - 1]->index + 1;
-	read_unlock_irq(&mapping->tree_lock);
-	return ret;
-}
-
-/**
- * grab_cache_page_nowait - returns locked page at given index in given cache
- * @mapping: target address_space
- * @index: the page index
- *
- * Same as grab_cache_page, but do not wait if the page is unavailable.
- * This is intended for speculative data generators, where the data can
- * be regenerated if the page couldn't be grabbed.  This routine should
- * be safe to call while holding the lock for another page.
- *
- * Clear __GFP_FS when allocating the page to avoid recursion into the fs
- * and deadlock against the caller's locked page.
- */
-struct page *
-grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
-{
-	struct page *page = find_get_page(mapping, index);
-	gfp_t gfp_mask;
-
-	if (page) {
-		if (!TestSetPageLocked(page))
-			return page;
-		page_cache_release(page);
-		return NULL;
-	}
-	gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS;
-	page = alloc_pages(gfp_mask, 0);
-	if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) {
-		page_cache_release(page);
-		page = NULL;
-	}
-	return page;
-}
-EXPORT_SYMBOL(grab_cache_page_nowait);
-
-/*
- * CD/DVDs are error prone. When a medium error occurs, the driver may fail
- * a _large_ part of the i/o request. Imagine the worst scenario:
- *
- *      ---R__________________________________________B__________
- *         ^ reading here                             ^ bad block(assume 4k)
- *
- * read(R) => miss => readahead(R...B) => media error => frustrating retries
- * => failing the whole request => read(R) => read(R+1) =>
- * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
- * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
- * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
- *
- * It is going insane. Fix it by quickly scaling down the readahead size.
- */
-static void shrink_readahead_size_eio(struct file *filp,
-					struct file_ra_state *ra)
-{
-	if (!ra->ra_pages)
-		return;
-
-	ra->ra_pages /= 4;
-	printk(KERN_WARNING "Reducing readahead size to %luK\n",
-			ra->ra_pages << (PAGE_CACHE_SHIFT - 10));
-}
-
-/**
- * do_generic_mapping_read - generic file read routine
- * @mapping:	address_space to be read
- * @_ra:	file's readahead state
- * @filp:	the file to read
- * @ppos:	current file position
- * @desc:	read_descriptor
- * @actor:	read method
- *
- * This is a generic file read routine, and uses the
- * mapping->a_ops->readpage() function for the actual low-level stuff.
- *
- * This is really ugly. But the goto's actually try to clarify some
- * of the logic when it comes to error handling etc.
- *
- * Note the struct file* is only passed for the use of readpage.
- * It may be NULL.
- */
-void do_generic_mapping_read(struct address_space *mapping,
-			     struct file_ra_state *_ra,
-			     struct file *filp,
-			     loff_t *ppos,
-			     read_descriptor_t *desc,
-			     read_actor_t actor)
-{
-	struct inode *inode = mapping->host;
-	unsigned long index;
-	unsigned long end_index;
-	unsigned long offset;
-	unsigned long last_index;
-	unsigned long next_index;
-	unsigned long prev_index;
-	loff_t isize;
-	struct page *cached_page;
-	struct page *prev_page;
-	int error;
-	struct file_ra_state ra = *_ra;
-
-	cached_page = NULL;
-	prev_page = NULL;
-	index = *ppos >> PAGE_CACHE_SHIFT;
-	next_index = index;
-	prev_index = ra.prev_page;
-	last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
-	offset = *ppos & ~PAGE_CACHE_MASK;
-
-	isize = i_size_read(inode);
-	if (!isize)
-		goto out;
-
-	if (readahead_debug_level >= 5)
-		printk(KERN_DEBUG "read-file(ino=%lu, req=%lu+%lu)\n",
-			inode->i_ino, index, last_index - index);
-
-	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
-	for (;;) {
-		struct page *page;
-		unsigned long nr, ret;
-
-		/* nr is the maximum number of bytes to copy from this page */
-		nr = PAGE_CACHE_SIZE;
-		if (index >= end_index) {
-			if (index > end_index)
-				goto out;
-			nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
-			if (nr <= offset) {
-				goto out;
-			}
-		}
-		nr = nr - offset;
-
-		cond_resched();
-
-		if (!prefer_adaptive_readahead() && index == next_index)
-			next_index = page_cache_readahead(mapping, &ra, filp,
-					index, last_index - index);
-
-find_page:
-		page = find_get_page(mapping, index);
-		if (prefer_adaptive_readahead()) {
-			if (unlikely(page == NULL)) {
-				ra.prev_page = prev_index;
-				page_cache_readahead_adaptive(mapping, &ra,
-						filp, prev_page, NULL,
-						*ppos >> PAGE_CACHE_SHIFT,
-						index, last_index);
-				page = find_get_page(mapping, index);
-			} else if (PageReadahead(page)) {
-				ra.prev_page = prev_index;
-				page_cache_readahead_adaptive(mapping, &ra,
-						filp, prev_page, page,
-						*ppos >> PAGE_CACHE_SHIFT,
-						index, last_index);
-			}
-		}
-		if (unlikely(page == NULL)) {
-			if (!prefer_adaptive_readahead())
-				handle_ra_miss(mapping, &ra, index);
-			goto no_cached_page;
-		}
-
-		if (prev_page)
-			page_cache_release(prev_page);
-		prev_page = page;
-
-		if (prefer_adaptive_readahead())
-			readahead_cache_hit(&ra, page);
-
-		if (readahead_debug_level >= 7)
-			printk(KERN_DEBUG "read-page(ino=%lu, idx=%lu, io=%s)\n",
-				inode->i_ino, index,
-				PageUptodate(page) ? "hit" : "miss");
-
-		if (!PageUptodate(page))
-			goto page_not_up_to_date;
-page_ok:
-
-		/* If users can be writing to this page using arbitrary
-		 * virtual addresses, take care about potential aliasing
-		 * before reading the page on the kernel side.
-		 */
-		if (mapping_writably_mapped(mapping))
-			flush_dcache_page(page);
-
-		/*
-		 * When (part of) the same page is read multiple times
-		 * in succession, only mark it as accessed the first time.
-		 */
-		if (prev_index != index)
-			mark_page_accessed(page);
-		prev_index = index;
-
-		/*
-		 * Ok, we have the page, and it's up-to-date, so
-		 * now we can copy it to user space...
-		 *
-		 * The actor routine returns how many bytes were actually used..
-		 * NOTE! This may not be the same as how much of a user buffer
-		 * we filled up (we may be padding etc), so we can only update
-		 * "pos" here (the actor routine has to update the user buffer
-		 * pointers and the remaining count).
-		 */
-		ret = actor(desc, page, offset, nr);
-		offset += ret;
-		index += offset >> PAGE_CACHE_SHIFT;
-		offset &= ~PAGE_CACHE_MASK;
-
-		if (ret == nr && desc->count)
-			continue;
-		goto out;
-
-page_not_up_to_date:
-		/* Get exclusive access to the page ... */
-		lock_page(page);
-
-		/* Did it get unhashed before we got the lock? */
-		if (!page->mapping) {
-			unlock_page(page);
-			continue;
-		}
-
-		/* Did somebody else fill it already? */
-		if (PageUptodate(page)) {
-			unlock_page(page);
-			goto page_ok;
-		}
-
-readpage:
-		/* Start the actual read. The read will unlock the page. */
-		error = mapping->a_ops->readpage(filp, page);
-
-		if (unlikely(error)) {
-			if (error == AOP_TRUNCATED_PAGE) {
-				page_cache_release(page);
-				goto find_page;
-			}
-			goto readpage_error;
-		}
-
-		if (!PageUptodate(page)) {
-			lock_page(page);
-			if (!PageUptodate(page)) {
-				if (page->mapping == NULL) {
-					/*
-					 * invalidate_inode_pages got it
-					 */
-					unlock_page(page);
-					goto find_page;
-				}
-				unlock_page(page);
-				error = -EIO;
-				shrink_readahead_size_eio(filp, &ra);
-				goto readpage_error;
-			}
-			unlock_page(page);
-		}
-
-		/*
-		 * i_size must be checked after we have done ->readpage.
-		 *
-		 * Checking i_size after the readpage allows us to calculate
-		 * the correct value for "nr", which means the zero-filled
-		 * part of the page is not copied back to userspace (unless
-		 * another truncate extends the file - this is desired though).
-		 */
-		isize = i_size_read(inode);
-		end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
-		if (unlikely(!isize || index > end_index)) {
-			goto out;
-		}
-
-		/* nr is the maximum number of bytes to copy from this page */
-		nr = PAGE_CACHE_SIZE;
-		if (index == end_index) {
-			nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
-			if (nr <= offset) {
-				goto out;
-			}
-		}
-		nr = nr - offset;
-		goto page_ok;
-
-readpage_error:
-		/* UHHUH! A synchronous read error occurred. Report it */
-		desc->error = error;
-		goto out;
-
-no_cached_page:
-		/*
-		 * Ok, it wasn't cached, so we need to create a new
-		 * page..
-		 */
-		if (!cached_page) {
-			cached_page = page_cache_alloc_cold(mapping);
-			if (!cached_page) {
-				desc->error = -ENOMEM;
-				goto out;
-			}
-		}
-		error = add_to_page_cache_lru(cached_page, mapping,
-						index, GFP_KERNEL);
-		if (error) {
-			if (error == -EEXIST)
-				goto find_page;
-			desc->error = error;
-			goto out;
-		}
-		page = cached_page;
-		cached_page = NULL;
-		if (prev_page)
-			page_cache_release(prev_page);
-		prev_page = page;
-		goto readpage;
-	}
-
-out:
-	*_ra = ra;
-	if (prefer_adaptive_readahead())
-		_ra->prev_page = prev_index;
-
-	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
-	if (cached_page)
-		page_cache_release(cached_page);
-	if (prev_page)
-		page_cache_release(prev_page);
-	if (filp)
-		file_accessed(filp);
-}
-EXPORT_SYMBOL(do_generic_mapping_read);
-
-int file_read_actor(read_descriptor_t *desc, struct page *page,
-			unsigned long offset, unsigned long size)
-{
-	char *kaddr;
-	unsigned long left, count = desc->count;
-
-	if (size > count)
-		size = count;
-
-	/*
-	 * Faults on the destination of a read are common, so do it before
-	 * taking the kmap.
-	 */
-	if (!fault_in_pages_writeable(desc->arg.buf, size)) {
-		kaddr = kmap_atomic(page, KM_USER0);
-		left = __copy_to_user_inatomic(desc->arg.buf,
-						kaddr + offset, size);
-		kunmap_atomic(kaddr, KM_USER0);
-		if (left == 0)
-			goto success;
-	}
-
-	/* Do it the slow way */
-	kaddr = kmap(page);
-	left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
-	kunmap(page);
-
-	if (left) {
-		size -= left;
-		desc->error = -EFAULT;
-	}
-success:
-	desc->count = count - size;
-	desc->written += size;
-	desc->arg.buf += size;
-	return size;
-}
-EXPORT_SYMBOL_GPL(file_read_actor);
-
-/**
- * __generic_file_aio_read - generic filesystem read routine
- * @iocb:	kernel I/O control block
- * @iov:	io vector request
- * @nr_segs:	number of segments in the iovec
- * @ppos:	current file position
- *
- * This is the "read()" routine for all filesystems
- * that can use the page cache directly.
- */
-ssize_t
-generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos)
-{
-	struct file *filp = iocb->ki_filp;
-	ssize_t retval;
-	unsigned long seg;
-	size_t count;
-	loff_t *ppos = &iocb->ki_pos;
-
-	count = 0;
-	for (seg = 0; seg < nr_segs; seg++) {
-		const struct iovec *iv = &iov[seg];
-
-		/*
-		 * If any segment has a negative length, or the cumulative
-		 * length ever wraps negative then return -EINVAL.
-		 */
-		count += iv->iov_len;
-		if (unlikely((ssize_t)(count|iv->iov_len) < 0))
-			return -EINVAL;
-		if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
-			continue;
-		if (seg == 0)
-			return -EFAULT;
-		nr_segs = seg;
-		count -= iv->iov_len;	/* This segment is no good */
-		break;
-	}
-
-	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
-	if (filp->f_flags & O_DIRECT) {
-		loff_t size;
-		struct address_space *mapping;
-		struct inode *inode;
-
-		mapping = filp->f_mapping;
-		inode = mapping->host;
-		retval = 0;
-		if (!count)
-			goto out; /* skip atime */
-		size = i_size_read(inode);
-		if (pos < size) {
-			retval = generic_file_direct_IO(READ, iocb,
-						iov, pos, nr_segs);
-			if (retval > 0 && !is_sync_kiocb(iocb))
-				retval = -EIOCBQUEUED;
-			if (retval > 0)
-				*ppos = pos + retval;
-		}
-		file_accessed(filp);
-		goto out;
-	}
-
-	retval = 0;
-	if (count) {
-		for (seg = 0; seg < nr_segs; seg++) {
-			read_descriptor_t desc;
-
-			desc.written = 0;
-			desc.arg.buf = iov[seg].iov_base;
-			desc.count = iov[seg].iov_len;
-			if (desc.count == 0)
-				continue;
-			desc.error = 0;
-			do_generic_file_read(filp,ppos,&desc,file_read_actor);
-			retval += desc.written;
-			if (desc.error) {
-				retval = retval ?: desc.error;
-				break;
-			}
-		}
-	}
-out:
-	return retval;
-}
-EXPORT_SYMBOL(generic_file_aio_read);
-
-int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
-{
-	ssize_t written;
-	unsigned long count = desc->count;
-	struct file *file = desc->arg.data;
-
-	if (size > count)
-		size = count;
-
-	written = file->f_op->sendpage(file, page, offset,
-				       size, &file->f_pos, size<count);
-	if (written < 0) {
-		desc->error = written;
-		written = 0;
-	}
-	desc->count = count - written;
-	desc->written += written;
-	return written;
-}
-
-ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
-			 size_t count, read_actor_t actor, void *target)
-{
-	read_descriptor_t desc;
-
-	if (!count)
-		return 0;
-
-	desc.written = 0;
-	desc.count = count;
-	desc.arg.data = target;
-	desc.error = 0;
-
-	do_generic_file_read(in_file, ppos, &desc, actor);
-	if (desc.written)
-		return desc.written;
-	return desc.error;
-}
-EXPORT_SYMBOL(generic_file_sendfile);
-
-static ssize_t
-do_readahead(struct address_space *mapping, struct file *filp,
-	     unsigned long index, unsigned long nr)
-{
-	if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
-		return -EINVAL;
-
-	force_page_cache_readahead(mapping, filp, index,
-					max_sane_readahead(nr));
-	return 0;
-}
-
-asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
-{
-	ssize_t ret;
-	struct file *file;
-
-	ret = -EBADF;
-	file = fget(fd);
-	if (file) {
-		if (file->f_mode & FMODE_READ) {
-			struct address_space *mapping = file->f_mapping;
-			unsigned long start = offset >> PAGE_CACHE_SHIFT;
-			unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
-			unsigned long len = end - start + 1;
-			ret = do_readahead(mapping, file, start, len);
-		}
-		fput(file);
-	}
-	return ret;
-}
-
-#ifdef CONFIG_MMU
-static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
-/**
- * page_cache_read - adds requested page to the page cache if not already there
- * @file:	file to read
- * @offset:	page index
- *
- * This adds the requested page to the page cache if it isn't already there,
- * and schedules an I/O to read in its contents from disk.
- */
-static int fastcall page_cache_read(struct file * file, unsigned long offset)
-{
-	struct address_space *mapping = file->f_mapping;
-	struct page *page; 
-	int ret;
-
-	do {
-		page = page_cache_alloc_cold(mapping);
-		if (!page)
-			return -ENOMEM;
-
-		ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
-		if (ret == 0)
-			ret = mapping->a_ops->readpage(file, page);
-		else if (ret == -EEXIST)
-			ret = 0; /* losing race to add is OK */
-
-		page_cache_release(page);
-
-	} while (ret == AOP_TRUNCATED_PAGE);
-		
-	return ret;
-}
-
-#define MMAP_LOTSAMISS  (100)
-
-/**
- * filemap_nopage - read in file data for page fault handling
- * @area:	the applicable vm_area
- * @address:	target address to read in
- * @type:	returned with VM_FAULT_{MINOR,MAJOR} if not %NULL
- *
- * filemap_nopage() is invoked via the vma operations vector for a
- * mapped memory region to read in file data during a page fault.
- *
- * The goto's are kind of ugly, but this streamlines the normal case of having
- * it in the page cache, and handles the special cases reasonably without
- * having a lot of duplicated code.
- */
-struct page *filemap_nopage(struct vm_area_struct *area,
-				unsigned long address, int *type)
-{
-	int error;
-	struct file *file = area->vm_file;
-	struct address_space *mapping = file->f_mapping;
-	struct file_ra_state *ra = &file->f_ra;
-	struct inode *inode = mapping->host;
-	struct page *page;
-	unsigned long size, pgoff;
-	int did_readaround = 0, majmin = VM_FAULT_MINOR;
-
-	ra->flags |= RA_FLAG_MMAP;
-	pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
-
-retry_all:
-	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (pgoff >= size)
-		goto outside_data_content;
-
-	/* If we don't want any read-ahead, don't bother */
-	if (VM_RandomReadHint(area))
-		goto no_cached_page;
-
-	/*
-	 * The readahead code wants to be told about each and every page
-	 * so it can build and shrink its windows appropriately
-	 *
-	 * For sequential accesses, we use the generic readahead logic.
-	 */
-	if (!prefer_adaptive_readahead() && VM_SequentialReadHint(area))
-		page_cache_readahead(mapping, ra, file, pgoff, 1);
-
-	/*
-	 * Do we have something in the page cache already?
-	 */
-retry_find:
-	page = find_get_page(mapping, pgoff);
-	if (prefer_adaptive_readahead() && VM_SequentialReadHint(area)) {
-		if (!page) {
-			page_cache_readahead_adaptive(mapping, ra,
-						file, NULL, NULL,
-						pgoff, pgoff, pgoff + 1);
-			page = find_get_page(mapping, pgoff);
-		} else if (PageReadahead(page)) {
-			page_cache_readahead_adaptive(mapping, ra,
-						file, NULL, page,
-						pgoff, pgoff, pgoff + 1);
-		}
-	}
-	if (!page) {
-		unsigned long ra_pages;
-
-		if (VM_SequentialReadHint(area)) {
-			if (!prefer_adaptive_readahead())
-				handle_ra_miss(mapping, ra, pgoff);
-			goto no_cached_page;
-		}
-		ra->mmap_miss++;
-
-		/*
-		 * Do we miss much more than hit in this file? If so,
-		 * stop bothering with read-ahead. It will only hurt.
-		 */
-		if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS)
-			goto no_cached_page;
-
-		/*
-		 * To keep the pgmajfault counter straight, we need to
-		 * check did_readaround, as this is an inner loop.
-		 */
-		if (!did_readaround) {
-			majmin = VM_FAULT_MAJOR;
-			count_vm_event(PGMAJFAULT);
-		}
-		did_readaround = 1;
-		ra_pages = max_sane_readahead(file->f_ra.ra_pages);
-		if (ra_pages) {
-			pgoff_t start = 0;
-
-			if (pgoff > ra_pages / 2)
-				start = pgoff - ra_pages / 2;
-			do_page_cache_readahead(mapping, file, start, ra_pages);
-		}
-		page = find_get_page(mapping, pgoff);
-		if (!page)
-			goto no_cached_page;
-	}
-
-	if (!did_readaround)
-		ra->mmap_hit++;
-
-	if (prefer_adaptive_readahead())
-		readahead_cache_hit(ra, page);
-
-	if (readahead_debug_level >= 6)
-		printk(KERN_DEBUG "read-mmap(ino=%lu, idx=%lu, hint=%s, io=%s)\n",
-			inode->i_ino, pgoff,
-			VM_RandomReadHint(area) ? "random" :
-			(VM_SequentialReadHint(area) ? "sequential" : "none"),
-			PageUptodate(page) ? "hit" : "miss");
-
-	/*
-	 * Ok, found a page in the page cache, now we need to check
-	 * that it's up-to-date.
-	 */
-	if (!PageUptodate(page))
-		goto page_not_uptodate;
-
-success:
-	/*
-	 * Found the page and have a reference on it.
-	 */
-	mark_page_accessed(page);
-	if (type)
-		*type = majmin;
-	if (prefer_adaptive_readahead())
-		ra->prev_page = page->index;
-	return page;
-
-outside_data_content:
-	/*
-	 * An external ptracer can access pages that normally aren't
-	 * accessible..
-	 */
-	if (area->vm_mm == current->mm)
-		return NULL;
-	/* Fall through to the non-read-ahead case */
-no_cached_page:
-	/*
-	 * We're only likely to ever get here if MADV_RANDOM is in
-	 * effect.
-	 */
-	error = page_cache_read(file, pgoff);
-	grab_swap_token();
-
-	/*
-	 * The page we want has now been added to the page cache.
-	 * In the unlikely event that someone removed it in the
-	 * meantime, we'll just come back here and read it again.
-	 */
-	if (error >= 0)
-		goto retry_find;
-
-	/*
-	 * An error return from page_cache_read can result if the
-	 * system is low on memory, or a problem occurs while trying
-	 * to schedule I/O.
-	 */
-	if (error == -ENOMEM)
-		return NOPAGE_OOM;
-	return NULL;
-
-page_not_uptodate:
-	if (!did_readaround) {
-		majmin = VM_FAULT_MAJOR;
-		count_vm_event(PGMAJFAULT);
-	}
-	lock_page(page);
-
-	/* Did it get unhashed while we waited for it? */
-	if (!page->mapping) {
-		unlock_page(page);
-		page_cache_release(page);
-		goto retry_all;
-	}
-
-	/* Did somebody else get it up-to-date? */
-	if (PageUptodate(page)) {
-		unlock_page(page);
-		goto success;
-	}
-
-	error = mapping->a_ops->readpage(file, page);
-	if (!error) {
-		wait_on_page_locked(page);
-		if (PageUptodate(page))
-			goto success;
-	} else if (error == AOP_TRUNCATED_PAGE) {
-		page_cache_release(page);
-		goto retry_find;
-	}
-
-	/*
-	 * Umm, take care of errors if the page isn't up-to-date.
-	 * Try to re-read it _once_. We do this synchronously,
-	 * because there really aren't any performance issues here
-	 * and we need to check for errors.
-	 */
-	lock_page(page);
-
-	/* Somebody truncated the page on us? */
-	if (!page->mapping) {
-		unlock_page(page);
-		page_cache_release(page);
-		goto retry_all;
-	}
-
-	/* Somebody else successfully read it in? */
-	if (PageUptodate(page)) {
-		unlock_page(page);
-		goto success;
-	}
-	ClearPageError(page);
-	error = mapping->a_ops->readpage(file, page);
-	if (!error) {
-		wait_on_page_locked(page);
-		if (PageUptodate(page))
-			goto success;
-	} else if (error == AOP_TRUNCATED_PAGE) {
-		page_cache_release(page);
-		goto retry_find;
-	}
-
-	/*
-	 * Things didn't work out. Return zero to tell the
-	 * mm layer so, possibly freeing the page cache page first.
-	 */
-	shrink_readahead_size_eio(file, ra);
-	page_cache_release(page);
-	return NULL;
-}
-EXPORT_SYMBOL(filemap_nopage);
-
-static struct page * filemap_getpage(struct file *file, unsigned long pgoff,
-					int nonblock)
-{
-	struct address_space *mapping = file->f_mapping;
-	struct page *page;
-	int error;
-
-	/*
-	 * Do we have something in the page cache already?
-	 */
-retry_find:
-	page = find_get_page(mapping, pgoff);
-	if (!page) {
-		if (nonblock)
-			return NULL;
-		goto no_cached_page;
-	}
-
-	/*
-	 * Ok, found a page in the page cache, now we need to check
-	 * that it's up-to-date.
-	 */
-	if (!PageUptodate(page)) {
-		if (nonblock) {
-			page_cache_release(page);
-			return NULL;
-		}
-		goto page_not_uptodate;
-	}
-
-success:
-	/*
-	 * Found the page and have a reference on it.
-	 */
-	mark_page_accessed(page);
-	return page;
-
-no_cached_page:
-	error = page_cache_read(file, pgoff);
-
-	/*
-	 * The page we want has now been added to the page cache.
-	 * In the unlikely event that someone removed it in the
-	 * meantime, we'll just come back here and read it again.
-	 */
-	if (error >= 0)
-		goto retry_find;
-
-	/*
-	 * An error return from page_cache_read can result if the
-	 * system is low on memory, or a problem occurs while trying
-	 * to schedule I/O.
-	 */
-	return NULL;
-
-page_not_uptodate:
-	lock_page(page);
-
-	/* Did it get unhashed while we waited for it? */
-	if (!page->mapping) {
-		unlock_page(page);
-		goto err;
-	}
-
-	/* Did somebody else get it up-to-date? */
-	if (PageUptodate(page)) {
-		unlock_page(page);
-		goto success;
-	}
-
-	error = mapping->a_ops->readpage(file, page);
-	if (!error) {
-		wait_on_page_locked(page);
-		if (PageUptodate(page))
-			goto success;
-	} else if (error == AOP_TRUNCATED_PAGE) {
-		page_cache_release(page);
-		goto retry_find;
-	}
-
-	/*
-	 * Umm, take care of errors if the page isn't up-to-date.
-	 * Try to re-read it _once_. We do this synchronously,
-	 * because there really aren't any performance issues here
-	 * and we need to check for errors.
-	 */
-	lock_page(page);
-
-	/* Somebody truncated the page on us? */
-	if (!page->mapping) {
-		unlock_page(page);
-		goto err;
-	}
-	/* Somebody else successfully read it in? */
-	if (PageUptodate(page)) {
-		unlock_page(page);
-		goto success;
-	}
-
-	ClearPageError(page);
-	error = mapping->a_ops->readpage(file, page);
-	if (!error) {
-		wait_on_page_locked(page);
-		if (PageUptodate(page))
-			goto success;
-	} else if (error == AOP_TRUNCATED_PAGE) {
-		page_cache_release(page);
-		goto retry_find;
-	}
-
-	/*
-	 * Things didn't work out. Return zero to tell the
-	 * mm layer so, possibly freeing the page cache page first.
-	 */
-err:
-	page_cache_release(page);
-
-	return NULL;
-}
-
-int filemap_populate(struct vm_area_struct *vma, unsigned long addr,
-		unsigned long len, pgprot_t prot, unsigned long pgoff,
-		int nonblock)
-{
-	struct file *file = vma->vm_file;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	unsigned long size;
-	struct mm_struct *mm = vma->vm_mm;
-	struct page *page;
-	int err;
-
-	if (!nonblock)
-		force_page_cache_readahead(mapping, vma->vm_file,
-					pgoff, len >> PAGE_CACHE_SHIFT);
-
-repeat:
-	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (pgoff + (len >> PAGE_CACHE_SHIFT) > size)
-		return -EINVAL;
-
-	page = filemap_getpage(file, pgoff, nonblock);
-
-	/* XXX: This is wrong, a filesystem I/O error may have happened. Fix that as
-	 * done in shmem_populate calling shmem_getpage */
-	if (!page && !nonblock)
-		return -ENOMEM;
-
-	if (page) {
-		err = install_page(mm, vma, addr, page, prot);
-		if (err) {
-			page_cache_release(page);
-			return err;
-		}
-	} else if (vma->vm_flags & VM_NONLINEAR) {
-		/* No page was found just because we can't read it in now (being
-		 * here implies nonblock != 0), but the page may exist, so set
-		 * the PTE to fault it in later. */
-		err = install_file_pte(mm, vma, addr, pgoff, prot);
-		if (err)
-			return err;
-	}
-
-	len -= PAGE_SIZE;
-	addr += PAGE_SIZE;
-	pgoff++;
-	if (len)
-		goto repeat;
-
-	return 0;
-}
-EXPORT_SYMBOL(filemap_populate);
-
-struct vm_operations_struct generic_file_vm_ops = {
-	.nopage		= filemap_nopage,
-	.populate	= filemap_populate,
-};
-
-/* This is used for a general mmap of a disk file */
-
-int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
-{
-	struct address_space *mapping = file->f_mapping;
-
-	if (!mapping->a_ops->readpage)
-		return -ENOEXEC;
-	file_accessed(file);
-	vma->vm_ops = &generic_file_vm_ops;
-	return 0;
-}
-
-/*
- * This is for filesystems which do not implement ->writepage.
- */
-int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
-		return -EINVAL;
-	return generic_file_mmap(file, vma);
-}
-#else
-int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
-{
-	return -ENOSYS;
-}
-int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
-{
-	return -ENOSYS;
-}
-#endif /* CONFIG_MMU */
-
-EXPORT_SYMBOL(generic_file_mmap);
-EXPORT_SYMBOL(generic_file_readonly_mmap);
-
-static inline struct page *__read_cache_page(struct address_space *mapping,
-				unsigned long index,
-				int (*filler)(void *,struct page*),
-				void *data)
-{
-	struct page *page, *cached_page = NULL;
-	int err;
-repeat:
-	page = find_get_page(mapping, index);
-	if (!page) {
-		if (!cached_page) {
-			cached_page = page_cache_alloc_cold(mapping);
-			if (!cached_page)
-				return ERR_PTR(-ENOMEM);
-		}
-		err = add_to_page_cache_lru(cached_page, mapping,
-					index, GFP_KERNEL);
-		if (err == -EEXIST)
-			goto repeat;
-		if (err < 0) {
-			/* Presumably ENOMEM for radix tree node */
-			page_cache_release(cached_page);
-			return ERR_PTR(err);
-		}
-		page = cached_page;
-		cached_page = NULL;
-		err = filler(data, page);
-		if (err < 0) {
-			page_cache_release(page);
-			page = ERR_PTR(err);
-		}
-	}
-	if (cached_page)
-		page_cache_release(cached_page);
-	return page;
-}
-
-/**
- * read_cache_page - read into page cache, fill it if needed
- * @mapping:	the page's address_space
- * @index:	the page index
- * @filler:	function to perform the read
- * @data:	destination for read data
- *
- * Read into the page cache. If a page already exists,
- * and PageUptodate() is not set, try to fill the page.
- */
-struct page *read_cache_page(struct address_space *mapping,
-				unsigned long index,
-				int (*filler)(void *,struct page*),
-				void *data)
-{
-	struct page *page;
-	int err;
-
-retry:
-	page = __read_cache_page(mapping, index, filler, data);
-	if (IS_ERR(page))
-		goto out;
-	mark_page_accessed(page);
-	if (PageUptodate(page))
-		goto out;
-
-	lock_page(page);
-	if (!page->mapping) {
-		unlock_page(page);
-		page_cache_release(page);
-		goto retry;
-	}
-	if (PageUptodate(page)) {
-		unlock_page(page);
-		goto out;
-	}
-	err = filler(data, page);
-	if (err < 0) {
-		page_cache_release(page);
-		page = ERR_PTR(err);
-	}
- out:
-	return page;
-}
-EXPORT_SYMBOL(read_cache_page);
-
-/*
- * If the page was newly created, increment its refcount and add it to the
- * caller's lru-buffering pagevec.  This function is specifically for
- * generic_file_write().
- */
-static inline struct page *
-__grab_cache_page(struct address_space *mapping, unsigned long index,
-			struct page **cached_page, struct pagevec *lru_pvec)
-{
-	int err;
-	struct page *page;
-repeat:
-	page = find_lock_page(mapping, index);
-	if (!page) {
-		if (!*cached_page) {
-			*cached_page = page_cache_alloc(mapping);
-			if (!*cached_page)
-				return NULL;
-		}
-		err = add_to_page_cache(*cached_page, mapping,
-					index, GFP_KERNEL);
-		if (err == -EEXIST)
-			goto repeat;
-		if (err == 0) {
-			page = *cached_page;
-			page_cache_get(page);
-			if (!pagevec_add(lru_pvec, page))
-				__pagevec_lru_add(lru_pvec);
-			*cached_page = NULL;
-		}
-	}
-	return page;
-}
-
-/*
- * The logic we want is
- *
- *	if suid or (sgid and xgrp)
- *		remove privs
- */
-int remove_suid(struct dentry *dentry)
-{
-	mode_t mode = dentry->d_inode->i_mode;
-	int kill = 0;
-	int result = 0;
-
-	/* suid always must be killed */
-	if (unlikely(mode & S_ISUID))
-		kill = ATTR_KILL_SUID;
-
-	/*
-	 * sgid without any exec bits is just a mandatory locking mark; leave
-	 * it alone.  If some exec bits are set, it's a real sgid; kill it.
-	 */
-	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
-		kill |= ATTR_KILL_SGID;
-
-	if (unlikely(kill && !capable(CAP_FSETID))) {
-		struct iattr newattrs;
-
-		newattrs.ia_valid = ATTR_FORCE | kill;
-		result = notify_change(dentry, &newattrs);
-	}
-	return result;
-}
-EXPORT_SYMBOL(remove_suid);
-
-size_t
-__filemap_copy_from_user_iovec_inatomic(char *vaddr,
-			const struct iovec *iov, size_t base, size_t bytes)
-{
-	size_t copied = 0, left = 0;
-
-	while (bytes) {
-		char __user *buf = iov->iov_base + base;
-		int copy = min(bytes, iov->iov_len - base);
-
-		base = 0;
-		left = __copy_from_user_inatomic_nocache(vaddr, buf, copy);
-		copied += copy;
-		bytes -= copy;
-		vaddr += copy;
-		iov++;
-
-		if (unlikely(left))
-			break;
-	}
-	return copied - left;
-}
-
-/*
- * Performs necessary checks before doing a write
- *
- * Can adjust writing position or amount of bytes to write.
- * Returns appropriate error code that caller should return or
- * zero in case that write should be allowed.
- */
-inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
-{
-	struct inode *inode = file->f_mapping->host;
-	unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-
-        if (unlikely(*pos < 0))
-                return -EINVAL;
-
-	if (!isblk) {
-		/* FIXME: this is for backwards compatibility with 2.4 */
-		if (file->f_flags & O_APPEND)
-                        *pos = i_size_read(inode);
-
-		if (limit != RLIM_INFINITY) {
-			if (*pos >= limit) {
-				send_sig(SIGXFSZ, current, 0);
-				return -EFBIG;
-			}
-			if (*count > limit - (typeof(limit))*pos) {
-				*count = limit - (typeof(limit))*pos;
-			}
-		}
-	}
-
-	/*
-	 * LFS rule
-	 */
-	if (unlikely(*pos + *count > MAX_NON_LFS &&
-				!(file->f_flags & O_LARGEFILE))) {
-		if (*pos >= MAX_NON_LFS) {
-			send_sig(SIGXFSZ, current, 0);
-			return -EFBIG;
-		}
-		if (*count > MAX_NON_LFS - (unsigned long)*pos) {
-			*count = MAX_NON_LFS - (unsigned long)*pos;
-		}
-	}
-
-	/*
-	 * Are we about to exceed the fs block limit ?
-	 *
-	 * If we have written data it becomes a short write.  If we have
-	 * exceeded without writing data we send a signal and return EFBIG.
-	 * Linus frestrict idea will clean these up nicely..
-	 */
-	if (likely(!isblk)) {
-		if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
-			if (*count || *pos > inode->i_sb->s_maxbytes) {
-				send_sig(SIGXFSZ, current, 0);
-				return -EFBIG;
-			}
-			/* zero-length writes at ->s_maxbytes are OK */
-		}
-
-		if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
-			*count = inode->i_sb->s_maxbytes - *pos;
-	} else {
-		loff_t isize;
-		if (bdev_read_only(I_BDEV(inode)))
-			return -EPERM;
-		isize = i_size_read(inode);
-		if (*pos >= isize) {
-			if (*count || *pos > isize)
-				return -ENOSPC;
-		}
-
-		if (*pos + *count > isize)
-			*count = isize - *pos;
-	}
-	return 0;
-}
-EXPORT_SYMBOL(generic_write_checks);
-
-ssize_t
-generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long *nr_segs, loff_t pos, loff_t *ppos,
-		size_t count, size_t ocount)
-{
-	struct file	*file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct inode	*inode = mapping->host;
-	ssize_t		written;
-
-	if (count != ocount)
-		*nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
-
-	written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs);
-	if (written > 0) {
-		loff_t end = pos + written;
-		if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
-			i_size_write(inode,  end);
-			mark_inode_dirty(inode);
-		}
-		*ppos = end;
-	}
-
-	/*
-	 * Sync the fs metadata but not the minor inode changes and
-	 * of course not the data as we did direct DMA for the IO.
-	 * i_mutex is held, which protects generic_osync_inode() from
-	 * livelocking.
-	 */
-	if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-		int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
-		if (err < 0)
-			written = err;
-	}
-	if (written == count && !is_sync_kiocb(iocb))
-		written = -EIOCBQUEUED;
-	return written;
-}
-EXPORT_SYMBOL(generic_file_direct_write);
-
-/**
- * write_actor - copy data from user buffer
- * @page: the page to copy data to
- * @offset: offset within the page
- * @bytes: number of bytes to copy
- * @desc: pointer to user buffer is obtained from here
- *
- * This is used to copy data from user buffer into @page in case of i/o vector
- * has 1 segment. In case of write, in short.
- */
-static size_t write_actor(struct page *page, unsigned long offset,
-			  size_t bytes, const write_descriptor_t *desc)
-{
-	return filemap_copy_from_user(page, offset, desc->buf, bytes);
-}
-
-/**
- * write_iovec_actor - copy data from i/o vector
- * @page: the page to copy data to
- * @offset: offset within the page
- * @bytes: number of bytes to copy
- * @desc: current iovec and offset in it are obtained from here
- *
- * This is used to copy data from user buffer into @page in case of i/o vector
- * has more than segment. In case of writev, in short.
- */
-static size_t write_iovec_actor(struct page *page, unsigned long offset,
- 				size_t bytes, const write_descriptor_t *desc)
-{
- 	return filemap_copy_from_user_iovec(page, offset, desc->cur_iov,
-					    desc->iov_off, bytes);
-}
-
-/**
- * generic_batch_write - generic implementation of batched write
- * @file: the file to write to
- * @desc: set of write arguments
- * @lru_pvec: multipage container to batch adding pages to LRU list
- * @cached_page: allocated but not used on previous call
- * @written: returned number of bytes successfully written
- *
- * This implementation of batch_write method writes not more than one page of
- * file. It faults in user space, allocates page and calls prepare_write and
- * commit_write address space operations. User data are copied by an actor
- * which is set by caller depending on whether write or writev is on the way.
- */
-static long generic_batch_write(struct file *file,
- 				const write_descriptor_t *desc,
- 				struct pagevec *lru_pvec,
- 				struct page **cached_page, size_t *written)
-{
- 	const struct address_space_operations *a_ops = file->f_mapping->a_ops;
-	struct page *page;
-	unsigned long index;
-	size_t bytes;
-	unsigned long offset;
-	long status;
-
-	/* offset within page write is to start at */
-	offset = (desc->pos & (PAGE_CACHE_SIZE - 1));
-
-	/* index of page we are to write to */
-	index = desc->pos >> PAGE_CACHE_SHIFT;
-
-	/* number of bytes which can be written to the page */
-	bytes = PAGE_CACHE_SIZE - offset;
-
-	/* Limit the size of the copy to the caller's write size */
-	bytes = min(bytes, desc->count);
-
-	/*
-	 * Limit the size of the copy to that of the current segment,
-	 * because fault_in_pages_readable() doesn't know how to walk
-	 * segments.
-	 */
-	bytes = min(bytes, desc->cur_iov->iov_len - desc->iov_off);
-
-	while (1) {
-		/*
-		 * Bring in the user page that we will copy from _first_.
-		 * Otherwise there's a nasty deadlock on copying from the
-		 * same page as we're writing to, without it being marked
-		 * up-to-date.
-		 */
-		fault_in_pages_readable(desc->buf, bytes);
-
-		page = __grab_cache_page(file->f_mapping, index, cached_page,
-					 lru_pvec);
-		if (!page)
-			return -ENOMEM;
-
-		status = a_ops->prepare_write(file, page, offset,
-					      offset+bytes);
-		if (unlikely(status)) {
-			loff_t isize = i_size_read(file->f_mapping->host);
-
-			if (status != AOP_TRUNCATED_PAGE)
-				unlock_page(page);
-			page_cache_release(page);
-			if (status == AOP_TRUNCATED_PAGE)
-				continue;
-			/*
-			 * prepare_write() may have instantiated a few
-			 * blocks outside i_size.  Trim these off
-			 * again.
-			 */
-			if (desc->pos + bytes > isize)
-				vmtruncate(file->f_mapping->host, isize);
-			return status;
-		}
-
-		/*
-		 * call write actor in order to copy user data to the
-		 * page
-		 */
-		*written = desc->actor(page, offset, bytes, desc);
-
-		flush_dcache_page(page);
-		status = a_ops->commit_write(file, page, offset, offset+bytes);
-		if (status == AOP_TRUNCATED_PAGE) {
-			page_cache_release(page);
-			continue;
-		}
-
-		unlock_page(page);
-		mark_page_accessed(page);
-		page_cache_release(page);
-		break;
-	}
-	/*
-	 * If commit_write returned error - write failed and we zero
-	 * number of written bytes. If write_actor copied less than it
-	 * was asked to we return -EFAULT and number of bytes
-	 * actually written.
-	 */
-	if (status)
-		*written = 0;
-	else if (*written != bytes)
-		status = -EFAULT;
-	return status;
-}
-
-ssize_t
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
-			    unsigned long nr_segs, loff_t pos, loff_t *ppos,
-			    size_t count, ssize_t written)
-{
-	struct file *file = iocb->ki_filp;
-	struct address_space * mapping = file->f_mapping;
-	const struct address_space_operations *a_ops = mapping->a_ops;
-	struct inode 	*inode = mapping->host;
-	long		status;
-	struct page	*cached_page = NULL;
-	struct pagevec	lru_pvec;
-	write_descriptor_t desc;
-	size_t copied = 0;
-
-	pagevec_init(&lru_pvec, 0);
-
-	/*
-	 * initialize write descriptor fields: position to write to
-	 * and number of bytes to write
-	 */
-	desc.pos = pos;
-	desc.count = count;
-
-	/*
-	 * handle partial DIO write.  Adjust cur_iov if needed.
-	 */
-	if (likely(nr_segs == 1)) {
-		desc.cur_iov = iov;
-		desc.iov_off = written;
-		desc.actor = write_actor;
-	} else {
-		filemap_set_next_iovec(&desc.cur_iov, &desc.iov_off, written);
-		desc.actor = write_iovec_actor;
-	}
-	/* pointer to user buffer */
-	desc.buf = desc.cur_iov->iov_base + desc.iov_off;
-
-	do {
-		/*
-		 * When calling the filesystem for writes, there is processing
-		 * that must be done:
-		 * 1) per word
-		 * 2) per page
-		 * 3) per call to the FS
-		 * If the FS is called per page, then it turns out that 3)
-		 * costs more than 1) and 2) for sophisticated filesystems.  To
-		 * allow the FS to choose to pay the cost of 3) only once we
-		 * call batch_write, if the FS supports it.
-		 */
-		if (a_ops->batch_write)
-			status = a_ops->batch_write(file, &desc, &lru_pvec,
-						    &cached_page, &copied);
-		else
-			status = generic_batch_write(file, &desc, &lru_pvec,
-						     &cached_page, &copied);
-		if (likely(copied > 0)) {
-			written += copied;
-			desc.count -= copied;
-			if (desc.count) {
-				/*
-				 * not everything is written yet. Adjust write
-				 * descriptor for next iteration
-				 */
-				desc.pos += copied;
-				if (unlikely(nr_segs > 1))
-					filemap_set_next_iovec(&desc.cur_iov,
-							       &desc.iov_off,
-							       copied);
-				else
-					desc.iov_off += copied;
-				desc.buf = desc.cur_iov->iov_base +
-					desc.iov_off;
-			}
-		}
-		if (status < 0)
-			break;
-		balance_dirty_pages_ratelimited(mapping);
-		cond_resched();
-	} while (desc.count);	
-	*ppos = pos + written;
-
-	if (cached_page)
-		page_cache_release(cached_page);
-
-	/*
-	 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
-	 */
-	if (likely(status >= 0)) {
-		if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-			if (!a_ops->writepage || !is_sync_kiocb(iocb))
-				status = generic_osync_inode(inode, mapping,
-						OSYNC_METADATA|OSYNC_DATA);
-		}
-  	}
-	
-	/*
-	 * If we get here for O_DIRECT writes then we must have fallen through
-	 * to buffered writes (block instantiation inside i_size).  So we sync
-	 * the file data here, to try to honour O_DIRECT expectations.
-	 */
-	if (unlikely(file->f_flags & O_DIRECT) && written)
-		status = filemap_write_and_wait(mapping);
-
-	pagevec_lru_add(&lru_pvec);
-	return written ? written : status;
-}
-EXPORT_SYMBOL(generic_file_buffered_write);
-
-static ssize_t
-__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
-				unsigned long nr_segs, loff_t *ppos)
-{
-	struct file *file = iocb->ki_filp;
-	const struct address_space * mapping = file->f_mapping;
-	size_t ocount;		/* original count */
-	size_t count;		/* after file limit checks */
-	struct inode 	*inode = mapping->host;
-	unsigned long	seg;
-	loff_t		pos;
-	ssize_t		written;
-	ssize_t		err;
-
-	ocount = 0;
-	for (seg = 0; seg < nr_segs; seg++) {
-		const struct iovec *iv = &iov[seg];
-
-		/*
-		 * If any segment has a negative length, or the cumulative
-		 * length ever wraps negative then return -EINVAL.
-		 */
-		ocount += iv->iov_len;
-		if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
-			return -EINVAL;
-		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
-			continue;
-		if (seg == 0)
-			return -EFAULT;
-		nr_segs = seg;
-		ocount -= iv->iov_len;	/* This segment is no good */
-		break;
-	}
-
-	count = ocount;
-	pos = *ppos;
-
-	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
-
-	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = mapping->backing_dev_info;
-	written = 0;
-
-	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
-	if (err)
-		goto out;
-
-	if (count == 0)
-		goto out;
-
-	err = remove_suid(file->f_dentry);
-	if (err)
-		goto out;
-
-	file_update_time(file);
-
-	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
-	if (unlikely(file->f_flags & O_DIRECT)) {
-		written = generic_file_direct_write(iocb, iov,
-				&nr_segs, pos, ppos, count, ocount);
-		if (written < 0 || written == count)
-			goto out;
-		/*
-		 * direct-io write to a hole: fall through to buffered I/O
-		 * for completing the rest of the request.
-		 */
-		pos += written;
-		count -= written;
-	}
-
-	written = generic_file_buffered_write(iocb, iov, nr_segs,
-			pos, ppos, count, written);
-out:
-	current->backing_dev_info = NULL;
-	return written ? written : err;
-}
-
-ssize_t generic_file_aio_write_nolock(struct kiocb *iocb,
-		const struct iovec *iov, unsigned long nr_segs, loff_t pos)
-{
-	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	ssize_t ret;
-
-	BUG_ON(iocb->ki_pos != pos);
-
-	ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
-			&iocb->ki_pos);
-
-	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-		ssize_t err;
-
-		err = sync_page_range_nolock(inode, mapping, pos, ret);
-		if (err < 0)
-			ret = err;
-	}
-	return ret;
-}
-EXPORT_SYMBOL(generic_file_aio_write_nolock);
-
-ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos)
-{
-	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	ssize_t ret;
-
-	BUG_ON(iocb->ki_pos != pos);
-
-	mutex_lock(&inode->i_mutex);
-	ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
-			&iocb->ki_pos);
-	mutex_unlock(&inode->i_mutex);
-
-	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-		ssize_t err;
-
-		err = sync_page_range(inode, mapping, pos, ret);
-		if (err < 0)
-			ret = err;
-	}
-	return ret;
-}
-EXPORT_SYMBOL(generic_file_aio_write);
-
-/*
- * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if something
- * went wrong during pagecache shootdown.
- */
-static ssize_t
-generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-	loff_t offset, unsigned long nr_segs)
-{
-	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	ssize_t retval;
-	size_t write_len = 0;
-
-	/*
-	 * If it's a write, unmap all mmappings of the file up-front.  This
-	 * will cause any pte dirty bits to be propagated into the pageframes
-	 * for the subsequent filemap_write_and_wait().
-	 */
-	if (rw == WRITE) {
-		write_len = iov_length(iov, nr_segs);
-	       	if (mapping_mapped(mapping))
-			unmap_mapping_range(mapping, offset, write_len, 0);
-	}
-
-	retval = filemap_write_and_wait(mapping);
-	if (retval == 0) {
-		retval = mapping->a_ops->direct_IO(rw, iocb, iov,
-						offset, nr_segs);
-		if (rw == WRITE && mapping->nrpages) {
-			pgoff_t end = (offset + write_len - 1)
-						>> PAGE_CACHE_SHIFT;
-			int err = invalidate_inode_pages2_range(mapping,
-					offset >> PAGE_CACHE_SHIFT, end);
-			if (err)
-				retval = err;
-		}
-	}
-	return retval;
-}
diff -urN newtree/mm/migrate.c newtree.2/mm/migrate.c
--- newtree/mm/migrate.c	2006-07-05 10:06:57.000000000 -0400
+++ newtree.2/mm/migrate.c	2006-07-11 13:50:10.000000000 -0400
@@ -29,8 +29,6 @@
 #include <linux/vmalloc.h>
 #include <linux/security.h>
 
-#include "internal.h"
-
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 
 /*
@@ -303,15 +301,18 @@
 		return 0;
 	}
 
-	write_lock_irq(&mapping->tree_lock);
+        SetPageNoNewRefs(page);
+        smp_wmb();
+        spin_lock_irq(&mapping->tree_lock);
 
 	radix_pointer = (struct page **)radix_tree_lookup_slot(
 						&mapping->page_tree,
 						page_index(page));
 
 	if (page_count(page) != 2 + !!PagePrivate(page) ||
-			radix_tree_deref_slot(radix_pointer) != page) {
-		write_unlock_irq(&mapping->tree_lock);
+                        radix_tree_deref_slot(radix_pointer) != page) {
+                spin_unlock_irq(&mapping->tree_lock);
+                ClearPageNoNewRefs(page);
 		return -EAGAIN;
 	}
 
@@ -326,9 +327,16 @@
 	}
 #endif
 
-	radix_tree_replace_slot(radix_pointer, newpage);
+        SetPageNoNewRefs(newpage);
+        radix_tree_replace_slot(radix_pointer, newpage);
+        page->mapping = NULL;
+
+        spin_unlock_irq(&mapping->tree_lock);
 	__put_page(page);
-	write_unlock_irq(&mapping->tree_lock);
+
+        smp_wmb();
+        ClearPageNoNewRefs(page);
+        ClearPageNoNewRefs(newpage);
 
 	return 0;
 }
diff -urN newtree/mm/page-writeback.c newtree.2/mm/page-writeback.c
--- newtree/mm/page-writeback.c	2006-07-05 10:06:57.000000000 -0400
+++ newtree.2/mm/page-writeback.c	2006-07-11 13:50:10.000000000 -0400
@@ -630,7 +630,7 @@
 		struct address_space *mapping2;
 
 		if (mapping) {
-			write_lock_irq(&mapping->tree_lock);
+			spin_lock_irq(&mapping->tree_lock);
 			mapping2 = page_mapping(page);
 			if (mapping2) { /* Race with truncate? */
 				BUG_ON(mapping2 != mapping);
@@ -640,7 +640,7 @@
 				radix_tree_tag_set(&mapping->page_tree,
 					page_index(page), PAGECACHE_TAG_DIRTY);
 			}
-			write_unlock_irq(&mapping->tree_lock);
+			spin_unlock_irq(&mapping->tree_lock);
 			if (mapping->host) {
 				/* !PageAnon && !swapper_space */
 				__mark_inode_dirty(mapping->host,
@@ -719,23 +719,23 @@
 
 	WARN_ON_ONCE(!PageLocked(page));
 	if (mapping) {
-		write_lock_irqsave(&mapping->tree_lock, flags);
+                spin_lock_irqsave(&mapping->tree_lock, flags);
 		if (TestClearPageDirty(page)) {
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
-			write_unlock_irqrestore(&mapping->tree_lock, flags);
 			/*
 			 * We can continue to use `mapping' here because the
 			 * page is locked, which pins the address_space
 			 */
+                        spin_unlock_irqrestore(&mapping->tree_lock, flags);
 			if (mapping_cap_account_dirty(mapping)) {
 				page_mkclean(page);
 				dec_zone_page_state(page, NR_FILE_DIRTY);
 			}
 			return 1;
 		}
-		write_unlock_irqrestore(&mapping->tree_lock, flags);
+                spin_unlock_irqrestore(&mapping->tree_lock, flags);
 		return 0;
 	}
 	return TestClearPageDirty(page);
@@ -778,33 +778,32 @@
 int test_clear_page_writeback(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
-	int ret;
 
 	if (mapping) {
 		unsigned long flags;
+		int ret;
 
-		write_lock_irqsave(&mapping->tree_lock, flags);
+		spin_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestClearPageWriteback(page);
 		if (ret)
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
-		write_unlock_irqrestore(&mapping->tree_lock, flags);
-	} else {
-		ret = TestClearPageWriteback(page);
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		return ret;
 	}
-	return ret;
+	return TestClearPageWriteback(page);
 }
 
 int test_set_page_writeback(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
-	int ret;
 
 	if (mapping) {
 		unsigned long flags;
+		int ret;
 
-		write_lock_irqsave(&mapping->tree_lock, flags);
+		spin_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestSetPageWriteback(page);
 		if (!ret)
 			radix_tree_tag_set(&mapping->page_tree,
@@ -814,27 +813,24 @@
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
-		write_unlock_irqrestore(&mapping->tree_lock, flags);
-	} else {
-		ret = TestSetPageWriteback(page);
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		return ret;
 	}
-	return ret;
+	return TestSetPageWriteback(page);
 
 }
 EXPORT_SYMBOL(test_set_page_writeback);
 
 /*
- * Return true if any of the pages in the mapping are marged with the
+ * Return true if any of the pages in the mapping are marked with the
  * passed tag.
  */
 int mapping_tagged(struct address_space *mapping, int tag)
 {
-	unsigned long flags;
 	int ret;
-
-	read_lock_irqsave(&mapping->tree_lock, flags);
+	rcu_read_lock();
 	ret = radix_tree_tagged(&mapping->page_tree, tag);
-	read_unlock_irqrestore(&mapping->tree_lock, flags);
+	rcu_read_unlock();
 	return ret;
 }
 EXPORT_SYMBOL(mapping_tagged);
diff -urN newtree/mm/readahead.c newtree.2/mm/readahead.c
--- newtree/mm/readahead.c	2006-07-05 10:06:57.000000000 -0400
+++ newtree.2/mm/readahead.c	2006-07-11 13:50:10.000000000 -0400
@@ -398,21 +398,21 @@
 	/*
 	 * Preallocate as many pages as we will need.
 	 */
-	read_lock_irq(&mapping->tree_lock);
 	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
 		pgoff_t page_offset = offset + page_idx;
 		
 		if (page_offset > end_index)
 			break;
 
+                /* Don't need mapping->tree_lock - lookup can be racy */
+                rcu_read_lock();
 		page = radix_tree_lookup(&mapping->page_tree, page_offset);
+                rcu_read_unlock();
 		if (page)
 			continue;
 
-		read_unlock_irq(&mapping->tree_lock);
 		cond_resched();
 		page = page_cache_alloc_cold(mapping);
-		read_lock_irq(&mapping->tree_lock);
 		if (!page)
 			break;
 		page->index = page_offset;
@@ -421,7 +421,6 @@
 			SetPageReadahead(page);
 		ret++;
 	}
-	read_unlock_irq(&mapping->tree_lock);
 
 	/*
 	 * Now start the IO.  We ignore I/O errors - if the page is not
@@ -1324,7 +1323,7 @@
 	pgoff_t ra_index;
 
 	cond_resched();
-	read_lock_irq(&mapping->tree_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	ra_index = radix_tree_scan_hole(&mapping->page_tree, index, max_scan);
 #ifdef DEBUG_READAHEAD_RADIXTREE
 	BUG_ON(!__probe_page(mapping, index));
@@ -1336,7 +1335,7 @@
 	if (ra_index != ~0UL && ra_index - index < max_scan)
 		WARN_ON(__probe_page(mapping, ra_index));
 #endif
-	read_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 
 	if (ra_index <= index + max_scan)
 		return ra_index;
@@ -1359,13 +1358,13 @@
 	 * Poor man's radix_tree_scan_data_backward() implementation.
 	 * Acceptable because max_scan won't be large.
 	 */
-	read_lock_irq(&mapping->tree_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	for (; origin - index < max_scan;)
 		if (__probe_page(mapping, --index)) {
-			read_unlock_irq(&mapping->tree_lock);
+			spin_unlock_irq(&mapping->tree_lock);
 			return index + 1;
 		}
-	read_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 
 	return 0;
 }
@@ -1416,7 +1415,7 @@
 	 * The count here determines ra_size.
 	 */
 	cond_resched();
-	read_lock_irq(&mapping->tree_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	index = radix_tree_scan_hole_backward(&mapping->page_tree,
 							offset - 1, ra_max);
 #ifdef DEBUG_READAHEAD_RADIXTREE
@@ -1458,7 +1457,7 @@
 			break;
 
 out_unlock:
-	read_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 
 	/*
 	 *  For sequential read that extends from index 0, the counted value
diff -urN newtree/mm/swap_prefetch.c newtree.2/mm/swap_prefetch.c
--- newtree/mm/swap_prefetch.c	2006-07-05 10:06:57.000000000 -0400
+++ newtree.2/mm/swap_prefetch.c	2006-07-11 13:50:10.000000000 -0400
@@ -190,10 +190,10 @@
 	enum trickle_return ret = TRICKLE_FAILED;
 	struct page *page;
 
-	read_lock_irq(&swapper_space.tree_lock);
+	spin_lock_irq(&swapper_space.tree_lock);
 	/* Entry may already exist */
 	page = radix_tree_lookup(&swapper_space.page_tree, entry.val);
-	read_unlock_irq(&swapper_space.tree_lock);
+	spin_unlock_irq(&swapper_space.tree_lock);
 	if (page) {
 		remove_from_swapped_list(entry.val);
 		goto out;
diff -urN newtree/mm/swap_state.c newtree.2/mm/swap_state.c
--- newtree/mm/swap_state.c	2006-07-05 10:06:57.000000000 -0400
+++ newtree.2/mm/swap_state.c	2006-07-11 13:50:10.000000000 -0400
@@ -39,7 +39,7 @@
 
 struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
-	.tree_lock	= __RW_LOCK_UNLOCKED(swapper_space.tree_lock),
+	.tree_lock	= __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
 	.a_ops		= &swap_aops,
 	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
 	.backing_dev_info = &swap_backing_dev_info,
@@ -79,7 +79,9 @@
 	BUG_ON(PagePrivate(page));
 	error = radix_tree_preload(gfp_mask);
 	if (!error) {
-		write_lock_irq(&swapper_space.tree_lock);
+		SetPageNoNewRefs(page);
+		smp_wmb();
+		spin_lock_irq(&swapper_space.tree_lock);
 		error = radix_tree_insert(&swapper_space.page_tree,
 						entry.val, page);
 		if (!error) {
@@ -91,7 +93,9 @@
 			total_swapcache_pages++;
 			__inc_zone_page_state(page, NR_FILE_PAGES);
 		}
-		write_unlock_irq(&swapper_space.tree_lock);
+		spin_unlock_irq(&swapper_space.tree_lock);
+		smp_wmb();
+		ClearPageNoNewRefs(page);
 		radix_tree_preload_end();
 	}
 	return error;
@@ -206,9 +210,9 @@
 
 	entry.val = page_private(page);
 
-	write_lock_irq(&swapper_space.tree_lock);
+	spin_lock_irq(&swapper_space.tree_lock);
 	__delete_from_swap_cache(page);
-	write_unlock_irq(&swapper_space.tree_lock);
+	spin_unlock_irq(&swapper_space.tree_lock);
 
 	swap_free(entry);
 	page_cache_release(page);
@@ -238,7 +242,7 @@
 int move_from_swap_cache(struct page *page, unsigned long index,
 		struct address_space *mapping)
 {
-	int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC);
+	int err = __add_to_page_cache(page, mapping, index, GFP_ATOMIC);
 	if (!err) {
 		delete_from_swap_cache(page);
 		/* shift page from clean_pages to dirty_pages list */
@@ -295,6 +299,29 @@
 	}
 }
 
+struct page *find_get_swap_page(swp_entry_t entry)
+{
+	struct page *page;
+
+	rcu_read_lock();
+repeat:
+	page = radix_tree_lookup(&swapper_space.page_tree, entry.val);
+	if (page) {
+		page = page_cache_get_speculative(page);
+		if (unlikely(!page))
+			goto repeat;
+		/* Has the page been truncated? */
+		if (unlikely(!PageSwapCache(page)
+				|| page_private(page) != entry.val)) {
+			page_cache_release(page);
+			goto repeat;
+		}
+	}
+	rcu_read_unlock();
+
+	return page;
+}
+
 /*
  * Lookup a swap entry in the swap cache. A found page will be returned
  * unlocked and with its refcount incremented - we rely on the kernel
@@ -305,7 +332,7 @@
 {
 	struct page *page;
 
-	page = find_get_page(&swapper_space, entry.val);
+	page = find_get_swap_page(entry);
 
 	if (page)
 		INC_CACHE_INFO(find_success);
@@ -335,7 +362,7 @@
 		 * called after lookup_swap_cache() failed, re-calling
 		 * that would confuse statistics.
 		 */
-		found_page = find_get_page(&swapper_space, entry.val);
+		found_page = find_get_swap_page(entry);
 		if (found_page)
 			break;
 
diff -urN newtree/mm/swapfile.c newtree.2/mm/swapfile.c
--- newtree/mm/swapfile.c	2006-07-05 10:06:57.000000000 -0400
+++ newtree.2/mm/swapfile.c	2006-07-11 13:50:10.000000000 -0400
@@ -367,13 +367,13 @@
 	retval = 0;
 	if (p->swap_map[swp_offset(entry)] == 1) {
 		/* Recheck the page count with the swapcache lock held.. */
-		write_lock_irq(&swapper_space.tree_lock);
+		spin_lock_irq(&swapper_space.tree_lock);
 		if ((page_count(page) == 2) && !PageWriteback(page)) {
 			__delete_from_swap_cache(page);
 			SetPageDirty(page);
 			retval = 1;
 		}
-		write_unlock_irq(&swapper_space.tree_lock);
+		spin_unlock_irq(&swapper_space.tree_lock);
 	}
 	spin_unlock(&swap_lock);
 
@@ -400,7 +400,7 @@
 	p = swap_info_get(entry);
 	if (p) {
 		if (swap_entry_free(p, swp_offset(entry)) == 1) {
-			page = find_get_page(&swapper_space, entry.val);
+			page = find_get_swap_page(entry);
 			if (page && unlikely(TestSetPageLocked(page))) {
 				page_cache_release(page);
 				page = NULL;
diff -urN newtree/mm/truncate.c newtree.2/mm/truncate.c
--- newtree/mm/truncate.c	2006-07-05 10:06:57.000000000 -0400
+++ newtree.2/mm/truncate.c	2006-07-11 13:50:10.000000000 -0400
@@ -67,15 +67,15 @@
 	if (PagePrivate(page) && !try_to_release_page(page, 0))
 		return 0;
 
-	write_lock_irq(&mapping->tree_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	if (PageDirty(page)) {
-		write_unlock_irq(&mapping->tree_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		return 0;
 	}
 
 	BUG_ON(PagePrivate(page));
 	__remove_from_page_cache(page);
-	write_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	ClearPageUptodate(page);
 	page_cache_release(page);	/* pagecache ref */
 	return 1;
diff -urN newtree/mm/vmscan.c newtree.2/mm/vmscan.c
--- newtree/mm/vmscan.c	2006-07-08 06:15:26.000000000 -0400
+++ newtree.2/mm/vmscan.c	2006-07-11 13:50:10.000000000 -0400
@@ -382,7 +382,9 @@
 	if (!mapping)
 		return 0;		/* truncate got there first */
 
-	write_lock_irq(&mapping->tree_lock);
+	SetPageNoNewRefs(page);
+	smp_wmb();
+	spin_lock_irq(&mapping->tree_lock);
 
 	/*
 	 * The non-racy check for busy page.  It is critical to check
@@ -399,19 +401,23 @@
 		swp_entry_t swap = { .val = page_private(page) };
 		add_to_swapped_list(page);
 		__delete_from_swap_cache(page);
-		write_unlock_irq(&mapping->tree_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		swap_free(swap);
-		__put_page(page);	/* The pagecache ref */
-		return 1;
+		goto free_it;
 	}
 
 	__remove_from_page_cache(page);
-	write_unlock_irq(&mapping->tree_lock);
-	__put_page(page);
+	spin_unlock_irq(&mapping->tree_lock);
+
+free_it:
+	smp_wmb();
+	__ClearPageNoNewRefs(page);
+	__put_page(page); /* The pagecache ref */
 	return 1;
 
 cannot_free:
-	write_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irq(&mapping->tree_lock);
+	ClearPageNoNewRefs(page);
 	return 0;
 }