diff -urN oldtree/arch/um/drivers/ubd_kern.c newtree/arch/um/drivers/ubd_kern.c
--- oldtree/arch/um/drivers/ubd_kern.c	2006-07-15 14:53:08.000000000 -0700
+++ newtree/arch/um/drivers/ubd_kern.c	2006-08-03 13:27:09.000000000 -0700
@@ -986,8 +986,6 @@
 	__u64 offset;
 	int len;
 
-	if(req->rq_status == RQ_INACTIVE) return(1);
-
 	/* This should be impossible now */
 	if((rq_data_dir(req) == WRITE) && !dev->openflags.w){
 		printk("Write attempted on readonly ubd device %s\n",
diff -urN oldtree/block/as-iosched.c newtree/block/as-iosched.c
--- oldtree/block/as-iosched.c	2006-07-15 14:53:08.000000000 -0700
+++ newtree/block/as-iosched.c	2006-08-03 13:27:09.000000000 -0700
@@ -14,7 +14,6 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
-#include <linux/hash.h>
 #include <linux/rbtree.h>
 #include <linux/interrupt.h>
 
@@ -93,9 +92,8 @@
 	struct rb_root sort_list[2];
 	struct list_head fifo_list[2];
 
-	struct as_rq *next_arq[2];	/* next in sort order */
+	struct request *next_rq[2];	/* next in sort order */
 	sector_t last_sector[2];	/* last REQ_SYNC & REQ_ASYNC sectors */
-	struct hlist_head *hash;	/* request hash */
 
 	unsigned long exit_prob;	/* probability a task will exit while
 					   being waited on */
@@ -115,7 +113,6 @@
 	int write_batch_count;		/* max # of reqs in a write batch */
 	int current_write_count;	/* how many requests left this batch */
 	int write_batch_idled;		/* has the write batch gone idle? */
-	mempool_t *arq_pool;
 
 	enum anticipation_status antic_status;
 	unsigned long antic_start;	/* jiffies: when it started */
@@ -133,8 +130,6 @@
 	unsigned long antic_expire;
 };
 
-#define list_entry_fifo(ptr)	list_entry((ptr), struct as_rq, fifo)
-
 /*
  * per-request data.
  */
@@ -150,40 +145,14 @@
 	AS_RQ_POSTSCHED,	/* when they shouldn't be */
 };
 
-struct as_rq {
-	/*
-	 * rbtree index, key is the starting offset
-	 */
-	struct rb_node rb_node;
-	sector_t rb_key;
-
-	struct request *request;
-
-	struct io_context *io_context;	/* The submitting task */
-
-	/*
-	 * request hash, key is the ending offset (for back merge lookup)
-	 */
-	struct hlist_node hash;
-
-	/*
-	 * expire fifo
-	 */
-	struct list_head fifo;
-	unsigned long expires;
+#define RQ_IOC(rq)	((struct io_context *) (rq)->elevator_private)
+#define RQ_STATE(rq)	((enum arq_state)(rq)->elevator_private2)
+#define RQ_SET_STATE(rq, state)	((rq)->elevator_private2 = (void *) state)
 
-	unsigned int is_sync;
-	enum arq_state state;
-};
-
-#define RQ_DATA(rq)	((struct as_rq *) (rq)->elevator_private)
-
-static kmem_cache_t *arq_pool;
-
-static atomic_t ioc_count = ATOMIC_INIT(0);
+static DEFINE_PER_CPU(unsigned long, ioc_count);
 static struct completion *ioc_gone;
 
-static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq);
+static void as_move_to_dispatch(struct as_data *ad, struct request *rq);
 static void as_antic_stop(struct as_data *ad);
 
 /*
@@ -194,7 +163,8 @@
 static void free_as_io_context(struct as_io_context *aic)
 {
 	kfree(aic);
-	if (atomic_dec_and_test(&ioc_count) && ioc_gone)
+	elv_ioc_count_dec(ioc_count);
+	if (ioc_gone && !elv_ioc_count_read(ioc_count))
 		complete(ioc_gone);
 }
 
@@ -230,7 +200,7 @@
 		ret->seek_total = 0;
 		ret->seek_samples = 0;
 		ret->seek_mean = 0;
-		atomic_inc(&ioc_count);
+		elv_ioc_count_inc(ioc_count);
 	}
 
 	return ret;
@@ -240,9 +210,9 @@
  * If the current task has no AS IO context then create one and initialise it.
  * Then take a ref on the task's io context and return it.
  */
-static struct io_context *as_get_io_context(void)
+static struct io_context *as_get_io_context(int node)
 {
-	struct io_context *ioc = get_io_context(GFP_ATOMIC);
+	struct io_context *ioc = get_io_context(GFP_ATOMIC, node);
 	if (ioc && !ioc->aic) {
 		ioc->aic = alloc_as_io_context();
 		if (!ioc->aic) {
@@ -253,194 +223,43 @@
 	return ioc;
 }
 
-static void as_put_io_context(struct as_rq *arq)
+static void as_put_io_context(struct request *rq)
 {
 	struct as_io_context *aic;
 
-	if (unlikely(!arq->io_context))
+	if (unlikely(!RQ_IOC(rq)))
 		return;
 
-	aic = arq->io_context->aic;
+	aic = RQ_IOC(rq)->aic;
 
-	if (arq->is_sync == REQ_SYNC && aic) {
+	if (rq_is_sync(rq) && aic) {
 		spin_lock(&aic->lock);
 		set_bit(AS_TASK_IORUNNING, &aic->state);
 		aic->last_end_request = jiffies;
 		spin_unlock(&aic->lock);
 	}
 
-	put_io_context(arq->io_context);
-}
-
-/*
- * the back merge hash support functions
- */
-static const int as_hash_shift = 6;
-#define AS_HASH_BLOCK(sec)	((sec) >> 3)
-#define AS_HASH_FN(sec)		(hash_long(AS_HASH_BLOCK((sec)), as_hash_shift))
-#define AS_HASH_ENTRIES		(1 << as_hash_shift)
-#define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
-
-static inline void __as_del_arq_hash(struct as_rq *arq)
-{
-	hlist_del_init(&arq->hash);
-}
-
-static inline void as_del_arq_hash(struct as_rq *arq)
-{
-	if (!hlist_unhashed(&arq->hash))
-		__as_del_arq_hash(arq);
-}
-
-static void as_add_arq_hash(struct as_data *ad, struct as_rq *arq)
-{
-	struct request *rq = arq->request;
-
-	BUG_ON(!hlist_unhashed(&arq->hash));
-
-	hlist_add_head(&arq->hash, &ad->hash[AS_HASH_FN(rq_hash_key(rq))]);
-}
-
-/*
- * move hot entry to front of chain
- */
-static inline void as_hot_arq_hash(struct as_data *ad, struct as_rq *arq)
-{
-	struct request *rq = arq->request;
-	struct hlist_head *head = &ad->hash[AS_HASH_FN(rq_hash_key(rq))];
-
-	if (hlist_unhashed(&arq->hash)) {
-		WARN_ON(1);
-		return;
-	}
-
-	if (&arq->hash != head->first) {
-		hlist_del(&arq->hash);
-		hlist_add_head(&arq->hash, head);
-	}
-}
-
-static struct request *as_find_arq_hash(struct as_data *ad, sector_t offset)
-{
-	struct hlist_head *hash_list = &ad->hash[AS_HASH_FN(offset)];
-	struct hlist_node *entry, *next;
-	struct as_rq *arq;
-
-	hlist_for_each_entry_safe(arq, entry, next, hash_list, hash) {
-		struct request *__rq = arq->request;
-
-		BUG_ON(hlist_unhashed(&arq->hash));
-
-		if (!rq_mergeable(__rq)) {
-			as_del_arq_hash(arq);
-			continue;
-		}
-
-		if (rq_hash_key(__rq) == offset)
-			return __rq;
-	}
-
-	return NULL;
+	put_io_context(RQ_IOC(rq));
 }
 
 /*
  * rb tree support functions
  */
-#define rb_entry_arq(node)	rb_entry((node), struct as_rq, rb_node)
-#define ARQ_RB_ROOT(ad, arq)	(&(ad)->sort_list[(arq)->is_sync])
-#define rq_rb_key(rq)		(rq)->sector
-
-/*
- * as_find_first_arq finds the first (lowest sector numbered) request
- * for the specified data_dir. Used to sweep back to the start of the disk
- * (1-way elevator) after we process the last (highest sector) request.
- */
-static struct as_rq *as_find_first_arq(struct as_data *ad, int data_dir)
-{
-	struct rb_node *n = ad->sort_list[data_dir].rb_node;
-
-	if (n == NULL)
-		return NULL;
-
-	for (;;) {
-		if (n->rb_left == NULL)
-			return rb_entry_arq(n);
-
-		n = n->rb_left;
-	}
-}
-
-/*
- * Add the request to the rb tree if it is unique.  If there is an alias (an
- * existing request against the same sector), which can happen when using
- * direct IO, then return the alias.
- */
-static struct as_rq *__as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
-{
-	struct rb_node **p = &ARQ_RB_ROOT(ad, arq)->rb_node;
-	struct rb_node *parent = NULL;
-	struct as_rq *__arq;
-	struct request *rq = arq->request;
-
-	arq->rb_key = rq_rb_key(rq);
-
-	while (*p) {
-		parent = *p;
-		__arq = rb_entry_arq(parent);
-
-		if (arq->rb_key < __arq->rb_key)
-			p = &(*p)->rb_left;
-		else if (arq->rb_key > __arq->rb_key)
-			p = &(*p)->rb_right;
-		else
-			return __arq;
-	}
-
-	rb_link_node(&arq->rb_node, parent, p);
-	rb_insert_color(&arq->rb_node, ARQ_RB_ROOT(ad, arq));
-
-	return NULL;
-}
+#define RQ_RB_ROOT(ad, rq)	(&(ad)->sort_list[rq_is_sync((rq))])
 
-static void as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
+static void as_add_rq_rb(struct as_data *ad, struct request *rq)
 {
-	struct as_rq *alias;
+	struct request *alias;
 
-	while ((unlikely(alias = __as_add_arq_rb(ad, arq)))) {
+	while ((unlikely(alias = elv_rb_add(RQ_RB_ROOT(ad, rq), rq)))) {
 		as_move_to_dispatch(ad, alias);
 		as_antic_stop(ad);
 	}
 }
 
-static inline void as_del_arq_rb(struct as_data *ad, struct as_rq *arq)
+static inline void as_del_rq_rb(struct as_data *ad, struct request *rq)
 {
-	if (!RB_EMPTY_NODE(&arq->rb_node)) {
-		WARN_ON(1);
-		return;
-	}
-
-	rb_erase(&arq->rb_node, ARQ_RB_ROOT(ad, arq));
-	RB_CLEAR_NODE(&arq->rb_node);
-}
-
-static struct request *
-as_find_arq_rb(struct as_data *ad, sector_t sector, int data_dir)
-{
-	struct rb_node *n = ad->sort_list[data_dir].rb_node;
-	struct as_rq *arq;
-
-	while (n) {
-		arq = rb_entry_arq(n);
-
-		if (sector < arq->rb_key)
-			n = n->rb_left;
-		else if (sector > arq->rb_key)
-			n = n->rb_right;
-		else
-			return arq->request;
-	}
-
-	return NULL;
+	elv_rb_del(RQ_RB_ROOT(ad, rq), rq);
 }
 
 /*
@@ -458,26 +277,26 @@
  * as_choose_req selects the preferred one of two requests of the same data_dir
  * ignoring time - eg. timeouts, which is the job of as_dispatch_request
  */
-static struct as_rq *
-as_choose_req(struct as_data *ad, struct as_rq *arq1, struct as_rq *arq2)
+static struct request *
+as_choose_req(struct as_data *ad, struct request *rq1, struct request *rq2)
 {
 	int data_dir;
 	sector_t last, s1, s2, d1, d2;
 	int r1_wrap=0, r2_wrap=0;	/* requests are behind the disk head */
 	const sector_t maxback = MAXBACK;
 
-	if (arq1 == NULL || arq1 == arq2)
-		return arq2;
-	if (arq2 == NULL)
-		return arq1;
+	if (rq1 == NULL || rq1 == rq2)
+		return rq2;
+	if (rq2 == NULL)
+		return rq1;
 
-	data_dir = arq1->is_sync;
+	data_dir = rq_is_sync(rq1);
 
 	last = ad->last_sector[data_dir];
-	s1 = arq1->request->sector;
-	s2 = arq2->request->sector;
+	s1 = rq1->sector;
+	s2 = rq2->sector;
 
-	BUG_ON(data_dir != arq2->is_sync);
+	BUG_ON(data_dir != rq_is_sync(rq2));
 
 	/*
 	 * Strict one way elevator _except_ in the case where we allow
@@ -504,61 +323,58 @@
 
 	/* Found required data */
 	if (!r1_wrap && r2_wrap)
-		return arq1;
+		return rq1;
 	else if (!r2_wrap && r1_wrap)
-		return arq2;
+		return rq2;
 	else if (r1_wrap && r2_wrap) {
 		/* both behind the head */
 		if (s1 <= s2)
-			return arq1;
+			return rq1;
 		else
-			return arq2;
+			return rq2;
 	}
 
 	/* Both requests in front of the head */
 	if (d1 < d2)
-		return arq1;
+		return rq1;
 	else if (d2 < d1)
-		return arq2;
+		return rq2;
 	else {
 		if (s1 >= s2)
-			return arq1;
+			return rq1;
 		else
-			return arq2;
+			return rq2;
 	}
 }
 
 /*
- * as_find_next_arq finds the next request after @prev in elevator order.
+ * as_find_next_rq finds the next request after @prev in elevator order.
  * this with as_choose_req form the basis for how the scheduler chooses
  * what request to process next. Anticipation works on top of this.
  */
-static struct as_rq *as_find_next_arq(struct as_data *ad, struct as_rq *last)
+static struct request *
+as_find_next_rq(struct as_data *ad, struct request *last)
 {
-	const int data_dir = last->is_sync;
-	struct as_rq *ret;
 	struct rb_node *rbnext = rb_next(&last->rb_node);
 	struct rb_node *rbprev = rb_prev(&last->rb_node);
-	struct as_rq *arq_next, *arq_prev;
+	struct request *next = NULL, *prev = NULL;
 
-	BUG_ON(!RB_EMPTY_NODE(&last->rb_node));
+	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
 
 	if (rbprev)
-		arq_prev = rb_entry_arq(rbprev);
-	else
-		arq_prev = NULL;
+		prev = rb_entry_rq(rbprev);
 
 	if (rbnext)
-		arq_next = rb_entry_arq(rbnext);
+		next = rb_entry_rq(rbnext);
 	else {
-		arq_next = as_find_first_arq(ad, data_dir);
-		if (arq_next == last)
-			arq_next = NULL;
-	}
+		const int data_dir = rq_is_sync(last);
 
-	ret = as_choose_req(ad,	arq_next, arq_prev);
+		rbnext = rb_first(&ad->sort_list[data_dir]);
+		if (rbnext && rbnext != &last->rb_node)
+			next = rb_entry_rq(rbnext);
+	}
 
-	return ret;
+	return as_choose_req(ad, next, prev);
 }
 
 /*
@@ -712,8 +528,7 @@
 static void as_update_iohist(struct as_data *ad, struct as_io_context *aic,
 				struct request *rq)
 {
-	struct as_rq *arq = RQ_DATA(rq);
-	int data_dir = arq->is_sync;
+	int data_dir = rq_is_sync(rq);
 	unsigned long thinktime = 0;
 	sector_t seek_dist;
 
@@ -752,11 +567,11 @@
  * previous one issued.
  */
 static int as_close_req(struct as_data *ad, struct as_io_context *aic,
-				struct as_rq *arq)
+			struct request *rq)
 {
 	unsigned long delay;	/* milliseconds */
 	sector_t last = ad->last_sector[ad->batch_data_dir];
-	sector_t next = arq->request->sector;
+	sector_t next = rq->sector;
 	sector_t delta; /* acceptable close offset (in sectors) */
 	sector_t s;
 
@@ -813,7 +628,7 @@
  *
  * If this task has queued some other IO, do not enter enticipation.
  */
-static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq)
+static int as_can_break_anticipation(struct as_data *ad, struct request *rq)
 {
 	struct io_context *ioc;
 	struct as_io_context *aic;
@@ -821,7 +636,7 @@
 	ioc = ad->io_context;
 	BUG_ON(!ioc);
 
-	if (arq && ioc == arq->io_context) {
+	if (rq && ioc == RQ_IOC(rq)) {
 		/* request from same process */
 		return 1;
 	}
@@ -848,7 +663,7 @@
 		return 1;
 	}
 
-	if (arq && arq->is_sync == REQ_SYNC && as_close_req(ad, aic, arq)) {
+	if (rq && rq_is_sync(rq) && as_close_req(ad, aic, rq)) {
 		/*
 		 * Found a close request that is not one of ours.
 		 *
@@ -864,7 +679,7 @@
 			ad->exit_no_coop = (7*ad->exit_no_coop)/8;
 		}
 
-		as_update_iohist(ad, aic, arq->request);
+		as_update_iohist(ad, aic, rq);
 		return 1;
 	}
 
@@ -891,10 +706,10 @@
 }
 
 /*
- * as_can_anticipate indicates whether we should either run arq
+ * as_can_anticipate indicates whether we should either run rq
  * or keep anticipating a better request.
  */
-static int as_can_anticipate(struct as_data *ad, struct as_rq *arq)
+static int as_can_anticipate(struct as_data *ad, struct request *rq)
 {
 	if (!ad->io_context)
 		/*
@@ -908,7 +723,7 @@
 		 */
 		return 0;
 
-	if (as_can_break_anticipation(ad, arq))
+	if (as_can_break_anticipation(ad, rq))
 		/*
 		 * This request is a good candidate. Don't keep anticipating,
 		 * run it.
@@ -926,16 +741,16 @@
 }
 
 /*
- * as_update_arq must be called whenever a request (arq) is added to
+ * as_update_rq must be called whenever a request (rq) is added to
  * the sort_list. This function keeps caches up to date, and checks if the
  * request might be one we are "anticipating"
  */
-static void as_update_arq(struct as_data *ad, struct as_rq *arq)
+static void as_update_rq(struct as_data *ad, struct request *rq)
 {
-	const int data_dir = arq->is_sync;
+	const int data_dir = rq_is_sync(rq);
 
-	/* keep the next_arq cache up to date */
-	ad->next_arq[data_dir] = as_choose_req(ad, arq, ad->next_arq[data_dir]);
+	/* keep the next_rq cache up to date */
+	ad->next_rq[data_dir] = as_choose_req(ad, rq, ad->next_rq[data_dir]);
 
 	/*
 	 * have we been anticipating this request?
@@ -944,7 +759,7 @@
 	 */
 	if (ad->antic_status == ANTIC_WAIT_REQ
 			|| ad->antic_status == ANTIC_WAIT_NEXT) {
-		if (as_can_break_anticipation(ad, arq))
+		if (as_can_break_anticipation(ad, rq))
 			as_antic_stop(ad);
 	}
 }
@@ -984,12 +799,11 @@
 static void as_completed_request(request_queue_t *q, struct request *rq)
 {
 	struct as_data *ad = q->elevator->elevator_data;
-	struct as_rq *arq = RQ_DATA(rq);
 
 	WARN_ON(!list_empty(&rq->queuelist));
 
-	if (arq->state != AS_RQ_REMOVED) {
-		printk("arq->state %d\n", arq->state);
+	if (RQ_STATE(rq) != AS_RQ_REMOVED) {
+		printk("rq->state %d\n", RQ_STATE(rq));
 		WARN_ON(1);
 		goto out;
 	}
@@ -1009,14 +823,14 @@
 	 * actually serviced. This should help devices with big TCQ windows
 	 * and writeback caches
 	 */
-	if (ad->new_batch && ad->batch_data_dir == arq->is_sync) {
+	if (ad->new_batch && ad->batch_data_dir == rq_is_sync(rq)) {
 		update_write_batch(ad);
 		ad->current_batch_expires = jiffies +
 				ad->batch_expire[REQ_SYNC];
 		ad->new_batch = 0;
 	}
 
-	if (ad->io_context == arq->io_context && ad->io_context) {
+	if (ad->io_context == RQ_IOC(rq) && ad->io_context) {
 		ad->antic_start = jiffies;
 		ad->ioc_finished = 1;
 		if (ad->antic_status == ANTIC_WAIT_REQ) {
@@ -1028,9 +842,9 @@
 		}
 	}
 
-	as_put_io_context(arq);
+	as_put_io_context(rq);
 out:
-	arq->state = AS_RQ_POSTSCHED;
+	RQ_SET_STATE(rq, AS_RQ_POSTSCHED);
 }
 
 /*
@@ -1041,27 +855,27 @@
  */
 static void as_remove_queued_request(request_queue_t *q, struct request *rq)
 {
-	struct as_rq *arq = RQ_DATA(rq);
-	const int data_dir = arq->is_sync;
+	const int data_dir = rq_is_sync(rq);
 	struct as_data *ad = q->elevator->elevator_data;
+	struct io_context *ioc;
 
-	WARN_ON(arq->state != AS_RQ_QUEUED);
+	WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED);
 
-	if (arq->io_context && arq->io_context->aic) {
-		BUG_ON(!atomic_read(&arq->io_context->aic->nr_queued));
-		atomic_dec(&arq->io_context->aic->nr_queued);
+	ioc = RQ_IOC(rq);
+	if (ioc && ioc->aic) {
+		BUG_ON(!atomic_read(&ioc->aic->nr_queued));
+		atomic_dec(&ioc->aic->nr_queued);
 	}
 
 	/*
-	 * Update the "next_arq" cache if we are about to remove its
+	 * Update the "next_rq" cache if we are about to remove its
 	 * entry
 	 */
-	if (ad->next_arq[data_dir] == arq)
-		ad->next_arq[data_dir] = as_find_next_arq(ad, arq);
+	if (ad->next_rq[data_dir] == rq)
+		ad->next_rq[data_dir] = as_find_next_rq(ad, rq);
 
-	list_del_init(&arq->fifo);
-	as_del_arq_hash(arq);
-	as_del_arq_rb(ad, arq);
+	rq_fifo_clear(rq);
+	as_del_rq_rb(ad, rq);
 }
 
 /*
@@ -1074,7 +888,7 @@
  */
 static int as_fifo_expired(struct as_data *ad, int adir)
 {
-	struct as_rq *arq;
+	struct request *rq;
 	long delta_jif;
 
 	delta_jif = jiffies - ad->last_check_fifo[adir];
@@ -1088,9 +902,9 @@
 	if (list_empty(&ad->fifo_list[adir]))
 		return 0;
 
-	arq = list_entry_fifo(ad->fifo_list[adir].next);
+	rq = rq_entry_fifo(ad->fifo_list[adir].next);
 
-	return time_after(jiffies, arq->expires);
+	return time_after(jiffies, rq_fifo_time(rq));
 }
 
 /*
@@ -1113,25 +927,25 @@
 /*
  * move an entry to dispatch queue
  */
-static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq)
+static void as_move_to_dispatch(struct as_data *ad, struct request *rq)
 {
-	struct request *rq = arq->request;
-	const int data_dir = arq->is_sync;
+	const int data_dir = rq_is_sync(rq);
 
-	BUG_ON(!RB_EMPTY_NODE(&arq->rb_node));
+	BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
 
 	as_antic_stop(ad);
 	ad->antic_status = ANTIC_OFF;
 
 	/*
 	 * This has to be set in order to be correctly updated by
-	 * as_find_next_arq
+	 * as_find_next_rq
 	 */
 	ad->last_sector[data_dir] = rq->sector + rq->nr_sectors;
 
 	if (data_dir == REQ_SYNC) {
+		struct io_context *ioc = RQ_IOC(rq);
 		/* In case we have to anticipate after this */
-		copy_io_context(&ad->io_context, &arq->io_context);
+		copy_io_context(&ad->io_context, &ioc);
 	} else {
 		if (ad->io_context) {
 			put_io_context(ad->io_context);
@@ -1143,19 +957,19 @@
 	}
 	ad->ioc_finished = 0;
 
-	ad->next_arq[data_dir] = as_find_next_arq(ad, arq);
+	ad->next_rq[data_dir] = as_find_next_rq(ad, rq);
 
 	/*
 	 * take it off the sort and fifo list, add to dispatch queue
 	 */
 	as_remove_queued_request(ad->q, rq);
-	WARN_ON(arq->state != AS_RQ_QUEUED);
+	WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED);
 
 	elv_dispatch_sort(ad->q, rq);
 
-	arq->state = AS_RQ_DISPATCHED;
-	if (arq->io_context && arq->io_context->aic)
-		atomic_inc(&arq->io_context->aic->nr_dispatched);
+	RQ_SET_STATE(rq, AS_RQ_DISPATCHED);
+	if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
+		atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched);
 	ad->nr_dispatched++;
 }
 
@@ -1167,9 +981,9 @@
 static int as_dispatch_request(request_queue_t *q, int force)
 {
 	struct as_data *ad = q->elevator->elevator_data;
-	struct as_rq *arq;
 	const int reads = !list_empty(&ad->fifo_list[REQ_SYNC]);
 	const int writes = !list_empty(&ad->fifo_list[REQ_ASYNC]);
+	struct request *rq;
 
 	if (unlikely(force)) {
 		/*
@@ -1185,14 +999,14 @@
 		ad->changed_batch = 0;
 		ad->new_batch = 0;
 
-		while (ad->next_arq[REQ_SYNC]) {
-			as_move_to_dispatch(ad, ad->next_arq[REQ_SYNC]);
+		while (ad->next_rq[REQ_SYNC]) {
+			as_move_to_dispatch(ad, ad->next_rq[REQ_SYNC]);
 			dispatched++;
 		}
 		ad->last_check_fifo[REQ_SYNC] = jiffies;
 
-		while (ad->next_arq[REQ_ASYNC]) {
-			as_move_to_dispatch(ad, ad->next_arq[REQ_ASYNC]);
+		while (ad->next_rq[REQ_ASYNC]) {
+			as_move_to_dispatch(ad, ad->next_rq[REQ_ASYNC]);
 			dispatched++;
 		}
 		ad->last_check_fifo[REQ_ASYNC] = jiffies;
@@ -1216,19 +1030,19 @@
 		/*
 		 * batch is still running or no reads or no writes
 		 */
-		arq = ad->next_arq[ad->batch_data_dir];
+		rq = ad->next_rq[ad->batch_data_dir];
 
 		if (ad->batch_data_dir == REQ_SYNC && ad->antic_expire) {
 			if (as_fifo_expired(ad, REQ_SYNC))
 				goto fifo_expired;
 
-			if (as_can_anticipate(ad, arq)) {
+			if (as_can_anticipate(ad, rq)) {
 				as_antic_waitreq(ad);
 				return 0;
 			}
 		}
 
-		if (arq) {
+		if (rq) {
 			/* we have a "next request" */
 			if (reads && !writes)
 				ad->current_batch_expires =
@@ -1256,7 +1070,7 @@
 			ad->changed_batch = 1;
 		}
 		ad->batch_data_dir = REQ_SYNC;
-		arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
+		rq = rq_entry_fifo(ad->fifo_list[REQ_SYNC].next);
 		ad->last_check_fifo[ad->batch_data_dir] = jiffies;
 		goto dispatch_request;
 	}
@@ -1282,7 +1096,7 @@
 		ad->batch_data_dir = REQ_ASYNC;
 		ad->current_write_count = ad->write_batch_count;
 		ad->write_batch_idled = 0;
-		arq = ad->next_arq[ad->batch_data_dir];
+		rq = ad->next_rq[ad->batch_data_dir];
 		goto dispatch_request;
 	}
 
@@ -1296,8 +1110,7 @@
 
 	if (as_fifo_expired(ad, ad->batch_data_dir)) {
 fifo_expired:
-		arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
-		BUG_ON(arq == NULL);
+		rq = rq_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
 	}
 
 	if (ad->changed_batch) {
@@ -1316,70 +1129,58 @@
 	}
 
 	/*
-	 * arq is the selected appropriate request.
+	 * rq is the selected appropriate request.
 	 */
-	as_move_to_dispatch(ad, arq);
+	as_move_to_dispatch(ad, rq);
 
 	return 1;
 }
 
 /*
- * add arq to rbtree and fifo
+ * add rq to rbtree and fifo
  */
 static void as_add_request(request_queue_t *q, struct request *rq)
 {
 	struct as_data *ad = q->elevator->elevator_data;
-	struct as_rq *arq = RQ_DATA(rq);
 	int data_dir;
 
-	arq->state = AS_RQ_NEW;
+	RQ_SET_STATE(rq, AS_RQ_NEW);
 
-	if (rq_data_dir(arq->request) == READ
-			|| (arq->request->flags & REQ_RW_SYNC))
-		arq->is_sync = 1;
-	else
-		arq->is_sync = 0;
-	data_dir = arq->is_sync;
+	data_dir = rq_is_sync(rq);
 
-	arq->io_context = as_get_io_context();
+	rq->elevator_private = as_get_io_context(q->node);
 
-	if (arq->io_context) {
-		as_update_iohist(ad, arq->io_context->aic, arq->request);
-		atomic_inc(&arq->io_context->aic->nr_queued);
+	if (RQ_IOC(rq)) {
+		as_update_iohist(ad, RQ_IOC(rq)->aic, rq);
+		atomic_inc(&RQ_IOC(rq)->aic->nr_queued);
 	}
 
-	as_add_arq_rb(ad, arq);
-	if (rq_mergeable(arq->request))
-		as_add_arq_hash(ad, arq);
+	as_add_rq_rb(ad, rq);
 
 	/*
 	 * set expire time (only used for reads) and add to fifo list
 	 */
-	arq->expires = jiffies + ad->fifo_expire[data_dir];
-	list_add_tail(&arq->fifo, &ad->fifo_list[data_dir]);
+	rq_set_fifo_time(rq, jiffies + ad->fifo_expire[data_dir]);
+	list_add_tail(&rq->queuelist, &ad->fifo_list[data_dir]);
 
-	as_update_arq(ad, arq); /* keep state machine up to date */
-	arq->state = AS_RQ_QUEUED;
+	as_update_rq(ad, rq); /* keep state machine up to date */
+	RQ_SET_STATE(rq, AS_RQ_QUEUED);
 }
 
 static void as_activate_request(request_queue_t *q, struct request *rq)
 {
-	struct as_rq *arq = RQ_DATA(rq);
-
-	WARN_ON(arq->state != AS_RQ_DISPATCHED);
-	arq->state = AS_RQ_REMOVED;
-	if (arq->io_context && arq->io_context->aic)
-		atomic_dec(&arq->io_context->aic->nr_dispatched);
+	WARN_ON(RQ_STATE(rq) != AS_RQ_DISPATCHED);
+	RQ_SET_STATE(rq, AS_RQ_REMOVED);
+	if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
+		atomic_dec(&RQ_IOC(rq)->aic->nr_dispatched);
 }
 
 static void as_deactivate_request(request_queue_t *q, struct request *rq)
 {
-	struct as_rq *arq = RQ_DATA(rq);
-
-	WARN_ON(arq->state != AS_RQ_REMOVED);
-	arq->state = AS_RQ_DISPATCHED;
-	if (arq->io_context && arq->io_context->aic)
-		atomic_inc(&arq->io_context->aic->nr_dispatched);
+	WARN_ON(RQ_STATE(rq) != AS_RQ_REMOVED);
+	RQ_SET_STATE(rq, AS_RQ_DISPATCHED);
+	if (RQ_IOC(rq) && RQ_IOC(rq)->aic)
+		atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched);
 }
 
 /*
@@ -1396,93 +1197,35 @@
 		&& list_empty(&ad->fifo_list[REQ_SYNC]);
 }
 
-static struct request *as_former_request(request_queue_t *q,
-					struct request *rq)
-{
-	struct as_rq *arq = RQ_DATA(rq);
-	struct rb_node *rbprev = rb_prev(&arq->rb_node);
-	struct request *ret = NULL;
-
-	if (rbprev)
-		ret = rb_entry_arq(rbprev)->request;
-
-	return ret;
-}
-
-static struct request *as_latter_request(request_queue_t *q,
-					struct request *rq)
-{
-	struct as_rq *arq = RQ_DATA(rq);
-	struct rb_node *rbnext = rb_next(&arq->rb_node);
-	struct request *ret = NULL;
-
-	if (rbnext)
-		ret = rb_entry_arq(rbnext)->request;
-
-	return ret;
-}
-
 static int
 as_merge(request_queue_t *q, struct request **req, struct bio *bio)
 {
 	struct as_data *ad = q->elevator->elevator_data;
 	sector_t rb_key = bio->bi_sector + bio_sectors(bio);
 	struct request *__rq;
-	int ret;
-
-	/*
-	 * see if the merge hash can satisfy a back merge
-	 */
-	__rq = as_find_arq_hash(ad, bio->bi_sector);
-	if (__rq) {
-		BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
-
-		if (elv_rq_merge_ok(__rq, bio)) {
-			ret = ELEVATOR_BACK_MERGE;
-			goto out;
-		}
-	}
 
 	/*
 	 * check for front merge
 	 */
-	__rq = as_find_arq_rb(ad, rb_key, bio_data_dir(bio));
-	if (__rq) {
-		BUG_ON(rb_key != rq_rb_key(__rq));
-
-		if (elv_rq_merge_ok(__rq, bio)) {
-			ret = ELEVATOR_FRONT_MERGE;
-			goto out;
-		}
+	__rq = elv_rb_find(&ad->sort_list[bio_data_dir(bio)], rb_key);
+	if (__rq && elv_rq_merge_ok(__rq, bio)) {
+		*req = __rq;
+		return ELEVATOR_FRONT_MERGE;
 	}
 
 	return ELEVATOR_NO_MERGE;
-out:
-	if (ret) {
-		if (rq_mergeable(__rq))
-			as_hot_arq_hash(ad, RQ_DATA(__rq));
-	}
-	*req = __rq;
-	return ret;
 }
 
-static void as_merged_request(request_queue_t *q, struct request *req)
+static void as_merged_request(request_queue_t *q, struct request *req, int type)
 {
 	struct as_data *ad = q->elevator->elevator_data;
-	struct as_rq *arq = RQ_DATA(req);
-
-	/*
-	 * hash always needs to be repositioned, key is end sector
-	 */
-	as_del_arq_hash(arq);
-	as_add_arq_hash(ad, arq);
 
 	/*
 	 * if the merge was a front merge, we need to reposition request
 	 */
-	if (rq_rb_key(req) != arq->rb_key) {
-		as_del_arq_rb(ad, arq);
-		as_add_arq_rb(ad, arq);
+	if (type == ELEVATOR_FRONT_MERGE) {
+		as_del_rq_rb(ad, req);
+		as_add_rq_rb(ad, req);
 		/*
 		 * Note! At this stage of this and the next function, our next
 		 * request may not be optimal - eg the request may have "grown"
@@ -1494,38 +1237,22 @@
 static void as_merged_requests(request_queue_t *q, struct request *req,
 			 	struct request *next)
 {
-	struct as_data *ad = q->elevator->elevator_data;
-	struct as_rq *arq = RQ_DATA(req);
-	struct as_rq *anext = RQ_DATA(next);
-
-	BUG_ON(!arq);
-	BUG_ON(!anext);
-
 	/*
-	 * reposition arq (this is the merged request) in hash, and in rbtree
-	 * in case of a front merge
+	 * if next expires before rq, assign its expire time to arq
+	 * and move into next position (next will be deleted) in fifo
 	 */
-	as_del_arq_hash(arq);
-	as_add_arq_hash(ad, arq);
+	if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
+		if (time_before(rq_fifo_time(next), rq_fifo_time(req))) {
+			struct io_context *rioc = RQ_IOC(req);
+			struct io_context *nioc = RQ_IOC(next);
 
-	if (rq_rb_key(req) != arq->rb_key) {
-		as_del_arq_rb(ad, arq);
-		as_add_arq_rb(ad, arq);
-	}
-
-	/*
-	 * if anext expires before arq, assign its expire time to arq
-	 * and move into anext position (anext will be deleted) in fifo
-	 */
-	if (!list_empty(&arq->fifo) && !list_empty(&anext->fifo)) {
-		if (time_before(anext->expires, arq->expires)) {
-			list_move(&arq->fifo, &anext->fifo);
-			arq->expires = anext->expires;
+			list_move(&req->queuelist, &next->queuelist);
+			rq_set_fifo_time(req, rq_fifo_time(next));
 			/*
 			 * Don't copy here but swap, because when anext is
 			 * removed below, it must contain the unused context
 			 */
-			swap_io_context(&arq->io_context, &anext->io_context);
+			swap_io_context(&rioc, &nioc);
 		}
 	}
 
@@ -1533,9 +1260,9 @@
 	 * kill knowledge of next, this one is a goner
 	 */
 	as_remove_queued_request(q, next);
-	as_put_io_context(anext);
+	as_put_io_context(next);
 
-	anext->state = AS_RQ_MERGED;
+	RQ_SET_STATE(next, AS_RQ_MERGED);
 }
 
 /*
@@ -1553,61 +1280,18 @@
 	unsigned long flags;
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	if (!as_queue_empty(q))
-		q->request_fn(q);
+	blk_start_queueing(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
-static void as_put_request(request_queue_t *q, struct request *rq)
-{
-	struct as_data *ad = q->elevator->elevator_data;
-	struct as_rq *arq = RQ_DATA(rq);
-
-	if (!arq) {
-		WARN_ON(1);
-		return;
-	}
-
-	if (unlikely(arq->state != AS_RQ_POSTSCHED &&
-		     arq->state != AS_RQ_PRESCHED &&
-		     arq->state != AS_RQ_MERGED)) {
-		printk("arq->state %d\n", arq->state);
-		WARN_ON(1);
-	}
-
-	mempool_free(arq, ad->arq_pool);
-	rq->elevator_private = NULL;
-}
-
-static int as_set_request(request_queue_t *q, struct request *rq,
-			  struct bio *bio, gfp_t gfp_mask)
-{
-	struct as_data *ad = q->elevator->elevator_data;
-	struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask);
-
-	if (arq) {
-		memset(arq, 0, sizeof(*arq));
-		RB_CLEAR_NODE(&arq->rb_node);
-		arq->request = rq;
-		arq->state = AS_RQ_PRESCHED;
-		arq->io_context = NULL;
-		INIT_HLIST_NODE(&arq->hash);
-		INIT_LIST_HEAD(&arq->fifo);
-		rq->elevator_private = arq;
-		return 0;
-	}
-
-	return 1;
-}
-
-static int as_may_queue(request_queue_t *q, int rw, struct bio *bio)
+static int as_may_queue(request_queue_t *q, int rw)
 {
 	int ret = ELV_MQUEUE_MAY;
 	struct as_data *ad = q->elevator->elevator_data;
 	struct io_context *ioc;
 	if (ad->antic_status == ANTIC_WAIT_REQ ||
 			ad->antic_status == ANTIC_WAIT_NEXT) {
-		ioc = as_get_io_context();
+		ioc = as_get_io_context(q->node);
 		if (ad->io_context == ioc)
 			ret = ELV_MQUEUE_MUST;
 		put_io_context(ioc);
@@ -1626,23 +1310,16 @@
 	BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC]));
 	BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC]));
 
-	mempool_destroy(ad->arq_pool);
 	put_io_context(ad->io_context);
-	kfree(ad->hash);
 	kfree(ad);
 }
 
 /*
- * initialize elevator private data (as_data), and alloc a arq for
- * each request on the free lists
+ * initialize elevator private data (as_data).
  */
 static void *as_init_queue(request_queue_t *q, elevator_t *e)
 {
 	struct as_data *ad;
-	int i;
-
-	if (!arq_pool)
-		return NULL;
 
 	ad = kmalloc_node(sizeof(*ad), GFP_KERNEL, q->node);
 	if (!ad)
@@ -1651,30 +1328,12 @@
 
 	ad->q = q; /* Identify what queue the data belongs to */
 
-	ad->hash = kmalloc_node(sizeof(struct hlist_head)*AS_HASH_ENTRIES,
-				GFP_KERNEL, q->node);
-	if (!ad->hash) {
-		kfree(ad);
-		return NULL;
-	}
-
-	ad->arq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
-				mempool_free_slab, arq_pool, q->node);
-	if (!ad->arq_pool) {
-		kfree(ad->hash);
-		kfree(ad);
-		return NULL;
-	}
-
 	/* anticipatory scheduling helpers */
 	ad->antic_timer.function = as_antic_timeout;
 	ad->antic_timer.data = (unsigned long)q;
 	init_timer(&ad->antic_timer);
 	INIT_WORK(&ad->antic_work, as_work_handler, q);
 
-	for (i = 0; i < AS_HASH_ENTRIES; i++)
-		INIT_HLIST_HEAD(&ad->hash[i]);
-
 	INIT_LIST_HEAD(&ad->fifo_list[REQ_SYNC]);
 	INIT_LIST_HEAD(&ad->fifo_list[REQ_ASYNC]);
 	ad->sort_list[REQ_SYNC] = RB_ROOT;
@@ -1787,10 +1446,8 @@
 		.elevator_deactivate_req_fn = 	as_deactivate_request,
 		.elevator_queue_empty_fn =	as_queue_empty,
 		.elevator_completed_req_fn =	as_completed_request,
-		.elevator_former_req_fn =	as_former_request,
-		.elevator_latter_req_fn =	as_latter_request,
-		.elevator_set_req_fn =		as_set_request,
-		.elevator_put_req_fn =		as_put_request,
+		.elevator_former_req_fn =	elv_rb_former_request,
+		.elevator_latter_req_fn =	elv_rb_latter_request,
 		.elevator_may_queue_fn =	as_may_queue,
 		.elevator_init_fn =		as_init_queue,
 		.elevator_exit_fn =		as_exit_queue,
@@ -1806,11 +1463,6 @@
 {
 	int ret;
 
-	arq_pool = kmem_cache_create("as_arq", sizeof(struct as_rq),
-				     0, 0, NULL, NULL);
-	if (!arq_pool)
-		return -ENOMEM;
-
 	ret = elv_register(&iosched_as);
 	if (!ret) {
 		/*
@@ -1822,7 +1474,6 @@
 		return 0;
 	}
 
-	kmem_cache_destroy(arq_pool);
 	return ret;
 }
 
@@ -1833,10 +1484,9 @@
 	ioc_gone = &all_gone;
 	/* ioc_gone's update must be visible before reading ioc_count */
 	smp_wmb();
-	if (atomic_read(&ioc_count))
+	if (elv_ioc_count_read(ioc_count))
 		wait_for_completion(ioc_gone);
 	synchronize_rcu();
-	kmem_cache_destroy(arq_pool);
 }
 
 module_init(as_init);
diff -urN oldtree/block/blktrace.c newtree/block/blktrace.c
--- oldtree/block/blktrace.c	2006-08-02 07:14:14.000000000 -0700
+++ newtree/block/blktrace.c	2006-08-03 13:28:27.000000000 -0700
@@ -69,7 +69,7 @@
 /*
  * Bio action bits of interest
  */
-static u32 bio_act[5] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC), 0, BLK_TC_ACT(BLK_TC_AHEAD) };
+static u32 bio_act[9] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC), 0, BLK_TC_ACT(BLK_TC_AHEAD), 0, 0, 0, BLK_TC_ACT(BLK_TC_META) };
 
 /*
  * More could be added as needed, taking care to increment the decrementer
@@ -80,7 +80,9 @@
 #define trace_sync_bit(rw)	\
 	(((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
 #define trace_ahead_bit(rw)	\
-	(((rw) & (1 << BIO_RW_AHEAD)) << (2 - BIO_RW_AHEAD))
+        (((rw) & (1 << BIO_RW_AHEAD)) << (2 - BIO_RW_AHEAD))
+#define trace_meta_bit(rw)    \
+        (((rw) & (1 << BIO_RW_META)) >> (BIO_RW_META - 3))
 
 /*
  * The worker for the various blk_add_trace*() types. Fills out a
@@ -103,6 +105,7 @@
 	what |= bio_act[trace_barrier_bit(rw)];
 	what |= bio_act[trace_sync_bit(rw)];
 	what |= bio_act[trace_ahead_bit(rw)];
+	what |= bio_act[trace_meta_bit(rw)];
 
 	pid = tsk->pid;
 	if (unlikely(act_log_check(bt, what, sector, pid)))
diff -urN oldtree/block/cfq-iosched.c newtree/block/cfq-iosched.c
--- oldtree/block/cfq-iosched.c	2006-08-02 07:14:14.000000000 -0700
+++ newtree/block/cfq-iosched.c	2006-08-03 13:30:15.000000000 -0700
@@ -17,7 +17,6 @@
  * tunables
  */
 static const int cfq_quantum = 4;		/* max queue in one round of service */
-static const int cfq_queued = 8;		/* minimum rq allocate limit per-queue*/
 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
 static const int cfq_back_max = 16 * 1024;	/* maximum backwards seek, in KiB */
 static const int cfq_back_penalty = 2;		/* penalty of a backwards seek */
@@ -32,8 +31,6 @@
 
 #define CFQ_KEY_ASYNC		(0)
 
-static DEFINE_SPINLOCK(cfq_exit_lock);
-
 /*
  * for the hash of cfqq inside the cfqd
  */
@@ -41,37 +38,19 @@
 #define CFQ_QHASH_ENTRIES	(1 << CFQ_QHASH_SHIFT)
 #define list_entry_qhash(entry)	hlist_entry((entry), struct cfq_queue, cfq_hash)
 
-/*
- * for the hash of crq inside the cfqq
- */
-#define CFQ_MHASH_SHIFT		6
-#define CFQ_MHASH_BLOCK(sec)	((sec) >> 3)
-#define CFQ_MHASH_ENTRIES	(1 << CFQ_MHASH_SHIFT)
-#define CFQ_MHASH_FN(sec)	hash_long(CFQ_MHASH_BLOCK(sec), CFQ_MHASH_SHIFT)
-#define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
-#define list_entry_hash(ptr)	hlist_entry((ptr), struct cfq_rq, hash)
-
 #define list_entry_cfqq(ptr)	list_entry((ptr), struct cfq_queue, cfq_list)
-#define list_entry_fifo(ptr)	list_entry((ptr), struct request, queuelist)
 
-#define RQ_DATA(rq)		(rq)->elevator_private
+#define RQ_CIC(rq)		((struct cfq_io_context*)(rq)->elevator_private)
+#define RQ_CFQQ(rq)		((rq)->elevator_private2)
 
-/*
- * rb-tree defines
- */
-#define rb_entry_crq(node)	rb_entry((node), struct cfq_rq, rb_node)
-#define rq_rb_key(rq)		(rq)->sector
-
-static kmem_cache_t *crq_pool;
 static kmem_cache_t *cfq_pool;
 static kmem_cache_t *cfq_ioc_pool;
 
-static atomic_t ioc_count = ATOMIC_INIT(0);
+static DEFINE_PER_CPU(unsigned long, ioc_count);
 static struct completion *ioc_gone;
 
 #define CFQ_PRIO_LISTS		IOPRIO_BE_NR
 #define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
-#define cfq_class_be(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_BE)
 #define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
 
 #define ASYNC			(0)
@@ -103,29 +82,14 @@
 	unsigned int busy_queues;
 
 	/*
-	 * non-ordered list of empty cfqq's
-	 */
-	struct list_head empty_list;
-
-	/*
 	 * cfqq lookup hash
 	 */
 	struct hlist_head *cfq_hash;
 
-	/*
-	 * global crq hash for all queues
-	 */
-	struct hlist_head *crq_hash;
-
-	mempool_t *crq_pool;
-
 	int rq_in_driver;
 	int hw_tag;
 
 	/*
-	 * schedule slice state info
-	 */
-	/*
 	 * idle window management
 	 */
 	struct timer_list idle_slice_timer;
@@ -141,13 +105,10 @@
 	sector_t last_sector;
 	unsigned long last_end_request;
 
-	unsigned int rq_starved;
-
 	/*
 	 * tunables, see top of file
 	 */
 	unsigned int cfq_quantum;
-	unsigned int cfq_queued;
 	unsigned int cfq_fifo_expire[2];
 	unsigned int cfq_back_penalty;
 	unsigned int cfq_back_max;
@@ -170,23 +131,24 @@
 	struct hlist_node cfq_hash;
 	/* hash key */
 	unsigned int key;
-	/* on either rr or empty list of cfqd */
+	/* member of the rr/busy/cur/idle cfqd list */
 	struct list_head cfq_list;
 	/* sorted list of pending requests */
 	struct rb_root sort_list;
 	/* if fifo isn't expired, next request to serve */
-	struct cfq_rq *next_crq;
+	struct request *next_rq;
 	/* requests queued in sort_list */
 	int queued[2];
 	/* currently allocated requests */
 	int allocated[2];
+	/* pending metadata requests */
+	int meta_pending;
 	/* fifo list of requests in sort_list */
 	struct list_head fifo;
 
 	unsigned long slice_start;
 	unsigned long slice_end;
 	unsigned long slice_left;
-	unsigned long service_last;
 
 	/* number of requests that are on the dispatch list */
 	int on_dispatch[2];
@@ -199,18 +161,6 @@
 	unsigned int flags;
 };
 
-struct cfq_rq {
-	struct rb_node rb_node;
-	sector_t rb_key;
-	struct request *request;
-	struct hlist_node hash;
-
-	struct cfq_queue *cfq_queue;
-	struct cfq_io_context *io_context;
-
-	unsigned int crq_flags;
-};
-
 enum cfqq_state_flags {
 	CFQ_CFQQ_FLAG_on_rr = 0,
 	CFQ_CFQQ_FLAG_wait_request,
@@ -220,6 +170,7 @@
 	CFQ_CFQQ_FLAG_fifo_expire,
 	CFQ_CFQQ_FLAG_idle_window,
 	CFQ_CFQQ_FLAG_prio_changed,
+	CFQ_CFQQ_FLAG_queue_new,
 };
 
 #define CFQ_CFQQ_FNS(name)						\
@@ -244,70 +195,14 @@
 CFQ_CFQQ_FNS(fifo_expire);
 CFQ_CFQQ_FNS(idle_window);
 CFQ_CFQQ_FNS(prio_changed);
+CFQ_CFQQ_FNS(queue_new);
 #undef CFQ_CFQQ_FNS
 
-enum cfq_rq_state_flags {
-	CFQ_CRQ_FLAG_is_sync = 0,
-};
-
-#define CFQ_CRQ_FNS(name)						\
-static inline void cfq_mark_crq_##name(struct cfq_rq *crq)		\
-{									\
-	crq->crq_flags |= (1 << CFQ_CRQ_FLAG_##name);			\
-}									\
-static inline void cfq_clear_crq_##name(struct cfq_rq *crq)		\
-{									\
-	crq->crq_flags &= ~(1 << CFQ_CRQ_FLAG_##name);			\
-}									\
-static inline int cfq_crq_##name(const struct cfq_rq *crq)		\
-{									\
-	return (crq->crq_flags & (1 << CFQ_CRQ_FLAG_##name)) != 0;	\
-}
-
-CFQ_CRQ_FNS(is_sync);
-#undef CFQ_CRQ_FNS
-
 static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
-static void cfq_dispatch_insert(request_queue_t *, struct cfq_rq *);
+static void cfq_dispatch_insert(request_queue_t *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, unsigned int key, struct task_struct *tsk, gfp_t gfp_mask);
 
 /*
- * lots of deadline iosched dupes, can be abstracted later...
- */
-static inline void cfq_del_crq_hash(struct cfq_rq *crq)
-{
-	hlist_del_init(&crq->hash);
-}
-
-static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
-{
-	const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request));
-
-	hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
-}
-
-static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
-{
-	struct hlist_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
-	struct hlist_node *entry, *next;
-
-	hlist_for_each_safe(entry, next, hash_list) {
-		struct cfq_rq *crq = list_entry_hash(entry);
-		struct request *__rq = crq->request;
-
-		if (!rq_mergeable(__rq)) {
-			cfq_del_crq_hash(crq);
-			continue;
-		}
-
-		if (rq_hash_key(__rq) == offset)
-			return __rq;
-	}
-
-	return NULL;
-}
-
-/*
  * scheduler run of queue, if there are requests pending and no one in the
  * driver that will restart queueing
  */
@@ -333,12 +228,12 @@
 }
 
 /*
- * Lifted from AS - choose which of crq1 and crq2 that is best served now.
+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
  * We choose the request that is closest to the head right now. Distance
  * behind the head is penalized and only allowed to a certain extent.
  */
-static struct cfq_rq *
-cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
+static struct request *
+cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
 {
 	sector_t last, s1, s2, d1 = 0, d2 = 0;
 	unsigned long back_max;
@@ -346,18 +241,22 @@
 #define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
 	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
 
-	if (crq1 == NULL || crq1 == crq2)
-		return crq2;
-	if (crq2 == NULL)
-		return crq1;
-
-	if (cfq_crq_is_sync(crq1) && !cfq_crq_is_sync(crq2))
-		return crq1;
-	else if (cfq_crq_is_sync(crq2) && !cfq_crq_is_sync(crq1))
-		return crq2;
+	if (rq1 == NULL || rq1 == rq2)
+		return rq2;
+	if (rq2 == NULL)
+		return rq1;
+
+	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
+		return rq1;
+	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
+		return rq2;
+	if (rq_is_meta(rq1) && !rq_is_meta(rq2))
+		return rq1;
+	else if (rq_is_meta(rq2) && !rq_is_meta(rq1))
+		return rq2;
 
-	s1 = crq1->request->sector;
-	s2 = crq2->request->sector;
+	s1 = rq1->sector;
+	s2 = rq2->sector;
 
 	last = cfqd->last_sector;
 
@@ -392,23 +291,23 @@
 	 * check two variables for all permutations: --> faster!
 	 */
 	switch (wrap) {
-	case 0: /* common case for CFQ: crq1 and crq2 not wrapped */
+	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
 		if (d1 < d2)
-			return crq1;
+			return rq1;
 		else if (d2 < d1)
-			return crq2;
+			return rq2;
 		else {
 			if (s1 >= s2)
-				return crq1;
+				return rq1;
 			else
-				return crq2;
+				return rq2;
 		}
 
 	case CFQ_RQ2_WRAP:
-		return crq1;
+		return rq1;
 	case CFQ_RQ1_WRAP:
-		return crq2;
-	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both crqs wrapped */
+		return rq2;
+	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
 	default:
 		/*
 		 * Since both rqs are wrapped,
@@ -417,50 +316,43 @@
 		 * since back seek takes more time than forward.
 		 */
 		if (s1 <= s2)
-			return crq1;
+			return rq1;
 		else
-			return crq2;
+			return rq2;
 	}
 }
 
 /*
  * would be nice to take fifo expire time into account as well
  */
-static struct cfq_rq *
-cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-		  struct cfq_rq *last)
+static struct request *
+cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+		  struct request *last)
 {
-	struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
-	struct rb_node *rbnext, *rbprev;
+	struct rb_node *rbnext = rb_next(&last->rb_node);
+	struct rb_node *rbprev = rb_prev(&last->rb_node);
+	struct request *next = NULL, *prev = NULL;
 
-	if (!(rbnext = rb_next(&last->rb_node))) {
-		rbnext = rb_first(&cfqq->sort_list);
-		if (rbnext == &last->rb_node)
-			rbnext = NULL;
-	}
-
-	rbprev = rb_prev(&last->rb_node);
+	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
 
 	if (rbprev)
-		crq_prev = rb_entry_crq(rbprev);
-	if (rbnext)
-		crq_next = rb_entry_crq(rbnext);
+		prev = rb_entry_rq(rbprev);
 
-	return cfq_choose_req(cfqd, crq_next, crq_prev);
-}
-
-static void cfq_update_next_crq(struct cfq_rq *crq)
-{
-	struct cfq_queue *cfqq = crq->cfq_queue;
+	if (rbnext)
+		next = rb_entry_rq(rbnext);
+	else {
+		rbnext = rb_first(&cfqq->sort_list);
+		if (rbnext && rbnext != &last->rb_node)
+			next = rb_entry_rq(rbnext);
+	}
 
-	if (cfqq->next_crq == crq)
-		cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
+	return cfq_choose_req(cfqd, next, prev);
 }
 
 static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
 {
 	struct cfq_data *cfqd = cfqq->cfqd;
-	struct list_head *list, *entry;
+	struct list_head *list;
 
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 
@@ -485,31 +377,14 @@
 	}
 
 	/*
-	 * if queue was preempted, just add to front to be fair. busy_rr
-	 * isn't sorted, but insert at the back for fairness.
+	 * If this queue was preempted or is new (never been serviced), let
+	 * it be added first for fairness. Otherwise, just add to the back
+	 * of the list.
 	 */
-	if (preempted || list == &cfqd->busy_rr) {
-		if (preempted)
-			list = list->prev;
-
-		list_add_tail(&cfqq->cfq_list, list);
-		return;
-	}
+	if (preempted || cfq_cfqq_queue_new(cfqq))
+		list = list->prev;
 
-	/*
-	 * sort by when queue was last serviced
-	 */
-	entry = list;
-	while ((entry = entry->prev) != list) {
-		struct cfq_queue *__cfqq = list_entry_cfqq(entry);
-
-		if (!__cfqq->service_last)
-			break;
-		if (time_before(__cfqq->service_last, cfqq->service_last))
-			break;
-	}
-
-	list_add(&cfqq->cfq_list, entry);
+	list_add_tail(&cfqq->cfq_list, list);
 }
 
 /*
@@ -531,7 +406,7 @@
 {
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 	cfq_clear_cfqq_on_rr(cfqq);
-	list_move(&cfqq->cfq_list, &cfqd->empty_list);
+	list_del_init(&cfqq->cfq_list);
 
 	BUG_ON(!cfqd->busy_queues);
 	cfqd->busy_queues--;
@@ -540,81 +415,43 @@
 /*
  * rb tree support functions
  */
-static inline void cfq_del_crq_rb(struct cfq_rq *crq)
+static inline void cfq_del_rq_rb(struct request *rq)
 {
-	struct cfq_queue *cfqq = crq->cfq_queue;
+	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	struct cfq_data *cfqd = cfqq->cfqd;
-	const int sync = cfq_crq_is_sync(crq);
+	const int sync = rq_is_sync(rq);
 
 	BUG_ON(!cfqq->queued[sync]);
 	cfqq->queued[sync]--;
 
-	cfq_update_next_crq(crq);
-
-	rb_erase(&crq->rb_node, &cfqq->sort_list);
+	elv_rb_del(&cfqq->sort_list, rq);
 
 	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
 		cfq_del_cfqq_rr(cfqd, cfqq);
 }
 
-static struct cfq_rq *
-__cfq_add_crq_rb(struct cfq_rq *crq)
-{
-	struct rb_node **p = &crq->cfq_queue->sort_list.rb_node;
-	struct rb_node *parent = NULL;
-	struct cfq_rq *__crq;
-
-	while (*p) {
-		parent = *p;
-		__crq = rb_entry_crq(parent);
-
-		if (crq->rb_key < __crq->rb_key)
-			p = &(*p)->rb_left;
-		else if (crq->rb_key > __crq->rb_key)
-			p = &(*p)->rb_right;
-		else
-			return __crq;
-	}
-
-	rb_link_node(&crq->rb_node, parent, p);
-	return NULL;
-}
-
-static void cfq_add_crq_rb(struct cfq_rq *crq)
+static void cfq_add_rq_rb(struct request *rq)
 {
-	struct cfq_queue *cfqq = crq->cfq_queue;
+	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	struct cfq_data *cfqd = cfqq->cfqd;
-	struct request *rq = crq->request;
-	struct cfq_rq *__alias;
+	struct request *__alias;
 
-	crq->rb_key = rq_rb_key(rq);
-	cfqq->queued[cfq_crq_is_sync(crq)]++;
+	cfqq->queued[rq_is_sync(rq)]++;
 
 	/*
 	 * looks a little odd, but the first insert might return an alias.
 	 * if that happens, put the alias on the dispatch list
 	 */
-	while ((__alias = __cfq_add_crq_rb(crq)) != NULL)
+	while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
 		cfq_dispatch_insert(cfqd->queue, __alias);
-
-	rb_insert_color(&crq->rb_node, &cfqq->sort_list);
-
-	if (!cfq_cfqq_on_rr(cfqq))
-		cfq_add_cfqq_rr(cfqd, cfqq);
-
-	/*
-	 * check if this request is a better next-serve candidate
-	 */
-	cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
 }
 
 static inline void
-cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
+cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 {
-	rb_erase(&crq->rb_node, &cfqq->sort_list);
-	cfqq->queued[cfq_crq_is_sync(crq)]--;
-
-	cfq_add_crq_rb(crq);
+	elv_rb_del(&cfqq->sort_list, rq);
+	cfqq->queued[rq_is_sync(rq)]--;
+	cfq_add_rq_rb(rq);
 }
 
 static struct request *
@@ -623,27 +460,14 @@
 	struct task_struct *tsk = current;
 	pid_t key = cfq_queue_pid(tsk, bio_data_dir(bio));
 	struct cfq_queue *cfqq;
-	struct rb_node *n;
-	sector_t sector;
 
 	cfqq = cfq_find_cfq_hash(cfqd, key, tsk->ioprio);
-	if (!cfqq)
-		goto out;
-
-	sector = bio->bi_sector + bio_sectors(bio);
-	n = cfqq->sort_list.rb_node;
-	while (n) {
-		struct cfq_rq *crq = rb_entry_crq(n);
+	if (cfqq) {
+		sector_t sector = bio->bi_sector + bio_sectors(bio);
 
-		if (sector < crq->rb_key)
-			n = n->rb_left;
-		else if (sector > crq->rb_key)
-			n = n->rb_right;
-		else
-			return crq->request;
+		return elv_rb_find(&cfqq->sort_list, sector);
 	}
 
-out:
 	return NULL;
 }
 
@@ -673,11 +497,18 @@
 
 static void cfq_remove_request(struct request *rq)
 {
-	struct cfq_rq *crq = RQ_DATA(rq);
+	struct cfq_queue *cfqq = RQ_CFQQ(rq);
+
+	if (cfqq->next_rq == rq)
+		cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
 
 	list_del_init(&rq->queuelist);
-	cfq_del_crq_rb(crq);
-	cfq_del_crq_hash(crq);
+	cfq_del_rq_rb(rq);
+
+	if (rq_is_meta(rq)) {
+		WARN_ON(!cfqq->meta_pending);
+		cfqq->meta_pending--;
+	}
 }
 
 static int
@@ -685,39 +516,23 @@
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct request *__rq;
-	int ret;
-
-	__rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
-	if (__rq && elv_rq_merge_ok(__rq, bio)) {
-		ret = ELEVATOR_BACK_MERGE;
-		goto out;
-	}
 
 	__rq = cfq_find_rq_fmerge(cfqd, bio);
 	if (__rq && elv_rq_merge_ok(__rq, bio)) {
-		ret = ELEVATOR_FRONT_MERGE;
-		goto out;
+		*req = __rq;
+		return ELEVATOR_FRONT_MERGE;
 	}
 
 	return ELEVATOR_NO_MERGE;
-out:
-	*req = __rq;
-	return ret;
 }
 
-static void cfq_merged_request(request_queue_t *q, struct request *req)
+static void cfq_merged_request(request_queue_t *q, struct request *req,
+			       int type)
 {
-	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct cfq_rq *crq = RQ_DATA(req);
-
-	cfq_del_crq_hash(crq);
-	cfq_add_crq_hash(cfqd, crq);
-
-	if (rq_rb_key(req) != crq->rb_key) {
-		struct cfq_queue *cfqq = crq->cfq_queue;
+	if (type == ELEVATOR_FRONT_MERGE) {
+		struct cfq_queue *cfqq = RQ_CFQQ(req);
 
-		cfq_update_next_crq(crq);
-		cfq_reposition_crq_rb(cfqq, crq);
+		cfq_reposition_rq_rb(cfqq, req);
 	}
 }
 
@@ -725,8 +540,6 @@
 cfq_merged_requests(request_queue_t *q, struct request *rq,
 		    struct request *next)
 {
-	cfq_merged_request(q, rq);
-
 	/*
 	 * reposition in fifo if next is older than rq
 	 */
@@ -768,13 +581,12 @@
 	if (cfq_cfqq_wait_request(cfqq))
 		del_timer(&cfqd->idle_slice_timer);
 
-	if (!preempted && !cfq_cfqq_dispatched(cfqq)) {
-		cfqq->service_last = now;
+	if (!preempted && !cfq_cfqq_dispatched(cfqq))
 		cfq_schedule_dispatch(cfqd);
-	}
 
 	cfq_clear_cfqq_must_dispatch(cfqq);
 	cfq_clear_cfqq_wait_request(cfqq);
+	cfq_clear_cfqq_queue_new(cfqq);
 
 	/*
 	 * store what was left of this slice, if the queue idled out
@@ -868,26 +680,25 @@
 {
 	struct cfq_queue *cfqq = NULL;
 
-	/*
-	 * if current list is non-empty, grab first entry. if it is empty,
-	 * get next prio level and grab first entry then if any are spliced
-	 */
-	if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1)
+	if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1) {
+		/*
+		 * if current list is non-empty, grab first entry. if it is
+		 * empty, get next prio level and grab first entry then if any
+		 * are spliced
+		 */
 		cfqq = list_entry_cfqq(cfqd->cur_rr.next);
-
-	/*
-	 * If no new queues are available, check if the busy list has some
-	 * before falling back to idle io.
-	 */
-	if (!cfqq && !list_empty(&cfqd->busy_rr))
+	} else if (!list_empty(&cfqd->busy_rr)) {
+		/*
+		 * If no new queues are available, check if the busy list has
+		 * some before falling back to idle io.
+		 */
 		cfqq = list_entry_cfqq(cfqd->busy_rr.next);
-
-	/*
-	 * if we have idle queues and no rt or be queues had pending
-	 * requests, either allow immediate service if the grace period
-	 * has passed or arm the idle grace timer
-	 */
-	if (!cfqq && !list_empty(&cfqd->idle_rr)) {
+	} else if (!list_empty(&cfqd->idle_rr)) {
+		/*
+		 * if we have idle queues and no rt or be queues had pending
+		 * requests, either allow immediate service if the grace period
+		 * has passed or arm the idle grace timer
+		 */
 		unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE;
 
 		if (time_after_eq(jiffies, end))
@@ -942,16 +753,14 @@
 	return 1;
 }
 
-static void cfq_dispatch_insert(request_queue_t *q, struct cfq_rq *crq)
+static void cfq_dispatch_insert(request_queue_t *q, struct request *rq)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct cfq_queue *cfqq = crq->cfq_queue;
-	struct request *rq;
+        struct cfq_queue *cfqq = RQ_CFQQ(rq);
 
-	cfqq->next_crq = cfq_find_next_crq(cfqd, cfqq, crq);
-	cfq_remove_request(crq->request);
-	cfqq->on_dispatch[cfq_crq_is_sync(crq)]++;
-	elv_dispatch_sort(q, crq->request);
+        cfq_remove_request(rq);
+        cfqq->on_dispatch[rq_is_sync(rq)]++;
+        elv_dispatch_sort(q, rq);
 
 	rq = list_entry(q->queue_head.prev, struct request, queuelist);
 	cfqd->last_sector = rq->sector + rq->nr_sectors;
@@ -960,24 +769,23 @@
 /*
  * return expired entry, or NULL to just start from scratch in rbtree
  */
-static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
+static inline struct request *cfq_check_fifo(struct cfq_queue *cfqq)
 {
 	struct cfq_data *cfqd = cfqq->cfqd;
 	struct request *rq;
-	struct cfq_rq *crq;
+	int fifo;
 
 	if (cfq_cfqq_fifo_expire(cfqq))
 		return NULL;
+	if (list_empty(&cfqq->fifo))
+		return NULL;
 
-	if (!list_empty(&cfqq->fifo)) {
-		int fifo = cfq_cfqq_class_sync(cfqq);
+	fifo = cfq_cfqq_class_sync(cfqq);
+	rq = rq_entry_fifo(cfqq->fifo.next);
 
-		crq = RQ_DATA(list_entry_fifo(cfqq->fifo.next));
-		rq = crq->request;
-		if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) {
-			cfq_mark_cfqq_fifo_expire(cfqq);
-			return crq;
-		}
+	if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) {
+		cfq_mark_cfqq_fifo_expire(cfqq);
+		return rq;
 	}
 
 	return NULL;
@@ -1063,25 +871,25 @@
 	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
 
 	do {
-		struct cfq_rq *crq;
+		struct request *rq;
 
 		/*
 		 * follow expired path, else get first next available
 		 */
-		if ((crq = cfq_check_fifo(cfqq)) == NULL)
-			crq = cfqq->next_crq;
+		if ((rq = cfq_check_fifo(cfqq)) == NULL)
+			rq = cfqq->next_rq;
 
 		/*
 		 * finally, insert request into driver dispatch list
 		 */
-		cfq_dispatch_insert(cfqd->queue, crq);
+		cfq_dispatch_insert(cfqd->queue, rq);
 
 		cfqd->dispatch_slice++;
 		dispatched++;
 
 		if (!cfqd->active_cic) {
-			atomic_inc(&crq->io_context->ioc->refcount);
-			cfqd->active_cic = crq->io_context;
+			atomic_inc(&RQ_CIC(rq)->ioc->refcount);
+			cfqd->active_cic = RQ_CIC(rq);
 		}
 
 		if (RB_EMPTY_ROOT(&cfqq->sort_list))
@@ -1112,13 +920,12 @@
 cfq_forced_dispatch_cfqqs(struct list_head *list)
 {
 	struct cfq_queue *cfqq, *next;
-	struct cfq_rq *crq;
 	int dispatched;
 
 	dispatched = 0;
 	list_for_each_entry_safe(cfqq, next, list, cfq_list) {
-		while ((crq = cfqq->next_crq)) {
-			cfq_dispatch_insert(cfqq->cfqd->queue, crq);
+		while (cfqq->next_rq) {
+			cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
 			dispatched++;
 		}
 		BUG_ON(!list_empty(&cfqq->fifo));
@@ -1194,8 +1001,8 @@
 }
 
 /*
- * task holds one reference to the queue, dropped when task exits. each crq
- * in-flight on this queue also holds a reference, dropped when crq is freed.
+ * task holds one reference to the queue, dropped when task exits. each rq
+ * in-flight on this queue also holds a reference, dropped when rq is freed.
  *
  * queue lock must be held here.
  */
@@ -1223,7 +1030,7 @@
 	kmem_cache_free(cfq_pool, cfqq);
 }
 
-static inline struct cfq_queue *
+static struct cfq_queue *
 __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
 		    const int hashval)
 {
@@ -1260,62 +1067,63 @@
 		freed++;
 	}
 
-	if (atomic_sub_and_test(freed, &ioc_count) && ioc_gone)
+	elv_ioc_count_mod(ioc_count, -freed);
+
+	if (ioc_gone && !elv_ioc_count_read(ioc_count))
 		complete(ioc_gone);
 }
 
-static void cfq_trim(struct io_context *ioc)
+static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-	ioc->set_ioprio = NULL;
-	cfq_free_io_context(ioc);
+	if (unlikely(cfqq == cfqd->active_queue))
+		__cfq_slice_expired(cfqd, cfqq, 0);
+
+	cfq_put_queue(cfqq);
 }
 
-/*
- * Called with interrupts disabled
- */
-static void cfq_exit_single_io_context(struct cfq_io_context *cic)
+static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
+					 struct cfq_io_context *cic)
 {
-	struct cfq_data *cfqd = cic->key;
-	request_queue_t *q;
-
-	if (!cfqd)
-		return;
-
-	q = cfqd->queue;
-
-	WARN_ON(!irqs_disabled());
-
-	spin_lock(q->queue_lock);
+	list_del_init(&cic->queue_list);
+	smp_wmb();
+	cic->key = NULL;
 
 	if (cic->cfqq[ASYNC]) {
-		if (unlikely(cic->cfqq[ASYNC] == cfqd->active_queue))
-			__cfq_slice_expired(cfqd, cic->cfqq[ASYNC], 0);
-		cfq_put_queue(cic->cfqq[ASYNC]);
+		cfq_exit_cfqq(cfqd, cic->cfqq[ASYNC]);
 		cic->cfqq[ASYNC] = NULL;
 	}
 
 	if (cic->cfqq[SYNC]) {
-		if (unlikely(cic->cfqq[SYNC] == cfqd->active_queue))
-			__cfq_slice_expired(cfqd, cic->cfqq[SYNC], 0);
-		cfq_put_queue(cic->cfqq[SYNC]);
+		cfq_exit_cfqq(cfqd, cic->cfqq[SYNC]);
 		cic->cfqq[SYNC] = NULL;
 	}
+}
 
-	cic->key = NULL;
-	list_del_init(&cic->queue_list);
-	spin_unlock(q->queue_lock);
+
+/*
+ * Called with interrupts disabled
+ */
+static void cfq_exit_single_io_context(struct cfq_io_context *cic)
+{
+	struct cfq_data *cfqd = cic->key;
+
+	if (cfqd) {
+		request_queue_t *q = cfqd->queue;
+
+		spin_lock_irq(q->queue_lock);
+		__cfq_exit_single_io_context(cfqd, cic);
+		spin_unlock_irq(q->queue_lock);
+	}
 }
 
 static void cfq_exit_io_context(struct io_context *ioc)
 {
 	struct cfq_io_context *__cic;
-	unsigned long flags;
 	struct rb_node *n;
 
 	/*
 	 * put the reference this task is holding to the various queues
 	 */
-	spin_lock_irqsave(&cfq_exit_lock, flags);
 
 	n = rb_first(&ioc->cic_root);
 	while (n != NULL) {
@@ -1324,22 +1132,21 @@
 		cfq_exit_single_io_context(__cic);
 		n = rb_next(n);
 	}
-
-	spin_unlock_irqrestore(&cfq_exit_lock, flags);
 }
 
 static struct cfq_io_context *
 cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
-	struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask);
+	struct cfq_io_context *cic;
 
+	cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask, cfqd->queue->node);
 	if (cic) {
 		memset(cic, 0, sizeof(*cic));
 		cic->last_end_request = jiffies;
 		INIT_LIST_HEAD(&cic->queue_list);
 		cic->dtor = cfq_free_io_context;
 		cic->exit = cfq_exit_io_context;
-		atomic_inc(&ioc_count);
+		elv_ioc_count_inc(ioc_count);
 	}
 
 	return cic;
@@ -1420,15 +1227,12 @@
 	spin_unlock(cfqd->queue->queue_lock);
 }
 
-/*
- * callback from sys_ioprio_set, irqs are disabled
- */
-static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
+static void cfq_ioc_set_ioprio(struct io_context *ioc)
 {
 	struct cfq_io_context *cic;
 	struct rb_node *n;
 
-	spin_lock(&cfq_exit_lock);
+	ioc->ioprio_changed = 0;
 
 	n = rb_first(&ioc->cic_root);
 	while (n != NULL) {
@@ -1437,10 +1241,6 @@
 		changed_ioprio(cic);
 		n = rb_next(n);
 	}
-
-	spin_unlock(&cfq_exit_lock);
-
-	return 0;
 }
 
 static struct cfq_queue *
@@ -1460,12 +1260,18 @@
 			cfqq = new_cfqq;
 			new_cfqq = NULL;
 		} else if (gfp_mask & __GFP_WAIT) {
+			/*
+			 * Inform the allocator of the fact that we will
+			 * just repeat this allocation if it fails, to allow
+			 * the allocator to do whatever it needs to attempt to
+			 * free memory.
+			 */
 			spin_unlock_irq(cfqd->queue->queue_lock);
-			new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
+			new_cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask|__GFP_NOFAIL, cfqd->queue->node);
 			spin_lock_irq(cfqd->queue->queue_lock);
 			goto retry;
 		} else {
-			cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
+			cfqq = kmem_cache_alloc_node(cfq_pool, gfp_mask, cfqd->queue->node);
 			if (!cfqq)
 				goto out;
 		}
@@ -1480,13 +1286,13 @@
 		hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
 		atomic_set(&cfqq->ref, 0);
 		cfqq->cfqd = cfqd;
-		cfqq->service_last = 0;
 		/*
 		 * set ->slice_left to allow preemption for a new process
 		 */
 		cfqq->slice_left = 2 * cfqd->cfq_slice_idle;
 		cfq_mark_cfqq_idle_window(cfqq);
 		cfq_mark_cfqq_prio_changed(cfqq);
+		cfq_mark_cfqq_queue_new(cfqq);
 		cfq_init_prio_data(cfqq);
 	}
 
@@ -1502,12 +1308,10 @@
 static void
 cfq_drop_dead_cic(struct io_context *ioc, struct cfq_io_context *cic)
 {
-	spin_lock(&cfq_exit_lock);
+	WARN_ON(!list_empty(&cic->queue_list));
 	rb_erase(&cic->rb_node, &ioc->cic_root);
-	list_del_init(&cic->queue_list);
-	spin_unlock(&cfq_exit_lock);
 	kmem_cache_free(cfq_ioc_pool, cic);
-	atomic_dec(&ioc_count);
+	elv_ioc_count_dec(ioc_count);
 }
 
 static struct cfq_io_context *
@@ -1551,7 +1355,6 @@
 	cic->ioc = ioc;
 	cic->key = cfqd;
 
-	ioc->set_ioprio = cfq_ioc_set_ioprio;
 restart:
 	parent = NULL;
 	p = &ioc->cic_root.rb_node;
@@ -1573,11 +1376,12 @@
 			BUG();
 	}
 
-	spin_lock(&cfq_exit_lock);
 	rb_link_node(&cic->rb_node, parent, p);
 	rb_insert_color(&cic->rb_node, &ioc->cic_root);
+
+	spin_lock_irq(cfqd->queue->queue_lock);
 	list_add(&cic->queue_list, &cfqd->cic_list);
-	spin_unlock(&cfq_exit_lock);
+	spin_unlock_irq(cfqd->queue->queue_lock);
 }
 
 /*
@@ -1593,7 +1397,7 @@
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
-	ioc = get_io_context(gfp_mask);
+	ioc = get_io_context(gfp_mask, cfqd->queue->node);
 	if (!ioc)
 		return NULL;
 
@@ -1607,6 +1411,10 @@
 
 	cfq_cic_link(cfqd, ioc, cic);
 out:
+	smp_read_barrier_depends();
+	if (unlikely(ioc->ioprio_changed))
+		cfq_ioc_set_ioprio(ioc);
+
 	return cic;
 err:
 	put_io_context(ioc);
@@ -1640,15 +1448,15 @@
 
 static void
 cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
-		       struct cfq_rq *crq)
+		       struct request *rq)
 {
 	sector_t sdist;
 	u64 total;
 
-	if (cic->last_request_pos < crq->request->sector)
-		sdist = crq->request->sector - cic->last_request_pos;
+	if (cic->last_request_pos < rq->sector)
+		sdist = rq->sector - cic->last_request_pos;
 	else
-		sdist = cic->last_request_pos - crq->request->sector;
+		sdist = cic->last_request_pos - rq->sector;
 
 	/*
 	 * Don't allow the seek distance to get too large from the
@@ -1699,7 +1507,7 @@
  */
 static int
 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
-		   struct cfq_rq *crq)
+		   struct request *rq)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
 
@@ -1718,7 +1526,17 @@
 	 */
 	if (new_cfqq->slice_left < cfqd->cfq_slice_idle)
 		return 0;
-	if (cfq_crq_is_sync(crq) && !cfq_cfqq_sync(cfqq))
+	/*
+	 * if the new request is sync, but the currently running queue is
+	 * not, let the sync request have priority.
+	 */
+	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
+		return 1;
+	/*
+	 * So both queues are sync. Let the new request get disk time if
+	 * it's a metadata request and the current queue is doing regular IO.
+	 */
+	if (rq_is_meta(rq) && !cfqq->meta_pending)
 		return 1;
 
 	return 0;
@@ -1730,47 +1548,45 @@
  */
 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-	struct cfq_queue *__cfqq, *next;
-
-	list_for_each_entry_safe(__cfqq, next, &cfqd->cur_rr, cfq_list)
-		cfq_resort_rr_list(__cfqq, 1);
+	cfq_slice_expired(cfqd, 1);
 
 	if (!cfqq->slice_left)
 		cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2;
 
-	cfqq->slice_end = cfqq->slice_left + jiffies;
-	cfq_slice_expired(cfqd, 1);
-	__cfq_set_active_queue(cfqd, cfqq);
-}
-
-/*
- * should really be a ll_rw_blk.c helper
- */
-static void cfq_start_queueing(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	request_queue_t *q = cfqd->queue;
+	/*
+	 * Put the new queue at the front of the of the current list,
+	 * so we know that it will be selected next.
+	 */
+	BUG_ON(!cfq_cfqq_on_rr(cfqq));
+	list_move(&cfqq->cfq_list, &cfqd->cur_rr);
 
-	if (!blk_queue_plugged(q))
-		q->request_fn(q);
-	else
-		__generic_unplug_device(q);
+	cfqq->slice_end = cfqq->slice_left + jiffies;
 }
 
 /*
- * Called when a new fs request (crq) is added (to cfqq). Check if there's
+ * Called when a new fs request (rq) is added (to cfqq). Check if there's
  * something we should do about it
  */
 static void
-cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-		 struct cfq_rq *crq)
+cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+		struct request *rq)
 {
-	struct cfq_io_context *cic = crq->io_context;
+	struct cfq_io_context *cic = RQ_CIC(rq);
+
+	if (rq_is_meta(rq))
+		cfqq->meta_pending++;
+
+	/*
+	 * check if this request is a better next-serve candidate)) {
+	 */
+	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq);
+	BUG_ON(!cfqq->next_rq);
 
 	/*
 	 * we never wait for an async request and we don't allow preemption
 	 * of an async request. so just return early
 	 */
-	if (!cfq_crq_is_sync(crq)) {
+	if (!rq_is_sync(rq)) {
 		/*
 		 * sync process issued an async request, if it's waiting
 		 * then expire it and kick rq handling.
@@ -1778,17 +1594,17 @@
 		if (cic == cfqd->active_cic &&
 		    del_timer(&cfqd->idle_slice_timer)) {
 			cfq_slice_expired(cfqd, 0);
-			cfq_start_queueing(cfqd, cfqq);
+			blk_start_queueing(cfqd->queue);
 		}
 		return;
 	}
 
 	cfq_update_io_thinktime(cfqd, cic);
-	cfq_update_io_seektime(cfqd, cic, crq);
+	cfq_update_io_seektime(cfqd, cic, rq);
 	cfq_update_idle_window(cfqd, cfqq, cic);
 
 	cic->last_queue = jiffies;
-	cic->last_request_pos = crq->request->sector + crq->request->nr_sectors;
+	cic->last_request_pos = rq->sector + rq->nr_sectors;
 
 	if (cfqq == cfqd->active_queue) {
 		/*
@@ -1799,9 +1615,9 @@
 		if (cfq_cfqq_wait_request(cfqq)) {
 			cfq_mark_cfqq_must_dispatch(cfqq);
 			del_timer(&cfqd->idle_slice_timer);
-			cfq_start_queueing(cfqd, cfqq);
+			blk_start_queueing(cfqd->queue);
 		}
-	} else if (cfq_should_preempt(cfqd, cfqq, crq)) {
+	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
 		/*
 		 * not the active queue - expire current slice if it is
 		 * idle and has expired it's mean thinktime or this new queue
@@ -1809,34 +1625,32 @@
 		 */
 		cfq_preempt_queue(cfqd, cfqq);
 		cfq_mark_cfqq_must_dispatch(cfqq);
-		cfq_start_queueing(cfqd, cfqq);
+		blk_start_queueing(cfqd->queue);
 	}
 }
 
 static void cfq_insert_request(request_queue_t *q, struct request *rq)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct cfq_rq *crq = RQ_DATA(rq);
-	struct cfq_queue *cfqq = crq->cfq_queue;
+	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 
 	cfq_init_prio_data(cfqq);
 
-	cfq_add_crq_rb(crq);
+	cfq_add_rq_rb(rq);
 
-	list_add_tail(&rq->queuelist, &cfqq->fifo);
+	if (!cfq_cfqq_on_rr(cfqq))
+		cfq_add_cfqq_rr(cfqd, cfqq);
 
-	if (rq_mergeable(rq))
-		cfq_add_crq_hash(cfqd, crq);
+	list_add_tail(&rq->queuelist, &cfqq->fifo);
 
-	cfq_crq_enqueued(cfqd, cfqq, crq);
+	cfq_rq_enqueued(cfqd, cfqq, rq);
 }
 
 static void cfq_completed_request(request_queue_t *q, struct request *rq)
 {
-	struct cfq_rq *crq = RQ_DATA(rq);
-	struct cfq_queue *cfqq = crq->cfq_queue;
+	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	struct cfq_data *cfqd = cfqq->cfqd;
-	const int sync = cfq_crq_is_sync(crq);
+	const int sync = rq_is_sync(rq);
 	unsigned long now;
 
 	now = jiffies;
@@ -1849,15 +1663,11 @@
 	if (!cfq_class_idle(cfqq))
 		cfqd->last_end_request = now;
 
-	if (!cfq_cfqq_dispatched(cfqq)) {
-		if (cfq_cfqq_on_rr(cfqq)) {
-			cfqq->service_last = now;
-			cfq_resort_rr_list(cfqq, 0);
-		}
-	}
+	if (!cfq_cfqq_dispatched(cfqq) && cfq_cfqq_on_rr(cfqq))
+		cfq_resort_rr_list(cfqq, 0);
 
 	if (sync)
-		crq->io_context->last_end_request = now;
+		RQ_CIC(rq)->last_end_request = now;
 
 	/*
 	 * If this is the active queue, check if it needs to be expired,
@@ -1873,30 +1683,6 @@
 	}
 }
 
-static struct request *
-cfq_former_request(request_queue_t *q, struct request *rq)
-{
-	struct cfq_rq *crq = RQ_DATA(rq);
-	struct rb_node *rbprev = rb_prev(&crq->rb_node);
-
-	if (rbprev)
-		return rb_entry_crq(rbprev)->request;
-
-	return NULL;
-}
-
-static struct request *
-cfq_latter_request(request_queue_t *q, struct request *rq)
-{
-	struct cfq_rq *crq = RQ_DATA(rq);
-	struct rb_node *rbnext = rb_next(&crq->rb_node);
-
-	if (rbnext)
-		return rb_entry_crq(rbnext)->request;
-
-	return NULL;
-}
-
 /*
  * we temporarily boost lower priority queues if they are holding fs exclusive
  * resources. they are boosted to normal prio (CLASS_BE/4)
@@ -1933,9 +1719,7 @@
 		cfq_resort_rr_list(cfqq, 0);
 }
 
-static inline int
-__cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-		struct task_struct *task, int rw)
+static inline int __cfq_may_queue(struct cfq_queue *cfqq)
 {
 	if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
 	    !cfq_cfqq_must_alloc_slice(cfqq)) {
@@ -1946,7 +1730,7 @@
 	return ELV_MQUEUE_MAY;
 }
 
-static int cfq_may_queue(request_queue_t *q, int rw, struct bio *bio)
+static int cfq_may_queue(request_queue_t *q, int rw)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct task_struct *tsk = current;
@@ -1963,48 +1747,30 @@
 		cfq_init_prio_data(cfqq);
 		cfq_prio_boost(cfqq);
 
-		return __cfq_may_queue(cfqd, cfqq, tsk, rw);
+		return __cfq_may_queue(cfqq);
 	}
 
 	return ELV_MQUEUE_MAY;
 }
 
-static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq)
-{
-	struct cfq_data *cfqd = q->elevator->elevator_data;
-
-	if (unlikely(cfqd->rq_starved)) {
-		struct request_list *rl = &q->rq;
-
-		smp_mb();
-		if (waitqueue_active(&rl->wait[READ]))
-			wake_up(&rl->wait[READ]);
-		if (waitqueue_active(&rl->wait[WRITE]))
-			wake_up(&rl->wait[WRITE]);
-	}
-}
-
 /*
  * queue lock held here
  */
 static void cfq_put_request(request_queue_t *q, struct request *rq)
 {
-	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct cfq_rq *crq = RQ_DATA(rq);
+	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 
-	if (crq) {
-		struct cfq_queue *cfqq = crq->cfq_queue;
+	if (cfqq) {
 		const int rw = rq_data_dir(rq);
 
 		BUG_ON(!cfqq->allocated[rw]);
 		cfqq->allocated[rw]--;
 
-		put_io_context(crq->io_context->ioc);
+		put_io_context(RQ_CIC(rq)->ioc);
 
-		mempool_free(crq, cfqd->crq_pool);
 		rq->elevator_private = NULL;
+		rq->elevator_private2 = NULL;
 
-		cfq_check_waiters(q, cfqq);
 		cfq_put_queue(cfqq);
 	}
 }
@@ -2013,8 +1779,7 @@
  * Allocate cfq data structures associated with this request.
  */
 static int
-cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
-		gfp_t gfp_mask)
+cfq_set_request(request_queue_t *q, struct request *rq, gfp_t gfp_mask)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct task_struct *tsk = current;
@@ -2022,7 +1787,6 @@
 	const int rw = rq_data_dir(rq);
 	pid_t key = cfq_queue_pid(tsk, rw);
 	struct cfq_queue *cfqq;
-	struct cfq_rq *crq;
 	unsigned long flags;
 	int is_sync = key != CFQ_KEY_ASYNC;
 
@@ -2046,42 +1810,18 @@
 
 	cfqq->allocated[rw]++;
 	cfq_clear_cfqq_must_alloc(cfqq);
-	cfqd->rq_starved = 0;
 	atomic_inc(&cfqq->ref);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-
-	crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
-	if (crq) {
-		RB_CLEAR_NODE(&crq->rb_node);
-		crq->rb_key = 0;
-		crq->request = rq;
-		INIT_HLIST_NODE(&crq->hash);
-		crq->cfq_queue = cfqq;
-		crq->io_context = cic;
 
-		if (is_sync)
-			cfq_mark_crq_is_sync(crq);
-		else
-			cfq_clear_crq_is_sync(crq);
+	spin_unlock_irqrestore(q->queue_lock, flags);
 
-		rq->elevator_private = crq;
-		return 0;
-	}
+	rq->elevator_private = cic;
+	rq->elevator_private2 = cfqq;
+	return 0;
 
-	spin_lock_irqsave(q->queue_lock, flags);
-	cfqq->allocated[rw]--;
-	if (!(cfqq->allocated[0] + cfqq->allocated[1]))
-		cfq_mark_cfqq_must_alloc(cfqq);
-	cfq_put_queue(cfqq);
 queue_fail:
 	if (cic)
 		put_io_context(cic->ioc);
-	/*
-	 * mark us rq allocation starved. we need to kickstart the process
-	 * ourselves if there are no pending requests that can do it for us.
-	 * that would be an extremely rare OOM situation
-	 */
-	cfqd->rq_starved = 1;
+
 	cfq_schedule_dispatch(cfqd);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 	return 1;
@@ -2090,27 +1830,10 @@
 static void cfq_kick_queue(void *data)
 {
 	request_queue_t *q = data;
-	struct cfq_data *cfqd = q->elevator->elevator_data;
 	unsigned long flags;
 
 	spin_lock_irqsave(q->queue_lock, flags);
-
-	if (cfqd->rq_starved) {
-		struct request_list *rl = &q->rq;
-
-		/*
-		 * we aren't guaranteed to get a request after this, but we
-		 * have to be opportunistic
-		 */
-		smp_mb();
-		if (waitqueue_active(&rl->wait[READ]))
-			wake_up(&rl->wait[READ]);
-		if (waitqueue_active(&rl->wait[WRITE]))
-			wake_up(&rl->wait[WRITE]);
-	}
-
-	blk_remove_plug(q);
-	q->request_fn(q);
+	blk_start_queueing(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
@@ -2193,7 +1916,6 @@
 
 	cfq_shutdown_timer_wq(cfqd);
 
-	spin_lock(&cfq_exit_lock);
 	spin_lock_irq(q->queue_lock);
 
 	if (cfqd->active_queue)
@@ -2203,25 +1925,14 @@
 		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
 							struct cfq_io_context,
 							queue_list);
-		if (cic->cfqq[ASYNC]) {
-			cfq_put_queue(cic->cfqq[ASYNC]);
-			cic->cfqq[ASYNC] = NULL;
-		}
-		if (cic->cfqq[SYNC]) {
-			cfq_put_queue(cic->cfqq[SYNC]);
-			cic->cfqq[SYNC] = NULL;
-		}
-		cic->key = NULL;
-		list_del_init(&cic->queue_list);
+
+		__cfq_exit_single_io_context(cfqd, cic);
 	}
 
 	spin_unlock_irq(q->queue_lock);
-	spin_unlock(&cfq_exit_lock);
 
 	cfq_shutdown_timer_wq(cfqd);
 
-	mempool_destroy(cfqd->crq_pool);
-	kfree(cfqd->crq_hash);
 	kfree(cfqd->cfq_hash);
 	kfree(cfqd);
 }
@@ -2231,7 +1942,7 @@
 	struct cfq_data *cfqd;
 	int i;
 
-	cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
+	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
 	if (!cfqd)
 		return NULL;
 
@@ -2243,23 +1954,12 @@
 	INIT_LIST_HEAD(&cfqd->busy_rr);
 	INIT_LIST_HEAD(&cfqd->cur_rr);
 	INIT_LIST_HEAD(&cfqd->idle_rr);
-	INIT_LIST_HEAD(&cfqd->empty_list);
 	INIT_LIST_HEAD(&cfqd->cic_list);
 
-	cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
-	if (!cfqd->crq_hash)
-		goto out_crqhash;
-
-	cfqd->cfq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
+	cfqd->cfq_hash = kmalloc_node(sizeof(struct hlist_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL, q->node);
 	if (!cfqd->cfq_hash)
-		goto out_cfqhash;
-
-	cfqd->crq_pool = mempool_create_slab_pool(BLKDEV_MIN_RQ, crq_pool);
-	if (!cfqd->crq_pool)
-		goto out_crqpool;
+		goto out_free;
 
-	for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
-		INIT_HLIST_HEAD(&cfqd->crq_hash[i]);
 	for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
 		INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
 
@@ -2275,7 +1975,6 @@
 
 	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q);
 
-	cfqd->cfq_queued = cfq_queued;
 	cfqd->cfq_quantum = cfq_quantum;
 	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
 	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
@@ -2287,19 +1986,13 @@
 	cfqd->cfq_slice_idle = cfq_slice_idle;
 
 	return cfqd;
-out_crqpool:
-	kfree(cfqd->cfq_hash);
-out_cfqhash:
-	kfree(cfqd->crq_hash);
-out_crqhash:
+out_free:
 	kfree(cfqd);
 	return NULL;
 }
 
 static void cfq_slab_kill(void)
 {
-	if (crq_pool)
-		kmem_cache_destroy(crq_pool);
 	if (cfq_pool)
 		kmem_cache_destroy(cfq_pool);
 	if (cfq_ioc_pool)
@@ -2308,11 +2001,6 @@
 
 static int __init cfq_slab_setup(void)
 {
-	crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
-					NULL, NULL);
-	if (!crq_pool)
-		goto fail;
-
 	cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
 					NULL, NULL);
 	if (!cfq_pool)
@@ -2358,7 +2046,6 @@
 	return cfq_var_show(__data, (page));				\
 }
 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
-SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
 SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
@@ -2386,7 +2073,6 @@
 	return ret;							\
 }
 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
-STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
@@ -2402,7 +2088,6 @@
 
 static struct elv_fs_entry cfq_attrs[] = {
 	CFQ_ATTR(quantum),
-	CFQ_ATTR(queued),
 	CFQ_ATTR(fifo_expire_sync),
 	CFQ_ATTR(fifo_expire_async),
 	CFQ_ATTR(back_seek_max),
@@ -2425,14 +2110,14 @@
 		.elevator_deactivate_req_fn =	cfq_deactivate_request,
 		.elevator_queue_empty_fn =	cfq_queue_empty,
 		.elevator_completed_req_fn =	cfq_completed_request,
-		.elevator_former_req_fn =	cfq_former_request,
-		.elevator_latter_req_fn =	cfq_latter_request,
+		.elevator_former_req_fn =	elv_rb_former_request,
+		.elevator_latter_req_fn =	elv_rb_latter_request,
 		.elevator_set_req_fn =		cfq_set_request,
 		.elevator_put_req_fn =		cfq_put_request,
 		.elevator_may_queue_fn =	cfq_may_queue,
 		.elevator_init_fn =		cfq_init_queue,
 		.elevator_exit_fn =		cfq_exit_queue,
-		.trim =				cfq_trim,
+		.trim =				cfq_free_io_context,
 	},
 	.elevator_attrs =	cfq_attrs,
 	.elevator_name =	"cfq",
@@ -2468,7 +2153,7 @@
 	ioc_gone = &all_gone;
 	/* ioc_gone's update must be visible before reading ioc_count */
 	smp_wmb();
-	if (atomic_read(&ioc_count))
+	if (elv_ioc_count_read(ioc_count))
 		wait_for_completion(ioc_gone);
 	synchronize_rcu();
 	cfq_slab_kill();
diff -urN oldtree/block/deadline-iosched.c newtree/block/deadline-iosched.c
--- oldtree/block/deadline-iosched.c	2006-07-15 14:53:08.000000000 -0700
+++ newtree/block/deadline-iosched.c	2006-08-03 13:27:10.000000000 -0700
@@ -12,7 +12,6 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
-#include <linux/hash.h>
 #include <linux/rbtree.h>
 
 /*
@@ -24,13 +23,6 @@
 static const int fifo_batch = 16;       /* # of sequential requests treated as one
 				     by the above parameters. For throughput. */
 
-static const int deadline_hash_shift = 5;
-#define DL_HASH_BLOCK(sec)	((sec) >> 3)
-#define DL_HASH_FN(sec)		(hash_long(DL_HASH_BLOCK((sec)), deadline_hash_shift))
-#define DL_HASH_ENTRIES		(1 << deadline_hash_shift)
-#define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
-#define ON_HASH(drq)		(!hlist_unhashed(&(drq)->hash))
-
 struct deadline_data {
 	/*
 	 * run time data
@@ -45,8 +37,7 @@
 	/*
 	 * next in sort order. read, write or both are NULL
 	 */
-	struct deadline_rq *next_drq[2];
-	struct hlist_head *hash;	/* request hash */
+	struct request *next_rq[2];
 	unsigned int batching;		/* number of sequential requests made */
 	sector_t last_sector;		/* head position */
 	unsigned int starved;		/* times reads have starved writes */
@@ -58,240 +49,69 @@
 	int fifo_batch;
 	int writes_starved;
 	int front_merges;
-
-	mempool_t *drq_pool;
-};
-
-/*
- * pre-request data.
- */
-struct deadline_rq {
-	/*
-	 * rbtree index, key is the starting offset
-	 */
-	struct rb_node rb_node;
-	sector_t rb_key;
-
-	struct request *request;
-
-	/*
-	 * request hash, key is the ending offset (for back merge lookup)
-	 */
-	struct hlist_node hash;
-
-	/*
-	 * expire fifo
-	 */
-	struct list_head fifo;
-	unsigned long expires;
 };
 
-static void deadline_move_request(struct deadline_data *dd, struct deadline_rq *drq);
-
-static kmem_cache_t *drq_pool;
-
-#define RQ_DATA(rq)	((struct deadline_rq *) (rq)->elevator_private)
-
-/*
- * the back merge hash support functions
- */
-static inline void __deadline_del_drq_hash(struct deadline_rq *drq)
-{
-	hlist_del_init(&drq->hash);
-}
-
-static inline void deadline_del_drq_hash(struct deadline_rq *drq)
-{
-	if (ON_HASH(drq))
-		__deadline_del_drq_hash(drq);
-}
-
-static inline void
-deadline_add_drq_hash(struct deadline_data *dd, struct deadline_rq *drq)
-{
-	struct request *rq = drq->request;
-
-	BUG_ON(ON_HASH(drq));
-
-	hlist_add_head(&drq->hash, &dd->hash[DL_HASH_FN(rq_hash_key(rq))]);
-}
-
-/*
- * move hot entry to front of chain
- */
-static inline void
-deadline_hot_drq_hash(struct deadline_data *dd, struct deadline_rq *drq)
-{
-	struct request *rq = drq->request;
-	struct hlist_head *head = &dd->hash[DL_HASH_FN(rq_hash_key(rq))];
-
-	if (ON_HASH(drq) && &drq->hash != head->first) {
-		hlist_del(&drq->hash);
-		hlist_add_head(&drq->hash, head);
-	}
-}
-
-static struct request *
-deadline_find_drq_hash(struct deadline_data *dd, sector_t offset)
-{
-	struct hlist_head *hash_list = &dd->hash[DL_HASH_FN(offset)];
-	struct hlist_node *entry, *next;
-	struct deadline_rq *drq;
-
-	hlist_for_each_entry_safe(drq, entry, next, hash_list, hash) {
-		struct request *__rq = drq->request;
-
-		BUG_ON(!ON_HASH(drq));
-
-		if (!rq_mergeable(__rq)) {
-			__deadline_del_drq_hash(drq);
-			continue;
-		}
-
-		if (rq_hash_key(__rq) == offset)
-			return __rq;
-	}
-
-	return NULL;
-}
-
-/*
- * rb tree support functions
- */
-#define rb_entry_drq(node)	rb_entry((node), struct deadline_rq, rb_node)
-#define DRQ_RB_ROOT(dd, drq)	(&(dd)->sort_list[rq_data_dir((drq)->request)])
-#define rq_rb_key(rq)		(rq)->sector
-
-static struct deadline_rq *
-__deadline_add_drq_rb(struct deadline_data *dd, struct deadline_rq *drq)
-{
-	struct rb_node **p = &DRQ_RB_ROOT(dd, drq)->rb_node;
-	struct rb_node *parent = NULL;
-	struct deadline_rq *__drq;
-
-	while (*p) {
-		parent = *p;
-		__drq = rb_entry_drq(parent);
-
-		if (drq->rb_key < __drq->rb_key)
-			p = &(*p)->rb_left;
-		else if (drq->rb_key > __drq->rb_key)
-			p = &(*p)->rb_right;
-		else
-			return __drq;
-	}
+static void deadline_move_request(struct deadline_data *, struct request *);
 
-	rb_link_node(&drq->rb_node, parent, p);
-	return NULL;
-}
+#define RQ_RB_ROOT(dd, rq)	(&(dd)->sort_list[rq_data_dir((rq))])
 
 static void
-deadline_add_drq_rb(struct deadline_data *dd, struct deadline_rq *drq)
+deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
 {
-	struct deadline_rq *__alias;
-
-	drq->rb_key = rq_rb_key(drq->request);
+	struct rb_root *root = RQ_RB_ROOT(dd, rq);
+	struct request *__alias;
 
 retry:
-	__alias = __deadline_add_drq_rb(dd, drq);
-	if (!__alias) {
-		rb_insert_color(&drq->rb_node, DRQ_RB_ROOT(dd, drq));
-		return;
+	__alias = elv_rb_add(root, rq);
+	if (unlikely(__alias)) {
+		deadline_move_request(dd, __alias);
+		goto retry;
 	}
-
-	deadline_move_request(dd, __alias);
-	goto retry;
 }
 
 static inline void
-deadline_del_drq_rb(struct deadline_data *dd, struct deadline_rq *drq)
+deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
 {
-	const int data_dir = rq_data_dir(drq->request);
+	const int data_dir = rq_data_dir(rq);
 
-	if (dd->next_drq[data_dir] == drq) {
-		struct rb_node *rbnext = rb_next(&drq->rb_node);
+	if (dd->next_rq[data_dir] == rq) {
+		struct rb_node *rbnext = rb_next(&rq->rb_node);
 
-		dd->next_drq[data_dir] = NULL;
+		dd->next_rq[data_dir] = NULL;
 		if (rbnext)
-			dd->next_drq[data_dir] = rb_entry_drq(rbnext);
+			dd->next_rq[data_dir] = rb_entry_rq(rbnext);
 	}
 
-	BUG_ON(!RB_EMPTY_NODE(&drq->rb_node));
-	rb_erase(&drq->rb_node, DRQ_RB_ROOT(dd, drq));
-	RB_CLEAR_NODE(&drq->rb_node);
-}
-
-static struct request *
-deadline_find_drq_rb(struct deadline_data *dd, sector_t sector, int data_dir)
-{
-	struct rb_node *n = dd->sort_list[data_dir].rb_node;
-	struct deadline_rq *drq;
-
-	while (n) {
-		drq = rb_entry_drq(n);
-
-		if (sector < drq->rb_key)
-			n = n->rb_left;
-		else if (sector > drq->rb_key)
-			n = n->rb_right;
-		else
-			return drq->request;
-	}
-
-	return NULL;
+	elv_rb_del(RQ_RB_ROOT(dd, rq), rq);
 }
 
 /*
- * deadline_find_first_drq finds the first (lowest sector numbered) request
- * for the specified data_dir. Used to sweep back to the start of the disk
- * (1-way elevator) after we process the last (highest sector) request.
- */
-static struct deadline_rq *
-deadline_find_first_drq(struct deadline_data *dd, int data_dir)
-{
-	struct rb_node *n = dd->sort_list[data_dir].rb_node;
-
-	for (;;) {
-		if (n->rb_left == NULL)
-			return rb_entry_drq(n);
-		
-		n = n->rb_left;
-	}
-}
-
-/*
- * add drq to rbtree and fifo
+ * add rq to rbtree and fifo
  */
 static void
 deadline_add_request(struct request_queue *q, struct request *rq)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
-	struct deadline_rq *drq = RQ_DATA(rq);
+	const int data_dir = rq_data_dir(rq);
 
-	const int data_dir = rq_data_dir(drq->request);
+	deadline_add_rq_rb(dd, rq);
 
-	deadline_add_drq_rb(dd, drq);
 	/*
 	 * set expire time (only used for reads) and add to fifo list
 	 */
-	drq->expires = jiffies + dd->fifo_expire[data_dir];
-	list_add_tail(&drq->fifo, &dd->fifo_list[data_dir]);
-
-	if (rq_mergeable(rq))
-		deadline_add_drq_hash(dd, drq);
+	rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]);
+	list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
 }
 
 /*
- * remove rq from rbtree, fifo, and hash
+ * remove rq from rbtree and fifo.
  */
 static void deadline_remove_request(request_queue_t *q, struct request *rq)
 {
-	struct deadline_rq *drq = RQ_DATA(rq);
 	struct deadline_data *dd = q->elevator->elevator_data;
 
-	list_del_init(&drq->fifo);
-	deadline_del_drq_rb(dd, drq);
-	deadline_del_drq_hash(drq);
+	rq_fifo_clear(rq);
+	deadline_del_rq_rb(dd, rq);
 }
 
 static int
@@ -302,27 +122,14 @@
 	int ret;
 
 	/*
-	 * see if the merge hash can satisfy a back merge
-	 */
-	__rq = deadline_find_drq_hash(dd, bio->bi_sector);
-	if (__rq) {
-		BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
-
-		if (elv_rq_merge_ok(__rq, bio)) {
-			ret = ELEVATOR_BACK_MERGE;
-			goto out;
-		}
-	}
-
-	/*
 	 * check for front merge
 	 */
 	if (dd->front_merges) {
-		sector_t rb_key = bio->bi_sector + bio_sectors(bio);
+		sector_t sector = bio->bi_sector + bio_sectors(bio);
 
-		__rq = deadline_find_drq_rb(dd, rb_key, bio_data_dir(bio));
+		__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
 		if (__rq) {
-			BUG_ON(rb_key != rq_rb_key(__rq));
+			BUG_ON(sector != __rq->sector);
 
 			if (elv_rq_merge_ok(__rq, bio)) {
 				ret = ELEVATOR_FRONT_MERGE;
@@ -333,29 +140,21 @@
 
 	return ELEVATOR_NO_MERGE;
 out:
-	if (ret)
-		deadline_hot_drq_hash(dd, RQ_DATA(__rq));
 	*req = __rq;
 	return ret;
 }
 
-static void deadline_merged_request(request_queue_t *q, struct request *req)
+static void deadline_merged_request(request_queue_t *q, struct request *req,
+				    int type)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
-	struct deadline_rq *drq = RQ_DATA(req);
-
-	/*
-	 * hash always needs to be repositioned, key is end sector
-	 */
-	deadline_del_drq_hash(drq);
-	deadline_add_drq_hash(dd, drq);
 
 	/*
 	 * if the merge was a front merge, we need to reposition request
 	 */
-	if (rq_rb_key(req) != drq->rb_key) {
-		deadline_del_drq_rb(dd, drq);
-		deadline_add_drq_rb(dd, drq);
+	if (type == ELEVATOR_FRONT_MERGE) {
+		elv_rb_del(RQ_RB_ROOT(dd, req), req);
+		deadline_add_rq_rb(dd, req);
 	}
 }
 
@@ -363,33 +162,14 @@
 deadline_merged_requests(request_queue_t *q, struct request *req,
 			 struct request *next)
 {
-	struct deadline_data *dd = q->elevator->elevator_data;
-	struct deadline_rq *drq = RQ_DATA(req);
-	struct deadline_rq *dnext = RQ_DATA(next);
-
-	BUG_ON(!drq);
-	BUG_ON(!dnext);
-
-	/*
-	 * reposition drq (this is the merged request) in hash, and in rbtree
-	 * in case of a front merge
-	 */
-	deadline_del_drq_hash(drq);
-	deadline_add_drq_hash(dd, drq);
-
-	if (rq_rb_key(req) != drq->rb_key) {
-		deadline_del_drq_rb(dd, drq);
-		deadline_add_drq_rb(dd, drq);
-	}
-
 	/*
-	 * if dnext expires before drq, assign its expire time to drq
-	 * and move into dnext position (dnext will be deleted) in fifo
+	 * if next expires before rq, assign its expire time to rq
+	 * and move into next position (next will be deleted) in fifo
 	 */
-	if (!list_empty(&drq->fifo) && !list_empty(&dnext->fifo)) {
-		if (time_before(dnext->expires, drq->expires)) {
-			list_move(&drq->fifo, &dnext->fifo);
-			drq->expires = dnext->expires;
+	if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
+		if (time_before(rq_fifo_time(next), rq_fifo_time(req))) {
+			list_move(&req->queuelist, &next->queuelist);
+			rq_set_fifo_time(req, rq_fifo_time(next));
 		}
 	}
 
@@ -403,52 +183,50 @@
  * move request from sort list to dispatch queue.
  */
 static inline void
-deadline_move_to_dispatch(struct deadline_data *dd, struct deadline_rq *drq)
+deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
 {
-	request_queue_t *q = drq->request->q;
+	request_queue_t *q = rq->q;
 
-	deadline_remove_request(q, drq->request);
-	elv_dispatch_add_tail(q, drq->request);
+	deadline_remove_request(q, rq);
+	elv_dispatch_add_tail(q, rq);
 }
 
 /*
  * move an entry to dispatch queue
  */
 static void
-deadline_move_request(struct deadline_data *dd, struct deadline_rq *drq)
+deadline_move_request(struct deadline_data *dd, struct request *rq)
 {
-	const int data_dir = rq_data_dir(drq->request);
-	struct rb_node *rbnext = rb_next(&drq->rb_node);
+	const int data_dir = rq_data_dir(rq);
+	struct rb_node *rbnext = rb_next(&rq->rb_node);
 
-	dd->next_drq[READ] = NULL;
-	dd->next_drq[WRITE] = NULL;
+	dd->next_rq[READ] = NULL;
+	dd->next_rq[WRITE] = NULL;
 
 	if (rbnext)
-		dd->next_drq[data_dir] = rb_entry_drq(rbnext);
+		dd->next_rq[data_dir] = rb_entry_rq(rbnext);
 	
-	dd->last_sector = drq->request->sector + drq->request->nr_sectors;
+	dd->last_sector = rq->sector + rq->nr_sectors;
 
 	/*
 	 * take it off the sort and fifo list, move
 	 * to dispatch queue
 	 */
-	deadline_move_to_dispatch(dd, drq);
+	deadline_move_to_dispatch(dd, rq);
 }
 
-#define list_entry_fifo(ptr)	list_entry((ptr), struct deadline_rq, fifo)
-
 /*
  * deadline_check_fifo returns 0 if there are no expired reads on the fifo,
  * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
  */
 static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
 {
-	struct deadline_rq *drq = list_entry_fifo(dd->fifo_list[ddir].next);
+	struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
 
 	/*
-	 * drq is expired!
+	 * rq is expired!
 	 */
-	if (time_after(jiffies, drq->expires))
+	if (time_after(jiffies, rq_fifo_time(rq)))
 		return 1;
 
 	return 0;
@@ -463,21 +241,21 @@
 	struct deadline_data *dd = q->elevator->elevator_data;
 	const int reads = !list_empty(&dd->fifo_list[READ]);
 	const int writes = !list_empty(&dd->fifo_list[WRITE]);
-	struct deadline_rq *drq;
+	struct request *rq;
 	int data_dir;
 
 	/*
 	 * batches are currently reads XOR writes
 	 */
-	if (dd->next_drq[WRITE])
-		drq = dd->next_drq[WRITE];
+	if (dd->next_rq[WRITE])
+		rq = dd->next_rq[WRITE];
 	else
-		drq = dd->next_drq[READ];
+		rq = dd->next_rq[READ];
 
-	if (drq) {
+	if (rq) {
 		/* we have a "next request" */
 		
-		if (dd->last_sector != drq->request->sector)
+		if (dd->last_sector != rq->sector)
 			/* end the batch on a non sequential request */
 			dd->batching += dd->fifo_batch;
 		
@@ -526,30 +304,33 @@
 	if (deadline_check_fifo(dd, data_dir)) {
 		/* An expired request exists - satisfy it */
 		dd->batching = 0;
-		drq = list_entry_fifo(dd->fifo_list[data_dir].next);
+		rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
 		
-	} else if (dd->next_drq[data_dir]) {
+	} else if (dd->next_rq[data_dir]) {
 		/*
 		 * The last req was the same dir and we have a next request in
 		 * sort order. No expired requests so continue on from here.
 		 */
-		drq = dd->next_drq[data_dir];
+		rq = dd->next_rq[data_dir];
 	} else {
+		struct rb_node *node;
 		/*
 		 * The last req was the other direction or we have run out of
 		 * higher-sectored requests. Go back to the lowest sectored
 		 * request (1 way elevator) and start a new batch.
 		 */
 		dd->batching = 0;
-		drq = deadline_find_first_drq(dd, data_dir);
+		node = rb_first(&dd->sort_list[data_dir]);
+		if (node)
+			rq = rb_entry_rq(node);
 	}
 
 dispatch_request:
 	/*
-	 * drq is the selected appropriate request.
+	 * rq is the selected appropriate request.
 	 */
 	dd->batching++;
-	deadline_move_request(dd, drq);
+	deadline_move_request(dd, rq);
 
 	return 1;
 }
@@ -562,30 +343,6 @@
 		&& list_empty(&dd->fifo_list[READ]);
 }
 
-static struct request *
-deadline_former_request(request_queue_t *q, struct request *rq)
-{
-	struct deadline_rq *drq = RQ_DATA(rq);
-	struct rb_node *rbprev = rb_prev(&drq->rb_node);
-
-	if (rbprev)
-		return rb_entry_drq(rbprev)->request;
-
-	return NULL;
-}
-
-static struct request *
-deadline_latter_request(request_queue_t *q, struct request *rq)
-{
-	struct deadline_rq *drq = RQ_DATA(rq);
-	struct rb_node *rbnext = rb_next(&drq->rb_node);
-
-	if (rbnext)
-		return rb_entry_drq(rbnext)->request;
-
-	return NULL;
-}
-
 static void deadline_exit_queue(elevator_t *e)
 {
 	struct deadline_data *dd = e->elevator_data;
@@ -593,46 +350,21 @@
 	BUG_ON(!list_empty(&dd->fifo_list[READ]));
 	BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
 
-	mempool_destroy(dd->drq_pool);
-	kfree(dd->hash);
 	kfree(dd);
 }
 
 /*
- * initialize elevator private data (deadline_data), and alloc a drq for
- * each request on the free lists
+ * initialize elevator private data (deadline_data).
  */
 static void *deadline_init_queue(request_queue_t *q, elevator_t *e)
 {
 	struct deadline_data *dd;
-	int i;
-
-	if (!drq_pool)
-		return NULL;
 
 	dd = kmalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
 	if (!dd)
 		return NULL;
 	memset(dd, 0, sizeof(*dd));
 
-	dd->hash = kmalloc_node(sizeof(struct hlist_head)*DL_HASH_ENTRIES,
-				GFP_KERNEL, q->node);
-	if (!dd->hash) {
-		kfree(dd);
-		return NULL;
-	}
-
-	dd->drq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
-					mempool_free_slab, drq_pool, q->node);
-	if (!dd->drq_pool) {
-		kfree(dd->hash);
-		kfree(dd);
-		return NULL;
-	}
-
-	for (i = 0; i < DL_HASH_ENTRIES; i++)
-		INIT_HLIST_HEAD(&dd->hash[i]);
-
 	INIT_LIST_HEAD(&dd->fifo_list[READ]);
 	INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
 	dd->sort_list[READ] = RB_ROOT;
@@ -645,39 +377,6 @@
 	return dd;
 }
 
-static void deadline_put_request(request_queue_t *q, struct request *rq)
-{
-	struct deadline_data *dd = q->elevator->elevator_data;
-	struct deadline_rq *drq = RQ_DATA(rq);
-
-	mempool_free(drq, dd->drq_pool);
-	rq->elevator_private = NULL;
-}
-
-static int
-deadline_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
-		     gfp_t gfp_mask)
-{
-	struct deadline_data *dd = q->elevator->elevator_data;
-	struct deadline_rq *drq;
-
-	drq = mempool_alloc(dd->drq_pool, gfp_mask);
-	if (drq) {
-		memset(drq, 0, sizeof(*drq));
-		RB_CLEAR_NODE(&drq->rb_node);
-		drq->request = rq;
-
-		INIT_HLIST_NODE(&drq->hash);
-
-		INIT_LIST_HEAD(&drq->fifo);
-
-		rq->elevator_private = drq;
-		return 0;
-	}
-
-	return 1;
-}
-
 /*
  * sysfs parts below
  */
@@ -757,10 +456,8 @@
 		.elevator_dispatch_fn =		deadline_dispatch_requests,
 		.elevator_add_req_fn =		deadline_add_request,
 		.elevator_queue_empty_fn =	deadline_queue_empty,
-		.elevator_former_req_fn =	deadline_former_request,
-		.elevator_latter_req_fn =	deadline_latter_request,
-		.elevator_set_req_fn =		deadline_set_request,
-		.elevator_put_req_fn = 		deadline_put_request,
+		.elevator_former_req_fn =	elv_rb_former_request,
+		.elevator_latter_req_fn =	elv_rb_latter_request,
 		.elevator_init_fn =		deadline_init_queue,
 		.elevator_exit_fn =		deadline_exit_queue,
 	},
@@ -772,24 +469,11 @@
 
 static int __init deadline_init(void)
 {
-	int ret;
-
-	drq_pool = kmem_cache_create("deadline_drq", sizeof(struct deadline_rq),
-				     0, 0, NULL, NULL);
-
-	if (!drq_pool)
-		return -ENOMEM;
-
-	ret = elv_register(&iosched_deadline);
-	if (ret)
-		kmem_cache_destroy(drq_pool);
-
-	return ret;
+	return elv_register(&iosched_deadline);
 }
 
 static void __exit deadline_exit(void)
 {
-	kmem_cache_destroy(drq_pool);
 	elv_unregister(&iosched_deadline);
 }
 
diff -urN oldtree/block/elevator.c newtree/block/elevator.c
--- oldtree/block/elevator.c	2006-07-15 14:53:08.000000000 -0700
+++ newtree/block/elevator.c	2006-08-03 13:27:10.000000000 -0700
@@ -33,6 +33,7 @@
 #include <linux/compiler.h>
 #include <linux/delay.h>
 #include <linux/blktrace_api.h>
+#include <linux/hash.h>
 
 #include <asm/uaccess.h>
 
@@ -40,6 +41,16 @@
 static LIST_HEAD(elv_list);
 
 /*
+ * Merge hash stuff.
+ */
+static const int elv_hash_shift = 6;
+#define ELV_HASH_BLOCK(sec)	((sec) >> 3)
+#define ELV_HASH_FN(sec)	(hash_long(ELV_HASH_BLOCK((sec)), elv_hash_shift))
+#define ELV_HASH_ENTRIES	(1 << elv_hash_shift)
+#define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
+#define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
+
+/*
  * can we safely merge with this request?
  */
 inline int elv_rq_merge_ok(struct request *rq, struct bio *bio)
@@ -56,8 +67,7 @@
 	/*
 	 * same device and no special stuff set, merge is ok
 	 */
-	if (rq->rq_disk == bio->bi_bdev->bd_disk &&
-	    !rq->waiting && !rq->special)
+	if (rq->rq_disk == bio->bi_bdev->bd_disk && !rq->special)
 		return 1;
 
 	return 0;
@@ -151,27 +161,44 @@
 
 static struct kobj_type elv_ktype;
 
-static elevator_t *elevator_alloc(struct elevator_type *e)
+static elevator_t *elevator_alloc(request_queue_t *q, struct elevator_type *e)
 {
-	elevator_t *eq = kmalloc(sizeof(elevator_t), GFP_KERNEL);
-	if (eq) {
-		memset(eq, 0, sizeof(*eq));
-		eq->ops = &e->ops;
-		eq->elevator_type = e;
-		kobject_init(&eq->kobj);
-		snprintf(eq->kobj.name, KOBJ_NAME_LEN, "%s", "iosched");
-		eq->kobj.ktype = &elv_ktype;
-		mutex_init(&eq->sysfs_lock);
-	} else {
-		elevator_put(e);
-	}
+	elevator_t *eq;
+	int i;
+
+	eq = kmalloc_node(sizeof(elevator_t), GFP_KERNEL, q->node);
+	if (unlikely(!eq))
+		goto err;
+
+	memset(eq, 0, sizeof(*eq));
+	eq->ops = &e->ops;
+	eq->elevator_type = e;
+	kobject_init(&eq->kobj);
+	snprintf(eq->kobj.name, KOBJ_NAME_LEN, "%s", "iosched");
+	eq->kobj.ktype = &elv_ktype;
+	mutex_init(&eq->sysfs_lock);
+
+	eq->hash = kmalloc_node(sizeof(struct hlist_head) * ELV_HASH_ENTRIES,
+					GFP_KERNEL, q->node);
+	if (!eq->hash)
+		goto err;
+
+	for (i = 0; i < ELV_HASH_ENTRIES; i++)
+		INIT_HLIST_HEAD(&eq->hash[i]);
+
 	return eq;
+err:
+	kfree(eq);
+	elevator_put(e);
+	return NULL;
 }
 
 static void elevator_release(struct kobject *kobj)
 {
 	elevator_t *e = container_of(kobj, elevator_t, kobj);
+
 	elevator_put(e->elevator_type);
+	kfree(e->hash);
 	kfree(e);
 }
 
@@ -198,7 +225,7 @@
 		e = elevator_get("noop");
 	}
 
-	eq = elevator_alloc(e);
+	eq = elevator_alloc(q, e);
 	if (!eq)
 		return -ENOMEM;
 
@@ -212,6 +239,8 @@
 	return ret;
 }
 
+EXPORT_SYMBOL(elevator_init);
+
 void elevator_exit(elevator_t *e)
 {
 	mutex_lock(&e->sysfs_lock);
@@ -223,10 +252,118 @@
 	kobject_put(&e->kobj);
 }
 
+EXPORT_SYMBOL(elevator_exit);
+
+static inline void __elv_rqhash_del(struct request *rq)
+{
+	hlist_del_init(&rq->hash);
+}
+
+static void elv_rqhash_del(request_queue_t *q, struct request *rq)
+{
+	if (ELV_ON_HASH(rq))
+		__elv_rqhash_del(rq);
+}
+
+static void elv_rqhash_add(request_queue_t *q, struct request *rq)
+{
+	elevator_t *e = q->elevator;
+
+	BUG_ON(ELV_ON_HASH(rq));
+	hlist_add_head(&rq->hash, &e->hash[ELV_HASH_FN(rq_hash_key(rq))]);
+}
+
+static void elv_rqhash_reposition(request_queue_t *q, struct request *rq)
+{
+	__elv_rqhash_del(rq);
+	elv_rqhash_add(q, rq);
+}
+
+static struct request *elv_rqhash_find(request_queue_t *q, sector_t offset)
+{
+	elevator_t *e = q->elevator;
+	struct hlist_head *hash_list = &e->hash[ELV_HASH_FN(offset)];
+	struct hlist_node *entry, *next;
+	struct request *rq;
+
+	hlist_for_each_entry_safe(rq, entry, next, hash_list, hash) {
+		BUG_ON(!ELV_ON_HASH(rq));
+
+		if (unlikely(!rq_mergeable(rq))) {
+			__elv_rqhash_del(rq);
+			continue;
+		}
+
+		if (rq_hash_key(rq) == offset)
+			return rq;
+	}
+
+	return NULL;
+}
+
+/*
+ * RB-tree support functions for inserting/lookup/removal of requests
+ * in a sorted RB tree.
+ */
+struct request *elv_rb_add(struct rb_root *root, struct request *rq)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct request *__rq;
+
+	while (*p) {
+		parent = *p;
+		__rq = rb_entry(parent, struct request, rb_node);
+
+		if (rq->sector < __rq->sector)
+			p = &(*p)->rb_left;
+		else if (rq->sector > __rq->sector)
+			p = &(*p)->rb_right;
+		else
+			return __rq;
+	}
+
+	rb_link_node(&rq->rb_node, parent, p);
+	rb_insert_color(&rq->rb_node, root);
+	return NULL;
+}
+
+EXPORT_SYMBOL(elv_rb_add);
+
+void elv_rb_del(struct rb_root *root, struct request *rq)
+{
+	BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
+	rb_erase(&rq->rb_node, root);
+	RB_CLEAR_NODE(&rq->rb_node);
+}
+
+EXPORT_SYMBOL(elv_rb_del);
+
+struct request *elv_rb_find(struct rb_root *root, sector_t sector)
+{
+	struct rb_node *n = root->rb_node;
+	struct request *rq;
+
+	while (n) {
+		rq = rb_entry(n, struct request, rb_node);
+
+		if (sector < rq->sector)
+			n = n->rb_left;
+		else if (sector > rq->sector)
+			n = n->rb_right;
+		else
+			return rq;
+	}
+
+	return NULL;
+}
+
+EXPORT_SYMBOL(elv_rb_find);
+
 /*
  * Insert rq into dispatch queue of q.  Queue lock must be held on
- * entry.  If sort != 0, rq is sort-inserted; otherwise, rq will be
- * appended to the dispatch queue.  To be used by specific elevators.
+ * entry.  rq is sort insted into the dispatch queue. To be used by
+ * specific elevators.
  */
 void elv_dispatch_sort(request_queue_t *q, struct request *rq)
 {
@@ -235,6 +372,9 @@
 
 	if (q->last_merge == rq)
 		q->last_merge = NULL;
+
+	elv_rqhash_del(q, rq);
+
 	q->nr_sorted--;
 
 	boundary = q->end_sector;
@@ -258,11 +398,38 @@
 	list_add(&rq->queuelist, entry);
 }
 
+EXPORT_SYMBOL(elv_dispatch_sort);
+
+/*
+ * Insert rq into dispatch queue of q.  Queue lock must be held on
+ * entry.  rq is added to the back of the dispatch queue. To be used by
+ * specific elevators.
+ */
+void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
+{
+	if (q->last_merge == rq)
+		q->last_merge = NULL;
+
+	elv_rqhash_del(q, rq);
+
+	q->nr_sorted--;
+
+	q->end_sector = rq_end_sector(rq);
+	q->boundary_rq = rq;
+	list_add_tail(&rq->queuelist, &q->queue_head);
+}
+
+EXPORT_SYMBOL(elv_dispatch_add_tail);
+
 int elv_merge(request_queue_t *q, struct request **req, struct bio *bio)
 {
 	elevator_t *e = q->elevator;
+	struct request *__rq;
 	int ret;
 
+	/*
+	 * First try one-hit cache.
+	 */
 	if (q->last_merge) {
 		ret = elv_try_merge(q->last_merge, bio);
 		if (ret != ELEVATOR_NO_MERGE) {
@@ -271,18 +438,30 @@
 		}
 	}
 
+	/*
+	 * See if our hash lookup can find a potential backmerge.
+	 */
+	__rq = elv_rqhash_find(q, bio->bi_sector);
+	if (__rq && elv_rq_merge_ok(__rq, bio)) {
+		*req = __rq;
+		return ELEVATOR_BACK_MERGE;
+	}
+
 	if (e->ops->elevator_merge_fn)
 		return e->ops->elevator_merge_fn(q, req, bio);
 
 	return ELEVATOR_NO_MERGE;
 }
 
-void elv_merged_request(request_queue_t *q, struct request *rq)
+void elv_merged_request(request_queue_t *q, struct request *rq, int type)
 {
 	elevator_t *e = q->elevator;
 
 	if (e->ops->elevator_merged_fn)
-		e->ops->elevator_merged_fn(q, rq);
+		e->ops->elevator_merged_fn(q, rq, type);
+
+	if (type == ELEVATOR_BACK_MERGE)
+		elv_rqhash_reposition(q, rq);
 
 	q->last_merge = rq;
 }
@@ -294,8 +473,11 @@
 
 	if (e->ops->elevator_merge_req_fn)
 		e->ops->elevator_merge_req_fn(q, rq, next);
-	q->nr_sorted--;
 
+	elv_rqhash_reposition(q, rq);
+	elv_rqhash_del(q, next);
+
+	q->nr_sorted--;
 	q->last_merge = rq;
 }
 
@@ -371,8 +553,12 @@
 		BUG_ON(!blk_fs_request(rq));
 		rq->flags |= REQ_SORTED;
 		q->nr_sorted++;
-		if (q->last_merge == NULL && rq_mergeable(rq))
-			q->last_merge = rq;
+		if (rq_mergeable(rq)) {
+			elv_rqhash_add(q, rq);
+			if (!q->last_merge)
+				q->last_merge = rq;
+		}
+
 		/*
 		 * Some ioscheds (cfq) run q->request_fn directly, so
 		 * rq cannot be accessed after calling
@@ -461,6 +647,8 @@
 	elv_insert(q, rq, where);
 }
 
+EXPORT_SYMBOL(__elv_add_request);
+
 void elv_add_request(request_queue_t *q, struct request *rq, int where,
 		     int plug)
 {
@@ -471,6 +659,8 @@
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
+EXPORT_SYMBOL(elv_add_request);
+
 static inline struct request *__elv_next_request(request_queue_t *q)
 {
 	struct request *rq;
@@ -554,9 +744,12 @@
 	return rq;
 }
 
+EXPORT_SYMBOL(elv_next_request);
+
 void elv_dequeue_request(request_queue_t *q, struct request *rq)
 {
 	BUG_ON(list_empty(&rq->queuelist));
+	BUG_ON(ELV_ON_HASH(rq));
 
 	list_del_init(&rq->queuelist);
 
@@ -569,6 +762,8 @@
 		q->in_flight++;
 }
 
+EXPORT_SYMBOL(elv_dequeue_request);
+
 int elv_queue_empty(request_queue_t *q)
 {
 	elevator_t *e = q->elevator;
@@ -582,6 +777,8 @@
 	return 1;
 }
 
+EXPORT_SYMBOL(elv_queue_empty);
+
 struct request *elv_latter_request(request_queue_t *q, struct request *rq)
 {
 	elevator_t *e = q->elevator;
@@ -600,13 +797,12 @@
 	return NULL;
 }
 
-int elv_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
-		    gfp_t gfp_mask)
+int elv_set_request(request_queue_t *q, struct request *rq, gfp_t gfp_mask)
 {
 	elevator_t *e = q->elevator;
 
 	if (e->ops->elevator_set_req_fn)
-		return e->ops->elevator_set_req_fn(q, rq, bio, gfp_mask);
+		return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
 
 	rq->elevator_private = NULL;
 	return 0;
@@ -620,12 +816,12 @@
 		e->ops->elevator_put_req_fn(q, rq);
 }
 
-int elv_may_queue(request_queue_t *q, int rw, struct bio *bio)
+int elv_may_queue(request_queue_t *q, int rw)
 {
 	elevator_t *e = q->elevator;
 
 	if (e->ops->elevator_may_queue_fn)
-		return e->ops->elevator_may_queue_fn(q, rw, bio);
+		return e->ops->elevator_may_queue_fn(q, rw);
 
 	return ELV_MQUEUE_MAY;
 }
@@ -791,7 +987,7 @@
 	/*
 	 * Allocate new elevator
 	 */
-	e = elevator_alloc(new_e);
+	e = elevator_alloc(q, new_e);
 	if (!e)
 		return 0;
 
@@ -907,11 +1103,26 @@
 	return len;
 }
 
-EXPORT_SYMBOL(elv_dispatch_sort);
-EXPORT_SYMBOL(elv_add_request);
-EXPORT_SYMBOL(__elv_add_request);
-EXPORT_SYMBOL(elv_next_request);
-EXPORT_SYMBOL(elv_dequeue_request);
-EXPORT_SYMBOL(elv_queue_empty);
-EXPORT_SYMBOL(elevator_exit);
-EXPORT_SYMBOL(elevator_init);
+struct request *elv_rb_former_request(request_queue_t *q, struct request *rq)
+{
+	struct rb_node *rbprev = rb_prev(&rq->rb_node);
+
+	if (rbprev)
+		return rb_entry_rq(rbprev);
+
+	return NULL;
+}
+
+EXPORT_SYMBOL(elv_rb_former_request);
+
+struct request *elv_rb_latter_request(request_queue_t *q, struct request *rq)
+{
+	struct rb_node *rbnext = rb_next(&rq->rb_node);
+
+	if (rbnext)
+		return rb_entry_rq(rbnext);
+
+	return NULL;
+}
+
+EXPORT_SYMBOL(elv_rb_latter_request);
diff -urN oldtree/block/ll_rw_blk.c newtree/block/ll_rw_blk.c
--- oldtree/block/ll_rw_blk.c	2006-08-02 07:14:14.000000000 -0700
+++ newtree/block/ll_rw_blk.c	2006-08-03 13:27:10.000000000 -0700
@@ -39,6 +39,7 @@
 static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);
 static void init_request_from_bio(struct request *req, struct bio *bio);
 static int __make_request(request_queue_t *q, struct bio *bio);
+static struct io_context *current_io_context(gfp_t gfp_flags, int node);
 
 /*
  * For the allocated request tables
@@ -274,19 +275,19 @@
 
 EXPORT_SYMBOL(blk_queue_make_request);
 
-static inline void rq_init(request_queue_t *q, struct request *rq)
+static void rq_init(request_queue_t *q, struct request *rq)
 {
 	INIT_LIST_HEAD(&rq->queuelist);
 	INIT_LIST_HEAD(&rq->donelist);
 
 	rq->errors = 0;
-	rq->rq_status = RQ_ACTIVE;
 	rq->bio = rq->biotail = NULL;
+	INIT_HLIST_NODE(&rq->hash);
+	RB_CLEAR_NODE(&rq->rb_node);
 	rq->ioprio = 0;
 	rq->buffer = NULL;
 	rq->ref_count = 1;
 	rq->q = q;
-	rq->waiting = NULL;
 	rq->special = NULL;
 	rq->data_len = 0;
 	rq->data = NULL;
@@ -446,8 +447,8 @@
 	rq_init(q, rq);
 	rq->flags = REQ_HARDBARRIER;
 	rq->elevator_private = NULL;
+	rq->elevator_private2 = NULL;
 	rq->rq_disk = q->bar_rq.rq_disk;
-	rq->rl = NULL;
 	rq->end_io = end_io;
 	q->prepare_flush_fn(q, rq);
 
@@ -472,7 +473,7 @@
 	rq->flags = bio_data_dir(q->orig_bar_rq->bio);
 	rq->flags |= q->ordered & QUEUE_ORDERED_FUA ? REQ_FUA : 0;
 	rq->elevator_private = NULL;
-	rq->rl = NULL;
+	rq->elevator_private2 = NULL;
 	init_request_from_bio(rq, q->orig_bar_rq->bio);
 	rq->end_io = bar_end_io;
 
@@ -584,8 +585,8 @@
 	return 0;
 }
 
-static inline int ordered_bio_endio(struct request *rq, struct bio *bio,
-				    unsigned int nbytes, int error)
+static int ordered_bio_endio(struct request *rq, struct bio *bio,
+			     unsigned int nbytes, int error)
 {
 	request_queue_t *q = rq->q;
 	bio_end_io_t *endio;
@@ -1967,9 +1968,8 @@
 	mempool_free(rq, q->rq.rq_pool);
 }
 
-static inline struct request *
-blk_alloc_request(request_queue_t *q, int rw, struct bio *bio,
-		  int priv, gfp_t gfp_mask)
+static struct request *
+blk_alloc_request(request_queue_t *q, int rw, int priv, gfp_t gfp_mask)
 {
 	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
 
@@ -1980,10 +1980,10 @@
 	 * first three bits are identical in rq->flags and bio->bi_rw,
 	 * see bio.h and blkdev.h
 	 */
-	rq->flags = rw;
+	rq->flags = rw | REQ_ALLOCED;
 
 	if (priv) {
-		if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) {
+		if (unlikely(elv_set_request(q, rq, gfp_mask))) {
 			mempool_free(rq, q->rq.rq_pool);
 			return NULL;
 		}
@@ -2074,13 +2074,13 @@
 	struct io_context *ioc = NULL;
 	int may_queue, priv;
 
-	may_queue = elv_may_queue(q, rw, bio);
+	may_queue = elv_may_queue(q, rw);
 	if (may_queue == ELV_MQUEUE_NO)
 		goto rq_starved;
 
 	if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {
 		if (rl->count[rw]+1 >= q->nr_requests) {
-			ioc = current_io_context(GFP_ATOMIC);
+			ioc = current_io_context(GFP_ATOMIC, q->node);
 			/*
 			 * The queue will fill after this allocation, so set
 			 * it as full, and mark this process as "batching".
@@ -2122,7 +2122,7 @@
 
 	spin_unlock_irq(q->queue_lock);
 
-	rq = blk_alloc_request(q, rw, bio, priv, gfp_mask);
+	rq = blk_alloc_request(q, rw, priv, gfp_mask);
 	if (unlikely(!rq)) {
 		/*
 		 * Allocation failed presumably due to memory. Undo anything
@@ -2158,7 +2158,6 @@
 		ioc->nr_batch_requests--;
 	
 	rq_init(q, rq);
-	rq->rl = rl;
 
 	blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
 out:
@@ -2201,7 +2200,7 @@
 			 * up to a big batch of them for a small period time.
 			 * See ioc_batching, ioc_set_batching
 			 */
-			ioc = current_io_context(GFP_NOIO);
+			ioc = current_io_context(GFP_NOIO, q->node);
 			ioc_set_batching(q, ioc);
 
 			spin_lock_irq(q->queue_lock);
@@ -2233,6 +2232,25 @@
 EXPORT_SYMBOL(blk_get_request);
 
 /**
+ * blk_start_queueing - initiate dispatch of requests to device
+ * @q:		request queue to kick into gear
+ *
+ * This is basically a helper to remove the need to know whether a queue
+ * is plugged or not if someone just wants to initiate dispatch of requests
+ * for this queue.
+ *
+ * The queue lock must be held with interrupts disabled.
+ */
+void blk_start_queueing(request_queue_t *q)
+{
+	if (!blk_queue_plugged(q))
+		q->request_fn(q);
+	else
+		__generic_unplug_device(q);
+}
+EXPORT_SYMBOL(blk_start_queueing);
+
+/**
  * blk_requeue_request - put a request back on queue
  * @q:		request queue where request should be inserted
  * @rq:		request to be inserted
@@ -2298,11 +2316,7 @@
 
 	drive_stat_acct(rq, rq->nr_sectors, 1);
 	__elv_add_request(q, rq, where, 0);
-
-	if (blk_queue_plugged(q))
-		__generic_unplug_device(q);
-	else
-		q->request_fn(q);
+	blk_start_queueing(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
@@ -2538,10 +2552,9 @@
 		rq->sense_len = 0;
 	}
 
-	rq->waiting = &wait;
+	rq->end_io_data = &wait;
 	blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
 	wait_for_completion(&wait);
-	rq->waiting = NULL;
 
 	if (rq->errors)
 		err = -EIO;
@@ -2650,8 +2663,6 @@
  */
 void __blk_put_request(request_queue_t *q, struct request *req)
 {
-	struct request_list *rl = req->rl;
-
 	if (unlikely(!q))
 		return;
 	if (unlikely(--req->ref_count))
@@ -2659,18 +2670,16 @@
 
 	elv_completed_request(q, req);
 
-	req->rq_status = RQ_INACTIVE;
-	req->rl = NULL;
-
 	/*
 	 * Request may not have originated from ll_rw_blk. if not,
 	 * it didn't come out of our reserved rq pools
 	 */
-	if (rl) {
+	if (req->flags & REQ_ALLOCED) {
 		int rw = rq_data_dir(req);
 		int priv = req->flags & REQ_ELVPRIV;
 
 		BUG_ON(!list_empty(&req->queuelist));
+		BUG_ON(!hlist_unhashed(&req->hash));
 
 		blk_free_request(q, req);
 		freed_request(q, rw, priv);
@@ -2704,9 +2713,9 @@
  */
 void blk_end_sync_rq(struct request *rq, int error)
 {
-	struct completion *waiting = rq->waiting;
+	struct completion *waiting = rq->end_io_data;
 
-	rq->waiting = NULL;
+	rq->end_io_data = NULL;
 	__blk_put_request(rq->q, rq);
 
 	/*
@@ -2757,7 +2766,7 @@
 
 	if (rq_data_dir(req) != rq_data_dir(next)
 	    || req->rq_disk != next->rq_disk
-	    || next->waiting || next->special)
+	    || next->special)
 		return 0;
 
 	/*
@@ -2834,10 +2843,11 @@
 
 	if (bio_sync(bio))
 		req->flags |= REQ_RW_SYNC;
+	if (bio_rw_meta(bio))
+		req->flags |= REQ_RW_META;
 
 	req->errors = 0;
 	req->hard_sector = req->sector = bio->bi_sector;
-	req->waiting = NULL;
 	req->ioprio = bio_prio(bio);
 	req->rq_disk = bio->bi_bdev->bd_disk;
 	req->start_time = jiffies;
@@ -2848,17 +2858,11 @@
 static int __make_request(request_queue_t *q, struct bio *bio)
 {
 	struct request *req;
-	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync;
-	unsigned short prio;
-	sector_t sector;
+	int el_ret, nr_sectors, barrier, err;
+	const unsigned short prio = bio_prio(bio);
+	const int sync = bio_sync(bio);
 
-	sector = bio->bi_sector;
 	nr_sectors = bio_sectors(bio);
-	cur_nr_sectors = bio_cur_sectors(bio);
-	prio = bio_prio(bio);
-
-	rw = bio_data_dir(bio);
-	sync = bio_sync(bio);
 
 	/*
 	 * low level driver can indicate that it wants pages above a
@@ -2867,8 +2871,6 @@
 	 */
 	blk_queue_bounce(q, &bio);
 
-	spin_lock_prefetch(q->queue_lock);
-
 	barrier = bio_barrier(bio);
 	if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
 		err = -EOPNOTSUPP;
@@ -2896,7 +2898,7 @@
 			req->ioprio = ioprio_best(req->ioprio, prio);
 			drive_stat_acct(req, nr_sectors, 0);
 			if (!attempt_back_merge(q, req))
-				elv_merged_request(q, req);
+				elv_merged_request(q, req, el_ret);
 			goto out;
 
 		case ELEVATOR_FRONT_MERGE:
@@ -2916,14 +2918,14 @@
 			 * not touch req->buffer either...
 			 */
 			req->buffer = bio_data(bio);
-			req->current_nr_sectors = cur_nr_sectors;
-			req->hard_cur_sectors = cur_nr_sectors;
-			req->sector = req->hard_sector = sector;
+			req->current_nr_sectors = bio_cur_sectors(bio);
+			req->hard_cur_sectors = req->current_nr_sectors;
+			req->sector = req->hard_sector = bio->bi_sector;
 			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
 			req->ioprio = ioprio_best(req->ioprio, prio);
 			drive_stat_acct(req, nr_sectors, 0);
 			if (!attempt_front_merge(q, req))
-				elv_merged_request(q, req);
+				elv_merged_request(q, req, el_ret);
 			goto out;
 
 		/* ELV_NO_MERGE: elevator says don't/can't merge. */
@@ -2936,7 +2938,7 @@
 	 * Grab a free request. This is might sleep but can not fail.
 	 * Returns with the queue unlocked.
 	 */
-	req = get_request_wait(q, rw, bio);
+	req = get_request_wait(q, bio_data_dir(bio), bio);
 
 	/*
 	 * After dropping the lock and possibly sleeping here, our request
@@ -3660,7 +3662,7 @@
  * but since the current task itself holds a reference, the context can be
  * used in general code, so long as it stays within `current` context.
  */
-struct io_context *current_io_context(gfp_t gfp_flags)
+static struct io_context *current_io_context(gfp_t gfp_flags, int node)
 {
 	struct task_struct *tsk = current;
 	struct io_context *ret;
@@ -3669,11 +3671,11 @@
 	if (likely(ret))
 		return ret;
 
-	ret = kmem_cache_alloc(iocontext_cachep, gfp_flags);
+	ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
 	if (ret) {
 		atomic_set(&ret->refcount, 1);
 		ret->task = current;
-		ret->set_ioprio = NULL;
+		ret->ioprio_changed = 0;
 		ret->last_waited = jiffies; /* doesn't matter... */
 		ret->nr_batch_requests = 0; /* because this is 0 */
 		ret->aic = NULL;
@@ -3691,10 +3693,10 @@
  *
  * This is always called in the context of the task which submitted the I/O.
  */
-struct io_context *get_io_context(gfp_t gfp_flags)
+struct io_context *get_io_context(gfp_t gfp_flags, int node)
 {
 	struct io_context *ret;
-	ret = current_io_context(gfp_flags);
+	ret = current_io_context(gfp_flags, node);
 	if (likely(ret))
 		atomic_inc(&ret->refcount);
 	return ret;
@@ -3806,6 +3808,7 @@
 	unsigned long ra_kb;
 	ssize_t ret = queue_var_store(&ra_kb, page, count);
 
+	spin_lock_irq(q->queue_lock);
 	q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
 
 	return ret;
diff -urN oldtree/block/noop-iosched.c newtree/block/noop-iosched.c
--- oldtree/block/noop-iosched.c	2006-07-15 14:53:08.000000000 -0700
+++ newtree/block/noop-iosched.c	2006-08-03 13:27:10.000000000 -0700
@@ -69,7 +69,7 @@
 {
 	struct noop_data *nd;
 
-	nd = kmalloc(sizeof(*nd), GFP_KERNEL);
+	nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
 	if (!nd)
 		return NULL;
 	INIT_LIST_HEAD(&nd->queue);
diff -urN oldtree/drivers/block/DAC960.c newtree/drivers/block/DAC960.c
--- oldtree/drivers/block/DAC960.c	2006-07-15 14:53:08.000000000 -0700
+++ newtree/drivers/block/DAC960.c	2006-08-03 13:27:10.000000000 -0700
@@ -3331,7 +3331,7 @@
 		Command->DmaDirection = PCI_DMA_TODEVICE;
 		Command->CommandType = DAC960_WriteCommand;
 	}
-	Command->Completion = Request->waiting;
+	Command->Completion = Request->end_io_data;
 	Command->LogicalDriveNumber = (long)Request->rq_disk->private_data;
 	Command->BlockNumber = Request->sector;
 	Command->BlockCount = Request->nr_sectors;
diff -urN oldtree/drivers/block/cciss.c newtree/drivers/block/cciss.c
--- oldtree/drivers/block/cciss.c	2006-08-02 07:14:15.000000000 -0700
+++ newtree/drivers/block/cciss.c	2006-08-03 13:31:02.000000000 -0700
@@ -1227,7 +1227,6 @@
 		int nr_sectors = bio_sectors(bio);
 
 		bio->bi_next = NULL;
-		blk_finished_io(len);
 		bio_endio(bio, nr_sectors << 9, status ? 0 : -EIO);
 		bio = xbh;
 	}
diff -urN oldtree/drivers/block/cpqarray.c newtree/drivers/block/cpqarray.c
--- oldtree/drivers/block/cpqarray.c	2006-07-15 14:53:08.000000000 -0700
+++ newtree/drivers/block/cpqarray.c	2006-08-03 13:27:10.000000000 -0700
@@ -989,7 +989,6 @@
 		xbh = bio->bi_next;
 		bio->bi_next = NULL;
 		
-		blk_finished_io(nr_sectors);
 		bio_endio(bio, nr_sectors << 9, ok ? 0 : -EIO);
 
 		bio = xbh;
diff -urN oldtree/drivers/block/paride/pd.c newtree/drivers/block/paride/pd.c
--- oldtree/drivers/block/paride/pd.c	2006-07-15 14:53:08.000000000 -0700
+++ newtree/drivers/block/paride/pd.c	2006-08-03 13:27:10.000000000 -0700
@@ -719,14 +719,12 @@
 
 	memset(&rq, 0, sizeof(rq));
 	rq.errors = 0;
-	rq.rq_status = RQ_ACTIVE;
 	rq.rq_disk = disk->gd;
 	rq.ref_count = 1;
-	rq.waiting = &wait;
+	rq.end_io_data = &wait;
 	rq.end_io = blk_end_sync_rq;
 	blk_insert_request(disk->gd->queue, &rq, 0, func);
 	wait_for_completion(&wait);
-	rq.waiting = NULL;
 	if (rq.errors)
 		err = -EIO;
 	blk_put_request(&rq);
diff -urN oldtree/drivers/block/pktcdvd.c newtree/drivers/block/pktcdvd.c
--- oldtree/drivers/block/pktcdvd.c	2006-08-02 07:14:15.000000000 -0700
+++ newtree/drivers/block/pktcdvd.c	2006-08-03 13:27:10.000000000 -0700
@@ -375,7 +375,7 @@
 
 	rq->ref_count++;
 	rq->flags |= REQ_NOMERGE;
-	rq->waiting = &wait;
+	rq->end_io_data = &wait;
 	rq->end_io = blk_end_sync_rq;
 	elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1);
 	generic_unplug_device(q);
diff -urN oldtree/drivers/block/swim3.c newtree/drivers/block/swim3.c
--- oldtree/drivers/block/swim3.c	2006-07-15 14:53:08.000000000 -0700
+++ newtree/drivers/block/swim3.c	2006-08-03 13:27:10.000000000 -0700
@@ -319,8 +319,8 @@
 		printk("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%ld buf=%p\n",
 		       req->rq_disk->disk_name, req->cmd,
 		       (long)req->sector, req->nr_sectors, req->buffer);
-		printk("           rq_status=%d errors=%d current_nr_sectors=%ld\n",
-		       req->rq_status, req->errors, req->current_nr_sectors);
+		printk("           errors=%d current_nr_sectors=%ld\n",
+		       req->errors, req->current_nr_sectors);
 #endif
 
 		if (req->sector < 0 || req->sector >= fs->total_secs) {
diff -urN oldtree/drivers/block/swim_iop.c newtree/drivers/block/swim_iop.c
--- oldtree/drivers/block/swim_iop.c	2006-07-15 14:53:08.000000000 -0700
+++ newtree/drivers/block/swim_iop.c	2006-08-03 13:27:10.000000000 -0700
@@ -529,8 +529,8 @@
 		printk("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%ld buf=%p\n",
 		       CURRENT->rq_disk->disk_name, CURRENT->cmd,
 		       CURRENT->sector, CURRENT->nr_sectors, CURRENT->buffer);
-		printk("           rq_status=%d errors=%d current_nr_sectors=%ld\n",
-		       CURRENT->rq_status, CURRENT->errors, CURRENT->current_nr_sectors);
+		printk("           errors=%d current_nr_sectors=%ld\n",
+		      CURRENT->errors, CURRENT->current_nr_sectors);
 #endif
 
 		if (CURRENT->sector < 0 || CURRENT->sector >= fs->total_secs) {
diff -urN oldtree/drivers/fc4/fc.c newtree/drivers/fc4/fc.c
--- oldtree/drivers/fc4/fc.c	2006-08-02 07:14:15.000000000 -0700
+++ newtree/drivers/fc4/fc.c	2006-08-03 13:27:10.000000000 -0700
@@ -974,7 +974,6 @@
 	 */
 
 	fc->rst_pkt->device->host->eh_action = &sem;
-	fc->rst_pkt->request->rq_status = RQ_SCSI_BUSY;
 
 	fc->rst_pkt->done = fcp_scsi_reset_done;
 
diff -urN oldtree/drivers/ide/ide-floppy.c newtree/drivers/ide/ide-floppy.c
--- oldtree/drivers/ide/ide-floppy.c	2006-07-15 14:53:08.000000000 -0700
+++ newtree/drivers/ide/ide-floppy.c	2006-08-03 13:27:10.000000000 -0700
@@ -1281,8 +1281,7 @@
 	idefloppy_pc_t *pc;
 	unsigned long block = (unsigned long)block_s;
 
-	debug_log(KERN_INFO "rq_status: %d, dev: %s, flags: %lx, errors: %d\n",
-			rq->rq_status,
+	debug_log(KERN_INFO "dev: %s, flags: %lx, errors: %d\n",
 			rq->rq_disk ? rq->rq_disk->disk_name : "?",
 			rq->flags, rq->errors);
 	debug_log(KERN_INFO "sector: %ld, nr_sectors: %ld, "
diff -urN oldtree/drivers/ide/ide-io.c newtree/drivers/ide/ide-io.c
--- oldtree/drivers/ide/ide-io.c	2006-08-02 07:14:15.000000000 -0700
+++ newtree/drivers/ide/ide-io.c	2006-08-03 13:27:11.000000000 -0700
@@ -1713,7 +1713,6 @@
 	int must_wait = (action == ide_wait || action == ide_head_wait);
 
 	rq->errors = 0;
-	rq->rq_status = RQ_ACTIVE;
 
 	/*
 	 * we need to hold an extra reference to request for safe inspection
@@ -1721,7 +1720,7 @@
 	 */
 	if (must_wait) {
 		rq->ref_count++;
-		rq->waiting = &wait;
+		rq->end_io_data = &wait;
 		rq->end_io = blk_end_sync_rq;
 	}
 
@@ -1739,7 +1738,6 @@
 	err = 0;
 	if (must_wait) {
 		wait_for_completion(&wait);
-		rq->waiting = NULL;
 		if (rq->errors)
 			err = -EIO;
 
diff -urN oldtree/drivers/ide/ide-tape.c newtree/drivers/ide/ide-tape.c
--- oldtree/drivers/ide/ide-tape.c	2006-07-15 14:53:08.000000000 -0700
+++ newtree/drivers/ide/ide-tape.c	2006-08-03 13:27:11.000000000 -0700
@@ -2423,8 +2423,8 @@
 #if IDETAPE_DEBUG_LOG
 #if 0
 	if (tape->debug_level >= 5)
-		printk(KERN_INFO "ide-tape: rq_status: %d, "
-			"dev: %s, cmd: %ld, errors: %d\n", rq->rq_status,
+		printk(KERN_INFO "ide-tape:  %d, "
+			"dev: %s, cmd: %ld, errors: %d\n",
 			 rq->rq_disk->disk_name, rq->cmd[0], rq->errors);
 #endif
 	if (tape->debug_level >= 2)
@@ -2773,7 +2773,7 @@
 		return;
 	}
 #endif /* IDETAPE_DEBUG_BUGS */
-	rq->waiting = &wait;
+	rq->end_io_data = &wait;
 	rq->end_io = blk_end_sync_rq;
 	spin_unlock_irq(&tape->spinlock);
 	wait_for_completion(&wait);
diff -urN oldtree/drivers/scsi/ide-scsi.c newtree/drivers/scsi/ide-scsi.c
--- oldtree/drivers/scsi/ide-scsi.c	2006-07-15 14:53:08.000000000 -0700
+++ newtree/drivers/scsi/ide-scsi.c	2006-08-03 13:27:11.000000000 -0700
@@ -708,7 +708,7 @@
 static ide_startstop_t idescsi_do_request (ide_drive_t *drive, struct request *rq, sector_t block)
 {
 #if IDESCSI_DEBUG_LOG
-	printk (KERN_INFO "rq_status: %d, dev: %s, cmd: %x, errors: %d\n",rq->rq_status, rq->rq_disk->disk_name,rq->cmd[0],rq->errors);
+	printk (KERN_INFO "dev: %s, cmd: %x, errors: %d\n", rq->rq_disk->disk_name,rq->cmd[0],rq->errors);
 	printk (KERN_INFO "sector: %ld, nr_sectors: %ld, current_nr_sectors: %d\n",rq->sector,rq->nr_sectors,rq->current_nr_sectors);
 #endif /* IDESCSI_DEBUG_LOG */
 
diff -urN oldtree/drivers/scsi/scsi.c newtree/drivers/scsi/scsi.c
--- oldtree/drivers/scsi/scsi.c	2006-08-02 07:14:20.000000000 -0700
+++ newtree/drivers/scsi/scsi.c	2006-08-03 13:27:11.000000000 -0700
@@ -1127,7 +1127,7 @@
 
 	spin_lock_irqsave(&sdev->list_lock, flags);
 	list_for_each_entry(scmd, &sdev->cmd_list, list) {
-		if (scmd->request && scmd->request->rq_status != RQ_INACTIVE) {
+		if (scmd->request) {
 			/*
 			 * If we are unable to remove the timer, it means
 			 * that the command has already timed out or
diff -urN oldtree/fs/ext3/inode.c newtree/fs/ext3/inode.c
--- oldtree/fs/ext3/inode.c	2006-08-02 07:14:21.000000000 -0700
+++ newtree/fs/ext3/inode.c	2006-08-03 13:27:11.000000000 -0700
@@ -36,6 +36,7 @@
 #include <linux/writeback.h>
 #include <linux/mpage.h>
 #include <linux/uio.h>
+#include <linux/bio.h>
 #include "xattr.h"
 #include "acl.h"
 
@@ -1070,7 +1071,7 @@
 		return bh;
 	if (buffer_uptodate(bh))
 		return bh;
-	ll_rw_block(READ, 1, &bh);
+	ll_rw_block(READ_META, 1, &bh);
 	wait_on_buffer(bh);
 	if (buffer_uptodate(bh))
 		return bh;
@@ -2537,7 +2538,7 @@
 		 */
 		get_bh(bh);
 		bh->b_end_io = end_buffer_read_sync;
-		submit_bh(READ, bh);
+		submit_bh(READ_META, bh);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
 			ext3_error(inode->i_sb, "ext3_get_inode_loc",
diff -urN oldtree/fs/ext3/namei.c newtree/fs/ext3/namei.c
--- oldtree/fs/ext3/namei.c	2006-08-02 07:14:21.000000000 -0700
+++ newtree/fs/ext3/namei.c	2006-08-03 13:27:11.000000000 -0700
@@ -35,6 +35,7 @@
 #include <linux/string.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include <linux/bio.h>
 #include <linux/smp_lock.h>
 
 #include "namei.h"
@@ -870,7 +871,7 @@
 				bh = ext3_getblk(NULL, dir, b++, 0, &err);
 				bh_use[ra_max] = bh;
 				if (bh)
-					ll_rw_block(READ, 1, &bh);
+					ll_rw_block(READ_META, 1, &bh);
 			}
 		}
 		if ((bh = bh_use[ra_ptr++]) == NULL)
diff -urN oldtree/fs/ioprio.c newtree/fs/ioprio.c
--- oldtree/fs/ioprio.c	2006-07-15 14:53:08.000000000 -0700
+++ newtree/fs/ioprio.c	2006-08-03 13:27:11.000000000 -0700
@@ -44,8 +44,8 @@
 	task->ioprio = ioprio;
 
 	ioc = task->io_context;
-	if (ioc && ioc->set_ioprio)
-		ioc->set_ioprio(ioc, ioprio);
+	if (ioc)
+		ioc->ioprio_changed = 1;
 
 	task_unlock(task);
 	return 0;
diff -urN oldtree/include/linux/bio.h newtree/include/linux/bio.h
--- oldtree/include/linux/bio.h	2006-07-15 14:53:08.000000000 -0700
+++ newtree/include/linux/bio.h	2006-08-03 13:27:11.000000000 -0700
@@ -148,6 +148,7 @@
 #define BIO_RW_BARRIER	2
 #define BIO_RW_FAILFAST	3
 #define BIO_RW_SYNC	4
+#define BIO_RW_META	5
 
 /*
  * upper 16 bits of bi_rw define the io priority of this bio
@@ -178,6 +179,7 @@
 #define bio_sync(bio)		((bio)->bi_rw & (1 << BIO_RW_SYNC))
 #define bio_failfast(bio)	((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
 #define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
+#define bio_rw_meta(bio)	((bio)->bi_rw & (1 << BIO_RW_META))
 
 /*
  * will die
diff -urN oldtree/include/linux/blkdev.h newtree/include/linux/blkdev.h
--- oldtree/include/linux/blkdev.h	2006-08-02 07:14:22.000000000 -0700
+++ newtree/include/linux/blkdev.h	2006-08-03 13:27:11.000000000 -0700
@@ -90,7 +90,7 @@
 	atomic_t refcount;
 	struct task_struct *task;
 
-	int (*set_ioprio)(struct io_context *, unsigned int);
+	unsigned int ioprio_changed;
 
 	/*
 	 * For request batching
@@ -104,8 +104,7 @@
 
 void put_io_context(struct io_context *ioc);
 void exit_io_context(void);
-struct io_context *current_io_context(gfp_t gfp_flags);
-struct io_context *get_io_context(gfp_t gfp_flags);
+struct io_context *get_io_context(gfp_t gfp_flags, int node);
 void copy_io_context(struct io_context **pdst, struct io_context **psrc);
 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2);
 
@@ -129,30 +128,46 @@
 	struct list_head queuelist;
 	struct list_head donelist;
 
+	request_queue_t *q;
+
 	unsigned long flags;		/* see REQ_ bits below */
 
 	/* Maintain bio traversal state for part by part I/O submission.
 	 * hard_* are block layer internals, no driver should touch them!
 	 */
-
 	sector_t sector;		/* next sector to submit */
+	sector_t hard_sector;		/* next sector to complete */
 	unsigned long nr_sectors;	/* no. of sectors left to submit */
+	unsigned long hard_nr_sectors;	/* no. of sectors left to complete */
+
 	/* no. of sectors left to submit in the current segment */
 	unsigned int current_nr_sectors;
 
-	sector_t hard_sector;		/* next sector to complete */
-	unsigned long hard_nr_sectors;	/* no. of sectors left to complete */
 	/* no. of sectors left to complete in the current segment */
 	unsigned int hard_cur_sectors;
 
 	struct bio *bio;
 	struct bio *biotail;
 
+	struct hlist_node hash;	/* merge hash */
+
+	/*
+	 * The rb_node is only used inside the io scheduler, requests
+	 * are pruned when moved to the dispatch queue. So let the
+	 * completion_data share space with the rb_node.
+	 */
+	union {
+		struct rb_node rb_node;	/* sort/lookup */
+		void *completion_data;
+	};
+
+	/*
+	 * two pointers are available for the IO schedulers, if they need
+	 * more they have to dynamically allocate it.
+	 */
 	void *elevator_private;
-	void *completion_data;
+	void *elevator_private2;
 
-	int rq_status;	/* should split this into a few status bits */
-	int errors;
 	struct gendisk *rq_disk;
 	unsigned long start_time;
 
@@ -170,15 +185,13 @@
 
 	unsigned short ioprio;
 
+	void *special;
+	char *buffer;
+
 	int tag;
+	int errors;
 
 	int ref_count;
-	request_queue_t *q;
-	struct request_list *rl;
-
-	struct completion *waiting;
-	void *special;
-	char *buffer;
 
 	/*
 	 * when request is used as a packet command carrier
@@ -195,7 +208,7 @@
 	int retries;
 
 	/*
-	 * completion callback. end_io_data should be folded in with waiting
+	 * completion callback.
 	 */
 	rq_end_io_fn *end_io;
 	void *end_io_data;
@@ -236,6 +249,8 @@
 	__REQ_PM_SHUTDOWN,	/* shutdown request */
 	__REQ_ORDERED_COLOR,	/* is before or after barrier */
 	__REQ_RW_SYNC,		/* request is sync (O_DIRECT) */
+	__REQ_ALLOCED,		/* request came from our alloc pool */
+	__REQ_RW_META,		/* metadata io request */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -266,6 +281,8 @@
 #define REQ_PM_SHUTDOWN	(1 << __REQ_PM_SHUTDOWN)
 #define REQ_ORDERED_COLOR	(1 << __REQ_ORDERED_COLOR)
 #define REQ_RW_SYNC	(1 << __REQ_RW_SYNC)
+#define REQ_ALLOCED	(1 << __REQ_ALLOCED)
+#define REQ_RW_META	(1 << __REQ_RW_META)
 
 /*
  * State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME
@@ -432,9 +449,6 @@
 	struct mutex		sysfs_lock;
 };
 
-#define RQ_INACTIVE		(-1)
-#define RQ_ACTIVE		1
-
 #define QUEUE_FLAG_CLUSTER	0	/* cluster several segments into 1 */
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
 #define QUEUE_FLAG_STOPPED	2	/* queue is stopped */
@@ -510,6 +524,12 @@
 
 #define rq_data_dir(rq)		((rq)->flags & 1)
 
+/*
+ * We regard a request as sync, if it's a READ or a SYNC write.
+ */
+#define rq_is_sync(rq)		(rq_data_dir((rq)) == READ || (rq)->flags & REQ_RW_SYNC)
+#define rq_is_meta(rq)		((rq)->flags & REQ_RW_META)
+
 static inline int blk_queue_full(struct request_queue *q, int rw)
 {
 	if (rw == READ)
@@ -544,12 +564,6 @@
 	(!((rq)->flags & RQ_NOMERGE_FLAGS) && blk_fs_request((rq)))
 
 /*
- * noop, requests are automagically marked as active/inactive by I/O
- * scheduler -- see elv_next_request
- */
-#define blk_queue_headactive(q, head_active)
-
-/*
  * q->prep_rq_fn return values
  */
 #define BLKPREP_OK		0	/* serve it */
@@ -586,11 +600,6 @@
 	if ((rq->bio))			\
 		for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
 
-struct sec_size {
-	unsigned block_size;
-	unsigned block_size_bits;
-};
-
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 extern void register_disk(struct gendisk *dev);
@@ -612,6 +621,7 @@
 extern void blk_sync_queue(struct request_queue *q);
 extern void __blk_stop_queue(request_queue_t *q);
 extern void blk_run_queue(request_queue_t *);
+extern void blk_start_queueing(request_queue_t *);
 extern void blk_queue_activity_fn(request_queue_t *, activity_fn *, void *);
 extern int blk_rq_map_user(request_queue_t *, struct request *, void __user *, unsigned int);
 extern int blk_rq_unmap_user(struct bio *, unsigned int);
@@ -656,16 +666,6 @@
 extern void end_request(struct request *req, int uptodate);
 extern void blk_complete_request(struct request *);
 
-static inline int rq_all_done(struct request *rq, unsigned int nr_bytes)
-{
-	if (blk_fs_request(rq))
-		return (nr_bytes >= (rq->hard_nr_sectors << 9));
-	else if (blk_pc_request(rq))
-		return nr_bytes >= rq->data_len;
-
-	return 0;
-}
-
 /*
  * end_that_request_first/chunk() takes an uptodate argument. we account
  * any value <= as an io error. 0 means -EIO for compatability reasons,
@@ -680,21 +680,6 @@
 }
 
 /*
- * This should be in elevator.h, but that requires pulling in rq and q
- */
-static inline void elv_dispatch_add_tail(struct request_queue *q,
-					 struct request *rq)
-{
-	if (q->last_merge == rq)
-		q->last_merge = NULL;
-	q->nr_sorted--;
-
-	q->end_sector = rq_end_sector(rq);
-	q->boundary_rq = rq;
-	list_add_tail(&rq->queuelist, &q->queue_head);
-}
-
-/*
  * Access functions for manipulating queue properties
  */
 extern request_queue_t *blk_init_queue_node(request_fn_proc *rfn,
@@ -785,14 +770,6 @@
 	return retval;
 }
 
-static inline int bdev_dma_aligment(struct block_device *bdev)
-{
-	return queue_dma_alignment(bdev_get_queue(bdev));
-}
-
-#define blk_finished_io(nsects)	do { } while (0)
-#define blk_started_io(nsects)	do { } while (0)
-
 /* assumes size > 256 */
 static inline unsigned int blksize_bits(unsigned int size)
 {
diff -urN oldtree/include/linux/blktrace_api.h newtree/include/linux/blktrace_api.h
--- oldtree/include/linux/blktrace_api.h	2006-07-15 14:53:08.000000000 -0700
+++ newtree/include/linux/blktrace_api.h	2006-08-03 13:27:11.000000000 -0700
@@ -20,6 +20,7 @@
 	BLK_TC_PC	= 1 << 9,	/* pc requests */
 	BLK_TC_NOTIFY	= 1 << 10,	/* special message */
 	BLK_TC_AHEAD	= 1 << 11,	/* readahead */
+	BLK_TC_META	= 1 << 12,	/* metadata */
 
 	BLK_TC_END	= 1 << 15,	/* only 16-bits, reminder */
 };
diff -urN oldtree/include/linux/elevator.h newtree/include/linux/elevator.h
--- oldtree/include/linux/elevator.h	2006-07-15 14:53:08.000000000 -0700
+++ newtree/include/linux/elevator.h	2006-08-03 13:27:11.000000000 -0700
@@ -1,12 +1,14 @@
 #ifndef _LINUX_ELEVATOR_H
 #define _LINUX_ELEVATOR_H
 
+#include <linux/percpu.h>
+
 typedef int (elevator_merge_fn) (request_queue_t *, struct request **,
 				 struct bio *);
 
 typedef void (elevator_merge_req_fn) (request_queue_t *, struct request *, struct request *);
 
-typedef void (elevator_merged_fn) (request_queue_t *, struct request *);
+typedef void (elevator_merged_fn) (request_queue_t *, struct request *, int);
 
 typedef int (elevator_dispatch_fn) (request_queue_t *, int);
 
@@ -14,9 +16,9 @@
 typedef int (elevator_queue_empty_fn) (request_queue_t *);
 typedef struct request *(elevator_request_list_fn) (request_queue_t *, struct request *);
 typedef void (elevator_completed_req_fn) (request_queue_t *, struct request *);
-typedef int (elevator_may_queue_fn) (request_queue_t *, int, struct bio *);
+typedef int (elevator_may_queue_fn) (request_queue_t *, int);
 
-typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, struct bio *, gfp_t);
+typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, gfp_t);
 typedef void (elevator_put_req_fn) (request_queue_t *, struct request *);
 typedef void (elevator_activate_req_fn) (request_queue_t *, struct request *);
 typedef void (elevator_deactivate_req_fn) (request_queue_t *, struct request *);
@@ -82,19 +84,21 @@
 	struct kobject kobj;
 	struct elevator_type *elevator_type;
 	struct mutex sysfs_lock;
+	struct hlist_head *hash;
 };
 
 /*
  * block elevator interface
  */
 extern void elv_dispatch_sort(request_queue_t *, struct request *);
+extern void elv_dispatch_add_tail(request_queue_t *, struct request *);
 extern void elv_add_request(request_queue_t *, struct request *, int, int);
 extern void __elv_add_request(request_queue_t *, struct request *, int, int);
 extern void elv_insert(request_queue_t *, struct request *, int);
 extern int elv_merge(request_queue_t *, struct request **, struct bio *);
 extern void elv_merge_requests(request_queue_t *, struct request *,
 			       struct request *);
-extern void elv_merged_request(request_queue_t *, struct request *);
+extern void elv_merged_request(request_queue_t *, struct request *, int);
 extern void elv_dequeue_request(request_queue_t *, struct request *);
 extern void elv_requeue_request(request_queue_t *, struct request *);
 extern int elv_queue_empty(request_queue_t *);
@@ -103,9 +107,9 @@
 extern struct request *elv_latter_request(request_queue_t *, struct request *);
 extern int elv_register_queue(request_queue_t *q);
 extern void elv_unregister_queue(request_queue_t *q);
-extern int elv_may_queue(request_queue_t *, int, struct bio *);
+extern int elv_may_queue(request_queue_t *, int);
 extern void elv_completed_request(request_queue_t *, struct request *);
-extern int elv_set_request(request_queue_t *, struct request *, struct bio *, gfp_t);
+extern int elv_set_request(request_queue_t *, struct request *, gfp_t);
 extern void elv_put_request(request_queue_t *, struct request *);
 
 /*
@@ -125,6 +129,19 @@
 extern int elv_rq_merge_ok(struct request *, struct bio *);
 
 /*
+ * Helper functions.
+ */
+extern struct request *elv_rb_former_request(request_queue_t *, struct request *);
+extern struct request *elv_rb_latter_request(request_queue_t *, struct request *);
+
+/*
+ * rb support functions.
+ */
+extern struct request *elv_rb_add(struct rb_root *, struct request *);
+extern void elv_rb_del(struct rb_root *, struct request *);
+extern struct request *elv_rb_find(struct rb_root *, sector_t);
+
+/*
  * Return values from elevator merger
  */
 #define ELEVATOR_NO_MERGE	0
@@ -149,5 +166,41 @@
 };
 
 #define rq_end_sector(rq)	((rq)->sector + (rq)->nr_sectors)
+#define rb_entry_rq(node)	rb_entry((node), struct request, rb_node)
+
+/*
+ * Hack to reuse the donelist list_head as the fifo time holder while
+ * the request is in the io scheduler. Saves an unsigned long in rq.
+ */
+#define rq_fifo_time(rq)	((unsigned long) (rq)->donelist.next)
+#define rq_set_fifo_time(rq,exp)	((rq)->donelist.next = (void *) (exp))
+#define rq_entry_fifo(ptr)	list_entry((ptr), struct request, queuelist)
+#define rq_fifo_clear(rq)	do {		\
+	list_del_init(&(rq)->queuelist);	\
+	INIT_LIST_HEAD(&(rq)->donelist);	\
+	} while (0)
+
+/*
+ * io context count accounting
+ */
+#define elv_ioc_count_mod(name, __val)				\
+	do {							\
+		preempt_disable();				\
+		__get_cpu_var(name) += (__val);			\
+		preempt_enable();				\
+	} while (0)
+
+#define elv_ioc_count_inc(name)	elv_ioc_count_mod(name, 1)
+#define elv_ioc_count_dec(name)	elv_ioc_count_mod(name, -1)
+
+#define elv_ioc_count_read(name)				\
+({								\
+	unsigned long __val = 0;				\
+	int __cpu;						\
+	smp_wmb();						\
+	for_each_possible_cpu(__cpu)				\
+		__val += per_cpu(name, __cpu);			\
+	__val;							\
+})
 
 #endif
diff -urN oldtree/include/linux/fs.h newtree/include/linux/fs.h
--- oldtree/include/linux/fs.h	2006-08-02 07:14:22.000000000 -0700
+++ newtree/include/linux/fs.h	2006-08-03 13:27:11.000000000 -0700
@@ -79,8 +79,8 @@
 #define WRITE 1
 #define READA 2		/* read-ahead  - don't block if no resources */
 #define SWRITE 3	/* for ll_rw_block() - wait for buffer lock */
-#define SPECIAL 4	/* For non-blockdevice requests in request queue */
 #define READ_SYNC	(READ | (1 << BIO_RW_SYNC))
+#define READ_META	(READ | (1 << BIO_RW_META))
 #define WRITE_SYNC	(WRITE | (1 << BIO_RW_SYNC))
 #define WRITE_BARRIER	((1 << BIO_RW) | (1 << BIO_RW_BARRIER))
 
diff -urN oldtree/include/linux/rbtree.h newtree/include/linux/rbtree.h
--- oldtree/include/linux/rbtree.h	2006-07-15 14:53:08.000000000 -0700
+++ newtree/include/linux/rbtree.h	2006-08-03 13:27:11.000000000 -0700
@@ -133,7 +133,7 @@
 #define	rb_entry(ptr, type, member) container_of(ptr, type, member)
 
 #define RB_EMPTY_ROOT(root)	((root)->rb_node == NULL)
-#define RB_EMPTY_NODE(node)	(rb_parent(node) != node)
+#define RB_EMPTY_NODE(node)	(rb_parent(node) == node)
 #define RB_CLEAR_NODE(node)	(rb_set_parent(node, node))
 
 extern void rb_insert_color(struct rb_node *, struct rb_root *);
diff -urN oldtree/lib/rbtree.c newtree/lib/rbtree.c
--- oldtree/lib/rbtree.c	2006-07-15 14:53:08.000000000 -0700
+++ newtree/lib/rbtree.c	2006-08-03 13:27:11.000000000 -0700
@@ -322,6 +322,9 @@
 {
 	struct rb_node *parent;
 
+	if (rb_parent(node) == node)
+		return NULL;
+
 	/* If we have a right-hand child, go down and then left as far
 	   as we can. */
 	if (node->rb_right) {
@@ -348,6 +351,9 @@
 {
 	struct rb_node *parent;
 
+	if (rb_parent(node) == node)
+		return NULL;
+
 	/* If we have a left-hand child, go down and then right as far
 	   as we can. */
 	if (node->rb_left) {